feat: add dynamic model registration support to TGI inference

add new overwrite_completion_id feature to OpenAIMixin to deal with TGI always returning id=""

test with -

tgi: `docker run --gpus all --shm-size 1g -p 8080:80 -v /data:/data ghcr.io/huggingface/text-generation-inference --model-id Qwen/Qwen3-0.6B`

stack: `TGI_URL=http://localhost:8080 uv run llama stack build --image-type venv --distro ci-tests --run`

test: `./scripts/integration-tests.sh --stack-config http://localhost:8321 --setup tgi --subdirs inference --pattern openai`
This commit is contained in:
Matthew Farrellee 2025-09-11 02:02:02 -04:00
parent d15368a302
commit c3fc859257
14 changed files with 12218 additions and 20 deletions

View file

@ -8,6 +8,7 @@
from collections.abc import AsyncGenerator
from huggingface_hub import AsyncInferenceClient, HfApi
from pydantic import SecretStr
from llama_stack.apis.common.content_types import (
InterleavedContent,
@ -33,6 +34,7 @@ from llama_stack.apis.inference import (
ToolPromptFormat,
)
from llama_stack.apis.models import Model
from llama_stack.apis.models.models import ModelType
from llama_stack.log import get_logger
from llama_stack.models.llama.sku_list import all_registered_models
from llama_stack.providers.datatypes import ModelsProtocolPrivate
@ -41,16 +43,15 @@ from llama_stack.providers.utils.inference.model_registry import (
build_hf_repo_model_entry,
)
from llama_stack.providers.utils.inference.openai_compat import (
OpenAIChatCompletionToLlamaStackMixin,
OpenAICompatCompletionChoice,
OpenAICompatCompletionResponse,
OpenAICompletionToLlamaStackMixin,
get_sampling_options,
process_chat_completion_response,
process_chat_completion_stream_response,
process_completion_response,
process_completion_stream_response,
)
from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin
from llama_stack.providers.utils.inference.prompt_adapter import (
chat_completion_request_to_model_input_info,
completion_request_to_prompt_model_input_info,
@ -73,26 +74,49 @@ def build_hf_repo_model_entries():
class _HfAdapter(
OpenAIMixin,
Inference,
OpenAIChatCompletionToLlamaStackMixin,
OpenAICompletionToLlamaStackMixin,
ModelsProtocolPrivate,
):
client: AsyncInferenceClient
url: str
api_key: SecretStr
hf_client: AsyncInferenceClient
max_tokens: int
model_id: str
overwrite_completion_id = True # TGI always returns id=""
def __init__(self) -> None:
self.register_helper = ModelRegistryHelper(build_hf_repo_model_entries())
self.huggingface_repo_to_llama_model_id = {
model.huggingface_repo: model.descriptor() for model in all_registered_models() if model.huggingface_repo
}
def get_api_key(self):
return self.api_key.get_secret_value()
def get_base_url(self):
return self.url
async def shutdown(self) -> None:
pass
async def list_models(self) -> list[Model] | None:
models = []
async for model in self.client.models.list():
models.append(
Model(
identifier=model.id,
provider_resource_id=model.id,
provider_id=self.__provider_id__,
metadata={},
model_type=ModelType.llm,
)
)
return models
async def register_model(self, model: Model) -> Model:
model = await self.register_helper.register_model(model)
if model.provider_resource_id != self.model_id:
raise ValueError(
f"Model {model.provider_resource_id} does not match the model {self.model_id} served by TGI."
@ -176,7 +200,7 @@ class _HfAdapter(
params = await self._get_params_for_completion(request)
async def _generate_and_convert_to_openai_compat():
s = await self.client.text_generation(**params)
s = await self.hf_client.text_generation(**params)
async for chunk in s:
token_result = chunk.token
finish_reason = None
@ -194,7 +218,7 @@ class _HfAdapter(
async def _nonstream_completion(self, request: CompletionRequest) -> AsyncGenerator:
params = await self._get_params_for_completion(request)
r = await self.client.text_generation(**params)
r = await self.hf_client.text_generation(**params)
choice = OpenAICompatCompletionChoice(
finish_reason=r.details.finish_reason,
@ -241,7 +265,7 @@ class _HfAdapter(
async def _nonstream_chat_completion(self, request: ChatCompletionRequest) -> ChatCompletionResponse:
params = await self._get_params(request)
r = await self.client.text_generation(**params)
r = await self.hf_client.text_generation(**params)
choice = OpenAICompatCompletionChoice(
finish_reason=r.details.finish_reason,
@ -256,7 +280,7 @@ class _HfAdapter(
params = await self._get_params(request)
async def _generate_and_convert_to_openai_compat():
s = await self.client.text_generation(**params)
s = await self.hf_client.text_generation(**params)
async for chunk in s:
token_result = chunk.token
@ -308,18 +332,21 @@ class TGIAdapter(_HfAdapter):
if not config.url:
raise ValueError("You must provide a URL in run.yaml (or via the TGI_URL environment variable) to use TGI.")
log.info(f"Initializing TGI client with url={config.url}")
self.client = AsyncInferenceClient(model=config.url, provider="hf-inference")
endpoint_info = await self.client.get_endpoint_info()
self.hf_client = AsyncInferenceClient(model=config.url, provider="hf-inference")
endpoint_info = await self.hf_client.get_endpoint_info()
self.max_tokens = endpoint_info["max_total_tokens"]
self.model_id = endpoint_info["model_id"]
self.url = f"{config.url.rstrip('/')}/v1"
self.api_key = SecretStr("NO_KEY")
class InferenceAPIAdapter(_HfAdapter):
async def initialize(self, config: InferenceAPIImplConfig) -> None:
self.client = AsyncInferenceClient(model=config.huggingface_repo, token=config.api_token.get_secret_value())
endpoint_info = await self.client.get_endpoint_info()
self.hf_client = AsyncInferenceClient(model=config.huggingface_repo, token=config.api_token.get_secret_value())
endpoint_info = await self.hf_client.get_endpoint_info()
self.max_tokens = endpoint_info["max_total_tokens"]
self.model_id = endpoint_info["model_id"]
# TODO: how do we set url for this?
class InferenceEndpointAdapter(_HfAdapter):
@ -331,6 +358,7 @@ class InferenceEndpointAdapter(_HfAdapter):
endpoint.wait(timeout=60)
# Initialize the adapter
self.client = endpoint.async_client
self.hf_client = endpoint.async_client
self.model_id = endpoint.repository
self.max_tokens = int(endpoint.raw["model"]["image"]["custom"]["env"]["MAX_TOTAL_TOKENS"])
# TODO: how do we set url for this?

View file

@ -4,6 +4,7 @@
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.
import uuid
from abc import ABC, abstractmethod
from collections.abc import AsyncIterator
from typing import Any
@ -43,6 +44,12 @@ class OpenAIMixin(ABC):
The model_store is set in routing_tables/common.py during provider initialization.
"""
# Allow subclasses to control whether to overwrite the 'id' field in OpenAI responses
# is overwritten with a client-side generated id.
#
# This is useful for providers that do not return a unique id in the response,
overwrite_completion_id: bool = False
@abstractmethod
def get_api_key(self) -> str:
"""
@ -98,6 +105,23 @@ class OpenAIMixin(ABC):
raise ValueError(f"Model {model} has no provider_resource_id")
return model_obj.provider_resource_id
async def _maybe_overwrite_id(self, resp: Any, stream: bool | None) -> Any:
if not self.overwrite_completion_id:
return resp
new_id = f"cltsd-{uuid.uuid4()}"
if stream:
async def _gen():
async for chunk in resp:
chunk.id = new_id
yield chunk
return _gen()
else:
resp.id = new_id
return resp
async def openai_completion(
self,
model: str,
@ -130,7 +154,7 @@ class OpenAIMixin(ABC):
logger.warning("prompt_logprobs is not supported by the OpenAI API. Ignoring.")
# TODO: fix openai_completion to return type compatible with OpenAI's API response
return await self.client.completions.create( # type: ignore[no-any-return]
resp = await self.client.completions.create(
**await prepare_openai_completion_params(
model=await self._get_provider_model_id(model),
prompt=prompt,
@ -153,6 +177,8 @@ class OpenAIMixin(ABC):
)
)
return await self._maybe_overwrite_id(resp, stream) # type: ignore[no-any-return]
async def openai_chat_completion(
self,
model: str,
@ -182,8 +208,7 @@ class OpenAIMixin(ABC):
"""
Direct OpenAI chat completion API call.
"""
# Type ignore because return types are compatible
return await self.client.chat.completions.create( # type: ignore[no-any-return]
resp = await self.client.chat.completions.create(
**await prepare_openai_completion_params(
model=await self._get_provider_model_id(model),
messages=messages,
@ -211,6 +236,8 @@ class OpenAIMixin(ABC):
)
)
return await self._maybe_overwrite_id(resp, stream) # type: ignore[no-any-return]
async def openai_embeddings(
self,
model: str,

View file

@ -48,7 +48,6 @@ def skip_if_model_doesnt_support_openai_completion(client_with_models, model_id)
"remote::nvidia",
"remote::runpod",
"remote::sambanova",
"remote::tgi",
"remote::vertexai",
# {"error":{"message":"Unknown request URL: GET /openai/v1/completions. Please check the URL for typos,
# or see the docs at https://console.groq.com/docs/","type":"invalid_request_error","code":"unknown_url"}}
@ -96,6 +95,7 @@ def skip_if_doesnt_support_n(client_with_models, model_id):
"remote::vertexai",
# Error code: 400 - [{'error': {'code': 400, 'message': 'Unable to submit request because candidateCount must be 1 but
# the entered value was 2. Update the candidateCount value and try again.', 'status': 'INVALID_ARGUMENT'}
"remote::tgi", # TGI ignores n param silently
):
pytest.skip(f"Model {model_id} hosted by {provider.provider_type} doesn't support n param.")
@ -110,7 +110,6 @@ def skip_if_model_doesnt_support_openai_chat_completion(client_with_models, mode
"remote::cerebras",
"remote::databricks",
"remote::runpod",
"remote::tgi",
):
pytest.skip(f"Model {model_id} hosted by {provider.provider_type} doesn't support OpenAI chat completions.")

View file

@ -0,0 +1,56 @@
{
"request": {
"method": "POST",
"url": "http://localhost:8080/v1/v1/chat/completions",
"headers": {},
"body": {
"model": "Qwen/Qwen3-0.6B",
"messages": [
{
"role": "user",
"content": "Hello, world!"
}
],
"stream": false
},
"endpoint": "/v1/chat/completions",
"model": "Qwen/Qwen3-0.6B"
},
"response": {
"body": {
"__type__": "openai.types.chat.chat_completion.ChatCompletion",
"__data__": {
"id": "",
"choices": [
{
"finish_reason": "stop",
"index": 0,
"logprobs": null,
"message": {
"content": "<think>\nOkay, the user just said \"Hello, world!\" so I need to respond in a friendly way. My prompt says to respond in the same style, so I should start with \"Hello, world!\" but maybe add some helpful information. Let me think. Since the user is probably testing or just sharing, a simple \"Hello, world!\" with a question would be best for user interaction. I'll make sure to keep it positive and open-ended.\n</think>\n\nHello, world! \ud83d\ude0a What do you need today?",
"refusal": null,
"role": "assistant",
"annotations": null,
"audio": null,
"function_call": null,
"tool_calls": null
}
}
],
"created": 1757550395,
"model": "Qwen/Qwen3-0.6B",
"object": "chat.completion",
"service_tier": null,
"system_fingerprint": "3.3.5-dev0-sha-1b90c50",
"usage": {
"completion_tokens": 108,
"prompt_tokens": 12,
"total_tokens": 120,
"completion_tokens_details": null,
"prompt_tokens_details": null
}
}
},
"is_streaming": false
}
}

File diff suppressed because one or more lines are too long

View file

@ -0,0 +1,56 @@
{
"request": {
"method": "POST",
"url": "http://localhost:8080/v1/v1/chat/completions",
"headers": {},
"body": {
"model": "Qwen/Qwen3-0.6B",
"messages": [
{
"role": "user",
"content": "Which planet has rings around it with a name starting with letter S?"
}
],
"stream": false
},
"endpoint": "/v1/chat/completions",
"model": "Qwen/Qwen3-0.6B"
},
"response": {
"body": {
"__type__": "openai.types.chat.chat_completion.ChatCompletion",
"__data__": {
"id": "",
"choices": [
{
"finish_reason": "stop",
"index": 0,
"logprobs": null,
"message": {
"content": "<think>\nOkay, so the user is asking which planet has rings around it and its name starts with the letter S. Let me think... I know that the Sun is a star, not a planet. So the Moon is a natural satellite, which has the Moon's name and rings. But the Moon's name starts with M, not S. The Earth has the name Earth, but the rings aren't really around the Earth in any real sense. Mars has a thin ring of dust. Venus and Mercury don't have rings in the sense of planetary rings as we know. Wait, maybe the answer is the Moon, even though it's not the same as the name starting with S. But the question says a planet, so if there's a planet named S, that would be it. But actually, the only planet with rings is Jupiter. Wait, Jupiter has a famous system of rings. But why does the question mention a planet with a name starting with S? Maybe there's a trick. Let me double-check. Jupiter's name starts with J, so maybe the answer is Venus? But Venus doesn't have rings. Mercury, too, doesn't. The Moon, as a planet, a dwarf planet, and has rings. Despite the name, the rings are around it. So the answer would be the Moon. Therefore, the planet with rings and name starting with S is the Moon.\n</think>\n\nThe planet with rings around it and a name starting with the letter **S** is the **Moon**. Though its name doesn't start with an **S**, it is technically a dwarf planet and has the rings in its orbit. Oops Saturn!",
"refusal": null,
"role": "assistant",
"annotations": null,
"audio": null,
"function_call": null,
"tool_calls": null
}
}
],
"created": 1757550394,
"model": "Qwen/Qwen3-0.6B",
"object": "chat.completion",
"service_tier": null,
"system_fingerprint": "3.3.5-dev0-sha-1b90c50",
"usage": {
"completion_tokens": 336,
"prompt_tokens": 22,
"total_tokens": 358,
"completion_tokens_details": null,
"prompt_tokens_details": null
}
}
},
"is_streaming": false
}
}

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,366 @@
{
"request": {
"method": "POST",
"url": "http://localhost:8080/v1/v1/chat/completions",
"headers": {},
"body": {
"model": "Qwen/Qwen3-0.6B",
"messages": [
{
"role": "user",
"content": "What's the weather in Tokyo? Use the get_weather function to get the weather."
}
],
"stream": true,
"tools": [
{
"type": "function",
"function": {
"name": "get_weather",
"description": "Get the weather in a given city",
"parameters": {
"type": "object",
"properties": {
"city": {
"type": "string",
"description": "The city to get the weather for"
}
}
}
}
}
]
},
"endpoint": "/v1/chat/completions",
"model": "Qwen/Qwen3-0.6B"
},
"response": {
"body": [
{
"__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
"__data__": {
"id": "",
"choices": [
{
"delta": {
"content": null,
"function_call": null,
"refusal": null,
"role": "assistant",
"tool_calls": [
{
"index": 0,
"id": "0",
"function": {
"arguments": "{",
"name": "get_weather"
},
"type": "function"
}
]
},
"finish_reason": null,
"index": 0,
"logprobs": null
}
],
"created": 1757550392,
"model": "Qwen/Qwen3-0.6B",
"object": "chat.completion.chunk",
"service_tier": null,
"system_fingerprint": "3.3.5-dev0-sha-1b90c50",
"usage": null
}
},
{
"__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
"__data__": {
"id": "",
"choices": [
{
"delta": {
"content": null,
"function_call": null,
"refusal": null,
"role": "assistant",
"tool_calls": [
{
"index": 0,
"id": "0",
"function": {
"arguments": " \"",
"name": null
},
"type": "function"
}
]
},
"finish_reason": null,
"index": 0,
"logprobs": null
}
],
"created": 1757550392,
"model": "Qwen/Qwen3-0.6B",
"object": "chat.completion.chunk",
"service_tier": null,
"system_fingerprint": "3.3.5-dev0-sha-1b90c50",
"usage": null
}
},
{
"__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
"__data__": {
"id": "",
"choices": [
{
"delta": {
"content": null,
"function_call": null,
"refusal": null,
"role": "assistant",
"tool_calls": [
{
"index": 0,
"id": "0",
"function": {
"arguments": "c",
"name": null
},
"type": "function"
}
]
},
"finish_reason": null,
"index": 0,
"logprobs": null
}
],
"created": 1757550392,
"model": "Qwen/Qwen3-0.6B",
"object": "chat.completion.chunk",
"service_tier": null,
"system_fingerprint": "3.3.5-dev0-sha-1b90c50",
"usage": null
}
},
{
"__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
"__data__": {
"id": "",
"choices": [
{
"delta": {
"content": null,
"function_call": null,
"refusal": null,
"role": "assistant",
"tool_calls": [
{
"index": 0,
"id": "0",
"function": {
"arguments": "ity",
"name": null
},
"type": "function"
}
]
},
"finish_reason": null,
"index": 0,
"logprobs": null
}
],
"created": 1757550392,
"model": "Qwen/Qwen3-0.6B",
"object": "chat.completion.chunk",
"service_tier": null,
"system_fingerprint": "3.3.5-dev0-sha-1b90c50",
"usage": null
}
},
{
"__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
"__data__": {
"id": "",
"choices": [
{
"delta": {
"content": null,
"function_call": null,
"refusal": null,
"role": "assistant",
"tool_calls": [
{
"index": 0,
"id": "0",
"function": {
"arguments": "\":",
"name": null
},
"type": "function"
}
]
},
"finish_reason": null,
"index": 0,
"logprobs": null
}
],
"created": 1757550392,
"model": "Qwen/Qwen3-0.6B",
"object": "chat.completion.chunk",
"service_tier": null,
"system_fingerprint": "3.3.5-dev0-sha-1b90c50",
"usage": null
}
},
{
"__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
"__data__": {
"id": "",
"choices": [
{
"delta": {
"content": null,
"function_call": null,
"refusal": null,
"role": "assistant",
"tool_calls": [
{
"index": 0,
"id": "0",
"function": {
"arguments": " \"",
"name": null
},
"type": "function"
}
]
},
"finish_reason": null,
"index": 0,
"logprobs": null
}
],
"created": 1757550392,
"model": "Qwen/Qwen3-0.6B",
"object": "chat.completion.chunk",
"service_tier": null,
"system_fingerprint": "3.3.5-dev0-sha-1b90c50",
"usage": null
}
},
{
"__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
"__data__": {
"id": "",
"choices": [
{
"delta": {
"content": null,
"function_call": null,
"refusal": null,
"role": "assistant",
"tool_calls": [
{
"index": 0,
"id": "0",
"function": {
"arguments": "Tok",
"name": null
},
"type": "function"
}
]
},
"finish_reason": null,
"index": 0,
"logprobs": null
}
],
"created": 1757550392,
"model": "Qwen/Qwen3-0.6B",
"object": "chat.completion.chunk",
"service_tier": null,
"system_fingerprint": "3.3.5-dev0-sha-1b90c50",
"usage": null
}
},
{
"__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
"__data__": {
"id": "",
"choices": [
{
"delta": {
"content": null,
"function_call": null,
"refusal": null,
"role": "assistant",
"tool_calls": [
{
"index": 0,
"id": "0",
"function": {
"arguments": "yo",
"name": null
},
"type": "function"
}
]
},
"finish_reason": null,
"index": 0,
"logprobs": null
}
],
"created": 1757550392,
"model": "Qwen/Qwen3-0.6B",
"object": "chat.completion.chunk",
"service_tier": null,
"system_fingerprint": "3.3.5-dev0-sha-1b90c50",
"usage": null
}
},
{
"__type__": "openai.types.chat.chat_completion_chunk.ChatCompletionChunk",
"__data__": {
"id": "",
"choices": [
{
"delta": {
"content": null,
"function_call": null,
"refusal": null,
"role": "assistant",
"tool_calls": [
{
"index": 0,
"id": "0",
"function": {
"arguments": "\"}",
"name": null
},
"type": "function"
}
]
},
"finish_reason": null,
"index": 0,
"logprobs": null
}
],
"created": 1757550392,
"model": "Qwen/Qwen3-0.6B",
"object": "chat.completion.chunk",
"service_tier": null,
"system_fingerprint": "3.3.5-dev0-sha-1b90c50",
"usage": null
}
}
],
"is_streaming": true
}
}

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,976 @@
{
"request": {
"method": "POST",
"url": "http://localhost:8080/v1/v1/completions",
"headers": {},
"body": {
"model": "Qwen/Qwen3-0.6B",
"prompt": "Respond to this question and explain your answer. Complete the sentence using one word: Roses are red, violets are ",
"max_tokens": 50,
"stream": true
},
"endpoint": "/v1/completions",
"model": "Qwen/Qwen3-0.6B"
},
"response": {
"body": [
{
"__type__": "openai.types.completion.Completion",
"__data__": {
"id": "",
"choices": [
{
"finish_reason": "",
"index": 0,
"logprobs": null,
"text": " several"
}
],
"created": 1757550367,
"model": "Qwen/Qwen3-0.6B",
"object": "text_completion",
"system_fingerprint": "3.3.5-dev0-sha-1b90c50",
"usage": null
}
},
{
"__type__": "openai.types.completion.Completion",
"__data__": {
"id": "",
"choices": [
{
"finish_reason": "",
"index": 0,
"logprobs": null,
"text": " several"
}
],
"created": 1757550367,
"model": "Qwen/Qwen3-0.6B",
"object": "text_completion",
"system_fingerprint": "3.3.5-dev0-sha-1b90c50",
"usage": null
}
},
{
"__type__": "openai.types.completion.Completion",
"__data__": {
"id": "",
"choices": [
{
"finish_reason": "",
"index": 0,
"logprobs": null,
"text": " times"
}
],
"created": 1757550367,
"model": "Qwen/Qwen3-0.6B",
"object": "text_completion",
"system_fingerprint": "3.3.5-dev0-sha-1b90c50",
"usage": null
}
},
{
"__type__": "openai.types.completion.Completion",
"__data__": {
"id": "",
"choices": [
{
"finish_reason": "",
"index": 0,
"logprobs": null,
"text": " more"
}
],
"created": 1757550367,
"model": "Qwen/Qwen3-0.6B",
"object": "text_completion",
"system_fingerprint": "3.3.5-dev0-sha-1b90c50",
"usage": null
}
},
{
"__type__": "openai.types.completion.Completion",
"__data__": {
"id": "",
"choices": [
{
"finish_reason": "",
"index": 0,
"logprobs": null,
"text": " popular"
}
],
"created": 1757550367,
"model": "Qwen/Qwen3-0.6B",
"object": "text_completion",
"system_fingerprint": "3.3.5-dev0-sha-1b90c50",
"usage": null
}
},
{
"__type__": "openai.types.completion.Completion",
"__data__": {
"id": "",
"choices": [
{
"finish_reason": "",
"index": 0,
"logprobs": null,
"text": " than"
}
],
"created": 1757550367,
"model": "Qwen/Qwen3-0.6B",
"object": "text_completion",
"system_fingerprint": "3.3.5-dev0-sha-1b90c50",
"usage": null
}
},
{
"__type__": "openai.types.completion.Completion",
"__data__": {
"id": "",
"choices": [
{
"finish_reason": "",
"index": 0,
"logprobs": null,
"text": " ____"
}
],
"created": 1757550367,
"model": "Qwen/Qwen3-0.6B",
"object": "text_completion",
"system_fingerprint": "3.3.5-dev0-sha-1b90c50",
"usage": null
}
},
{
"__type__": "openai.types.completion.Completion",
"__data__": {
"id": "",
"choices": [
{
"finish_reason": "",
"index": 0,
"logprobs": null,
"text": ".\n"
}
],
"created": 1757550367,
"model": "Qwen/Qwen3-0.6B",
"object": "text_completion",
"system_fingerprint": "3.3.5-dev0-sha-1b90c50",
"usage": null
}
},
{
"__type__": "openai.types.completion.Completion",
"__data__": {
"id": "",
"choices": [
{
"finish_reason": "",
"index": 0,
"logprobs": null,
"text": "Answer"
}
],
"created": 1757550367,
"model": "Qwen/Qwen3-0.6B",
"object": "text_completion",
"system_fingerprint": "3.3.5-dev0-sha-1b90c50",
"usage": null
}
},
{
"__type__": "openai.types.completion.Completion",
"__data__": {
"id": "",
"choices": [
{
"finish_reason": "",
"index": 0,
"logprobs": null,
"text": ":\n\n"
}
],
"created": 1757550367,
"model": "Qwen/Qwen3-0.6B",
"object": "text_completion",
"system_fingerprint": "3.3.5-dev0-sha-1b90c50",
"usage": null
}
},
{
"__type__": "openai.types.completion.Completion",
"__data__": {
"id": "",
"choices": [
{
"finish_reason": "",
"index": 0,
"logprobs": null,
"text": "The"
}
],
"created": 1757550367,
"model": "Qwen/Qwen3-0.6B",
"object": "text_completion",
"system_fingerprint": "3.3.5-dev0-sha-1b90c50",
"usage": null
}
},
{
"__type__": "openai.types.completion.Completion",
"__data__": {
"id": "",
"choices": [
{
"finish_reason": "",
"index": 0,
"logprobs": null,
"text": " roses"
}
],
"created": 1757550367,
"model": "Qwen/Qwen3-0.6B",
"object": "text_completion",
"system_fingerprint": "3.3.5-dev0-sha-1b90c50",
"usage": null
}
},
{
"__type__": "openai.types.completion.Completion",
"__data__": {
"id": "",
"choices": [
{
"finish_reason": "",
"index": 0,
"logprobs": null,
"text": " are"
}
],
"created": 1757550367,
"model": "Qwen/Qwen3-0.6B",
"object": "text_completion",
"system_fingerprint": "3.3.5-dev0-sha-1b90c50",
"usage": null
}
},
{
"__type__": "openai.types.completion.Completion",
"__data__": {
"id": "",
"choices": [
{
"finish_reason": "",
"index": 0,
"logprobs": null,
"text": " red"
}
],
"created": 1757550367,
"model": "Qwen/Qwen3-0.6B",
"object": "text_completion",
"system_fingerprint": "3.3.5-dev0-sha-1b90c50",
"usage": null
}
},
{
"__type__": "openai.types.completion.Completion",
"__data__": {
"id": "",
"choices": [
{
"finish_reason": "",
"index": 0,
"logprobs": null,
"text": ","
}
],
"created": 1757550367,
"model": "Qwen/Qwen3-0.6B",
"object": "text_completion",
"system_fingerprint": "3.3.5-dev0-sha-1b90c50",
"usage": null
}
},
{
"__type__": "openai.types.completion.Completion",
"__data__": {
"id": "",
"choices": [
{
"finish_reason": "",
"index": 0,
"logprobs": null,
"text": " v"
}
],
"created": 1757550367,
"model": "Qwen/Qwen3-0.6B",
"object": "text_completion",
"system_fingerprint": "3.3.5-dev0-sha-1b90c50",
"usage": null
}
},
{
"__type__": "openai.types.completion.Completion",
"__data__": {
"id": "",
"choices": [
{
"finish_reason": "",
"index": 0,
"logprobs": null,
"text": "io"
}
],
"created": 1757550367,
"model": "Qwen/Qwen3-0.6B",
"object": "text_completion",
"system_fingerprint": "3.3.5-dev0-sha-1b90c50",
"usage": null
}
},
{
"__type__": "openai.types.completion.Completion",
"__data__": {
"id": "",
"choices": [
{
"finish_reason": "",
"index": 0,
"logprobs": null,
"text": "lets"
}
],
"created": 1757550367,
"model": "Qwen/Qwen3-0.6B",
"object": "text_completion",
"system_fingerprint": "3.3.5-dev0-sha-1b90c50",
"usage": null
}
},
{
"__type__": "openai.types.completion.Completion",
"__data__": {
"id": "",
"choices": [
{
"finish_reason": "",
"index": 0,
"logprobs": null,
"text": " are"
}
],
"created": 1757550367,
"model": "Qwen/Qwen3-0.6B",
"object": "text_completion",
"system_fingerprint": "3.3.5-dev0-sha-1b90c50",
"usage": null
}
},
{
"__type__": "openai.types.completion.Completion",
"__data__": {
"id": "",
"choices": [
{
"finish_reason": "",
"index": 0,
"logprobs": null,
"text": " several"
}
],
"created": 1757550367,
"model": "Qwen/Qwen3-0.6B",
"object": "text_completion",
"system_fingerprint": "3.3.5-dev0-sha-1b90c50",
"usage": null
}
},
{
"__type__": "openai.types.completion.Completion",
"__data__": {
"id": "",
"choices": [
{
"finish_reason": "",
"index": 0,
"logprobs": null,
"text": " several"
}
],
"created": 1757550367,
"model": "Qwen/Qwen3-0.6B",
"object": "text_completion",
"system_fingerprint": "3.3.5-dev0-sha-1b90c50",
"usage": null
}
},
{
"__type__": "openai.types.completion.Completion",
"__data__": {
"id": "",
"choices": [
{
"finish_reason": "",
"index": 0,
"logprobs": null,
"text": " times"
}
],
"created": 1757550367,
"model": "Qwen/Qwen3-0.6B",
"object": "text_completion",
"system_fingerprint": "3.3.5-dev0-sha-1b90c50",
"usage": null
}
},
{
"__type__": "openai.types.completion.Completion",
"__data__": {
"id": "",
"choices": [
{
"finish_reason": "",
"index": 0,
"logprobs": null,
"text": " more"
}
],
"created": 1757550367,
"model": "Qwen/Qwen3-0.6B",
"object": "text_completion",
"system_fingerprint": "3.3.5-dev0-sha-1b90c50",
"usage": null
}
},
{
"__type__": "openai.types.completion.Completion",
"__data__": {
"id": "",
"choices": [
{
"finish_reason": "",
"index": 0,
"logprobs": null,
"text": " popular"
}
],
"created": 1757550367,
"model": "Qwen/Qwen3-0.6B",
"object": "text_completion",
"system_fingerprint": "3.3.5-dev0-sha-1b90c50",
"usage": null
}
},
{
"__type__": "openai.types.completion.Completion",
"__data__": {
"id": "",
"choices": [
{
"finish_reason": "",
"index": 0,
"logprobs": null,
"text": " than"
}
],
"created": 1757550367,
"model": "Qwen/Qwen3-0.6B",
"object": "text_completion",
"system_fingerprint": "3.3.5-dev0-sha-1b90c50",
"usage": null
}
},
{
"__type__": "openai.types.completion.Completion",
"__data__": {
"id": "",
"choices": [
{
"finish_reason": "",
"index": 0,
"logprobs": null,
"text": " **"
}
],
"created": 1757550367,
"model": "Qwen/Qwen3-0.6B",
"object": "text_completion",
"system_fingerprint": "3.3.5-dev0-sha-1b90c50",
"usage": null
}
},
{
"__type__": "openai.types.completion.Completion",
"__data__": {
"id": "",
"choices": [
{
"finish_reason": "",
"index": 0,
"logprobs": null,
"text": "numbers"
}
],
"created": 1757550367,
"model": "Qwen/Qwen3-0.6B",
"object": "text_completion",
"system_fingerprint": "3.3.5-dev0-sha-1b90c50",
"usage": null
}
},
{
"__type__": "openai.types.completion.Completion",
"__data__": {
"id": "",
"choices": [
{
"finish_reason": "",
"index": 0,
"logprobs": null,
"text": "**"
}
],
"created": 1757550367,
"model": "Qwen/Qwen3-0.6B",
"object": "text_completion",
"system_fingerprint": "3.3.5-dev0-sha-1b90c50",
"usage": null
}
},
{
"__type__": "openai.types.completion.Completion",
"__data__": {
"id": "",
"choices": [
{
"finish_reason": "",
"index": 0,
"logprobs": null,
"text": ".\n\n"
}
],
"created": 1757550367,
"model": "Qwen/Qwen3-0.6B",
"object": "text_completion",
"system_fingerprint": "3.3.5-dev0-sha-1b90c50",
"usage": null
}
},
{
"__type__": "openai.types.completion.Completion",
"__data__": {
"id": "",
"choices": [
{
"finish_reason": "",
"index": 0,
"logprobs": null,
"text": "Explanation"
}
],
"created": 1757550367,
"model": "Qwen/Qwen3-0.6B",
"object": "text_completion",
"system_fingerprint": "3.3.5-dev0-sha-1b90c50",
"usage": null
}
},
{
"__type__": "openai.types.completion.Completion",
"__data__": {
"id": "",
"choices": [
{
"finish_reason": "",
"index": 0,
"logprobs": null,
"text": ":"
}
],
"created": 1757550367,
"model": "Qwen/Qwen3-0.6B",
"object": "text_completion",
"system_fingerprint": "3.3.5-dev0-sha-1b90c50",
"usage": null
}
},
{
"__type__": "openai.types.completion.Completion",
"__data__": {
"id": "",
"choices": [
{
"finish_reason": "",
"index": 0,
"logprobs": null,
"text": " \""
}
],
"created": 1757550367,
"model": "Qwen/Qwen3-0.6B",
"object": "text_completion",
"system_fingerprint": "3.3.5-dev0-sha-1b90c50",
"usage": null
}
},
{
"__type__": "openai.types.completion.Completion",
"__data__": {
"id": "",
"choices": [
{
"finish_reason": "",
"index": 0,
"logprobs": null,
"text": "se"
}
],
"created": 1757550367,
"model": "Qwen/Qwen3-0.6B",
"object": "text_completion",
"system_fingerprint": "3.3.5-dev0-sha-1b90c50",
"usage": null
}
},
{
"__type__": "openai.types.completion.Completion",
"__data__": {
"id": "",
"choices": [
{
"finish_reason": "",
"index": 0,
"logprobs": null,
"text": "veral"
}
],
"created": 1757550367,
"model": "Qwen/Qwen3-0.6B",
"object": "text_completion",
"system_fingerprint": "3.3.5-dev0-sha-1b90c50",
"usage": null
}
},
{
"__type__": "openai.types.completion.Completion",
"__data__": {
"id": "",
"choices": [
{
"finish_reason": "",
"index": 0,
"logprobs": null,
"text": " several"
}
],
"created": 1757550367,
"model": "Qwen/Qwen3-0.6B",
"object": "text_completion",
"system_fingerprint": "3.3.5-dev0-sha-1b90c50",
"usage": null
}
},
{
"__type__": "openai.types.completion.Completion",
"__data__": {
"id": "",
"choices": [
{
"finish_reason": "",
"index": 0,
"logprobs": null,
"text": " times"
}
],
"created": 1757550367,
"model": "Qwen/Qwen3-0.6B",
"object": "text_completion",
"system_fingerprint": "3.3.5-dev0-sha-1b90c50",
"usage": null
}
},
{
"__type__": "openai.types.completion.Completion",
"__data__": {
"id": "",
"choices": [
{
"finish_reason": "",
"index": 0,
"logprobs": null,
"text": " more"
}
],
"created": 1757550367,
"model": "Qwen/Qwen3-0.6B",
"object": "text_completion",
"system_fingerprint": "3.3.5-dev0-sha-1b90c50",
"usage": null
}
},
{
"__type__": "openai.types.completion.Completion",
"__data__": {
"id": "",
"choices": [
{
"finish_reason": "",
"index": 0,
"logprobs": null,
"text": " popular"
}
],
"created": 1757550367,
"model": "Qwen/Qwen3-0.6B",
"object": "text_completion",
"system_fingerprint": "3.3.5-dev0-sha-1b90c50",
"usage": null
}
},
{
"__type__": "openai.types.completion.Completion",
"__data__": {
"id": "",
"choices": [
{
"finish_reason": "",
"index": 0,
"logprobs": null,
"text": " than"
}
],
"created": 1757550367,
"model": "Qwen/Qwen3-0.6B",
"object": "text_completion",
"system_fingerprint": "3.3.5-dev0-sha-1b90c50",
"usage": null
}
},
{
"__type__": "openai.types.completion.Completion",
"__data__": {
"id": "",
"choices": [
{
"finish_reason": "",
"index": 0,
"logprobs": null,
"text": "\""
}
],
"created": 1757550367,
"model": "Qwen/Qwen3-0.6B",
"object": "text_completion",
"system_fingerprint": "3.3.5-dev0-sha-1b90c50",
"usage": null
}
},
{
"__type__": "openai.types.completion.Completion",
"__data__": {
"id": "",
"choices": [
{
"finish_reason": "",
"index": 0,
"logprobs": null,
"text": " can"
}
],
"created": 1757550367,
"model": "Qwen/Qwen3-0.6B",
"object": "text_completion",
"system_fingerprint": "3.3.5-dev0-sha-1b90c50",
"usage": null
}
},
{
"__type__": "openai.types.completion.Completion",
"__data__": {
"id": "",
"choices": [
{
"finish_reason": "",
"index": 0,
"logprobs": null,
"text": " be"
}
],
"created": 1757550367,
"model": "Qwen/Qwen3-0.6B",
"object": "text_completion",
"system_fingerprint": "3.3.5-dev0-sha-1b90c50",
"usage": null
}
},
{
"__type__": "openai.types.completion.Completion",
"__data__": {
"id": "",
"choices": [
{
"finish_reason": "",
"index": 0,
"logprobs": null,
"text": " replaced"
}
],
"created": 1757550367,
"model": "Qwen/Qwen3-0.6B",
"object": "text_completion",
"system_fingerprint": "3.3.5-dev0-sha-1b90c50",
"usage": null
}
},
{
"__type__": "openai.types.completion.Completion",
"__data__": {
"id": "",
"choices": [
{
"finish_reason": "",
"index": 0,
"logprobs": null,
"text": " with"
}
],
"created": 1757550367,
"model": "Qwen/Qwen3-0.6B",
"object": "text_completion",
"system_fingerprint": "3.3.5-dev0-sha-1b90c50",
"usage": null
}
},
{
"__type__": "openai.types.completion.Completion",
"__data__": {
"id": "",
"choices": [
{
"finish_reason": "",
"index": 0,
"logprobs": null,
"text": " \""
}
],
"created": 1757550367,
"model": "Qwen/Qwen3-0.6B",
"object": "text_completion",
"system_fingerprint": "3.3.5-dev0-sha-1b90c50",
"usage": null
}
},
{
"__type__": "openai.types.completion.Completion",
"__data__": {
"id": "",
"choices": [
{
"finish_reason": "",
"index": 0,
"logprobs": null,
"text": "numbers"
}
],
"created": 1757550367,
"model": "Qwen/Qwen3-0.6B",
"object": "text_completion",
"system_fingerprint": "3.3.5-dev0-sha-1b90c50",
"usage": null
}
},
{
"__type__": "openai.types.completion.Completion",
"__data__": {
"id": "",
"choices": [
{
"finish_reason": "",
"index": 0,
"logprobs": null,
"text": "\""
}
],
"created": 1757550367,
"model": "Qwen/Qwen3-0.6B",
"object": "text_completion",
"system_fingerprint": "3.3.5-dev0-sha-1b90c50",
"usage": null
}
},
{
"__type__": "openai.types.completion.Completion",
"__data__": {
"id": "",
"choices": [
{
"finish_reason": "",
"index": 0,
"logprobs": null,
"text": " as"
}
],
"created": 1757550367,
"model": "Qwen/Qwen3-0.6B",
"object": "text_completion",
"system_fingerprint": "3.3.5-dev0-sha-1b90c50",
"usage": null
}
},
{
"__type__": "openai.types.completion.Completion",
"__data__": {
"id": "",
"choices": [
{
"finish_reason": "",
"index": 0,
"logprobs": null,
"text": " the"
}
],
"created": 1757550367,
"model": "Qwen/Qwen3-0.6B",
"object": "text_completion",
"system_fingerprint": "3.3.5-dev0-sha-1b90c50",
"usage": null
}
},
{
"__type__": "openai.types.completion.Completion",
"__data__": {
"id": "",
"choices": [
{
"finish_reason": "length",
"index": 0,
"logprobs": null,
"text": " number"
}
],
"created": 1757550367,
"model": "Qwen/Qwen3-0.6B",
"object": "text_completion",
"system_fingerprint": "3.3.5-dev0-sha-1b90c50",
"usage": {
"completion_tokens": 50,
"prompt_tokens": 25,
"total_tokens": 75,
"completion_tokens_details": null,
"prompt_tokens_details": null
}
}
}
],
"is_streaming": true
}
}

View file

@ -0,0 +1,84 @@
{
"request": {
"method": "POST",
"url": "http://localhost:8080/v1/v1/chat/completions",
"headers": {},
"body": {
"model": "Qwen/Qwen3-0.6B",
"messages": [
{
"role": "user",
"content": "What's the weather in Tokyo? Use the get_weather function to get the weather."
}
],
"stream": false,
"tools": [
{
"type": "function",
"function": {
"name": "get_weather",
"description": "Get the weather in a given city",
"parameters": {
"type": "object",
"properties": {
"city": {
"type": "string",
"description": "The city to get the weather for"
}
}
}
}
}
]
},
"endpoint": "/v1/chat/completions",
"model": "Qwen/Qwen3-0.6B"
},
"response": {
"body": {
"__type__": "openai.types.chat.chat_completion.ChatCompletion",
"__data__": {
"id": "",
"choices": [
{
"finish_reason": "stop",
"index": 0,
"logprobs": null,
"message": {
"content": null,
"refusal": null,
"role": "assistant",
"annotations": null,
"audio": null,
"function_call": null,
"tool_calls": [
{
"id": "0",
"function": {
"arguments": "{\"city\":\"Tokyo\"}",
"name": "get_weather",
"description": null
},
"type": "function"
}
]
}
}
],
"created": 1757550396,
"model": "Qwen/Qwen3-0.6B",
"object": "chat.completion",
"service_tier": null,
"system_fingerprint": "3.3.5-dev0-sha-1b90c50",
"usage": {
"completion_tokens": 19,
"prompt_tokens": 239,
"total_tokens": 258,
"completion_tokens_details": null,
"prompt_tokens_details": null
}
}
},
"is_streaming": false
}
}

View file

@ -0,0 +1,56 @@
{
"request": {
"method": "POST",
"url": "http://localhost:8080/v1/v1/chat/completions",
"headers": {},
"body": {
"model": "Qwen/Qwen3-0.6B",
"messages": [
{
"role": "user",
"content": "Which planet do humans live on?"
}
],
"stream": false
},
"endpoint": "/v1/chat/completions",
"model": "Qwen/Qwen3-0.6B"
},
"response": {
"body": {
"__type__": "openai.types.chat.chat_completion.ChatCompletion",
"__data__": {
"id": "",
"choices": [
{
"finish_reason": "stop",
"index": 0,
"logprobs": null,
"message": {
"content": "<think>\nOkay, the user is asking which planet humans live on. I need to make sure I answer this accurately. First, I should recall what I know about our solar system. The Earth is our home, and it's in our solar system. There are eight planets in total, right? Let me check that. Mercury, Venus, Earth, Mars, Jupiter, Saturn, Uranus, and Neptune. Yep, that's the list.\n\nBut wait, the user might be confusing Earth with Mars. I should clarify that Earth is the only planet known to support life. The other planets are mostly gas giants and have no liquid water, so they don't support life as Earth does. So the answer should be Earth. I should also mention that although there are other planets, none have liquid water, which makes the answer more complete.\n\nI need to make sure there are no alternatives. Maybe some people might think Mars, but I know that's not the case. Also, it's good to mention that life on Earth is closely linked to the presence of water, which is why Earth is our only planet with that characteristic. That way, the answer is not only accurate but also informative.\n</think>\n\nHumans live on **Earth**, the planet that supports life as we know it. The Earth is the only known planet in our solar system where liquid water exists and where life can occur. Other planets are considered \"gas giants\" or \"ice giants\" due to their extreme conditions and lack of liquid water, making them inhospitable for life.",
"refusal": null,
"role": "assistant",
"annotations": null,
"audio": null,
"function_call": null,
"tool_calls": null
}
}
],
"created": 1757550390,
"model": "Qwen/Qwen3-0.6B",
"object": "chat.completion",
"service_tier": null,
"system_fingerprint": "3.3.5-dev0-sha-1b90c50",
"usage": {
"completion_tokens": 312,
"prompt_tokens": 15,
"total_tokens": 327,
"completion_tokens_details": null,
"prompt_tokens_details": null
}
}
},
"is_streaming": false
}
}

File diff suppressed because it is too large Load diff

View file

@ -90,6 +90,16 @@ SETUP_DEFINITIONS: dict[str, Setup] = {
"embedding_model": "sentence-transformers/all-MiniLM-L6-v2",
},
),
"tgi": Setup(
name="tgi",
description="Text Generation Inference (TGI) provider with a text model",
env={
"TGI_URL": "http://localhost:8080",
},
defaults={
"text_model": "tgi/Qwen/Qwen3-0.6B",
},
),
}