fixes and linting

This commit is contained in:
Hardik Shah 2025-03-28 18:33:36 -07:00
parent 021dd0d35d
commit 5251d2422d
8 changed files with 149 additions and 345 deletions

View file

@ -56,9 +56,7 @@ from .models import MODEL_ENTRIES
logger = get_logger(name=__name__, category="inference")
class FireworksInferenceAdapter(
ModelRegistryHelper, Inference, NeedsRequestProviderData
):
class FireworksInferenceAdapter(ModelRegistryHelper, Inference, NeedsRequestProviderData):
def __init__(self, config: FireworksImplConfig) -> None:
ModelRegistryHelper.__init__(self, MODEL_ENTRIES)
self.config = config
@ -70,9 +68,7 @@ class FireworksInferenceAdapter(
pass
def _get_api_key(self) -> str:
config_api_key = (
self.config.api_key.get_secret_value() if self.config.api_key else None
)
config_api_key = self.config.api_key.get_secret_value() if self.config.api_key else None
if config_api_key:
return config_api_key
else:
@ -112,9 +108,7 @@ class FireworksInferenceAdapter(
else:
return await self._nonstream_completion(request)
async def _nonstream_completion(
self, request: CompletionRequest
) -> CompletionResponse:
async def _nonstream_completion(self, request: CompletionRequest) -> CompletionResponse:
params = await self._get_params(request)
r = await self._get_client().completion.acreate(**params)
return process_completion_response(r)
@ -194,9 +188,7 @@ class FireworksInferenceAdapter(
else:
return await self._nonstream_chat_completion(request)
async def _nonstream_chat_completion(
self, request: ChatCompletionRequest
) -> ChatCompletionResponse:
async def _nonstream_chat_completion(self, request: ChatCompletionRequest) -> ChatCompletionResponse:
params = await self._get_params(request)
if "messages" in params:
r = await self._get_client().chat.completions.acreate(**params)
@ -204,9 +196,7 @@ class FireworksInferenceAdapter(
r = await self._get_client().completion.acreate(**params)
return process_chat_completion_response(r, request)
async def _stream_chat_completion(
self, request: ChatCompletionRequest
) -> AsyncGenerator:
async def _stream_chat_completion(self, request: ChatCompletionRequest) -> AsyncGenerator:
params = await self._get_params(request)
async def _to_async_generator():
@ -221,9 +211,7 @@ class FireworksInferenceAdapter(
async for chunk in process_chat_completion_stream_response(stream, request):
yield chunk
async def _get_params(
self, request: Union[ChatCompletionRequest, CompletionRequest]
) -> dict:
async def _get_params(self, request: Union[ChatCompletionRequest, CompletionRequest]) -> dict:
input_dict = {}
media_present = request_has_media(request)
@ -231,17 +219,12 @@ class FireworksInferenceAdapter(
if isinstance(request, ChatCompletionRequest):
if media_present or not llama_model:
input_dict["messages"] = [
await convert_message_to_openai_dict(m, download=True)
for m in request.messages
await convert_message_to_openai_dict(m, download=True) for m in request.messages
]
else:
input_dict["prompt"] = await chat_completion_request_to_prompt(
request, llama_model
)
input_dict["prompt"] = await chat_completion_request_to_prompt(request, llama_model)
else:
assert (
not media_present
), "Fireworks does not support media for Completion requests"
assert not media_present, "Fireworks does not support media for Completion requests"
input_dict["prompt"] = await completion_request_to_prompt(request)
# Fireworks always prepends with BOS
@ -253,9 +236,7 @@ class FireworksInferenceAdapter(
"model": request.model,
**input_dict,
"stream": request.stream,
**self._build_options(
request.sampling_params, request.response_format, request.logprobs
),
**self._build_options(request.sampling_params, request.response_format, request.logprobs),
}
logger.debug(f"params to fireworks: {params}")
@ -274,9 +255,9 @@ class FireworksInferenceAdapter(
kwargs = {}
if model.metadata.get("embedding_dimension"):
kwargs["dimensions"] = model.metadata.get("embedding_dimension")
assert all(
not content_has_media(content) for content in contents
), "Fireworks does not support media for embeddings"
assert all(not content_has_media(content) for content in contents), (
"Fireworks does not support media for embeddings"
)
response = self._get_client().embeddings.create(
model=model.provider_resource_id,
input=[interleaved_content_as_str(content) for content in contents],

View file

@ -5,10 +5,9 @@
# the root directory of this source tree.
import logging
from typing import AsyncGenerator, List, Optional
from huggingface_hub import AsyncInferenceClient, HfApi, InferenceClient
from huggingface_hub import AsyncInferenceClient, HfApi
from llama_stack.apis.common.content_types import (
InterleavedContent,
@ -16,12 +15,10 @@ from llama_stack.apis.common.content_types import (
)
from llama_stack.apis.inference import (
ChatCompletionRequest,
ChatCompletionResponse,
CompletionRequest,
EmbeddingsResponse,
EmbeddingTaskType,
Inference,
JsonSchemaResponseFormat,
LogProbConfig,
Message,
ResponseFormat,
@ -38,26 +35,20 @@ from llama_stack.log import get_logger
from llama_stack.models.llama.sku_list import all_registered_models
from llama_stack.providers.datatypes import ModelsProtocolPrivate
from llama_stack.providers.utils.inference.model_registry import (
build_hf_repo_model_entry,
ModelRegistryHelper,
build_hf_repo_model_entry,
)
from llama_stack.providers.utils.inference.openai_compat import (
convert_chat_completion_request_to_openai_params,
convert_completion_request_to_openai_params,
convert_message_to_openai_dict_new,
convert_openai_chat_completion_choice,
convert_openai_chat_completion_stream,
convert_tooldef_to_openai_tool,
get_sampling_options,
OpenAICompatCompletionChoice,
OpenAICompatCompletionResponse,
process_chat_completion_response,
process_chat_completion_stream_response,
convert_chat_completion_request_to_openai_params,
convert_openai_chat_completion_choice,
convert_openai_chat_completion_stream,
get_sampling_options,
process_completion_response,
process_completion_stream_response,
)
from llama_stack.providers.utils.inference.prompt_adapter import (
chat_completion_request_to_model_input_info,
completion_request_to_prompt_model_input_info,
)
@ -85,9 +76,7 @@ class _HfAdapter(Inference, ModelsProtocolPrivate):
def __init__(self) -> None:
self.register_helper = ModelRegistryHelper(build_hf_repo_model_entries())
self.huggingface_repo_to_llama_model_id = {
model.huggingface_repo: model.descriptor()
for model in all_registered_models()
if model.huggingface_repo
model.huggingface_repo: model.descriptor() for model in all_registered_models() if model.huggingface_repo
}
async def shutdown(self) -> None:
@ -114,7 +103,7 @@ class _HfAdapter(Inference, ModelsProtocolPrivate):
logprobs: Optional[LogProbConfig] = None,
) -> AsyncGenerator:
if response_format:
raise ValueError(f"TGI does not support Response Format for completions.")
raise ValueError("TGI does not support Response Format for completions.")
if sampling_params is None:
sampling_params = SamplingParams()
@ -166,17 +155,13 @@ class _HfAdapter(Inference, ModelsProtocolPrivate):
return options
async def _get_params_for_completion(self, request: CompletionRequest) -> dict:
prompt, input_tokens = await completion_request_to_prompt_model_input_info(
request
)
prompt, input_tokens = await completion_request_to_prompt_model_input_info(request)
return dict(
prompt=prompt,
stream=request.stream,
details=True,
max_new_tokens=self._get_max_new_tokens(
request.sampling_params, input_tokens
),
max_new_tokens=self._get_max_new_tokens(request.sampling_params, input_tokens),
stop_sequences=["<|eom_id|>", "<|eot_id|>"],
**self._build_options(request.sampling_params, request.response_format),
)
@ -185,16 +170,14 @@ class _HfAdapter(Inference, ModelsProtocolPrivate):
params = await self._get_params_for_completion(request)
async def _generate_and_convert_to_openai_compat():
s = self.client.text_generation(**params)
for chunk in s:
s = await self.client.text_generation(**params)
async for chunk in s:
token_result = chunk.token
finish_reason = None
if chunk.details:
finish_reason = chunk.details.finish_reason
choice = OpenAICompatCompletionChoice(
text=token_result.text, finish_reason=finish_reason
)
choice = OpenAICompatCompletionChoice(text=token_result.text, finish_reason=finish_reason)
yield OpenAICompatCompletionResponse(
choices=[choice],
)
@ -205,7 +188,7 @@ class _HfAdapter(Inference, ModelsProtocolPrivate):
async def _nonstream_completion(self, request: CompletionRequest) -> AsyncGenerator:
params = await self._get_params_for_completion(request)
r = self.client.text_generation(**params)
r = await self.client.text_generation(**params)
choice = OpenAICompatCompletionChoice(
finish_reason=r.details.finish_reason,
@ -234,9 +217,6 @@ class _HfAdapter(Inference, ModelsProtocolPrivate):
if sampling_params is None:
sampling_params = SamplingParams()
model = await self.model_store.get_model(model_id)
from rich.pretty import pprint
pprint(messages)
request = ChatCompletionRequest(
model=model.provider_resource_id,
messages=messages,
@ -250,18 +230,9 @@ class _HfAdapter(Inference, ModelsProtocolPrivate):
params = await convert_chat_completion_request_to_openai_params(request)
import json
# print(json.dumps(params, indent=2))
pprint(params)
response = self.client.chat.completions.create(**params)
response = await self.client.chat.completions.create(**params)
if stream:
return convert_openai_chat_completion_stream(
response, enable_incremental_tool_calls=True
)
return convert_openai_chat_completion_stream(response, enable_incremental_tool_calls=True)
else:
return convert_openai_chat_completion_choice(response.choices[0])
@ -281,18 +252,16 @@ class TGIAdapter(_HfAdapter):
logger.info(f"Initializing TGI client with url={config.url}")
# unfortunately, the TGI async client does not work well with proxies
# so using sync client for now instead
self.client = InferenceClient(model=f"{config.url}")
self.client = AsyncInferenceClient(model=f"{config.url}")
endpoint_info = self.client.get_endpoint_info()
endpoint_info = await self.client.get_endpoint_info()
self.max_tokens = endpoint_info["max_total_tokens"]
self.model_id = endpoint_info["model_id"]
class InferenceAPIAdapter(_HfAdapter):
async def initialize(self, config: InferenceAPIImplConfig) -> None:
self.client = AsyncInferenceClient(
model=config.huggingface_repo, token=config.api_token.get_secret_value()
)
self.client = AsyncInferenceClient(model=config.huggingface_repo, token=config.api_token.get_secret_value())
endpoint_info = await self.client.get_endpoint_info()
self.max_tokens = endpoint_info["max_total_tokens"]
self.model_id = endpoint_info["model_id"]
@ -310,6 +279,4 @@ class InferenceEndpointAdapter(_HfAdapter):
# Initialize the adapter
self.client = endpoint.async_client
self.model_id = endpoint.repository
self.max_tokens = int(
endpoint.raw["model"]["image"]["custom"]["env"]["MAX_TOTAL_TOKENS"]
)
self.max_tokens = int(endpoint.raw["model"]["image"]["custom"]["env"]["MAX_TOTAL_TOKENS"])