chore: turn OpenAIMixin into a pydantic.BaseModel (#3671)

# What does this PR do?

- implement get_api_key instead of relying on
LiteLLMOpenAIMixin.get_api_key
 - remove use of LiteLLMOpenAIMixin
 - add default initialize/shutdown methods to OpenAIMixin
 - remove __init__s to allow proper pydantic construction
- remove dead code from vllm adapter and associated / duplicate unit
tests
 - update vllm adapter to use openaimixin for model registration
 - remove ModelRegistryHelper from fireworks & together adapters
 - remove Inference from nvidia adapter
 - complete type hints on embedding_model_metadata
- allow extra fields on OpenAIMixin, for model_store, __provider_id__,
etc
 - new recordings for ollama
 - enhance the list models error handling
- update cerebras (remove cerebras-cloud-sdk) and anthropic (custom
model listing) inference adapters
 - parametrized test_inference_client_caching
- remove cerebras, databricks, fireworks, together from blanket mypy
exclude
 - removed unnecessary litellm deps

## Test Plan

ci
This commit is contained in:
Matthew Farrellee 2025-10-06 11:33:19 -04:00 committed by GitHub
parent 724dac498c
commit d23ed26238
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
131 changed files with 83634 additions and 1760 deletions

View file

@ -6,39 +6,14 @@
from urllib.parse import urljoin
from cerebras.cloud.sdk import AsyncCerebras
from llama_stack.apis.inference import (
ChatCompletionRequest,
CompletionRequest,
Inference,
OpenAIEmbeddingsResponse,
TopKSamplingStrategy,
)
from llama_stack.providers.utils.inference.openai_compat import (
get_sampling_options,
)
from llama_stack.apis.inference import OpenAIEmbeddingsResponse
from llama_stack.providers.utils.inference.openai_mixin import OpenAIMixin
from llama_stack.providers.utils.inference.prompt_adapter import (
chat_completion_request_to_prompt,
completion_request_to_prompt,
)
from .config import CerebrasImplConfig
class CerebrasInferenceAdapter(
OpenAIMixin,
Inference,
):
def __init__(self, config: CerebrasImplConfig) -> None:
self.config = config
# TODO: make this use provider data, etc. like other providers
self._cerebras_client = AsyncCerebras(
base_url=self.config.base_url,
api_key=self.config.api_key.get_secret_value(),
)
class CerebrasInferenceAdapter(OpenAIMixin):
config: CerebrasImplConfig
def get_api_key(self) -> str:
return self.config.api_key.get_secret_value()
@ -46,31 +21,6 @@ class CerebrasInferenceAdapter(
def get_base_url(self) -> str:
return urljoin(self.config.base_url, "v1")
async def initialize(self) -> None:
return
async def shutdown(self) -> None:
pass
async def _get_params(self, request: ChatCompletionRequest | CompletionRequest) -> dict:
if request.sampling_params and isinstance(request.sampling_params.strategy, TopKSamplingStrategy):
raise ValueError("`top_k` not supported by Cerebras")
prompt = ""
if isinstance(request, ChatCompletionRequest):
prompt = await chat_completion_request_to_prompt(request, self.get_llama_model(request.model))
elif isinstance(request, CompletionRequest):
prompt = await completion_request_to_prompt(request)
else:
raise ValueError(f"Unknown request type {type(request)}")
return {
"model": request.model,
"prompt": prompt,
"stream": request.stream,
**get_sampling_options(request.sampling_params),
}
async def openai_embeddings(
self,
model: str,