mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-06-28 02:53:30 +00:00
chore: Add OpenAI compatiblity for vLLM embeddings
- Implement OpenAI-compatible embeddings endpoint in vLLM provider - Support both float and base64 encoding formats - Add proper error handling and response formatting Signed-off-by: Varsha Prasad Narsing <varshaprasad96@gmail.com>
This commit is contained in:
parent
40e2c97915
commit
e35e6eebfe
1 changed files with 35 additions and 1 deletions
|
@ -38,7 +38,9 @@ from llama_stack.apis.inference import (
|
||||||
JsonSchemaResponseFormat,
|
JsonSchemaResponseFormat,
|
||||||
LogProbConfig,
|
LogProbConfig,
|
||||||
Message,
|
Message,
|
||||||
|
OpenAIEmbeddingData,
|
||||||
OpenAIEmbeddingsResponse,
|
OpenAIEmbeddingsResponse,
|
||||||
|
OpenAIEmbeddingUsage,
|
||||||
ResponseFormat,
|
ResponseFormat,
|
||||||
SamplingParams,
|
SamplingParams,
|
||||||
TextTruncation,
|
TextTruncation,
|
||||||
|
@ -536,7 +538,39 @@ class VLLMInferenceAdapter(Inference, ModelsProtocolPrivate):
|
||||||
dimensions: int | None = None,
|
dimensions: int | None = None,
|
||||||
user: str | None = None,
|
user: str | None = None,
|
||||||
) -> OpenAIEmbeddingsResponse:
|
) -> OpenAIEmbeddingsResponse:
|
||||||
raise NotImplementedError()
|
self._lazy_initialize_client()
|
||||||
|
assert self.client is not None
|
||||||
|
model_obj = await self._get_model(model)
|
||||||
|
assert model_obj.model_type == ModelType.embedding
|
||||||
|
|
||||||
|
# Convert input to list if it's a string
|
||||||
|
input_list = [input] if isinstance(input, str) else input
|
||||||
|
|
||||||
|
# Call vLLM embeddings endpoint with encoding_format
|
||||||
|
response = await self.client.embeddings.create(
|
||||||
|
model=model_obj.provider_resource_id,
|
||||||
|
input=input_list,
|
||||||
|
dimensions=dimensions,
|
||||||
|
encoding_format=encoding_format,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Convert response to OpenAI format
|
||||||
|
data = [
|
||||||
|
OpenAIEmbeddingData(
|
||||||
|
embedding=embedding_data.embedding,
|
||||||
|
index=i,
|
||||||
|
)
|
||||||
|
for i, embedding_data in enumerate(response.data)
|
||||||
|
]
|
||||||
|
|
||||||
|
# Not returning actual token usage since vLLM doesn't provide it
|
||||||
|
usage = OpenAIEmbeddingUsage(prompt_tokens=-1, total_tokens=-1)
|
||||||
|
|
||||||
|
return OpenAIEmbeddingsResponse(
|
||||||
|
data=data,
|
||||||
|
model=model_obj.provider_resource_id,
|
||||||
|
usage=usage,
|
||||||
|
)
|
||||||
|
|
||||||
async def openai_completion(
|
async def openai_completion(
|
||||||
self,
|
self,
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue