LiteLLM Minor Fixes & Improvements (09/24/2024) (#5880)

* LiteLLM Minor Fixes & Improvements (09/23/2024)  (#5842)

* feat(auth_utils.py): enable admin to allow client-side credentials to be passed

Makes it easier for devs to experiment with finetuned fireworks ai models

* feat(router.py): allow setting configurable_clientside_auth_params for a model

Closes https://github.com/BerriAI/litellm/issues/5843

* build(model_prices_and_context_window.json): fix anthropic claude-3-5-sonnet max output token limit

Fixes https://github.com/BerriAI/litellm/issues/5850

* fix(azure_ai/): support content list for azure ai

Fixes https://github.com/BerriAI/litellm/issues/4237

* fix(litellm_logging.py): always set saved_cache_cost

Set to 0 by default

* fix(fireworks_ai/cost_calculator.py): add fireworks ai default pricing

handles calling 405b+ size models

* fix(slack_alerting.py): fix error alerting for failed spend tracking

Fixes regression with slack alerting error monitoring

* fix(vertex_and_google_ai_studio_gemini.py): handle gemini no candidates in streaming chunk error

* docs(bedrock.md): add llama3-1 models

* test: fix tests

* fix(azure_ai/chat): fix transformation for azure ai calls

* feat(azure_ai/embed): Add azure ai embeddings support

Closes https://github.com/BerriAI/litellm/issues/5861

* fix(azure_ai/embed): enable async embedding

* feat(azure_ai/embed): support azure ai multimodal embeddings

* fix(azure_ai/embed): support async multi modal embeddings

* feat(together_ai/embed): support together ai embedding calls

* feat(rerank/main.py): log source documents for rerank endpoints to langfuse

improves rerank endpoint logging

* fix(langfuse.py): support logging `/audio/speech` input to langfuse

* test(test_embedding.py): fix test

* test(test_completion_cost.py): fix helper util
This commit is contained in:
Krish Dholakia 2024-09-25 22:11:57 -07:00 committed by GitHub
parent 5bc5eaff8a
commit 16c0307eab
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
25 changed files with 1675 additions and 340 deletions

View file

@ -901,7 +901,7 @@ from .llms.cohere.completion import CohereConfig
from .llms.clarifai import ClarifaiConfig
from .llms.AI21.completion import AI21Config
from .llms.AI21.chat import AI21ChatConfig
from .llms.together_ai import TogetherAIConfig
from .llms.together_ai.chat import TogetherAIConfig
from .llms.cloudflare import CloudflareConfig
from .llms.palm import PalmConfig
from .llms.gemini import GeminiConfig

View file

@ -28,6 +28,7 @@ from litellm.llms.databricks.cost_calculator import (
from litellm.llms.fireworks_ai.cost_calculator import (
cost_per_token as fireworks_ai_cost_per_token,
)
from litellm.llms.together_ai.cost_calculator import get_model_params_and_category
from litellm.rerank_api.types import RerankResponse
from litellm.types.llms.openai import HttpxBinaryResponseContent
from litellm.types.router import SPECIAL_MODEL_INFO_PARAMS
@ -395,48 +396,6 @@ def cost_per_token(
)
# Extract the number of billion parameters from the model name
# only used for together_computer LLMs
def get_model_params_and_category(model_name) -> str:
"""
Helper function for calculating together ai pricing.
Returns
- str - model pricing category if mapped else received model name
"""
import re
model_name = model_name.lower()
re_params_match = re.search(
r"(\d+b)", model_name
) # catch all decimals like 3b, 70b, etc
category = None
if re_params_match is not None:
params_match = str(re_params_match.group(1))
params_match = params_match.replace("b", "")
if params_match is not None:
params_billion = float(params_match)
else:
return model_name
# Determine the category based on the number of parameters
if params_billion <= 4.0:
category = "together-ai-up-to-4b"
elif params_billion <= 8.0:
category = "together-ai-4.1b-8b"
elif params_billion <= 21.0:
category = "together-ai-8.1b-21b"
elif params_billion <= 41.0:
category = "together-ai-21.1b-41b"
elif params_billion <= 80.0:
category = "together-ai-41.1b-80b"
elif params_billion <= 110.0:
category = "together-ai-81.1b-110b"
if category is not None:
return category
return model_name
def get_replicate_completion_pricing(completion_response: dict, total_time=0.0):
# see https://replicate.com/pricing
# for all litellm currently supported LLMs, almost all requests go to a100_80gb
@ -477,7 +436,7 @@ def _select_model_name_for_cost_calc(
if isinstance(completion_response, str):
return return_model
elif return_model is None:
elif return_model is None and hasattr(completion_response, "get"):
return_model = completion_response.get("model", "") # type: ignore
hidden_params = getattr(completion_response, "_hidden_params", None)
@ -716,7 +675,9 @@ def completion_cost(
):
# together ai prices based on size of llm
# get_model_params_and_category takes a model name and returns the category of LLM size it is in model_prices_and_context_window.json
model = get_model_params_and_category(model)
model = get_model_params_and_category(model, call_type=CallTypes(call_type))
# replicate llms are calculate based on time for request running
# see https://replicate.com/pricing
elif (

View file

@ -204,6 +204,11 @@ class LangFuseLogger:
):
input = prompt
output = response_obj["choices"][0]["message"].json()
elif response_obj is not None and isinstance(
response_obj, litellm.HttpxBinaryResponseContent
):
input = prompt
output = "speech-output"
elif response_obj is not None and isinstance(
response_obj, litellm.TextCompletionResponse
):
@ -549,7 +554,10 @@ class LangFuseLogger:
generation_id = None
usage = None
if response_obj is not None:
if response_obj.get("id", None) is not None:
if (
hasattr(response_obj, "id")
and response_obj.get("id", None) is not None
):
generation_id = litellm.utils.get_logging_id(
start_time, response_obj
)
@ -571,8 +579,8 @@ class LangFuseLogger:
if _user_api_key_alias is not None:
generation_name = f"litellm:{_user_api_key_alias}"
if response_obj is not None and "system_fingerprint" in response_obj:
system_fingerprint = response_obj.get("system_fingerprint", None)
if response_obj is not None:
system_fingerprint = getattr(response_obj, "system_fingerprint", None)
else:
system_fingerprint = None

View file

@ -1215,7 +1215,6 @@ class OpenAIChatCompletion(BaseLLM):
client: Optional[AsyncOpenAI] = None,
max_retries=None,
):
response = None
try:
openai_aclient: AsyncOpenAI = self._get_openai_client( # type: ignore
is_async=True,
@ -1237,12 +1236,15 @@ class OpenAIChatCompletion(BaseLLM):
additional_args={"complete_input_dict": data},
original_response=stringified_response,
)
return convert_to_model_response_object(
returned_response: (
litellm.EmbeddingResponse
) = convert_to_model_response_object(
response_object=stringified_response,
model_response_object=model_response,
response_type="embedding",
_response_headers=headers,
) # type: ignore
return returned_response
except OpenAIError as e:
## LOGGING
logging_obj.post_call(
@ -1284,7 +1286,6 @@ class OpenAIChatCompletion(BaseLLM):
aembedding=None,
):
super().embedding()
exception_mapping_worked = False
try:
model = model
data = {"model": model, "input": input, **optional_params}
@ -1299,7 +1300,7 @@ class OpenAIChatCompletion(BaseLLM):
)
if aembedding is True:
response = self.aembedding(
async_response = self.aembedding(
data=data,
input=input,
logging_obj=logging_obj,
@ -1310,7 +1311,7 @@ class OpenAIChatCompletion(BaseLLM):
client=client,
max_retries=max_retries,
)
return response
return async_response
openai_client: OpenAI = self._get_openai_client( # type: ignore
is_async=False,
@ -1335,12 +1336,13 @@ class OpenAIChatCompletion(BaseLLM):
additional_args={"complete_input_dict": data},
original_response=sync_embedding_response,
)
return convert_to_model_response_object(
response: litellm.EmbeddingResponse = convert_to_model_response_object(
response_object=sync_embedding_response.model_dump(),
model_response_object=model_response,
_response_headers=headers,
response_type="embedding",
) # type: ignore
return response
except OpenAIError as e:
raise e
except Exception as e:

View file

@ -0,0 +1,3 @@
from .chat.handler import AzureAIChatCompletion
from .embed.handler import AzureAIEmbedding
from .rerank.handler import AzureAIRerank

View file

View file

@ -0,0 +1,98 @@
"""
Transformation logic from OpenAI /v1/embeddings format to Azure AI Cohere's /v1/embed.
Why separate file? Make it easy to see how transformation works
Convers
- Cohere request format
Docs - https://docs.aws.amazon.com/bedrock/latest/userguide/model-parameters-titan-embed-text.html
"""
from typing import List, Optional, Tuple, Union
from litellm.types.llms.azure_ai import ImageEmbeddingInput, ImageEmbeddingRequest
from litellm.types.llms.openai import EmbeddingCreateParams
from litellm.types.utils import Embedding, EmbeddingResponse, Usage
from litellm.utils import is_base64_encoded
class AzureAICohereConfig:
def __init__(self) -> None:
pass
def _map_azure_model_group(self, model: str) -> str:
if "model=offer-cohere-embed-multili-paygo":
return "Cohere-embed-v3-multilingual"
elif "model=offer-cohere-embed-english-paygo":
return "Cohere-embed-v3-english"
return model
def _transform_request_image_embeddings(
self, input: List[str], optional_params: dict
) -> ImageEmbeddingRequest:
"""
Assume all str in list is base64 encoded string
"""
image_input: List[ImageEmbeddingInput] = []
for i in input:
embedding_input = ImageEmbeddingInput(image=i)
image_input.append(embedding_input)
return ImageEmbeddingRequest(input=image_input, **optional_params)
def _transform_request(
self, input: List[str], optional_params: dict, model: str
) -> Tuple[ImageEmbeddingRequest, EmbeddingCreateParams, List[int]]:
"""
Return the list of input to `/image/embeddings`, `/v1/embeddings`, list of image_embedding_idx for recombination
"""
image_embeddings: List[str] = []
image_embedding_idx: List[int] = []
for idx, i in enumerate(input):
"""
- is base64 -> route to image embeddings
- is ImageEmbeddingInput -> route to image embeddings
- else -> route to `/v1/embeddings`
"""
if is_base64_encoded(i):
image_embeddings.append(i)
image_embedding_idx.append(idx)
## REMOVE IMAGE EMBEDDINGS FROM input list
filtered_input = [
item for idx, item in enumerate(input) if idx not in image_embedding_idx
]
v1_embeddings_request = EmbeddingCreateParams(
input=filtered_input, model=model, **optional_params
)
image_embeddings_request = self._transform_request_image_embeddings(
input=image_embeddings, optional_params=optional_params
)
return image_embeddings_request, v1_embeddings_request, image_embedding_idx
def _transform_response(self, response: EmbeddingResponse) -> EmbeddingResponse:
additional_headers: Optional[dict] = response._hidden_params.get(
"additional_headers"
)
if additional_headers:
# CALCULATE USAGE
input_tokens: Optional[str] = additional_headers.get(
"llm_provider-num_tokens"
)
if input_tokens:
if response.usage:
response.usage.prompt_tokens = int(input_tokens)
else:
response.usage = Usage(prompt_tokens=int(input_tokens))
# SET MODEL
base_model: Optional[str] = additional_headers.get(
"llm_provider-azureml-model-group"
)
if base_model:
response.model = self._map_azure_model_group(base_model)
return response

View file

@ -0,0 +1,296 @@
import asyncio
import copy
import json
import os
from copy import deepcopy
from typing import Any, Callable, List, Literal, Optional, Tuple, Union
import httpx
from openai import OpenAI
import litellm
from litellm.llms.cohere.embed import embedding as cohere_embedding
from litellm.llms.custom_httpx.http_handler import (
AsyncHTTPHandler,
HTTPHandler,
_get_httpx_client,
get_async_httpx_client,
)
from litellm.llms.OpenAI.openai import OpenAIChatCompletion
from litellm.types.llms.azure_ai import ImageEmbeddingRequest
from litellm.types.utils import Embedding, EmbeddingResponse
from litellm.utils import convert_to_model_response_object, is_base64_encoded
from .cohere_transformation import AzureAICohereConfig
class AzureAIEmbedding(OpenAIChatCompletion):
def _process_response(
self,
image_embedding_responses: Optional[List],
text_embedding_responses: Optional[List],
image_embeddings_idx: List[int],
model_response: EmbeddingResponse,
input: List,
):
combined_responses = []
if (
image_embedding_responses is not None
and text_embedding_responses is not None
):
# Combine and order the results
text_idx = 0
image_idx = 0
for idx in range(len(input)):
if idx in image_embeddings_idx:
combined_responses.append(image_embedding_responses[image_idx])
image_idx += 1
else:
combined_responses.append(text_embedding_responses[text_idx])
text_idx += 1
model_response.data = combined_responses
elif image_embedding_responses is not None:
model_response.data = image_embedding_responses
elif text_embedding_responses is not None:
model_response.data = text_embedding_responses
response = AzureAICohereConfig()._transform_response(response=model_response) # type: ignore
return response
async def async_image_embedding(
self,
model: str,
data: ImageEmbeddingRequest,
timeout: float,
logging_obj,
model_response: litellm.EmbeddingResponse,
optional_params: dict,
api_key: Optional[str],
api_base: Optional[str],
client: Optional[Union[HTTPHandler, AsyncHTTPHandler]] = None,
) -> EmbeddingResponse:
if client is None or not isinstance(client, AsyncHTTPHandler):
client = AsyncHTTPHandler(timeout=timeout, concurrent_limit=1)
url = "{}/images/embeddings".format(api_base)
response = await client.post(
url=url,
json=data, # type: ignore
headers={"Authorization": "Bearer {}".format(api_key)},
)
embedding_response = response.json()
embedding_headers = dict(response.headers)
returned_response: litellm.EmbeddingResponse = convert_to_model_response_object( # type: ignore
response_object=embedding_response,
model_response_object=model_response,
response_type="embedding",
stream=False,
_response_headers=embedding_headers,
)
return returned_response
def image_embedding(
self,
model: str,
data: ImageEmbeddingRequest,
timeout: float,
logging_obj,
model_response: litellm.EmbeddingResponse,
optional_params: dict,
api_key: Optional[str],
api_base: Optional[str],
client: Optional[Union[HTTPHandler, AsyncHTTPHandler]] = None,
):
if api_base is None:
raise ValueError(
"api_base is None. Please set AZURE_AI_API_BASE or dynamically via `api_base` param, to make the request."
)
if api_key is None:
raise ValueError(
"api_key is None. Please set AZURE_AI_API_KEY or dynamically via `api_key` param, to make the request."
)
if client is None or not isinstance(client, HTTPHandler):
client = HTTPHandler(timeout=timeout, concurrent_limit=1)
url = "{}/images/embeddings".format(api_base)
response = client.post(
url=url,
json=data, # type: ignore
headers={"Authorization": "Bearer {}".format(api_key)},
)
embedding_response = response.json()
embedding_headers = dict(response.headers)
returned_response: litellm.EmbeddingResponse = convert_to_model_response_object( # type: ignore
response_object=embedding_response,
model_response_object=model_response,
response_type="embedding",
stream=False,
_response_headers=embedding_headers,
)
return returned_response
async def async_embedding(
self,
model: str,
input: List,
timeout: float,
logging_obj,
model_response: litellm.EmbeddingResponse,
optional_params: dict,
api_key: Optional[str] = None,
api_base: Optional[str] = None,
client=None,
) -> EmbeddingResponse:
(
image_embeddings_request,
v1_embeddings_request,
image_embeddings_idx,
) = AzureAICohereConfig()._transform_request(
input=input, optional_params=optional_params, model=model
)
image_embedding_responses: Optional[List] = None
text_embedding_responses: Optional[List] = None
if image_embeddings_request["input"]:
image_response = await self.async_image_embedding(
model=model,
data=image_embeddings_request,
timeout=timeout,
logging_obj=logging_obj,
model_response=model_response,
optional_params=optional_params,
api_key=api_key,
api_base=api_base,
client=client,
)
image_embedding_responses = image_response.data
if image_embedding_responses is None:
raise Exception("/image/embeddings route returned None Embeddings.")
if v1_embeddings_request["input"]:
response: EmbeddingResponse = await super().embedding( # type: ignore
model=model,
input=input,
timeout=timeout,
logging_obj=logging_obj,
model_response=model_response,
optional_params=optional_params,
api_key=api_key,
api_base=api_base,
client=client,
aembedding=True,
)
text_embedding_responses = response.data
if text_embedding_responses is None:
raise Exception("/v1/embeddings route returned None Embeddings.")
return self._process_response(
image_embedding_responses=image_embedding_responses,
text_embedding_responses=text_embedding_responses,
image_embeddings_idx=image_embeddings_idx,
model_response=model_response,
input=input,
)
def embedding(
self,
model: str,
input: List,
timeout: float,
logging_obj,
model_response: litellm.EmbeddingResponse,
optional_params: dict,
api_key: Optional[str] = None,
api_base: Optional[str] = None,
client=None,
aembedding=None,
):
"""
- Separate image url from text
-> route image url call to `/image/embeddings`
-> route text call to `/v1/embeddings` (OpenAI route)
assemble result in-order, and return
"""
if aembedding is True:
return self.async_embedding(
model,
input,
timeout,
logging_obj,
model_response,
optional_params,
api_key,
api_base,
client,
)
(
image_embeddings_request,
v1_embeddings_request,
image_embeddings_idx,
) = AzureAICohereConfig()._transform_request(
input=input, optional_params=optional_params, model=model
)
image_embedding_responses: Optional[List] = None
text_embedding_responses: Optional[List] = None
if image_embeddings_request["input"]:
image_response = self.image_embedding(
model=model,
data=image_embeddings_request,
timeout=timeout,
logging_obj=logging_obj,
model_response=model_response,
optional_params=optional_params,
api_key=api_key,
api_base=api_base,
client=client,
)
image_embedding_responses = image_response.data
if image_embedding_responses is None:
raise Exception("/image/embeddings route returned None Embeddings.")
if v1_embeddings_request["input"]:
response: EmbeddingResponse = super().embedding( # type: ignore
model,
input,
timeout,
logging_obj,
model_response,
optional_params,
api_key,
api_base,
client=(
client
if client is not None and isinstance(client, OpenAI)
else None
),
aembedding=aembedding,
)
text_embedding_responses = response.data
if text_embedding_responses is None:
raise Exception("/v1/embeddings route returned None Embeddings.")
return self._process_response(
image_embedding_responses=image_embedding_responses,
text_embedding_responses=text_embedding_responses,
image_embeddings_idx=image_embeddings_idx,
model_response=model_response,
input=input,
)

View file

@ -1,239 +0,0 @@
"""
Deprecated. We now do together ai calls via the openai client.
Reference: https://docs.together.ai/docs/openai-api-compatibility
"""
import json
import os
import time
import types
from enum import Enum
from typing import Callable, Optional
import httpx # type: ignore
import requests # type: ignore
import litellm
from litellm.utils import ModelResponse, Usage
from .prompt_templates.factory import custom_prompt, prompt_factory
class TogetherAIError(Exception):
def __init__(self, status_code, message):
self.status_code = status_code
self.message = message
self.request = httpx.Request(
method="POST", url="https://api.together.xyz/inference"
)
self.response = httpx.Response(status_code=status_code, request=self.request)
super().__init__(
self.message
) # Call the base class constructor with the parameters it needs
class TogetherAIConfig:
"""
Reference: https://docs.together.ai/reference/inference
The class `TogetherAIConfig` provides configuration for the TogetherAI's API interface. Here are the parameters:
- `max_tokens` (int32, required): The maximum number of tokens to generate.
- `stop` (string, optional): A string sequence that will truncate (stop) the inference text output. For example, "\n\n" will stop generation as soon as the model generates two newlines.
- `temperature` (float, optional): A decimal number that determines the degree of randomness in the response. A value of 1 will always yield the same output. A temperature less than 1 favors more correctness and is appropriate for question answering or summarization. A value greater than 1 introduces more randomness in the output.
- `top_p` (float, optional): The `top_p` (nucleus) parameter is used to dynamically adjust the number of choices for each predicted token based on the cumulative probabilities. It specifies a probability threshold, below which all less likely tokens are filtered out. This technique helps to maintain diversity and generate more fluent and natural-sounding text.
- `top_k` (int32, optional): The `top_k` parameter is used to limit the number of choices for the next predicted word or token. It specifies the maximum number of tokens to consider at each step, based on their probability of occurrence. This technique helps to speed up the generation process and can improve the quality of the generated text by focusing on the most likely options.
- `repetition_penalty` (float, optional): A number that controls the diversity of generated text by reducing the likelihood of repeated sequences. Higher values decrease repetition.
- `logprobs` (int32, optional): This parameter is not described in the prompt.
"""
max_tokens: Optional[int] = None
stop: Optional[str] = None
temperature: Optional[int] = None
top_p: Optional[float] = None
top_k: Optional[int] = None
repetition_penalty: Optional[float] = None
logprobs: Optional[int] = None
def __init__(
self,
max_tokens: Optional[int] = None,
stop: Optional[str] = None,
temperature: Optional[int] = None,
top_p: Optional[float] = None,
top_k: Optional[int] = None,
repetition_penalty: Optional[float] = None,
logprobs: Optional[int] = None,
) -> None:
locals_ = locals()
for key, value in locals_.items():
if key != "self" and value is not None:
setattr(self.__class__, key, value)
@classmethod
def get_config(cls):
return {
k: v
for k, v in cls.__dict__.items()
if not k.startswith("__")
and not isinstance(
v,
(
types.FunctionType,
types.BuiltinFunctionType,
classmethod,
staticmethod,
),
)
and v is not None
}
# def validate_environment(api_key):
# if api_key is None:
# raise ValueError(
# "Missing TogetherAI API Key - A call is being made to together_ai but no key is set either in the environment variables or via params"
# )
# headers = {
# "accept": "application/json",
# "content-type": "application/json",
# "Authorization": "Bearer " + api_key,
# }
# return headers
# def completion(
# model: str,
# messages: list,
# api_base: str,
# model_response: ModelResponse,
# print_verbose: Callable,
# encoding,
# api_key,
# logging_obj,
# custom_prompt_dict={},
# optional_params=None,
# litellm_params=None,
# logger_fn=None,
# ):
# headers = validate_environment(api_key)
# ## Load Config
# config = litellm.TogetherAIConfig.get_config()
# for k, v in config.items():
# if (
# k not in optional_params
# ): # completion(top_k=3) > togetherai_config(top_k=3) <- allows for dynamic variables to be passed in
# optional_params[k] = v
# print_verbose(f"CUSTOM PROMPT DICT: {custom_prompt_dict}; model: {model}")
# if model in custom_prompt_dict:
# # check if the model has a registered custom prompt
# model_prompt_details = custom_prompt_dict[model]
# prompt = custom_prompt(
# role_dict=model_prompt_details.get("roles", {}),
# initial_prompt_value=model_prompt_details.get("initial_prompt_value", ""),
# final_prompt_value=model_prompt_details.get("final_prompt_value", ""),
# bos_token=model_prompt_details.get("bos_token", ""),
# eos_token=model_prompt_details.get("eos_token", ""),
# messages=messages,
# )
# else:
# prompt = prompt_factory(
# model=model,
# messages=messages,
# api_key=api_key,
# custom_llm_provider="together_ai",
# ) # api key required to query together ai model list
# data = {
# "model": model,
# "prompt": prompt,
# "request_type": "language-model-inference",
# **optional_params,
# }
# ## LOGGING
# logging_obj.pre_call(
# input=prompt,
# api_key=api_key,
# additional_args={
# "complete_input_dict": data,
# "headers": headers,
# "api_base": api_base,
# },
# )
# ## COMPLETION CALL
# if "stream_tokens" in optional_params and optional_params["stream_tokens"] == True:
# response = requests.post(
# api_base,
# headers=headers,
# data=json.dumps(data),
# stream=optional_params["stream_tokens"],
# )
# return response.iter_lines()
# else:
# response = requests.post(api_base, headers=headers, data=json.dumps(data))
# ## LOGGING
# logging_obj.post_call(
# input=prompt,
# api_key=api_key,
# original_response=response.text,
# additional_args={"complete_input_dict": data},
# )
# print_verbose(f"raw model_response: {response.text}")
# ## RESPONSE OBJECT
# if response.status_code != 200:
# raise TogetherAIError(
# status_code=response.status_code, message=response.text
# )
# completion_response = response.json()
# if "error" in completion_response:
# raise TogetherAIError(
# message=json.dumps(completion_response),
# status_code=response.status_code,
# )
# elif "error" in completion_response["output"]:
# raise TogetherAIError(
# message=json.dumps(completion_response["output"]),
# status_code=response.status_code,
# )
# if len(completion_response["output"]["choices"][0]["text"]) >= 0:
# model_response.choices[0].message.content = completion_response["output"][
# "choices"
# ][0]["text"]
# ## CALCULATING USAGE
# print_verbose(
# f"CALCULATING TOGETHERAI TOKEN USAGE. Model Response: {model_response}; model_response['choices'][0]['message'].get('content', ''): {model_response['choices'][0]['message'].get('content', None)}"
# )
# prompt_tokens = len(encoding.encode(prompt))
# completion_tokens = len(
# encoding.encode(model_response["choices"][0]["message"].get("content", ""))
# )
# if "finish_reason" in completion_response["output"]["choices"][0]:
# model_response.choices[0].finish_reason = completion_response["output"][
# "choices"
# ][0]["finish_reason"]
# model_response["created"] = int(time.time())
# model_response["model"] = "together_ai/" + model
# usage = Usage(
# prompt_tokens=prompt_tokens,
# completion_tokens=completion_tokens,
# total_tokens=prompt_tokens + completion_tokens,
# )
# setattr(model_response, "usage", usage)
# return model_response
# def embedding():
# # logic for parsing in - calling - parsing out model embedding calls
# pass

View file

@ -0,0 +1,13 @@
"""
Support for OpenAI's `/v1/chat/completions` endpoint.
Calls done in OpenAI/openai.py as TogetherAI is openai-compatible.
Docs: https://docs.together.ai/reference/completions-1
"""
from ..OpenAI.openai import OpenAIConfig
class TogetherAIConfig(OpenAIConfig):
pass

View file

@ -0,0 +1,7 @@
"""
Support for OpenAI's `/v1/completions` endpoint.
Calls done in OpenAI/openai.py as TogetherAI is openai-compatible.
Docs: https://docs.together.ai/reference/completions-1
"""

View file

@ -0,0 +1,79 @@
"""
Handles calculating cost for together ai models
"""
import re
from litellm.types.utils import CallTypes
# Extract the number of billion parameters from the model name
# only used for together_computer LLMs
def get_model_params_and_category(model_name, call_type: CallTypes) -> str:
"""
Helper function for calculating together ai pricing.
Returns
- str - model pricing category if mapped else received model name
"""
if call_type == CallTypes.embedding or call_type == CallTypes.aembedding:
return get_model_params_and_category_embeddings(model_name=model_name)
model_name = model_name.lower()
re_params_match = re.search(
r"(\d+b)", model_name
) # catch all decimals like 3b, 70b, etc
category = None
if re_params_match is not None:
params_match = str(re_params_match.group(1))
params_match = params_match.replace("b", "")
if params_match is not None:
params_billion = float(params_match)
else:
return model_name
# Determine the category based on the number of parameters
if params_billion <= 4.0:
category = "together-ai-up-to-4b"
elif params_billion <= 8.0:
category = "together-ai-4.1b-8b"
elif params_billion <= 21.0:
category = "together-ai-8.1b-21b"
elif params_billion <= 41.0:
category = "together-ai-21.1b-41b"
elif params_billion <= 80.0:
category = "together-ai-41.1b-80b"
elif params_billion <= 110.0:
category = "together-ai-81.1b-110b"
if category is not None:
return category
return model_name
def get_model_params_and_category_embeddings(model_name) -> str:
"""
Helper function for calculating together ai embedding pricing.
Returns
- str - model pricing category if mapped else received model name
"""
model_name = model_name.lower()
re_params_match = re.search(
r"(\d+m)", model_name
) # catch all decimals like 100m, 200m, etc.
category = None
if re_params_match is not None:
params_match = str(re_params_match.group(1))
params_match = params_match.replace("m", "")
if params_match is not None:
params_million = float(params_match)
else:
return model_name
# Determine the category based on the number of parameters
if params_million <= 150:
category = "together-ai-embedding-up-to-150m"
elif params_million <= 350:
category = "together-ai-embedding-151m-to-350m"
if category is not None:
return category
return model_name

View file

@ -0,0 +1,7 @@
"""
Support for OpenAI's `/v1/embeddings` endpoint.
Calls done in OpenAI/openai.py as TogetherAI is openai-compatible.
Docs: https://docs.together.ai/reference/completions-1
"""

View file

@ -83,7 +83,7 @@ from .llms import (
from .llms.AI21 import completion as ai21
from .llms.anthropic.chat import AnthropicChatCompletion
from .llms.anthropic.completion import AnthropicTextCompletion
from .llms.azure_ai.chat.handler import AzureAIChatCompletion
from .llms.azure_ai import AzureAIChatCompletion, AzureAIEmbedding
from .llms.azure_text import AzureTextCompletion
from .llms.AzureOpenAI.audio_transcriptions import AzureAudioTranscription
from .llms.AzureOpenAI.azure import AzureChatCompletion, _check_dynamic_azure_params
@ -168,6 +168,7 @@ openai_o1_chat_completions = OpenAIO1ChatCompletion()
openai_audio_transcriptions = OpenAIAudioTranscription()
databricks_chat_completions = DatabricksChatCompletion()
azure_ai_chat_completions = AzureAIChatCompletion()
azure_ai_embedding = AzureAIEmbedding()
anthropic_chat_completions = AnthropicChatCompletion()
anthropic_text_completions = AnthropicTextCompletion()
azure_chat_completions = AzureChatCompletion()
@ -3215,6 +3216,8 @@ async def aembedding(*args, **kwargs) -> EmbeddingResponse:
or custom_llm_provider == "cohere"
or custom_llm_provider == "huggingface"
or custom_llm_provider == "bedrock"
or custom_llm_provider == "azure_ai"
or custom_llm_provider == "together_ai"
): # currently implemented aiohttp calls for just azure and openai, soon all.
# Await normally
init_response = await loop.run_in_executor(None, func_with_context)
@ -3385,6 +3388,9 @@ def embedding(
api_base=api_base,
api_key=api_key,
)
if dynamic_api_key is not None:
api_key = dynamic_api_key
optional_params = get_optional_params_embeddings(
model=model,
user=user,
@ -3481,7 +3487,9 @@ def embedding(
aembedding=aembedding,
)
elif (
model in litellm.open_ai_embedding_models or custom_llm_provider == "openai"
model in litellm.open_ai_embedding_models
or custom_llm_provider == "openai"
or custom_llm_provider == "together_ai"
):
api_base = (
api_base
@ -3832,6 +3840,33 @@ def embedding(
model_response=EmbeddingResponse(),
aembedding=aembedding,
)
elif custom_llm_provider == "azure_ai":
api_base = (
api_base # for deepinfra/perplexity/anyscale/groq/friendliai we check in get_llm_provider and pass in the api base from there
or litellm.api_base
or get_secret("AZURE_AI_API_BASE")
)
# set API KEY
api_key = (
api_key
or litellm.api_key # for deepinfra/perplexity/anyscale/friendliai we check in get_llm_provider and pass in the api key from there
or litellm.openai_key
or get_secret("AZURE_AI_API_KEY")
)
## EMBEDDING CALL
response = azure_ai_embedding.embedding(
model=model,
input=input,
api_base=api_base,
api_key=api_key,
logging_obj=logging,
timeout=timeout,
model_response=EmbeddingResponse(),
optional_params=optional_params,
client=client,
aembedding=aembedding,
)
else:
args = locals()
raise ValueError(f"No valid embedding model args passed in - {args}")
@ -4901,7 +4936,11 @@ def speech(
aspeech: Optional[bool] = None,
**kwargs,
) -> HttpxBinaryResponseContent:
user = kwargs.get("user", None)
litellm_call_id: Optional[str] = kwargs.get("litellm_call_id", None)
proxy_server_request = kwargs.get("proxy_server_request", None)
model_info = kwargs.get("model_info", None)
metadata = kwargs.get("metadata", {})
model, custom_llm_provider, dynamic_api_key, api_base = get_llm_provider(model=model, custom_llm_provider=custom_llm_provider, api_base=api_base) # type: ignore
tags = kwargs.pop("tags", [])
@ -4918,6 +4957,21 @@ def speech(
max_retries = litellm.num_retries or openai.DEFAULT_MAX_RETRIES
logging_obj = kwargs.get("litellm_logging_obj", None)
logging_obj.update_environment_variables(
model=model,
user=user,
optional_params={},
litellm_params={
"litellm_call_id": litellm_call_id,
"proxy_server_request": proxy_server_request,
"model_info": model_info,
"metadata": metadata,
"preset_cache_key": None,
"stream_response": {},
**kwargs,
},
custom_llm_provider=custom_llm_provider,
)
response: Optional[HttpxBinaryResponseContent] = None
if custom_llm_provider == "openai":
if voice is None or not (isinstance(voice, str)):

View file

@ -990,6 +990,26 @@
"mode": "chat",
"source":"https://azuremarketplace.microsoft.com/en-us/marketplace/apps/metagenai.meta-llama-3-1-405b-instruct-offer?tab=PlansAndPrice"
},
"azure_ai/Cohere-embed-v3-english": {
"max_tokens": 512,
"max_input_tokens": 512,
"output_vector_size": 1024,
"input_cost_per_token": 0.0000001,
"output_cost_per_token": 0.0,
"litellm_provider": "azure_ai",
"mode": "embedding",
"source":"https://azuremarketplace.microsoft.com/en-us/marketplace/apps/cohere.cohere-embed-v3-english-offer?tab=PlansAndPrice"
},
"azure_ai/Cohere-embed-v3-multilingual": {
"max_tokens": 512,
"max_input_tokens": 512,
"output_vector_size": 1024,
"input_cost_per_token": 0.0000001,
"output_cost_per_token": 0.0,
"litellm_provider": "azure_ai",
"mode": "embedding",
"source":"https://azuremarketplace.microsoft.com/en-us/marketplace/apps/cohere.cohere-embed-v3-english-offer?tab=PlansAndPrice"
},
"babbage-002": {
"max_tokens": 16384,
"max_input_tokens": 16384,
@ -4953,50 +4973,71 @@
"together-ai-up-to-4b": {
"input_cost_per_token": 0.0000001,
"output_cost_per_token": 0.0000001,
"litellm_provider": "together_ai"
"litellm_provider": "together_ai",
"mode": "chat"
},
"together-ai-4.1b-8b": {
"input_cost_per_token": 0.0000002,
"output_cost_per_token": 0.0000002,
"litellm_provider": "together_ai"
"litellm_provider": "together_ai",
"mode": "chat"
},
"together-ai-8.1b-21b": {
"max_tokens": 1000,
"input_cost_per_token": 0.0000003,
"output_cost_per_token": 0.0000003,
"litellm_provider": "together_ai"
"litellm_provider": "together_ai",
"mode": "chat"
},
"together-ai-21.1b-41b": {
"input_cost_per_token": 0.0000008,
"output_cost_per_token": 0.0000008,
"litellm_provider": "together_ai"
"litellm_provider": "together_ai",
"mode": "chat"
},
"together-ai-41.1b-80b": {
"input_cost_per_token": 0.0000009,
"output_cost_per_token": 0.0000009,
"litellm_provider": "together_ai"
"litellm_provider": "together_ai",
"mode": "chat"
},
"together-ai-81.1b-110b": {
"input_cost_per_token": 0.0000018,
"output_cost_per_token": 0.0000018,
"litellm_provider": "together_ai"
"litellm_provider": "together_ai",
"mode": "chat"
},
"together-ai-embedding-up-to-150m": {
"input_cost_per_token": 0.000000008,
"output_cost_per_token": 0.0,
"litellm_provider": "together_ai",
"mode": "embedding"
},
"together-ai-embedding-151m-to-350m": {
"input_cost_per_token": 0.000000016,
"output_cost_per_token": 0.0,
"litellm_provider": "together_ai",
"mode": "embedding"
},
"together_ai/mistralai/Mixtral-8x7B-Instruct-v0.1": {
"input_cost_per_token": 0.0000006,
"output_cost_per_token": 0.0000006,
"litellm_provider": "together_ai",
"supports_function_calling": true,
"supports_parallel_function_calling": true
"supports_parallel_function_calling": true,
"mode": "chat"
},
"together_ai/mistralai/Mistral-7B-Instruct-v0.1": {
"litellm_provider": "together_ai",
"supports_function_calling": true,
"supports_parallel_function_calling": true
"supports_parallel_function_calling": true,
"mode": "chat"
},
"together_ai/togethercomputer/CodeLlama-34b-Instruct": {
"litellm_provider": "together_ai",
"supports_function_calling": true,
"supports_parallel_function_calling": true
"supports_parallel_function_calling": true,
"mode": "chat"
},
"ollama/codegemma": {
"max_tokens": 8192,

View file

@ -8,7 +8,7 @@ from litellm._logging import verbose_logger
from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLoggingObj
from litellm.llms.azure_ai.rerank import AzureAIRerank
from litellm.llms.cohere.rerank import CohereRerank
from litellm.llms.togetherai.rerank import TogetherAIRerank
from litellm.llms.together_ai.rerank import TogetherAIRerank
from litellm.secret_managers.main import get_secret
from litellm.types.router import *
from litellm.utils import client, exception_type, supports_httpx_timeout
@ -103,16 +103,14 @@ def rerank(
)
)
model_parameters = [
"top_n",
"rank_fields",
"return_documents",
"max_chunks_per_doc",
]
model_params_dict = {}
for k, v in optional_params.model_fields.items():
if k in model_parameters:
model_params_dict[k] = v
model_params_dict = {
"top_n": top_n,
"rank_fields": rank_fields,
"return_documents": return_documents,
"max_chunks_per_doc": max_chunks_per_doc,
"documents": documents,
}
litellm_logging_obj.update_environment_variables(
model=model,
user=user,

View file

@ -570,6 +570,9 @@ def test_groq_response_cost_tracking(is_streaming):
print(f"response_cost: {response_cost}")
from litellm.types.utils import CallTypes
def test_together_ai_qwen_completion_cost():
input_kwargs = {
"completion_response": litellm.ModelResponse(
@ -612,7 +615,7 @@ def test_together_ai_qwen_completion_cost():
}
response = litellm.cost_calculator.get_model_params_and_category(
model_name="qwen/Qwen2-72B-Instruct"
model_name="qwen/Qwen2-72B-Instruct", call_type=CallTypes.completion
)
assert response == "together-ai-41.1b-80b"
@ -1323,3 +1326,802 @@ def test_completion_cost_vertex_llama3():
cost = completion_cost(model=model, completion_response=response)
assert cost == 0
def test_together_ai_embedding_completion_cost():
from litellm.utils import Choices, EmbeddingResponse, Message, ModelResponse, Usage
os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
litellm.model_cost = litellm.get_model_cost_map(url="")
response = EmbeddingResponse(
model="togethercomputer/m2-bert-80M-8k-retrieval",
data=[
{
"embedding": [
-0.18039076,
0.11614138,
0.37174946,
0.27238843,
-0.21933095,
-0.15207036,
0.17764972,
-0.08700938,
-0.23863377,
-0.24203257,
0.20441775,
0.04630023,
-0.07832973,
-0.193581,
0.2009999,
-0.30106494,
0.21179546,
-0.23836501,
-0.14919636,
-0.045276586,
0.08645845,
-0.027714893,
-0.009854938,
0.25298217,
-0.1081501,
-0.2383125,
0.23080236,
0.011114239,
0.06954927,
-0.21081704,
0.06937218,
-0.16756944,
-0.2030545,
-0.19809915,
-0.031914014,
-0.15959585,
0.17361341,
0.30239972,
-0.09923253,
0.12680714,
-0.13018028,
0.1302273,
0.19179879,
0.17068875,
0.065124996,
-0.15515316,
0.08250379,
0.07309733,
-0.07283606,
0.21411736,
0.15457751,
-0.08725933,
0.07227311,
0.056812778,
-0.077683985,
0.06833304,
0.0328722,
0.2719641,
-0.06989647,
0.22805125,
0.14953858,
0.0792393,
0.07793462,
0.16176109,
-0.15616545,
-0.25149494,
-0.065352336,
-0.38410214,
-0.27288514,
0.13946335,
-0.21873806,
0.1365704,
0.11738016,
-0.1141173,
0.022973377,
-0.16935326,
0.026940947,
-0.09990286,
-0.05157219,
0.21006724,
0.15897459,
0.011987913,
0.02576497,
-0.11819022,
-0.09184997,
-0.31881434,
-0.17055357,
-0.09523704,
0.008458802,
-0.015483258,
0.038404867,
0.014673892,
-0.041162584,
0.002691519,
0.04601874,
0.059108324,
0.007177156,
0.066804245,
0.038554087,
-0.038720075,
-0.2145991,
-0.15713418,
-0.03712905,
-0.066650696,
0.04227769,
0.018708894,
-0.26332214,
0.0012769096,
-0.13878848,
-0.33141217,
0.118736655,
0.03026654,
0.1017467,
-0.08000539,
0.00092649367,
0.13062756,
-0.03785864,
-0.2038575,
0.07655428,
-0.24818295,
-0.0600955,
0.114760056,
0.027571939,
-0.047068622,
-0.19806816,
0.0774084,
-0.05213658,
-0.042000014,
0.051924672,
-0.14131106,
-0.2309609,
0.20305444,
0.0700591,
0.13863273,
-0.06145084,
-0.039423797,
-0.055951696,
0.04732105,
0.078736484,
0.2566198,
0.054494765,
0.017602794,
-0.107575715,
-0.017887019,
-0.26046592,
-0.077659994,
-0.08430523,
0.18806657,
-0.12292346,
0.06288608,
-0.106739804,
-0.06600645,
-0.14719339,
-0.05070389,
0.23234129,
-0.034023043,
0.056019265,
-0.03627352,
0.11740493,
0.060294818,
-0.21726903,
-0.09775424,
0.27007395,
0.28328258,
0.022495652,
0.13218465,
0.07199022,
-0.15933248,
0.02381037,
-0.08288268,
0.020621575,
0.17395815,
0.06978612,
0.18418784,
-0.12663148,
-0.21287888,
0.21239495,
0.10222956,
0.03952703,
-0.066957936,
-0.035802357,
0.03683884,
0.22524163,
-0.029355489,
-0.11534147,
-0.041979663,
-0.012147716,
-0.07279564,
0.17417553,
0.05546745,
-0.1773277,
-0.26984993,
0.31703642,
0.05958132,
-0.14933203,
-0.084655434,
0.074604444,
-0.077568695,
0.25167143,
-0.17753932,
-0.006415411,
0.068613894,
-0.0031754146,
-0.0039771493,
0.015294107,
0.11839045,
-0.04570732,
0.103238374,
-0.09678329,
-0.21713412,
0.047976546,
-0.14346297,
0.17429878,
-0.31257913,
0.15445377,
-0.10576352,
-0.16792995,
-0.17988597,
-0.14238739,
-0.088244036,
0.2760547,
0.088823885,
-0.08074319,
-0.028918687,
0.107819095,
0.12004892,
0.13343112,
-0.1332874,
-0.0946055,
-0.20433402,
0.17760132,
0.11774745,
0.16756779,
-0.0937686,
0.23887308,
0.27315456,
0.08657822,
0.027402503,
-0.06605757,
0.29859266,
-0.21552202,
0.026192812,
0.1328459,
0.13072926,
0.19236198,
0.01760772,
-0.042355467,
0.08815041,
-0.013158761,
-0.23350924,
-0.043668386,
-0.15479062,
-0.024266671,
0.08113482,
0.14451654,
-0.29152337,
-0.028919466,
0.15022752,
-0.26923147,
0.23846954,
0.03292609,
-0.23572414,
-0.14883325,
-0.12743121,
-0.052229587,
-0.14230779,
0.284658,
0.36885592,
-0.13176951,
-0.16442224,
-0.20283924,
0.048434418,
-0.16231743,
-0.0010730615,
0.1408047,
0.09481033,
0.018139571,
-0.030843062,
0.13304341,
-0.1516288,
-0.051779557,
0.46940327,
-0.07969027,
-0.051570967,
-0.038892798,
0.11187677,
0.1703113,
-0.39926252,
0.06859773,
0.08364686,
0.14696898,
0.026642298,
0.13225247,
0.05730332,
0.35534015,
0.11189959,
0.039673142,
-0.056019083,
0.15707816,
-0.11053284,
0.12823457,
0.20075114,
0.040237684,
-0.19367051,
0.13039409,
-0.26038498,
-0.05770229,
-0.009781617,
0.15812513,
-0.10420735,
-0.020158196,
0.13160926,
-0.20823349,
-0.045596864,
-0.2074525,
0.1546387,
0.30158705,
0.13175933,
0.11967154,
-0.09094463,
0.0019428955,
-0.06745872,
0.02998099,
-0.18385777,
0.014330351,
0.07141392,
-0.17461702,
0.099743806,
-0.016181415,
0.1661396,
0.070834026,
0.110713825,
0.14590909,
0.15404254,
-0.21658006,
0.00715122,
-0.10229453,
-0.09980027,
-0.09406554,
-0.014849227,
-0.26285952,
0.069972225,
0.05732395,
-0.10685719,
0.037572138,
-0.18863359,
-0.00083297276,
-0.16088934,
-0.117982,
-0.16381365,
-0.008932539,
-0.06549256,
-0.08928683,
0.29934987,
0.16532114,
-0.27117223,
-0.12302226,
-0.28685933,
-0.14041144,
-0.0062569617,
-0.20768198,
-0.15385273,
0.20506454,
-0.21685128,
0.1081962,
-0.13133131,
0.18937315,
0.14751591,
0.2786974,
-0.060183275,
0.10365405,
0.109799005,
-0.044105034,
-0.04260162,
0.025758557,
0.07590695,
0.0726137,
-0.09882405,
0.26437432,
0.15884234,
0.115702584,
0.0015900572,
0.11673009,
-0.18648374,
0.3080215,
-0.26407364,
-0.15610488,
0.12658228,
-0.05672454,
0.016239772,
-0.092462406,
-0.36205122,
-0.2925843,
-0.104364775,
-0.2598659,
-0.14073578,
0.10225995,
-0.2612335,
-0.17479639,
0.17488293,
-0.2437756,
0.114384405,
-0.13196659,
-0.067482576,
0.024756929,
0.11779123,
0.2751749,
-0.13306957,
-0.034118645,
-0.14177705,
0.27164033,
0.06266008,
0.11199439,
-0.09814594,
0.13231735,
0.019105865,
-0.2652429,
-0.12924416,
0.0840029,
0.098754935,
0.025883028,
-0.33059177,
-0.10544467,
-0.14131607,
-0.09680401,
-0.047318626,
-0.08157771,
-0.11271855,
0.12637804,
0.11703408,
0.014556337,
0.22788583,
-0.05599293,
0.25811172,
0.22956331,
0.13004553,
0.15419081,
-0.07971162,
0.11692607,
-0.2859737,
0.059627946,
-0.02716421,
0.117603,
-0.061154094,
-0.13555732,
0.17092334,
-0.16639015,
0.2919375,
-0.020189757,
0.18548165,
-0.32514027,
0.19324942,
-0.117969565,
0.23577307,
-0.18052326,
-0.10520473,
-0.2647645,
-0.29393113,
0.052641366,
-0.07733946,
-0.10684275,
-0.15046178,
0.065737076,
-0.0022297644,
-0.010802031,
-0.115943395,
-0.11602136,
0.24265991,
-0.12240144,
0.11817584,
0.026270682,
-0.25762397,
-0.14545679,
0.014168602,
0.106698096,
0.12905516,
-0.12560321,
0.15034604,
0.071529925,
0.123048246,
-0.058863316,
-0.12251829,
0.20463347,
0.06841168,
0.13706751,
0.05893755,
-0.12269708,
0.096701816,
-0.3237337,
-0.2213742,
-0.073655166,
-0.12979327,
0.14173084,
0.19167605,
-0.14523135,
0.06963011,
-0.019228822,
-0.14134938,
0.22017507,
0.007933044,
-0.0065696104,
0.074060634,
-0.13231485,
0.1387053,
-0.14480218,
-0.007837481,
0.29880494,
0.101618655,
0.14514285,
-0.066113696,
-0.041709363,
0.21512671,
-0.090142876,
-0.010337287,
0.13212202,
0.08307805,
0.10144794,
-0.024808172,
0.21877879,
-0.071282186,
-8.786433e-05,
-0.014574037,
-0.11954953,
-0.096931055,
-0.2557228,
0.1090451,
0.15424186,
-0.029206438,
-0.2898023,
0.22510754,
-0.019507697,
0.1566895,
-0.24820097,
-0.012163554,
0.12401036,
0.024711533,
0.24737844,
-0.06311193,
0.0652544,
-0.067403205,
0.15362221,
-0.12093675,
0.096014425,
0.17337392,
-0.017509578,
0.015355054,
0.055885684,
-0.08358914,
-0.018012024,
0.069017515,
0.32854614,
0.0063175815,
-0.09058244,
0.000681382,
-0.10825181,
0.13190223,
0.009358909,
-0.12205342,
0.08268384,
-0.260608,
-0.11042252,
-0.022601532,
-0.080661446,
-0.035559367,
0.14736788,
0.061933476,
-0.07815901,
0.110823035,
-0.00875032,
-0.064237975,
-0.04546554,
-0.05909862,
0.23463917,
-0.20451859,
-0.16576467,
0.10957323,
-0.08632836,
-0.27395645,
0.0002913844,
0.13701706,
-0.058854006,
0.30768716,
-0.037643027,
-0.1365738,
0.095908396,
-0.05029932,
0.14793666,
0.30881998,
-0.018806668,
-0.15902956,
0.07953607,
-0.07259314,
0.17318867,
0.123503335,
-0.11327983,
-0.24497227,
-0.092871994,
0.31053993,
0.09460377,
-0.21152224,
-0.03127119,
-0.018713845,
-0.014523326,
-0.18656968,
0.2255386,
-0.1902719,
0.18821372,
-0.16890709,
-0.04607359,
0.13054903,
-0.05379203,
-0.051014878,
0.054293603,
-0.07299424,
-0.06728367,
-0.052388195,
-0.29960096,
-0.22351485,
-0.06481434,
-0.1619141,
0.24709718,
-0.1203425,
0.029514981,
-0.01951599,
-0.072677284,
-0.25097945,
0.03758907,
0.14380245,
-0.037721623,
-0.19958745,
0.2408246,
-0.13995907,
-0.028115002,
-0.14780775,
0.17445801,
0.11311988,
0.05306163,
0.0018454103,
0.00088805315,
-0.27949628,
-0.23556526,
-0.18175222,
-0.28372183,
-0.43095905,
0.22644317,
0.06072053,
0.02278773,
0.021752749,
0.053462002,
-0.30636713,
0.15607472,
-0.16657323,
-0.07240017,
0.1410017,
-0.026987495,
0.15029654,
0.03340291,
-0.2056912,
0.055395555,
0.11999902,
0.06368412,
-0.025476053,
-0.1702383,
-0.23432998,
0.14855467,
-0.07505147,
-0.030296376,
-0.07001051,
0.10510949,
0.10420236,
0.09809715,
0.17195594,
0.19430229,
-0.16121922,
-0.081139356,
0.15032287,
0.10385191,
-0.18741366,
0.008690719,
-0.12941097,
-0.027797364,
-0.2148853,
0.037788823,
0.16691138,
0.099181786,
-0.0955518,
-0.0074798446,
-0.17511943,
0.14543307,
-0.029364567,
-0.21223477,
-0.05881982,
0.11064195,
-0.2877007,
-0.023934823,
-0.15569815,
0.015789302,
-0.035767324,
-0.15110208,
0.07125638,
0.05703369,
-0.08454703,
-0.07080854,
0.025179204,
-0.10522502,
-0.03670824,
-0.11075579,
0.0681693,
-0.28287485,
0.2769406,
0.026260372,
0.07289979,
0.04669447,
-0.16541554,
0.040775143,
0.035916835,
0.03648039,
0.11299418,
0.14765884,
0.031163761,
0.0011800596,
-0.10715472,
0.02665826,
-0.06237457,
0.15672882,
0.09038829,
0.0061029866,
-0.2592228,
-0.21008603,
0.019810716,
-0.08721265,
0.107840165,
0.28438854,
-0.16649202,
0.19627784,
0.040611178,
0.16516201,
0.24990341,
-0.16222852,
-0.009037945,
0.053751092,
0.1647804,
-0.16184275,
-0.29710436,
0.043035872,
0.04667557,
0.14761224,
-0.09030331,
-0.024515491,
0.10857025,
0.19865094,
-0.07794062,
0.17942934,
0.13322048,
-0.16857187,
0.055713065,
0.18661156,
-0.07864222,
0.23296827,
0.10348465,
-0.11750994,
-0.065938555,
-0.04377608,
0.14903909,
0.019000417,
0.21033548,
0.12162547,
0.1273347,
],
"index": 0,
"object": "embedding",
}
],
object="list",
usage=Usage(
completion_tokens=0,
prompt_tokens=0,
total_tokens=0,
completion_tokens_details=None,
),
)
cost = completion_cost(
completion_response=response,
custom_llm_provider="together_ai",
call_type="embedding",
)

View file

@ -104,14 +104,131 @@ def test_openai_embedding_3():
pytest.fail(f"Error occurred: {e}")
def test_openai_azure_embedding_simple():
@pytest.mark.parametrize(
"model, api_base, api_key",
[
# ("azure/azure-embedding-model", None, None),
("together_ai/togethercomputer/m2-bert-80M-8k-retrieval", None, None),
],
)
@pytest.mark.parametrize("sync_mode", [True, False])
@pytest.mark.asyncio
async def test_openai_azure_embedding_simple(model, api_base, api_key, sync_mode):
try:
litellm.set_verbose = True
os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
litellm.model_cost = litellm.get_model_cost_map(url="")
# litellm.set_verbose = True
if sync_mode:
response = embedding(
model="azure/azure-embedding-model",
model=model,
input=["good morning from litellm"],
api_base=api_base,
api_key=api_key,
)
else:
response = await litellm.aembedding(
model=model,
input=["good morning from litellm"],
api_base=api_base,
api_key=api_key,
)
# print(await response)
print(response)
print(response._hidden_params)
response_keys = set(dict(response).keys())
response_keys.discard("_response_ms")
assert set(["usage", "model", "object", "data"]) == set(
response_keys
) # assert litellm response has expected keys from OpenAI embedding response
request_cost = litellm.completion_cost(
completion_response=response, call_type="embedding"
)
print("Calculated request cost=", request_cost)
assert isinstance(response.usage, litellm.Usage)
except Exception as e:
pytest.fail(f"Error occurred: {e}")
# test_openai_azure_embedding_simple()
import base64
import requests
litellm.set_verbose = True
url = "https://dummyimage.com/100/100/fff&text=Test+image"
response = requests.get(url)
file_data = response.content
encoded_file = base64.b64encode(file_data).decode("utf-8")
base64_image = f"data:image/png;base64,{encoded_file}"
from openai.types.embedding import Embedding
def _azure_ai_image_mock_response(*args, **kwargs):
new_response = MagicMock()
new_response.headers = {"azureml-model-group": "offer-cohere-embed-multili-paygo"}
new_response.json.return_value = {
"data": [Embedding(embedding=[1234], index=0, object="embedding")],
"model": "",
"object": "list",
"usage": {"prompt_tokens": 1, "total_tokens": 2},
}
return new_response
@pytest.mark.parametrize(
"model, api_base, api_key",
[
(
"azure_ai/Cohere-embed-v3-multilingual-jzu",
"https://Cohere-embed-v3-multilingual-jzu.eastus2.models.ai.azure.com",
os.getenv("AZURE_AI_COHERE_API_KEY_2"),
)
],
)
@pytest.mark.parametrize("sync_mode", [True, False])
@pytest.mark.asyncio
async def test_azure_ai_embedding_image(model, api_base, api_key, sync_mode):
try:
os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
litellm.model_cost = litellm.get_model_cost_map(url="")
input = base64_image
if sync_mode:
client = HTTPHandler()
else:
client = AsyncHTTPHandler()
with patch.object(
client, "post", side_effect=_azure_ai_image_mock_response
) as mock_client:
if sync_mode:
response = embedding(
model=model,
input=[input],
api_base=api_base,
api_key=api_key,
client=client,
)
else:
response = await litellm.aembedding(
model=model,
input=[input],
api_base=api_base,
api_key=api_key,
client=client,
)
print(response)
assert len(response.data) == 1
print(response._hidden_params)
response_keys = set(dict(response).keys())
response_keys.discard("_response_ms")
assert set(["usage", "model", "object", "data"]) == set(
@ -128,9 +245,6 @@ def test_openai_azure_embedding_simple():
pytest.fail(f"Error occurred: {e}")
# test_openai_azure_embedding_simple()
def test_openai_azure_embedding_timeouts():
try:
response = embedding(
@ -226,13 +340,16 @@ def test_openai_azure_embedding_with_oidc_and_cf():
os.environ["AZURE_API_KEY"] = old_key
from openai.types.embedding import Embedding
def _openai_mock_response(*args, **kwargs):
new_response = MagicMock()
new_response.headers = {"hello": "world"}
new_response.parse.return_value = (
openai.types.create_embedding_response.CreateEmbeddingResponse(
data=[],
data=[Embedding(embedding=[1234, 45667], index=0, object="embedding")],
model="azure/test",
object="list",
usage=openai.types.create_embedding_response.Usage(
@ -267,20 +384,28 @@ def test_openai_azure_embedding_optional_arg():
# test_openai_embedding()
@pytest.mark.parametrize(
"model, api_base",
[
("embed-english-v2.0", None),
],
)
@pytest.mark.parametrize("sync_mode", [True, False])
@pytest.mark.asyncio
async def test_cohere_embedding(sync_mode):
async def test_cohere_embedding(sync_mode, model, api_base):
try:
# litellm.set_verbose=True
data = {
"model": "embed-english-v2.0",
"model": model,
"input": ["good morning from litellm", "this is another item"],
"input_type": "search_query",
"api_base": api_base,
}
if sync_mode:
response = embedding(**data)
else:
response = await litellm.aembedding(**data)
print(f"response:", response)
assert isinstance(response.usage, litellm.Usage)

View file

@ -774,3 +774,21 @@ def test_usage_object_null_tokens():
usage_obj = litellm.Usage(prompt_tokens=2, completion_tokens=None, total_tokens=2)
assert usage_obj.completion_tokens == 0
def test_is_base64_encoded():
import base64
import requests
litellm.set_verbose = True
url = "https://dummyimage.com/100/100/fff&text=Test+image"
response = requests.get(url)
file_data = response.content
encoded_file = base64.b64encode(file_data).decode("utf-8")
base64_image = f"data:image/png;base64,{encoded_file}"
from litellm.utils import is_base64_encoded
assert is_base64_encoded(s=base64_image) is True

View file

@ -0,0 +1,17 @@
from typing import Any, Dict, Iterable, List, Literal, Optional, Union
from typing_extensions import Required, TypedDict
class ImageEmbeddingInput(TypedDict, total=False):
image: Required[str]
text: str
EncodingFormat = Literal["base64", "binary", "float", "int8", "ubinary", "uint8"]
class ImageEmbeddingRequest(TypedDict, total=False):
input: Required[List[ImageEmbeddingInput]]
dimensions: int
encoding_format: EncodingFormat

View file

@ -9,7 +9,7 @@ from openai.lib.streaming._assistants import (
AsyncAssistantStreamManager,
)
from openai.pagination import AsyncCursorPage, SyncCursorPage
from openai.types import Batch, FileObject
from openai.types import Batch, EmbeddingCreateParams, FileObject
from openai.types.beta.assistant import Assistant
from openai.types.beta.assistant_tool_param import AssistantToolParam
from openai.types.beta.thread_create_params import (

View file

@ -766,7 +766,7 @@ class EmbeddingResponse(OpenAIObject):
"""The actual embedding value"""
object: Literal["list"]
"""The object type, which is always "embedding" """
"""The object type, which is always "list" """
usage: Optional[Usage] = None
"""Usage statistics for the embedding request."""

View file

@ -11118,6 +11118,10 @@ def is_cached_message(message: AllMessageValues) -> bool:
def is_base64_encoded(s: str) -> bool:
try:
# Strip out the prefix if it exists
if s.startswith("data:"):
s = s.split(",")[1]
# Try to decode the string
decoded_bytes = base64.b64decode(s, validate=True)
# Check if the original string can be re-encoded to the same string

View file

@ -990,6 +990,26 @@
"mode": "chat",
"source":"https://azuremarketplace.microsoft.com/en-us/marketplace/apps/metagenai.meta-llama-3-1-405b-instruct-offer?tab=PlansAndPrice"
},
"azure_ai/Cohere-embed-v3-english": {
"max_tokens": 512,
"max_input_tokens": 512,
"output_vector_size": 1024,
"input_cost_per_token": 0.0000001,
"output_cost_per_token": 0.0,
"litellm_provider": "azure_ai",
"mode": "embedding",
"source":"https://azuremarketplace.microsoft.com/en-us/marketplace/apps/cohere.cohere-embed-v3-english-offer?tab=PlansAndPrice"
},
"azure_ai/Cohere-embed-v3-multilingual": {
"max_tokens": 512,
"max_input_tokens": 512,
"output_vector_size": 1024,
"input_cost_per_token": 0.0000001,
"output_cost_per_token": 0.0,
"litellm_provider": "azure_ai",
"mode": "embedding",
"source":"https://azuremarketplace.microsoft.com/en-us/marketplace/apps/cohere.cohere-embed-v3-english-offer?tab=PlansAndPrice"
},
"babbage-002": {
"max_tokens": 16384,
"max_input_tokens": 16384,
@ -4964,50 +4984,71 @@
"together-ai-up-to-4b": {
"input_cost_per_token": 0.0000001,
"output_cost_per_token": 0.0000001,
"litellm_provider": "together_ai"
"litellm_provider": "together_ai",
"mode": "chat"
},
"together-ai-4.1b-8b": {
"input_cost_per_token": 0.0000002,
"output_cost_per_token": 0.0000002,
"litellm_provider": "together_ai"
"litellm_provider": "together_ai",
"mode": "chat"
},
"together-ai-8.1b-21b": {
"max_tokens": 1000,
"input_cost_per_token": 0.0000003,
"output_cost_per_token": 0.0000003,
"litellm_provider": "together_ai"
"litellm_provider": "together_ai",
"mode": "chat"
},
"together-ai-21.1b-41b": {
"input_cost_per_token": 0.0000008,
"output_cost_per_token": 0.0000008,
"litellm_provider": "together_ai"
"litellm_provider": "together_ai",
"mode": "chat"
},
"together-ai-41.1b-80b": {
"input_cost_per_token": 0.0000009,
"output_cost_per_token": 0.0000009,
"litellm_provider": "together_ai"
"litellm_provider": "together_ai",
"mode": "chat"
},
"together-ai-81.1b-110b": {
"input_cost_per_token": 0.0000018,
"output_cost_per_token": 0.0000018,
"litellm_provider": "together_ai"
"litellm_provider": "together_ai",
"mode": "chat"
},
"together-ai-embedding-up-to-150m": {
"input_cost_per_token": 0.000000008,
"output_cost_per_token": 0.0,
"litellm_provider": "together_ai",
"mode": "embedding"
},
"together-ai-embedding-151m-to-350m": {
"input_cost_per_token": 0.000000016,
"output_cost_per_token": 0.0,
"litellm_provider": "together_ai",
"mode": "embedding"
},
"together_ai/mistralai/Mixtral-8x7B-Instruct-v0.1": {
"input_cost_per_token": 0.0000006,
"output_cost_per_token": 0.0000006,
"litellm_provider": "together_ai",
"supports_function_calling": true,
"supports_parallel_function_calling": true
"supports_parallel_function_calling": true,
"mode": "chat"
},
"together_ai/mistralai/Mistral-7B-Instruct-v0.1": {
"litellm_provider": "together_ai",
"supports_function_calling": true,
"supports_parallel_function_calling": true
"supports_parallel_function_calling": true,
"mode": "chat"
},
"together_ai/togethercomputer/CodeLlama-34b-Instruct": {
"litellm_provider": "together_ai",
"supports_function_calling": true,
"supports_parallel_function_calling": true
"supports_parallel_function_calling": true,
"mode": "chat"
},
"ollama/codegemma": {
"max_tokens": 8192,