forked from phoenix/litellm-mirror
Merge pull request #5414 from BerriAI/litellm_main_staging
fixes: minor litellm fixes
This commit is contained in:
commit
8ae0fc693f
22 changed files with 831 additions and 145 deletions
|
@ -274,6 +274,17 @@ curl --location 'http://0.0.0.0:4000/chat/completions' \
|
|||
## Advanced
|
||||
### Fallbacks + Retries + Timeouts + Cooldowns
|
||||
|
||||
To set fallbacks, just do:
|
||||
|
||||
```
|
||||
litellm_settings:
|
||||
fallbacks: [{"zephyr-beta": ["gpt-3.5-turbo"]}]
|
||||
```
|
||||
|
||||
**Covers all errors (429, 500, etc.)**
|
||||
|
||||
[**See Code**]()
|
||||
|
||||
**Set via config**
|
||||
```yaml
|
||||
model_list:
|
||||
|
@ -302,10 +313,70 @@ litellm_settings:
|
|||
num_retries: 3 # retry call 3 times on each model_name (e.g. zephyr-beta)
|
||||
request_timeout: 10 # raise Timeout error if call takes longer than 10s. Sets litellm.request_timeout
|
||||
fallbacks: [{"zephyr-beta": ["gpt-3.5-turbo"]}] # fallback to gpt-3.5-turbo if call fails num_retries
|
||||
context_window_fallbacks: [{"zephyr-beta": ["gpt-3.5-turbo-16k"]}, {"gpt-3.5-turbo": ["gpt-3.5-turbo-16k"]}] # fallback to gpt-3.5-turbo-16k if context window error
|
||||
allowed_fails: 3 # cooldown model if it fails > 1 call in a minute.
|
||||
cooldown_time: 30 # how long to cooldown model if fails/min > allowed_fails
|
||||
```
|
||||
|
||||
### Test Fallbacks!
|
||||
|
||||
Check if your fallbacks are working as expected.
|
||||
|
||||
#### **Regular Fallbacks**
|
||||
```bash
|
||||
curl -X POST 'http://0.0.0.0:4000/chat/completions' \
|
||||
-H 'Content-Type: application/json' \
|
||||
-H 'Authorization: Bearer sk-1234' \
|
||||
-D '{
|
||||
"model": "my-bad-model",
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "ping"
|
||||
}
|
||||
],
|
||||
"mock_testing_fallbacks": true # 👈 KEY CHANGE
|
||||
}
|
||||
'
|
||||
```
|
||||
|
||||
#### **Content Policy Fallbacks**
|
||||
```bash
|
||||
curl -X POST 'http://0.0.0.0:4000/chat/completions' \
|
||||
-H 'Content-Type: application/json' \
|
||||
-H 'Authorization: Bearer sk-1234' \
|
||||
-D '{
|
||||
"model": "my-bad-model",
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "ping"
|
||||
}
|
||||
],
|
||||
"mock_testing_content_policy_fallbacks": true # 👈 KEY CHANGE
|
||||
}
|
||||
'
|
||||
```
|
||||
|
||||
#### **Context Window Fallbacks**
|
||||
|
||||
```bash
|
||||
curl -X POST 'http://0.0.0.0:4000/chat/completions' \
|
||||
-H 'Content-Type: application/json' \
|
||||
-H 'Authorization: Bearer sk-1234' \
|
||||
-D '{
|
||||
"model": "my-bad-model",
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "ping"
|
||||
}
|
||||
],
|
||||
"mock_testing_context_window_fallbacks": true # 👈 KEY CHANGE
|
||||
}
|
||||
'
|
||||
```
|
||||
|
||||
|
||||
### Context Window Fallbacks (Pre-Call Checks + Fallbacks)
|
||||
|
||||
**Before call is made** check if a call is within model context window with **`enable_pre_call_checks: true`**.
|
||||
|
@ -493,65 +564,6 @@ This will default to claude-opus in case any model fails.
|
|||
|
||||
A model-specific fallbacks (e.g. {"gpt-3.5-turbo-small": ["claude-opus"]}) overrides default fallback.
|
||||
|
||||
### Test Fallbacks!
|
||||
|
||||
Check if your fallbacks are working as expected.
|
||||
|
||||
#### **Regular Fallbacks**
|
||||
```bash
|
||||
curl -X POST 'http://0.0.0.0:4000/chat/completions' \
|
||||
-H 'Content-Type: application/json' \
|
||||
-H 'Authorization: Bearer sk-1234' \
|
||||
-D '{
|
||||
"model": "my-bad-model",
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "ping"
|
||||
}
|
||||
],
|
||||
"mock_testing_fallbacks": true # 👈 KEY CHANGE
|
||||
}
|
||||
'
|
||||
```
|
||||
|
||||
#### **Content Policy Fallbacks**
|
||||
```bash
|
||||
curl -X POST 'http://0.0.0.0:4000/chat/completions' \
|
||||
-H 'Content-Type: application/json' \
|
||||
-H 'Authorization: Bearer sk-1234' \
|
||||
-D '{
|
||||
"model": "my-bad-model",
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "ping"
|
||||
}
|
||||
],
|
||||
"mock_testing_content_policy_fallbacks": true # 👈 KEY CHANGE
|
||||
}
|
||||
'
|
||||
```
|
||||
|
||||
#### **Context Window Fallbacks**
|
||||
|
||||
```bash
|
||||
curl -X POST 'http://0.0.0.0:4000/chat/completions' \
|
||||
-H 'Content-Type: application/json' \
|
||||
-H 'Authorization: Bearer sk-1234' \
|
||||
-D '{
|
||||
"model": "my-bad-model",
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": "ping"
|
||||
}
|
||||
],
|
||||
"mock_testing_context_window_fallbacks": true # 👈 KEY CHANGE
|
||||
}
|
||||
'
|
||||
```
|
||||
|
||||
### EU-Region Filtering (Pre-Call Checks)
|
||||
|
||||
**Before call is made** check if a call is within model context window with **`enable_pre_call_checks: true`**.
|
||||
|
|
|
@ -848,7 +848,7 @@ from .llms.gemini import GeminiConfig
|
|||
from .llms.nlp_cloud import NLPCloudConfig
|
||||
from .llms.aleph_alpha import AlephAlphaConfig
|
||||
from .llms.petals import PetalsConfig
|
||||
from .llms.vertex_ai_and_google_ai_studio.vertex_and_google_ai_studio_gemini import (
|
||||
from .llms.vertex_ai_and_google_ai_studio.gemini.vertex_and_google_ai_studio_gemini import (
|
||||
VertexGeminiConfig,
|
||||
GoogleAIStudioGeminiConfig,
|
||||
VertexAIConfig,
|
||||
|
@ -865,6 +865,7 @@ from .llms.vertex_ai_and_google_ai_studio.vertex_ai_partner_models.llama3.transf
|
|||
from .llms.vertex_ai_and_google_ai_studio.vertex_ai_partner_models.ai21.transformation import (
|
||||
VertexAIAi21Config,
|
||||
)
|
||||
|
||||
from .llms.sagemaker.sagemaker import SagemakerConfig
|
||||
from .llms.ollama import OllamaConfig
|
||||
from .llms.ollama_chat import OllamaChatConfig
|
||||
|
|
|
@ -8,7 +8,7 @@ from openai.types.fine_tuning.fine_tuning_job import FineTuningJob, Hyperparamet
|
|||
from litellm._logging import verbose_logger
|
||||
from litellm.llms.base import BaseLLM
|
||||
from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler
|
||||
from litellm.llms.vertex_ai_and_google_ai_studio.vertex_and_google_ai_studio_gemini import (
|
||||
from litellm.llms.vertex_ai_and_google_ai_studio.gemini.vertex_and_google_ai_studio_gemini import (
|
||||
VertexLLM,
|
||||
)
|
||||
from litellm.types.llms.openai import FineTuningJobCreate
|
||||
|
|
|
@ -13,7 +13,7 @@ from litellm.llms.custom_httpx.http_handler import (
|
|||
_get_httpx_client,
|
||||
)
|
||||
from litellm.llms.openai import HttpxBinaryResponseContent
|
||||
from litellm.llms.vertex_ai_and_google_ai_studio.vertex_and_google_ai_studio_gemini import (
|
||||
from litellm.llms.vertex_ai_and_google_ai_studio.gemini.vertex_and_google_ai_studio_gemini import (
|
||||
VertexLLM,
|
||||
)
|
||||
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
from typing import Literal
|
||||
from typing import Literal, Tuple
|
||||
|
||||
import httpx
|
||||
|
||||
|
@ -37,3 +37,74 @@ def get_supports_system_message(
|
|||
supports_system_message = False
|
||||
|
||||
return supports_system_message
|
||||
|
||||
|
||||
from typing import Literal, Optional
|
||||
|
||||
all_gemini_url_modes = Literal["chat", "embedding", "batch_embedding"]
|
||||
|
||||
|
||||
def _get_vertex_url(
|
||||
mode: all_gemini_url_modes,
|
||||
model: str,
|
||||
stream: Optional[bool],
|
||||
vertex_project: Optional[str],
|
||||
vertex_location: Optional[str],
|
||||
vertex_api_version: Literal["v1", "v1beta1"],
|
||||
) -> Tuple[str, str]:
|
||||
if mode == "chat":
|
||||
### SET RUNTIME ENDPOINT ###
|
||||
endpoint = "generateContent"
|
||||
if stream is True:
|
||||
endpoint = "streamGenerateContent"
|
||||
url = f"https://{vertex_location}-aiplatform.googleapis.com/{vertex_api_version}/projects/{vertex_project}/locations/{vertex_location}/publishers/google/models/{model}:{endpoint}?alt=sse"
|
||||
else:
|
||||
url = f"https://{vertex_location}-aiplatform.googleapis.com/{vertex_api_version}/projects/{vertex_project}/locations/{vertex_location}/publishers/google/models/{model}:{endpoint}"
|
||||
|
||||
# if model is only numeric chars then it's a fine tuned gemini model
|
||||
# model = 4965075652664360960
|
||||
# send to this url: url = f"https://{vertex_location}-aiplatform.googleapis.com/{version}/projects/{vertex_project}/locations/{vertex_location}/endpoints/{model}:{endpoint}"
|
||||
if model.isdigit():
|
||||
# It's a fine-tuned Gemini model
|
||||
url = f"https://{vertex_location}-aiplatform.googleapis.com/{vertex_api_version}/projects/{vertex_project}/locations/{vertex_location}/endpoints/{model}:{endpoint}"
|
||||
if stream is True:
|
||||
url += "?alt=sse"
|
||||
elif mode == "embedding":
|
||||
endpoint = "predict"
|
||||
url = f"https://{vertex_location}-aiplatform.googleapis.com/v1/projects/{vertex_project}/locations/{vertex_location}/publishers/google/models/{model}:{endpoint}"
|
||||
|
||||
return url, endpoint
|
||||
|
||||
|
||||
def _get_gemini_url(
|
||||
mode: all_gemini_url_modes,
|
||||
model: str,
|
||||
stream: Optional[bool],
|
||||
gemini_api_key: Optional[str],
|
||||
) -> Tuple[str, str]:
|
||||
_gemini_model_name = "models/{}".format(model)
|
||||
if mode == "chat":
|
||||
endpoint = "generateContent"
|
||||
if stream is True:
|
||||
endpoint = "streamGenerateContent"
|
||||
url = "https://generativelanguage.googleapis.com/v1beta/{}:{}?key={}&alt=sse".format(
|
||||
_gemini_model_name, endpoint, gemini_api_key
|
||||
)
|
||||
else:
|
||||
url = (
|
||||
"https://generativelanguage.googleapis.com/v1beta/{}:{}?key={}".format(
|
||||
_gemini_model_name, endpoint, gemini_api_key
|
||||
)
|
||||
)
|
||||
elif mode == "embedding":
|
||||
endpoint = "embedContent"
|
||||
url = "https://generativelanguage.googleapis.com/v1beta/{}:{}?key={}".format(
|
||||
_gemini_model_name, endpoint, gemini_api_key
|
||||
)
|
||||
elif mode == "batch_embedding":
|
||||
endpoint = "batchEmbedContents"
|
||||
url = "https://generativelanguage.googleapis.com/v1beta/{}:{}?key={}".format(
|
||||
_gemini_model_name, endpoint, gemini_api_key
|
||||
)
|
||||
|
||||
return url, endpoint
|
||||
|
|
|
@ -11,8 +11,10 @@ from litellm.types.llms.vertex_ai import CachedContentRequestBody, SystemInstruc
|
|||
from litellm.utils import is_cached_message
|
||||
|
||||
from ..common_utils import VertexAIError, get_supports_system_message
|
||||
from ..gemini_transformation import transform_system_message
|
||||
from ..vertex_and_google_ai_studio_gemini import _gemini_convert_messages_with_history
|
||||
from ..gemini.transformation import transform_system_message
|
||||
from ..gemini.vertex_and_google_ai_studio_gemini import (
|
||||
_gemini_convert_messages_with_history,
|
||||
)
|
||||
|
||||
|
||||
def separate_cached_messages(
|
||||
|
|
|
@ -0,0 +1,167 @@
|
|||
"""
|
||||
Google AI Studio /batchEmbedContents Embeddings Endpoint
|
||||
"""
|
||||
|
||||
import json
|
||||
from typing import List, Literal, Optional, Union
|
||||
|
||||
import httpx
|
||||
|
||||
from litellm import EmbeddingResponse
|
||||
from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler
|
||||
from litellm.types.llms.openai import EmbeddingInput
|
||||
from litellm.types.llms.vertex_ai import (
|
||||
VertexAIBatchEmbeddingsRequestBody,
|
||||
VertexAIBatchEmbeddingsResponseObject,
|
||||
)
|
||||
|
||||
from ..gemini.vertex_and_google_ai_studio_gemini import VertexLLM
|
||||
from .batch_embed_content_transformation import (
|
||||
process_response,
|
||||
transform_openai_input_gemini_content,
|
||||
)
|
||||
|
||||
|
||||
class GoogleBatchEmbeddings(VertexLLM):
|
||||
def batch_embeddings(
|
||||
self,
|
||||
model: str,
|
||||
input: EmbeddingInput,
|
||||
print_verbose,
|
||||
model_response: EmbeddingResponse,
|
||||
custom_llm_provider: Literal["gemini", "vertex_ai"],
|
||||
optional_params: dict,
|
||||
api_key: Optional[str] = None,
|
||||
api_base: Optional[str] = None,
|
||||
logging_obj=None,
|
||||
encoding=None,
|
||||
vertex_project=None,
|
||||
vertex_location=None,
|
||||
vertex_credentials=None,
|
||||
aembedding=False,
|
||||
timeout=300,
|
||||
client=None,
|
||||
) -> EmbeddingResponse:
|
||||
|
||||
auth_header, url = self._get_token_and_url(
|
||||
model=model,
|
||||
gemini_api_key=api_key,
|
||||
vertex_project=vertex_project,
|
||||
vertex_location=vertex_location,
|
||||
vertex_credentials=vertex_credentials,
|
||||
stream=None,
|
||||
custom_llm_provider=custom_llm_provider,
|
||||
api_base=api_base,
|
||||
should_use_v1beta1_features=False,
|
||||
mode="batch_embedding",
|
||||
)
|
||||
|
||||
if client is None:
|
||||
_params = {}
|
||||
if timeout is not None:
|
||||
if isinstance(timeout, float) or isinstance(timeout, int):
|
||||
_httpx_timeout = httpx.Timeout(timeout)
|
||||
_params["timeout"] = _httpx_timeout
|
||||
else:
|
||||
_params["timeout"] = httpx.Timeout(timeout=600.0, connect=5.0)
|
||||
|
||||
sync_handler: HTTPHandler = HTTPHandler(**_params) # type: ignore
|
||||
else:
|
||||
sync_handler = client # type: ignore
|
||||
|
||||
optional_params = optional_params or {}
|
||||
|
||||
### TRANSFORMATION ###
|
||||
request_data = transform_openai_input_gemini_content(
|
||||
input=input, model=model, optional_params=optional_params
|
||||
)
|
||||
|
||||
headers = {
|
||||
"Content-Type": "application/json; charset=utf-8",
|
||||
}
|
||||
|
||||
## LOGGING
|
||||
logging_obj.pre_call(
|
||||
input=input,
|
||||
api_key="",
|
||||
additional_args={
|
||||
"complete_input_dict": request_data,
|
||||
"api_base": url,
|
||||
"headers": headers,
|
||||
},
|
||||
)
|
||||
|
||||
if aembedding is True:
|
||||
return self.async_batch_embeddings( # type: ignore
|
||||
model=model,
|
||||
api_base=api_base,
|
||||
url=url,
|
||||
data=request_data,
|
||||
model_response=model_response,
|
||||
timeout=timeout,
|
||||
headers=headers,
|
||||
input=input,
|
||||
)
|
||||
|
||||
response = sync_handler.post(
|
||||
url=url,
|
||||
headers=headers,
|
||||
data=json.dumps(request_data),
|
||||
)
|
||||
|
||||
if response.status_code != 200:
|
||||
raise Exception(f"Error: {response.status_code} {response.text}")
|
||||
|
||||
_json_response = response.json()
|
||||
_predictions = VertexAIBatchEmbeddingsResponseObject(**_json_response) # type: ignore
|
||||
|
||||
return process_response(
|
||||
model=model,
|
||||
model_response=model_response,
|
||||
_predictions=_predictions,
|
||||
input=input,
|
||||
)
|
||||
|
||||
async def async_batch_embeddings(
|
||||
self,
|
||||
model: str,
|
||||
api_base: Optional[str],
|
||||
url: str,
|
||||
data: VertexAIBatchEmbeddingsRequestBody,
|
||||
model_response: EmbeddingResponse,
|
||||
input: EmbeddingInput,
|
||||
timeout: Optional[Union[float, httpx.Timeout]],
|
||||
headers={},
|
||||
client: Optional[AsyncHTTPHandler] = None,
|
||||
) -> EmbeddingResponse:
|
||||
if client is None:
|
||||
_params = {}
|
||||
if timeout is not None:
|
||||
if isinstance(timeout, float) or isinstance(timeout, int):
|
||||
_httpx_timeout = httpx.Timeout(timeout)
|
||||
_params["timeout"] = _httpx_timeout
|
||||
else:
|
||||
_params["timeout"] = httpx.Timeout(timeout=600.0, connect=5.0)
|
||||
|
||||
async_handler: AsyncHTTPHandler = AsyncHTTPHandler(**_params) # type: ignore
|
||||
else:
|
||||
async_handler = client # type: ignore
|
||||
|
||||
response = await async_handler.post(
|
||||
url=url,
|
||||
headers=headers,
|
||||
data=json.dumps(data),
|
||||
)
|
||||
|
||||
if response.status_code != 200:
|
||||
raise Exception(f"Error: {response.status_code} {response.text}")
|
||||
|
||||
_json_response = response.json()
|
||||
_predictions = VertexAIBatchEmbeddingsResponseObject(**_json_response) # type: ignore
|
||||
|
||||
return process_response(
|
||||
model=model,
|
||||
model_response=model_response,
|
||||
_predictions=_predictions,
|
||||
input=input,
|
||||
)
|
|
@ -0,0 +1,76 @@
|
|||
"""
|
||||
Transformation logic from OpenAI /v1/embeddings format to Google AI Studio /batchEmbedContents format.
|
||||
|
||||
Why separate file? Make it easy to see how transformation works
|
||||
"""
|
||||
|
||||
from typing import List
|
||||
|
||||
from litellm import EmbeddingResponse
|
||||
from litellm.types.llms.openai import EmbeddingInput
|
||||
from litellm.types.llms.vertex_ai import (
|
||||
ContentType,
|
||||
EmbedContentRequest,
|
||||
PartType,
|
||||
VertexAIBatchEmbeddingsRequestBody,
|
||||
VertexAIBatchEmbeddingsResponseObject,
|
||||
)
|
||||
from litellm.types.utils import Embedding, Usage
|
||||
from litellm.utils import get_formatted_prompt, token_counter
|
||||
|
||||
from ..common_utils import VertexAIError
|
||||
|
||||
|
||||
def transform_openai_input_gemini_content(
|
||||
input: EmbeddingInput, model: str, optional_params: dict
|
||||
) -> VertexAIBatchEmbeddingsRequestBody:
|
||||
"""
|
||||
The content to embed. Only the parts.text fields will be counted.
|
||||
"""
|
||||
gemini_model_name = "models/{}".format(model)
|
||||
requests: List[EmbedContentRequest] = []
|
||||
if isinstance(input, str):
|
||||
request = EmbedContentRequest(
|
||||
model=gemini_model_name,
|
||||
content=ContentType(parts=[PartType(text=input)]),
|
||||
**optional_params
|
||||
)
|
||||
requests.append(request)
|
||||
else:
|
||||
for i in input:
|
||||
request = EmbedContentRequest(
|
||||
model=gemini_model_name,
|
||||
content=ContentType(parts=[PartType(text=i)]),
|
||||
**optional_params
|
||||
)
|
||||
requests.append(request)
|
||||
|
||||
return VertexAIBatchEmbeddingsRequestBody(requests=requests)
|
||||
|
||||
|
||||
def process_response(
|
||||
input: EmbeddingInput,
|
||||
model_response: EmbeddingResponse,
|
||||
model: str,
|
||||
_predictions: VertexAIBatchEmbeddingsResponseObject,
|
||||
) -> EmbeddingResponse:
|
||||
|
||||
openai_embeddings: List[Embedding] = []
|
||||
for embedding in _predictions["embeddings"]:
|
||||
openai_embedding = Embedding(
|
||||
embedding=embedding["values"],
|
||||
index=0,
|
||||
object="embedding",
|
||||
)
|
||||
openai_embeddings.append(openai_embedding)
|
||||
|
||||
model_response.data = openai_embeddings
|
||||
model_response.model = model
|
||||
|
||||
input_text = get_formatted_prompt(data={"input": input}, call_type="embedding")
|
||||
prompt_tokens = token_counter(model=model, text=input_text)
|
||||
model_response.usage = Usage(
|
||||
prompt_tokens=prompt_tokens, total_tokens=prompt_tokens
|
||||
)
|
||||
|
||||
return model_response
|
|
@ -54,10 +54,16 @@ from litellm.types.llms.vertex_ai import (
|
|||
from litellm.types.utils import GenericStreamingChunk
|
||||
from litellm.utils import CustomStreamWrapper, ModelResponse, Usage
|
||||
|
||||
from ..base import BaseLLM
|
||||
from .common_utils import VertexAIError, get_supports_system_message
|
||||
from .context_caching.vertex_ai_context_caching import ContextCachingEndpoints
|
||||
from .gemini_transformation import transform_system_message
|
||||
from ...base import BaseLLM
|
||||
from ..common_utils import (
|
||||
VertexAIError,
|
||||
_get_gemini_url,
|
||||
_get_vertex_url,
|
||||
all_gemini_url_modes,
|
||||
get_supports_system_message,
|
||||
)
|
||||
from ..context_caching.vertex_ai_context_caching import ContextCachingEndpoints
|
||||
from .transformation import transform_system_message
|
||||
|
||||
context_caching_endpoints = ContextCachingEndpoints()
|
||||
|
||||
|
@ -309,6 +315,7 @@ class GoogleAIStudioGeminiConfig: # key diff from VertexAI - 'frequency_penalty
|
|||
"n",
|
||||
"stop",
|
||||
]
|
||||
|
||||
def _map_function(self, value: List[dict]) -> List[Tools]:
|
||||
gtool_func_declarations = []
|
||||
googleSearchRetrieval: Optional[dict] = None
|
||||
|
@ -1164,6 +1171,7 @@ class VertexLLM(BaseLLM):
|
|||
custom_llm_provider: Literal["vertex_ai", "vertex_ai_beta", "gemini"],
|
||||
api_base: Optional[str],
|
||||
should_use_v1beta1_features: Optional[bool] = False,
|
||||
mode: all_gemini_url_modes = "chat",
|
||||
) -> Tuple[Optional[str], str]:
|
||||
"""
|
||||
Internal function. Returns the token and url for the call.
|
||||
|
@ -1174,18 +1182,13 @@ class VertexLLM(BaseLLM):
|
|||
token, url
|
||||
"""
|
||||
if custom_llm_provider == "gemini":
|
||||
_gemini_model_name = "models/{}".format(model)
|
||||
auth_header = None
|
||||
endpoint = "generateContent"
|
||||
if stream is True:
|
||||
endpoint = "streamGenerateContent"
|
||||
url = "https://generativelanguage.googleapis.com/v1beta/{}:{}?key={}&alt=sse".format(
|
||||
_gemini_model_name, endpoint, gemini_api_key
|
||||
)
|
||||
else:
|
||||
url = "https://generativelanguage.googleapis.com/v1beta/{}:{}?key={}".format(
|
||||
_gemini_model_name, endpoint, gemini_api_key
|
||||
)
|
||||
url, endpoint = _get_gemini_url(
|
||||
mode=mode,
|
||||
model=model,
|
||||
stream=stream,
|
||||
gemini_api_key=gemini_api_key,
|
||||
)
|
||||
else:
|
||||
auth_header, vertex_project = self._ensure_access_token(
|
||||
credentials=vertex_credentials, project_id=vertex_project
|
||||
|
@ -1193,23 +1196,17 @@ class VertexLLM(BaseLLM):
|
|||
vertex_location = self.get_vertex_region(vertex_region=vertex_location)
|
||||
|
||||
### SET RUNTIME ENDPOINT ###
|
||||
version = "v1beta1" if should_use_v1beta1_features is True else "v1"
|
||||
endpoint = "generateContent"
|
||||
litellm.utils.print_verbose("vertex_project - {}".format(vertex_project))
|
||||
if stream is True:
|
||||
endpoint = "streamGenerateContent"
|
||||
url = f"https://{vertex_location}-aiplatform.googleapis.com/{version}/projects/{vertex_project}/locations/{vertex_location}/publishers/google/models/{model}:{endpoint}?alt=sse"
|
||||
else:
|
||||
url = f"https://{vertex_location}-aiplatform.googleapis.com/{version}/projects/{vertex_project}/locations/{vertex_location}/publishers/google/models/{model}:{endpoint}"
|
||||
|
||||
# if model is only numeric chars then it's a fine tuned gemini model
|
||||
# model = 4965075652664360960
|
||||
# send to this url: url = f"https://{vertex_location}-aiplatform.googleapis.com/{version}/projects/{vertex_project}/locations/{vertex_location}/endpoints/{model}:{endpoint}"
|
||||
if model.isdigit():
|
||||
# It's a fine-tuned Gemini model
|
||||
url = f"https://{vertex_location}-aiplatform.googleapis.com/{version}/projects/{vertex_project}/locations/{vertex_location}/endpoints/{model}:{endpoint}"
|
||||
if stream is True:
|
||||
url += "?alt=sse"
|
||||
version: Literal["v1beta1", "v1"] = (
|
||||
"v1beta1" if should_use_v1beta1_features is True else "v1"
|
||||
)
|
||||
url, endpoint = _get_vertex_url(
|
||||
mode=mode,
|
||||
model=model,
|
||||
stream=stream,
|
||||
vertex_project=vertex_project,
|
||||
vertex_location=vertex_location,
|
||||
vertex_api_version=version,
|
||||
)
|
||||
|
||||
if (
|
||||
api_base is not None
|
||||
|
@ -1793,8 +1790,10 @@ class VertexLLM(BaseLLM):
|
|||
input: Union[list, str],
|
||||
print_verbose,
|
||||
model_response: litellm.EmbeddingResponse,
|
||||
custom_llm_provider: Literal["gemini", "vertex_ai"],
|
||||
optional_params: dict,
|
||||
api_key: Optional[str] = None,
|
||||
api_base: Optional[str] = None,
|
||||
logging_obj=None,
|
||||
encoding=None,
|
||||
vertex_project=None,
|
||||
|
@ -1804,6 +1803,18 @@ class VertexLLM(BaseLLM):
|
|||
timeout=300,
|
||||
client=None,
|
||||
):
|
||||
auth_header, url = self._get_token_and_url(
|
||||
model=model,
|
||||
gemini_api_key=api_key,
|
||||
vertex_project=vertex_project,
|
||||
vertex_location=vertex_location,
|
||||
vertex_credentials=vertex_credentials,
|
||||
stream=None,
|
||||
custom_llm_provider=custom_llm_provider,
|
||||
api_base=api_base,
|
||||
should_use_v1beta1_features=False,
|
||||
mode="embedding",
|
||||
)
|
||||
|
||||
if client is None:
|
||||
_params = {}
|
||||
|
@ -1818,11 +1829,6 @@ class VertexLLM(BaseLLM):
|
|||
else:
|
||||
sync_handler = client # type: ignore
|
||||
|
||||
url = f"https://{vertex_location}-aiplatform.googleapis.com/v1/projects/{vertex_project}/locations/{vertex_location}/publishers/google/models/{model}:predict"
|
||||
|
||||
auth_header, _ = self._ensure_access_token(
|
||||
credentials=vertex_credentials, project_id=vertex_project
|
||||
)
|
||||
optional_params = optional_params or {}
|
||||
|
||||
request_data = VertexMultimodalEmbeddingRequest()
|
||||
|
@ -1840,30 +1846,22 @@ class VertexLLM(BaseLLM):
|
|||
|
||||
request_data["instances"] = [vertex_request_instance]
|
||||
|
||||
request_str = f"\n curl -X POST \\\n -H \"Authorization: Bearer {auth_header[:10] + 'XXXXXXXXXX'}\" \\\n -H \"Content-Type: application/json; charset=utf-8\" \\\n -d {request_data} \\\n \"{url}\""
|
||||
logging_obj.pre_call(
|
||||
input=[],
|
||||
api_key=None,
|
||||
additional_args={
|
||||
"complete_input_dict": optional_params,
|
||||
"request_str": request_str,
|
||||
},
|
||||
)
|
||||
|
||||
logging_obj.pre_call(
|
||||
input=[],
|
||||
api_key=None,
|
||||
additional_args={
|
||||
"complete_input_dict": optional_params,
|
||||
"request_str": request_str,
|
||||
},
|
||||
)
|
||||
|
||||
headers = {
|
||||
"Content-Type": "application/json; charset=utf-8",
|
||||
"Authorization": f"Bearer {auth_header}",
|
||||
}
|
||||
|
||||
## LOGGING
|
||||
logging_obj.pre_call(
|
||||
input=input,
|
||||
api_key="",
|
||||
additional_args={
|
||||
"complete_input_dict": request_data,
|
||||
"api_base": url,
|
||||
"headers": headers,
|
||||
},
|
||||
)
|
||||
|
||||
if aembedding is True:
|
||||
return self.async_multimodal_embedding(
|
||||
model=model,
|
|
@ -205,7 +205,7 @@ def get_vertex_client(
|
|||
vertex_credentials: Optional[str],
|
||||
) -> Tuple[Any, Optional[str]]:
|
||||
args = locals()
|
||||
from litellm.llms.vertex_ai_and_google_ai_studio.vertex_and_google_ai_studio_gemini import (
|
||||
from litellm.llms.vertex_ai_and_google_ai_studio.gemini.vertex_and_google_ai_studio_gemini import (
|
||||
VertexLLM,
|
||||
)
|
||||
|
||||
|
@ -270,7 +270,7 @@ def completion(
|
|||
from anthropic import AnthropicVertex
|
||||
|
||||
from litellm.llms.anthropic import AnthropicChatCompletion
|
||||
from litellm.llms.vertex_ai_and_google_ai_studio.vertex_and_google_ai_studio_gemini import (
|
||||
from litellm.llms.vertex_ai_and_google_ai_studio.gemini.vertex_and_google_ai_studio_gemini import (
|
||||
VertexLLM,
|
||||
)
|
||||
except:
|
||||
|
|
|
@ -83,7 +83,7 @@ class VertexAIPartnerModels(BaseLLM):
|
|||
from litellm.llms.databricks import DatabricksChatCompletion
|
||||
from litellm.llms.openai import OpenAIChatCompletion
|
||||
from litellm.llms.text_completion_codestral import CodestralTextCompletion
|
||||
from litellm.llms.vertex_ai_and_google_ai_studio.vertex_and_google_ai_studio_gemini import (
|
||||
from litellm.llms.vertex_ai_and_google_ai_studio.gemini.vertex_and_google_ai_studio_gemini import (
|
||||
VertexLLM,
|
||||
)
|
||||
except Exception:
|
||||
|
|
|
@ -126,12 +126,15 @@ from .llms.vertex_ai_and_google_ai_studio import (
|
|||
vertex_ai_anthropic,
|
||||
vertex_ai_non_gemini,
|
||||
)
|
||||
from .llms.vertex_ai_and_google_ai_studio.embeddings.batch_embed_content_handler import (
|
||||
GoogleBatchEmbeddings,
|
||||
)
|
||||
from .llms.vertex_ai_and_google_ai_studio.gemini.vertex_and_google_ai_studio_gemini import (
|
||||
VertexLLM,
|
||||
)
|
||||
from .llms.vertex_ai_and_google_ai_studio.vertex_ai_partner_models.main import (
|
||||
VertexAIPartnerModels,
|
||||
)
|
||||
from .llms.vertex_ai_and_google_ai_studio.vertex_and_google_ai_studio_gemini import (
|
||||
VertexLLM,
|
||||
)
|
||||
from .llms.watsonx import IBMWatsonXAI
|
||||
from .types.llms.openai import HttpxBinaryResponseContent
|
||||
from .types.utils import (
|
||||
|
@ -172,6 +175,7 @@ triton_chat_completions = TritonChatCompletion()
|
|||
bedrock_chat_completion = BedrockLLM()
|
||||
bedrock_converse_chat_completion = BedrockConverseLLM()
|
||||
vertex_chat_completion = VertexLLM()
|
||||
google_batch_embeddings = GoogleBatchEmbeddings()
|
||||
vertex_partner_models_chat_completion = VertexAIPartnerModels()
|
||||
vertex_text_to_speech = VertexTextToSpeechAPI()
|
||||
watsonxai = IBMWatsonXAI()
|
||||
|
@ -3134,6 +3138,7 @@ async def aembedding(*args, **kwargs) -> EmbeddingResponse:
|
|||
or custom_llm_provider == "fireworks_ai"
|
||||
or custom_llm_provider == "ollama"
|
||||
or custom_llm_provider == "vertex_ai"
|
||||
or custom_llm_provider == "gemini"
|
||||
or custom_llm_provider == "databricks"
|
||||
or custom_llm_provider == "watsonx"
|
||||
or custom_llm_provider == "cohere"
|
||||
|
@ -3531,6 +3536,26 @@ def embedding(
|
|||
client=client,
|
||||
aembedding=aembedding,
|
||||
)
|
||||
elif custom_llm_provider == "gemini":
|
||||
|
||||
gemini_api_key = api_key or get_secret("GEMINI_API_KEY") or litellm.api_key
|
||||
|
||||
response = google_batch_embeddings.batch_embeddings( # type: ignore
|
||||
model=model,
|
||||
input=input,
|
||||
encoding=encoding,
|
||||
logging_obj=logging,
|
||||
optional_params=optional_params,
|
||||
model_response=EmbeddingResponse(),
|
||||
vertex_project=None,
|
||||
vertex_location=None,
|
||||
vertex_credentials=None,
|
||||
aembedding=aembedding,
|
||||
print_verbose=print_verbose,
|
||||
custom_llm_provider="gemini",
|
||||
api_key=gemini_api_key,
|
||||
)
|
||||
|
||||
elif custom_llm_provider == "vertex_ai":
|
||||
vertex_ai_project = (
|
||||
optional_params.pop("vertex_project", None)
|
||||
|
@ -3571,6 +3596,7 @@ def embedding(
|
|||
vertex_credentials=vertex_credentials,
|
||||
aembedding=aembedding,
|
||||
print_verbose=print_verbose,
|
||||
custom_llm_provider="vertex_ai",
|
||||
)
|
||||
else:
|
||||
response = vertex_ai_non_gemini.embedding(
|
||||
|
|
|
@ -4028,11 +4028,74 @@
|
|||
"max_tokens": 8192,
|
||||
"max_input_tokens": 8192,
|
||||
"max_output_tokens": 8192,
|
||||
"input_cost_per_token": 0.0000004,
|
||||
"input_cost_per_token": 0.0000003,
|
||||
"output_cost_per_token": 0.0000006,
|
||||
"litellm_provider": "bedrock",
|
||||
"mode": "chat"
|
||||
},
|
||||
"bedrock/us-east-1/meta.llama3-8b-instruct-v1:0": {
|
||||
"max_tokens": 8192,
|
||||
"max_input_tokens": 8192,
|
||||
"max_output_tokens": 8192,
|
||||
"input_cost_per_token": 0.0000003,
|
||||
"output_cost_per_token": 0.0000006,
|
||||
"litellm_provider": "bedrock",
|
||||
"mode": "chat"
|
||||
},
|
||||
"bedrock/us-west-1/meta.llama3-8b-instruct-v1:0": {
|
||||
"max_tokens": 8192,
|
||||
"max_input_tokens": 8192,
|
||||
"max_output_tokens": 8192,
|
||||
"input_cost_per_token": 0.0000003,
|
||||
"output_cost_per_token": 0.0000006,
|
||||
"litellm_provider": "bedrock",
|
||||
"mode": "chat"
|
||||
},
|
||||
"bedrock/ap-south-1/meta.llama3-8b-instruct-v1:0": {
|
||||
"max_tokens": 8192,
|
||||
"max_input_tokens": 8192,
|
||||
"max_output_tokens": 8192,
|
||||
"input_cost_per_token": 0.00000036,
|
||||
"output_cost_per_token": 0.00000072,
|
||||
"litellm_provider": "bedrock",
|
||||
"mode": "chat"
|
||||
},
|
||||
"bedrock/ca-central-1/meta.llama3-8b-instruct-v1:0": {
|
||||
"max_tokens": 8192,
|
||||
"max_input_tokens": 8192,
|
||||
"max_output_tokens": 8192,
|
||||
"input_cost_per_token": 0.00000035,
|
||||
"output_cost_per_token": 0.00000069,
|
||||
"litellm_provider": "bedrock",
|
||||
"mode": "chat"
|
||||
},
|
||||
"bedrock/eu-west-1/meta.llama3-8b-instruct-v1:0": {
|
||||
"max_tokens": 8192,
|
||||
"max_input_tokens": 8192,
|
||||
"max_output_tokens": 8192,
|
||||
"input_cost_per_token": 0.00000032,
|
||||
"output_cost_per_token": 0.00000065,
|
||||
"litellm_provider": "bedrock",
|
||||
"mode": "chat"
|
||||
},
|
||||
"bedrock/eu-west-2/meta.llama3-8b-instruct-v1:0": {
|
||||
"max_tokens": 8192,
|
||||
"max_input_tokens": 8192,
|
||||
"max_output_tokens": 8192,
|
||||
"input_cost_per_token": 0.00000039,
|
||||
"output_cost_per_token": 0.00000078,
|
||||
"litellm_provider": "bedrock",
|
||||
"mode": "chat"
|
||||
},
|
||||
"bedrock/sa-east-1/meta.llama3-8b-instruct-v1:0": {
|
||||
"max_tokens": 8192,
|
||||
"max_input_tokens": 8192,
|
||||
"max_output_tokens": 8192,
|
||||
"input_cost_per_token": 0.0000005,
|
||||
"output_cost_per_token": 0.00000101,
|
||||
"litellm_provider": "bedrock",
|
||||
"mode": "chat"
|
||||
},
|
||||
"meta.llama3-70b-instruct-v1:0": {
|
||||
"max_tokens": 8192,
|
||||
"max_input_tokens": 8192,
|
||||
|
@ -4042,12 +4105,75 @@
|
|||
"litellm_provider": "bedrock",
|
||||
"mode": "chat"
|
||||
},
|
||||
"bedrock/us-east-1/meta.llama3-70b-instruct-v1:0": {
|
||||
"max_tokens": 8192,
|
||||
"max_input_tokens": 8192,
|
||||
"max_output_tokens": 8192,
|
||||
"input_cost_per_token": 0.00000265,
|
||||
"output_cost_per_token": 0.0000035,
|
||||
"litellm_provider": "bedrock",
|
||||
"mode": "chat"
|
||||
},
|
||||
"bedrock/us-west-1/meta.llama3-70b-instruct-v1:0": {
|
||||
"max_tokens": 8192,
|
||||
"max_input_tokens": 8192,
|
||||
"max_output_tokens": 8192,
|
||||
"input_cost_per_token": 0.00000265,
|
||||
"output_cost_per_token": 0.0000035,
|
||||
"litellm_provider": "bedrock",
|
||||
"mode": "chat"
|
||||
},
|
||||
"bedrock/ap-south-1/meta.llama3-70b-instruct-v1:0": {
|
||||
"max_tokens": 8192,
|
||||
"max_input_tokens": 8192,
|
||||
"max_output_tokens": 8192,
|
||||
"input_cost_per_token": 0.00000318,
|
||||
"output_cost_per_token": 0.0000042,
|
||||
"litellm_provider": "bedrock",
|
||||
"mode": "chat"
|
||||
},
|
||||
"bedrock/ca-central-1/meta.llama3-70b-instruct-v1:0": {
|
||||
"max_tokens": 8192,
|
||||
"max_input_tokens": 8192,
|
||||
"max_output_tokens": 8192,
|
||||
"input_cost_per_token": 0.00000305,
|
||||
"output_cost_per_token": 0.00000403,
|
||||
"litellm_provider": "bedrock",
|
||||
"mode": "chat"
|
||||
},
|
||||
"bedrock/eu-west-1/meta.llama3-70b-instruct-v1:0": {
|
||||
"max_tokens": 8192,
|
||||
"max_input_tokens": 8192,
|
||||
"max_output_tokens": 8192,
|
||||
"input_cost_per_token": 0.00000286,
|
||||
"output_cost_per_token": 0.00000378,
|
||||
"litellm_provider": "bedrock",
|
||||
"mode": "chat"
|
||||
},
|
||||
"bedrock/eu-west-2/meta.llama3-70b-instruct-v1:0": {
|
||||
"max_tokens": 8192,
|
||||
"max_input_tokens": 8192,
|
||||
"max_output_tokens": 8192,
|
||||
"input_cost_per_token": 0.00000345,
|
||||
"output_cost_per_token": 0.00000455,
|
||||
"litellm_provider": "bedrock",
|
||||
"mode": "chat"
|
||||
},
|
||||
"bedrock/sa-east-1/meta.llama3-70b-instruct-v1:0": {
|
||||
"max_tokens": 8192,
|
||||
"max_input_tokens": 8192,
|
||||
"max_output_tokens": 8192,
|
||||
"input_cost_per_token": 0.00000445,
|
||||
"output_cost_per_token": 0.00000588,
|
||||
"litellm_provider": "bedrock",
|
||||
"mode": "chat"
|
||||
},
|
||||
"meta.llama3-1-8b-instruct-v1:0": {
|
||||
"max_tokens": 128000,
|
||||
"max_input_tokens": 128000,
|
||||
"max_output_tokens": 2048,
|
||||
"input_cost_per_token": 0.0000004,
|
||||
"output_cost_per_token": 0.0000006,
|
||||
"input_cost_per_token": 0.00000022,
|
||||
"output_cost_per_token": 0.00000022,
|
||||
"litellm_provider": "bedrock",
|
||||
"mode": "chat",
|
||||
"supports_function_calling": true,
|
||||
|
@ -4057,8 +4183,8 @@
|
|||
"max_tokens": 128000,
|
||||
"max_input_tokens": 128000,
|
||||
"max_output_tokens": 2048,
|
||||
"input_cost_per_token": 0.00000265,
|
||||
"output_cost_per_token": 0.0000035,
|
||||
"input_cost_per_token": 0.00000099,
|
||||
"output_cost_per_token": 0.00000099,
|
||||
"litellm_provider": "bedrock",
|
||||
"mode": "chat",
|
||||
"supports_function_calling": true,
|
||||
|
|
|
@ -587,6 +587,7 @@ class GenerateRequestBase(LiteLLMBase):
|
|||
|
||||
class GenerateKeyRequest(GenerateRequestBase):
|
||||
key_alias: Optional[str] = None
|
||||
key: Optional[str] = None
|
||||
duration: Optional[str] = None
|
||||
aliases: Optional[dict] = {}
|
||||
config: Optional[dict] = {}
|
||||
|
|
|
@ -149,7 +149,7 @@ def init_rds_client(
|
|||
# boto3 automatically reads env variables
|
||||
|
||||
client = boto3.client(
|
||||
service_name="bedrock-runtime",
|
||||
service_name="rds",
|
||||
region_name=region_name,
|
||||
config=config,
|
||||
)
|
||||
|
@ -168,8 +168,10 @@ def generate_iam_auth_token(db_host, db_port, db_user) -> str:
|
|||
aws_secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY"),
|
||||
aws_session_name=os.getenv("AWS_SESSION_NAME"),
|
||||
aws_profile_name=os.getenv("AWS_PROFILE_NAME"),
|
||||
aws_role_name=os.getenv("AWS_ROLE_NAME"),
|
||||
aws_web_identity_token=os.getenv("AWS_WEB_IDENTITY_TOKEN"),
|
||||
aws_role_name=os.getenv("AWS_ROLE_NAME", os.getenv("AWS_ROLE_ARN")),
|
||||
aws_web_identity_token=os.getenv(
|
||||
"AWS_WEB_IDENTITY_TOKEN", os.getenv("AWS_WEB_IDENTITY_TOKEN_FILE")
|
||||
),
|
||||
)
|
||||
|
||||
token = boto_client.generate_db_auth_token(
|
||||
|
|
|
@ -55,6 +55,7 @@ async def generate_key_fn(
|
|||
Parameters:
|
||||
- duration: Optional[str] - Specify the length of time the token is valid for. You can set duration as seconds ("30s"), minutes ("30m"), hours ("30h"), days ("30d").
|
||||
- key_alias: Optional[str] - User defined key alias
|
||||
- key: Optional[str] - User defined key value. If not set, a 16-digit unique sk-key is created for you.
|
||||
- team_id: Optional[str] - The team id of the key
|
||||
- user_id: Optional[str] - The user id of the key
|
||||
- models: Optional[list] - Model_name's a user is allowed to call. (if empty, key is allowed to call all models)
|
||||
|
@ -728,6 +729,9 @@ async def generate_key_helper_fn(
|
|||
max_budget: Optional[float] = None, # max_budget is used to Budget Per user
|
||||
budget_duration: Optional[str] = None, # max_budget is used to Budget Per user
|
||||
token: Optional[str] = None,
|
||||
key: Optional[
|
||||
str
|
||||
] = None, # dev-friendly alt param for 'token'. Exposed on `/key/generate` for setting key value yourself.
|
||||
user_id: Optional[str] = None,
|
||||
team_id: Optional[str] = None,
|
||||
user_email: Optional[str] = None,
|
||||
|
@ -763,7 +767,10 @@ async def generate_key_helper_fn(
|
|||
)
|
||||
|
||||
if token is None:
|
||||
token = f"sk-{secrets.token_urlsafe(16)}"
|
||||
if key is not None:
|
||||
token = key
|
||||
else:
|
||||
token = f"sk-{secrets.token_urlsafe(16)}"
|
||||
|
||||
if duration is None: # allow tokens that never expire
|
||||
expires = None
|
||||
|
|
|
@ -28,7 +28,7 @@ from litellm import (
|
|||
completion_cost,
|
||||
embedding,
|
||||
)
|
||||
from litellm.llms.vertex_ai_and_google_ai_studio.vertex_and_google_ai_studio_gemini import (
|
||||
from litellm.llms.vertex_ai_and_google_ai_studio.gemini.vertex_and_google_ai_studio_gemini import (
|
||||
_gemini_convert_messages_with_history,
|
||||
)
|
||||
from litellm.tests.test_streaming import streaming_format_tests
|
||||
|
@ -447,7 +447,9 @@ async def test_async_vertexai_response():
|
|||
test_models = random.sample(test_models, 1)
|
||||
test_models += litellm.vertex_language_models # always test gemini-pro
|
||||
for model in test_models:
|
||||
print(f"model being tested in async call: {model}")
|
||||
print(
|
||||
f"model being tested in async call: {model}, litellm.vertex_language_models: {litellm.vertex_language_models}"
|
||||
)
|
||||
if model in VERTEX_MODELS_TO_NOT_TEST or (
|
||||
"gecko" in model or "32k" in model or "ultra" in model or "002" in model
|
||||
):
|
||||
|
@ -2088,7 +2090,7 @@ def test_prompt_factory_nested():
|
|||
|
||||
|
||||
def test_get_token_url():
|
||||
from litellm.llms.vertex_ai_and_google_ai_studio.vertex_and_google_ai_studio_gemini import (
|
||||
from litellm.llms.vertex_ai_and_google_ai_studio.gemini.vertex_and_google_ai_studio_gemini import (
|
||||
VertexLLM,
|
||||
)
|
||||
|
||||
|
|
|
@ -695,6 +695,33 @@ async def test_triton_embeddings():
|
|||
pytest.fail(f"Error occurred: {e}")
|
||||
|
||||
|
||||
@pytest.mark.parametrize("sync_mode", [True, False])
|
||||
@pytest.mark.parametrize(
|
||||
"input", ["good morning from litellm", ["good morning from litellm"]] #
|
||||
)
|
||||
@pytest.mark.asyncio
|
||||
async def test_gemini_embeddings(sync_mode, input):
|
||||
try:
|
||||
litellm.set_verbose = True
|
||||
if sync_mode:
|
||||
response = litellm.embedding(
|
||||
model="gemini/text-embedding-004",
|
||||
input=input,
|
||||
)
|
||||
else:
|
||||
response = await litellm.aembedding(
|
||||
model="gemini/text-embedding-004",
|
||||
input=input,
|
||||
)
|
||||
print(f"response: {response}")
|
||||
|
||||
# stubbed endpoint is setup to return this
|
||||
assert isinstance(response.data[0]["embedding"], list)
|
||||
assert response.usage.prompt_tokens > 0
|
||||
except Exception as e:
|
||||
pytest.fail(f"Error occurred: {e}")
|
||||
|
||||
|
||||
@pytest.mark.parametrize("sync_mode", [True, False])
|
||||
@pytest.mark.asyncio
|
||||
async def test_databricks_embeddings(sync_mode):
|
||||
|
|
|
@ -30,6 +30,7 @@ from openai.types.beta.threads.message import Message as OpenAIMessage
|
|||
from openai.types.beta.threads.message_content import MessageContent
|
||||
from openai.types.beta.threads.run import Run
|
||||
from openai.types.chat import ChatCompletionChunk
|
||||
from openai.types.embedding import Embedding as OpenAIEmbedding
|
||||
from pydantic import BaseModel, Field
|
||||
from typing_extensions import Dict, Required, TypedDict, override
|
||||
|
||||
|
@ -47,6 +48,9 @@ FileTypes = Union[
|
|||
]
|
||||
|
||||
|
||||
EmbeddingInput = Union[str, List[str]]
|
||||
|
||||
|
||||
class NotGiven:
|
||||
"""
|
||||
A sentinel singleton class used to distinguish omitted keyword arguments
|
||||
|
|
|
@ -336,3 +336,41 @@ class VertexMultimodalEmbeddingRequest(TypedDict, total=False):
|
|||
class VertexAICachedContentResponseObject(TypedDict):
|
||||
name: str
|
||||
model: str
|
||||
|
||||
|
||||
class TaskTypeEnum(Enum):
|
||||
TASK_TYPE_UNSPECIFIED = "TASK_TYPE_UNSPECIFIED"
|
||||
RETRIEVAL_QUERY = "RETRIEVAL_QUERY"
|
||||
RETRIEVAL_DOCUMENT = "RETRIEVAL_DOCUMENT"
|
||||
SEMANTIC_SIMILARITY = "SEMANTIC_SIMILARITY"
|
||||
CLASSIFICATION = "CLASSIFICATION"
|
||||
CLUSTERING = "CLUSTERING"
|
||||
QUESTION_ANSWERING = "QUESTION_ANSWERING"
|
||||
FACT_VERIFICATION = "FACT_VERIFICATION"
|
||||
|
||||
|
||||
class VertexAITextEmbeddingsRequestBody(TypedDict, total=False):
|
||||
content: Required[ContentType]
|
||||
taskType: TaskTypeEnum
|
||||
title: str
|
||||
outputDimensionality: int
|
||||
|
||||
|
||||
class ContentEmbeddings(TypedDict):
|
||||
values: List[int]
|
||||
|
||||
|
||||
class VertexAITextEmbeddingsResponseObject(TypedDict):
|
||||
embedding: ContentEmbeddings
|
||||
|
||||
|
||||
class EmbedContentRequest(VertexAITextEmbeddingsRequestBody):
|
||||
model: Required[str]
|
||||
|
||||
|
||||
class VertexAIBatchEmbeddingsRequestBody(TypedDict, total=False):
|
||||
requests: List[EmbedContentRequest]
|
||||
|
||||
|
||||
class VertexAIBatchEmbeddingsResponseObject(TypedDict):
|
||||
embeddings: List[ContentEmbeddings]
|
||||
|
|
|
@ -4028,11 +4028,74 @@
|
|||
"max_tokens": 8192,
|
||||
"max_input_tokens": 8192,
|
||||
"max_output_tokens": 8192,
|
||||
"input_cost_per_token": 0.0000004,
|
||||
"input_cost_per_token": 0.0000003,
|
||||
"output_cost_per_token": 0.0000006,
|
||||
"litellm_provider": "bedrock",
|
||||
"mode": "chat"
|
||||
},
|
||||
"bedrock/us-east-1/meta.llama3-8b-instruct-v1:0": {
|
||||
"max_tokens": 8192,
|
||||
"max_input_tokens": 8192,
|
||||
"max_output_tokens": 8192,
|
||||
"input_cost_per_token": 0.0000003,
|
||||
"output_cost_per_token": 0.0000006,
|
||||
"litellm_provider": "bedrock",
|
||||
"mode": "chat"
|
||||
},
|
||||
"bedrock/us-west-1/meta.llama3-8b-instruct-v1:0": {
|
||||
"max_tokens": 8192,
|
||||
"max_input_tokens": 8192,
|
||||
"max_output_tokens": 8192,
|
||||
"input_cost_per_token": 0.0000003,
|
||||
"output_cost_per_token": 0.0000006,
|
||||
"litellm_provider": "bedrock",
|
||||
"mode": "chat"
|
||||
},
|
||||
"bedrock/ap-south-1/meta.llama3-8b-instruct-v1:0": {
|
||||
"max_tokens": 8192,
|
||||
"max_input_tokens": 8192,
|
||||
"max_output_tokens": 8192,
|
||||
"input_cost_per_token": 0.00000036,
|
||||
"output_cost_per_token": 0.00000072,
|
||||
"litellm_provider": "bedrock",
|
||||
"mode": "chat"
|
||||
},
|
||||
"bedrock/ca-central-1/meta.llama3-8b-instruct-v1:0": {
|
||||
"max_tokens": 8192,
|
||||
"max_input_tokens": 8192,
|
||||
"max_output_tokens": 8192,
|
||||
"input_cost_per_token": 0.00000035,
|
||||
"output_cost_per_token": 0.00000069,
|
||||
"litellm_provider": "bedrock",
|
||||
"mode": "chat"
|
||||
},
|
||||
"bedrock/eu-west-1/meta.llama3-8b-instruct-v1:0": {
|
||||
"max_tokens": 8192,
|
||||
"max_input_tokens": 8192,
|
||||
"max_output_tokens": 8192,
|
||||
"input_cost_per_token": 0.00000032,
|
||||
"output_cost_per_token": 0.00000065,
|
||||
"litellm_provider": "bedrock",
|
||||
"mode": "chat"
|
||||
},
|
||||
"bedrock/eu-west-2/meta.llama3-8b-instruct-v1:0": {
|
||||
"max_tokens": 8192,
|
||||
"max_input_tokens": 8192,
|
||||
"max_output_tokens": 8192,
|
||||
"input_cost_per_token": 0.00000039,
|
||||
"output_cost_per_token": 0.00000078,
|
||||
"litellm_provider": "bedrock",
|
||||
"mode": "chat"
|
||||
},
|
||||
"bedrock/sa-east-1/meta.llama3-8b-instruct-v1:0": {
|
||||
"max_tokens": 8192,
|
||||
"max_input_tokens": 8192,
|
||||
"max_output_tokens": 8192,
|
||||
"input_cost_per_token": 0.0000005,
|
||||
"output_cost_per_token": 0.00000101,
|
||||
"litellm_provider": "bedrock",
|
||||
"mode": "chat"
|
||||
},
|
||||
"meta.llama3-70b-instruct-v1:0": {
|
||||
"max_tokens": 8192,
|
||||
"max_input_tokens": 8192,
|
||||
|
@ -4042,12 +4105,75 @@
|
|||
"litellm_provider": "bedrock",
|
||||
"mode": "chat"
|
||||
},
|
||||
"bedrock/us-east-1/meta.llama3-70b-instruct-v1:0": {
|
||||
"max_tokens": 8192,
|
||||
"max_input_tokens": 8192,
|
||||
"max_output_tokens": 8192,
|
||||
"input_cost_per_token": 0.00000265,
|
||||
"output_cost_per_token": 0.0000035,
|
||||
"litellm_provider": "bedrock",
|
||||
"mode": "chat"
|
||||
},
|
||||
"bedrock/us-west-1/meta.llama3-70b-instruct-v1:0": {
|
||||
"max_tokens": 8192,
|
||||
"max_input_tokens": 8192,
|
||||
"max_output_tokens": 8192,
|
||||
"input_cost_per_token": 0.00000265,
|
||||
"output_cost_per_token": 0.0000035,
|
||||
"litellm_provider": "bedrock",
|
||||
"mode": "chat"
|
||||
},
|
||||
"bedrock/ap-south-1/meta.llama3-70b-instruct-v1:0": {
|
||||
"max_tokens": 8192,
|
||||
"max_input_tokens": 8192,
|
||||
"max_output_tokens": 8192,
|
||||
"input_cost_per_token": 0.00000318,
|
||||
"output_cost_per_token": 0.0000042,
|
||||
"litellm_provider": "bedrock",
|
||||
"mode": "chat"
|
||||
},
|
||||
"bedrock/ca-central-1/meta.llama3-70b-instruct-v1:0": {
|
||||
"max_tokens": 8192,
|
||||
"max_input_tokens": 8192,
|
||||
"max_output_tokens": 8192,
|
||||
"input_cost_per_token": 0.00000305,
|
||||
"output_cost_per_token": 0.00000403,
|
||||
"litellm_provider": "bedrock",
|
||||
"mode": "chat"
|
||||
},
|
||||
"bedrock/eu-west-1/meta.llama3-70b-instruct-v1:0": {
|
||||
"max_tokens": 8192,
|
||||
"max_input_tokens": 8192,
|
||||
"max_output_tokens": 8192,
|
||||
"input_cost_per_token": 0.00000286,
|
||||
"output_cost_per_token": 0.00000378,
|
||||
"litellm_provider": "bedrock",
|
||||
"mode": "chat"
|
||||
},
|
||||
"bedrock/eu-west-2/meta.llama3-70b-instruct-v1:0": {
|
||||
"max_tokens": 8192,
|
||||
"max_input_tokens": 8192,
|
||||
"max_output_tokens": 8192,
|
||||
"input_cost_per_token": 0.00000345,
|
||||
"output_cost_per_token": 0.00000455,
|
||||
"litellm_provider": "bedrock",
|
||||
"mode": "chat"
|
||||
},
|
||||
"bedrock/sa-east-1/meta.llama3-70b-instruct-v1:0": {
|
||||
"max_tokens": 8192,
|
||||
"max_input_tokens": 8192,
|
||||
"max_output_tokens": 8192,
|
||||
"input_cost_per_token": 0.00000445,
|
||||
"output_cost_per_token": 0.00000588,
|
||||
"litellm_provider": "bedrock",
|
||||
"mode": "chat"
|
||||
},
|
||||
"meta.llama3-1-8b-instruct-v1:0": {
|
||||
"max_tokens": 128000,
|
||||
"max_input_tokens": 128000,
|
||||
"max_output_tokens": 2048,
|
||||
"input_cost_per_token": 0.0000004,
|
||||
"output_cost_per_token": 0.0000006,
|
||||
"input_cost_per_token": 0.00000022,
|
||||
"output_cost_per_token": 0.00000022,
|
||||
"litellm_provider": "bedrock",
|
||||
"mode": "chat",
|
||||
"supports_function_calling": true,
|
||||
|
@ -4057,8 +4183,8 @@
|
|||
"max_tokens": 128000,
|
||||
"max_input_tokens": 128000,
|
||||
"max_output_tokens": 2048,
|
||||
"input_cost_per_token": 0.00000265,
|
||||
"output_cost_per_token": 0.0000035,
|
||||
"input_cost_per_token": 0.00000099,
|
||||
"output_cost_per_token": 0.00000099,
|
||||
"litellm_provider": "bedrock",
|
||||
"mode": "chat",
|
||||
"supports_function_calling": true,
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue