LiteLLM Minor Fixes & Improvements (10/24/2024) (#6421)

* fix(utils.py): support passing dynamic api base to validate_environment Returns True if just api base is required and api base is passed * fix(litellm_pre_call_utils.py): feature flag sending client headers to llm api Fixes https://github.com/BerriAI/litellm/issues/6410 * fix(anthropic/chat/transformation.py): return correct error message * fix(http_handler.py): add error response text in places where we expect it * fix(factory.py): handle base case of no non-system messages to bedrock Fixes https://github.com/BerriAI/litellm/issues/6411 * feat(cohere/embed): Support cohere image embeddings Closes https://github.com/BerriAI/litellm/issues/6413 * fix(__init__.py): fix linting error * docs(supported_embedding.md): add image embedding example to docs * feat(cohere/embed): use cohere embedding returned usage for cost calc * build(model_prices_and_context_window.json): add embed-english-v3.0 details (image cost + 'supports_image_input' flag) * fix(cohere_transformation.py): fix linting error * test(test_proxy_server.py): cleanup test * test: cleanup test * fix: fix linting errors
2024-10-25 15:55:56 -07:00 · 2024-10-25 15:55:56 -07:00 · c03e5da41f
commit c03e5da41f
parent 38708a355a
23 changed files with 417 additions and 150 deletions
--- a/docs/my-website/docs/embedding/supported_embedding.md
+++ b/docs/my-website/docs/embedding/supported_embedding.md
@ -84,6 +84,60 @@ print(query_result[:5])
 </TabItem>
 </Tabs>

+
+## Image Embeddings
+
+For models that support image embeddings, you can pass in a base64 encoded image string to the `input` param.
+
+<Tabs>
+<TabItem value="sdk" label="SDK">
+
+```python
+from litellm import embedding
+import os
+
+# set your api key
+os.environ["COHERE_API_KEY"] = ""
+
+response = embedding(model="cohere/embed-english-v3.0", input=["<base64 encoded image>"])
+```
+
+</TabItem>
+<TabItem value="proxy" label="PROXY">
+
+1. Setup config.yaml 
+
+```yaml
+model_list:
+  - model_name: cohere-embed
+    litellm_params:
+      model: cohere/embed-english-v3.0
+      api_key: os.environ/COHERE_API_KEY
+```
+
+
+2. Start proxy
+
+```bash
+litellm --config /path/to/config.yaml 
+
+# RUNNING on http://0.0.0.0:4000
+```
+
+3. Test it!
+
+```bash
+curl -X POST 'http://0.0.0.0:4000/v1/embeddings' \
+-H 'Authorization: Bearer sk-54d77cd67b9febbb' \
+-H 'Content-Type: application/json' \
+-d '{
+  "model": "cohere/embed-english-v3.0",
+  "input": ["<base64 encoded image>"]
+}'
+```
+</TabItem>
+</Tabs>
+
 ## Input Params for `litellm.embedding()`


--- a/docs/my-website/docs/proxy/configs.md
+++ b/docs/my-website/docs/proxy/configs.md
@ -814,6 +814,7 @@ general_settings:
 | pass_through_endpoints | List[Dict[str, Any]] | Define the pass through endpoints. [Docs](./pass_through) |
 | enable_oauth2_proxy_auth | boolean | (Enterprise Feature) If true, enables oauth2.0 authentication |
 | forward_openai_org_id | boolean | If true, forwards the OpenAI Organization ID to the backend LLM call (if it's OpenAI). |
+| forward_client_headers_to_llm_api | boolean | If true, forwards the client headers (any `x-` headers) to the backend LLM call |

 ### router_settings - Reference

--- a/litellm/init.py
+++ b/litellm/init.py
@ -8,6 +8,7 @@ import os
 from typing import Callable, List, Optional, Dict, Union, Any, Literal, get_args
 from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler
 from litellm.caching.caching import Cache, DualCache, RedisCache, InMemoryCache
+from litellm.types.llms.bedrock import COHERE_EMBEDDING_INPUT_TYPES
 from litellm._logging import (
    set_verbose,
    _turn_on_debug,
@ -136,7 +137,7 @@ enable_azure_ad_token_refresh: Optional[bool] = False
 ### DEFAULT AZURE API VERSION ###
 AZURE_DEFAULT_API_VERSION = "2024-08-01-preview"  # this is updated to the latest
 ### COHERE EMBEDDINGS DEFAULT TYPE ###
-COHERE_DEFAULT_EMBEDDING_INPUT_TYPE = "search_document"
+COHERE_DEFAULT_EMBEDDING_INPUT_TYPE: COHERE_EMBEDDING_INPUT_TYPES = "search_document"
 ### GUARDRAILS ###
 llamaguard_model_name: Optional[str] = None
 openai_moderations_model_name: Optional[str] = None
--- a/litellm/litellm_core_utils/get_llm_provider_logic.py
+++ b/litellm/litellm_core_utils/get_llm_provider_logic.py
@ -333,6 +333,14 @@ def _get_openai_compatible_provider_info(  # noqa: PLR0915
    api_key: Optional[str],
    dynamic_api_key: Optional[str],
 ) -> Tuple[str, str, Optional[str], Optional[str]]:
+    """
+    Returns:
+        Tuple[str, str, Optional[str], Optional[str]]:
+            model: str
+            custom_llm_provider: str
+            dynamic_api_key: Optional[str]
+            api_base: Optional[str]
+    """
    custom_llm_provider = model.split("/", 1)[0]
    model = model.split("/", 1)[1]

--- a/litellm/llms/anthropic/chat/handler.py
+++ b/litellm/llms/anthropic/chat/handler.py
@ -398,6 +398,8 @@ class AnthropicChatCompletion(BaseLLM):
            error_response = getattr(e, "response", None)
            if error_headers is None and error_response:
                error_headers = getattr(error_response, "headers", None)
+            if error_response and hasattr(error_response, "text"):
+                error_text = getattr(error_response, "text", error_text)
            raise AnthropicError(
                message=error_text,
                status_code=status_code,
--- a/litellm/llms/azure_ai/embed/handler.py
+++ b/litellm/llms/azure_ai/embed/handler.py
@ -9,7 +9,7 @@ import httpx
 from openai import OpenAI

 import litellm
-from litellm.llms.cohere.embed import embedding as cohere_embedding
+from litellm.llms.cohere.embed.handler import embedding as cohere_embedding
 from litellm.llms.custom_httpx.http_handler import (
    AsyncHTTPHandler,
    HTTPHandler,
--- a/litellm/llms/bedrock/embed/cohere_transformation.py
+++ b/litellm/llms/bedrock/embed/cohere_transformation.py
@ -7,6 +7,7 @@ Why separate file? Make it easy to see how transformation works
 from typing import List

 import litellm
+from litellm.llms.cohere.embed.transformation import CohereEmbeddingConfig
 from litellm.types.llms.bedrock import CohereEmbeddingRequest, CohereEmbeddingResponse
 from litellm.types.utils import Embedding, EmbeddingResponse

@ -26,15 +27,21 @@ class BedrockCohereEmbeddingConfig:
                optional_params["embedding_types"] = v
        return optional_params

+    def _is_v3_model(self, model: str) -> bool:
+        return "3" in model
+
    def _transform_request(
-        self, input: List[str], inference_params: dict
+        self, model: str, input: List[str], inference_params: dict
    ) -> CohereEmbeddingRequest:
-        transformed_request = CohereEmbeddingRequest(
-            texts=input,
-            input_type=litellm.COHERE_DEFAULT_EMBEDDING_INPUT_TYPE,  # type: ignore
+        transformed_request = CohereEmbeddingConfig()._transform_request(
+            model, input, inference_params
        )

-        for k, v in inference_params.items():
-            transformed_request[k] = v  # type: ignore
+        new_transformed_request = CohereEmbeddingRequest(
+            input_type=transformed_request["input_type"],
+        )
+        for k in CohereEmbeddingRequest.__annotations__.keys():
+            if k in transformed_request:
+                new_transformed_request[k] = transformed_request[k]  # type: ignore

-        return transformed_request
+        return new_transformed_request
--- a/litellm/llms/bedrock/embed/embedding.py
+++ b/litellm/llms/bedrock/embed/embedding.py
@ -11,7 +11,7 @@ from typing import Any, Callable, List, Literal, Optional, Tuple, Union
 import httpx

 import litellm
-from litellm.llms.cohere.embed import embedding as cohere_embedding
+from litellm.llms.cohere.embed.handler import embedding as cohere_embedding
 from litellm.llms.custom_httpx.http_handler import (
    AsyncHTTPHandler,
    HTTPHandler,
@ -369,7 +369,7 @@ class BedrockEmbedding(BaseAWSLLM):
        batch_data: Optional[List] = None
        if provider == "cohere":
            data = BedrockCohereEmbeddingConfig()._transform_request(
-                input=input, inference_params=inference_params
+                model=model, input=input, inference_params=inference_params
            )
        elif provider == "amazon" and model in [
            "amazon.titan-embed-image-v1",
--- a/litellm/llms/cohere/embed/handler.py
+++ b/litellm/llms/cohere/embed/handler.py
@ -12,8 +12,11 @@ import requests  # type: ignore
 import litellm
 from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLoggingObj
 from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler
+from litellm.types.llms.bedrock import CohereEmbeddingRequest
 from litellm.utils import Choices, Message, ModelResponse, Usage

+from .transformation import CohereEmbeddingConfig
+

 def validate_environment(api_key, headers: dict):
    headers.update(
@ -41,39 +44,9 @@ class CohereError(Exception):
        )  # Call the base class constructor with the parameters it needs


-def _process_embedding_response(
-    embeddings: list,
-    model_response: litellm.EmbeddingResponse,
-    model: str,
-    encoding: Any,
-    input: list,
-) -> litellm.EmbeddingResponse:
-    output_data = []
-    for idx, embedding in enumerate(embeddings):
-        output_data.append(
-            {"object": "embedding", "index": idx, "embedding": embedding}
-        )
-    model_response.object = "list"
-    model_response.data = output_data
-    model_response.model = model
-    input_tokens = 0
-    for text in input:
-        input_tokens += len(encoding.encode(text))
-
-    setattr(
-        model_response,
-        "usage",
-        Usage(
-            prompt_tokens=input_tokens, completion_tokens=0, total_tokens=input_tokens
-        ),
-    )
-
-    return model_response
-
-
 async def async_embedding(
    model: str,
-    data: dict,
+    data: Union[dict, CohereEmbeddingRequest],
    input: list,
    model_response: litellm.utils.EmbeddingResponse,
    timeout: Optional[Union[float, httpx.Timeout]],
@ -121,19 +94,12 @@ async def async_embedding(
        )
        raise e

-    ## LOGGING
-    logging_obj.post_call(
-        input=input,
-        api_key=api_key,
-        additional_args={"complete_input_dict": data},
-        original_response=response.text,
-    )
-
-    embeddings = response.json()["embeddings"]
-
    ## PROCESS RESPONSE ##
-    return _process_embedding_response(
-        embeddings=embeddings,
+    return CohereEmbeddingConfig()._transform_response(
+        response=response,
+        api_key=api_key,
+        logging_obj=logging_obj,
+        data=data,
        model_response=model_response,
        model=model,
        encoding=encoding,
@ -149,7 +115,7 @@ def embedding(
    optional_params: dict,
    headers: dict,
    encoding: Any,
-    data: Optional[dict] = None,
+    data: Optional[Union[dict, CohereEmbeddingRequest]] = None,
    complete_api_base: Optional[str] = None,
    api_key: Optional[str] = None,
    aembedding: Optional[bool] = None,
@ -159,11 +125,10 @@ def embedding(
    headers = validate_environment(api_key, headers=headers)
    embed_url = complete_api_base or "https://api.cohere.ai/v1/embed"
    model = model
-    data = data or {"model": model, "texts": input, **optional_params}

-    if "3" in model and "input_type" not in data:
-        # cohere v3 embedding models require input_type, if no input_type is provided, default to "search_document"
-        data["input_type"] = "search_document"
+    data = data or CohereEmbeddingConfig()._transform_request(
+        model=model, input=input, inference_params=optional_params
+    )

    ## ROUTING
    if aembedding is True:
@ -193,30 +158,12 @@ def embedding(
        client = HTTPHandler(concurrent_limit=1)

    response = client.post(embed_url, headers=headers, data=json.dumps(data))
-    ## LOGGING
-    logging_obj.post_call(
-        input=input,
-        api_key=api_key,
-        additional_args={"complete_input_dict": data},
-        original_response=response,
-    )
-    """
-        response 
-        {
-            'object': "list",
-            'data': [
-            
-            ]
-            'model', 
-            'usage'
-        }
-    """
-    if response.status_code != 200:
-        raise CohereError(message=response.text, status_code=response.status_code)
-    embeddings = response.json()["embeddings"]

-    return _process_embedding_response(
-        embeddings=embeddings,
+    return CohereEmbeddingConfig()._transform_response(
+        response=response,
+        api_key=api_key,
+        logging_obj=logging_obj,
+        data=data,
        model_response=model_response,
        model=model,
        encoding=encoding,
--- a/litellm/llms/cohere/embed/transformation.py
+++ b/litellm/llms/cohere/embed/transformation.py
@ -0,0 +1,160 @@
+"""
+Transformation logic from OpenAI /v1/embeddings format to Cohere's /v1/embed format.
+
+Why separate file? Make it easy to see how transformation works
+
+Convers
+- v3 embedding models
+- v2 embedding models
+
+Docs - https://docs.cohere.com/v2/reference/embed
+"""
+
+import types
+from typing import Any, List, Optional, Union
+
+import httpx
+
+from litellm import COHERE_DEFAULT_EMBEDDING_INPUT_TYPE
+from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLoggingObj
+from litellm.types.llms.bedrock import (
+    COHERE_EMBEDDING_INPUT_TYPES,
+    CohereEmbeddingRequest,
+    CohereEmbeddingRequestWithModel,
+)
+from litellm.types.utils import (
+    Embedding,
+    EmbeddingResponse,
+    PromptTokensDetailsWrapper,
+    Usage,
+)
+from litellm.utils import is_base64_encoded
+
+
+class CohereEmbeddingConfig:
+    """
+    Reference: https://docs.cohere.com/v2/reference/embed
+    """
+
+    def __init__(self) -> None:
+        pass
+
+    def get_supported_openai_params(self) -> List[str]:
+        return ["encoding_format"]
+
+    def map_openai_params(
+        self, non_default_params: dict, optional_params: dict
+    ) -> dict:
+        for k, v in non_default_params.items():
+            if k == "encoding_format":
+                optional_params["embedding_types"] = v
+        return optional_params
+
+    def _is_v3_model(self, model: str) -> bool:
+        return "3" in model
+
+    def _transform_request(
+        self, model: str, input: List[str], inference_params: dict
+    ) -> CohereEmbeddingRequestWithModel:
+        is_encoded = False
+        for input_str in input:
+            is_encoded = is_base64_encoded(input_str)
+
+        if is_encoded:  # check if string is b64 encoded image or not
+            transformed_request = CohereEmbeddingRequestWithModel(
+                model=model,
+                images=input,
+                input_type="image",
+            )
+        else:
+            transformed_request = CohereEmbeddingRequestWithModel(
+                model=model,
+                texts=input,
+                input_type=COHERE_DEFAULT_EMBEDDING_INPUT_TYPE,
+            )
+
+        for k, v in inference_params.items():
+            transformed_request[k] = v  # type: ignore
+
+        return transformed_request
+
+    def _calculate_usage(self, input: List[str], encoding: Any, meta: dict) -> Usage:
+
+        input_tokens = 0
+
+        text_tokens: Optional[int] = meta.get("billed_units", {}).get("input_tokens")
+
+        image_tokens: Optional[int] = meta.get("billed_units", {}).get("images")
+
+        prompt_tokens_details: Optional[PromptTokensDetailsWrapper] = None
+        if image_tokens is None and text_tokens is None:
+            for text in input:
+                input_tokens += len(encoding.encode(text))
+        else:
+            prompt_tokens_details = PromptTokensDetailsWrapper(
+                image_tokens=image_tokens,
+                text_tokens=text_tokens,
+            )
+            if image_tokens:
+                input_tokens += image_tokens
+            if text_tokens:
+                input_tokens += text_tokens
+
+        return Usage(
+            prompt_tokens=input_tokens,
+            completion_tokens=0,
+            total_tokens=input_tokens,
+            prompt_tokens_details=prompt_tokens_details,
+        )
+
+    def _transform_response(
+        self,
+        response: httpx.Response,
+        api_key: Optional[str],
+        logging_obj: LiteLLMLoggingObj,
+        data: Union[dict, CohereEmbeddingRequest],
+        model_response: EmbeddingResponse,
+        model: str,
+        encoding: Any,
+        input: list,
+    ) -> EmbeddingResponse:
+
+        response_json = response.json()
+        ## LOGGING
+        logging_obj.post_call(
+            input=input,
+            api_key=api_key,
+            additional_args={"complete_input_dict": data},
+            original_response=response_json,
+        )
+        """
+            response 
+            {
+                'object': "list",
+                'data': [
+                
+                ]
+                'model', 
+                'usage'
+            }
+        """
+        embeddings = response_json["embeddings"]
+        output_data = []
+        for idx, embedding in enumerate(embeddings):
+            output_data.append(
+                {"object": "embedding", "index": idx, "embedding": embedding}
+            )
+        model_response.object = "list"
+        model_response.data = output_data
+        model_response.model = model
+        input_tokens = 0
+        for text in input:
+            input_tokens += len(encoding.encode(text))
+
+        setattr(
+            model_response,
+            "usage",
+            self._calculate_usage(input, encoding, response_json.get("meta", {})),
+        )
+
+        return model_response
--- a/litellm/llms/custom_httpx/http_handler.py
+++ b/litellm/llms/custom_httpx/http_handler.py
@ -152,8 +152,10 @@ class AsyncHTTPHandler:
            setattr(e, "status_code", e.response.status_code)
            if stream is True:
                setattr(e, "message", await e.response.aread())
+                setattr(e, "text", await e.response.aread())
            else:
                setattr(e, "message", e.response.text)
+                setattr(e, "text", e.response.text)
            raise e
        except Exception as e:
            raise e
--- a/litellm/llms/prompt_templates/factory.py
+++ b/litellm/llms/prompt_templates/factory.py
@ -2429,6 +2429,15 @@ def _bedrock_converse_messages_pt(  # noqa: PLR0915
    contents: List[BedrockMessageBlock] = []
    msg_i = 0

+    ## BASE CASE ##
+    if len(messages) == 0:
+        raise litellm.BadRequestError(
+            message=BAD_MESSAGE_ERROR_STR
+            + "bedrock requires at least one non-system message",
+            model=model,
+            llm_provider=llm_provider,
+        )
+
    # if initial message is assistant message
    if messages[0].get("role") is not None and messages[0]["role"] == "assistant":
        if user_continue_message is not None:
--- a/litellm/main.py
+++ b/litellm/main.py
@ -113,7 +113,7 @@ from .llms.bedrock.chat import BedrockConverseLLM, BedrockLLM
 from .llms.bedrock.embed.embedding import BedrockEmbedding
 from .llms.cohere import chat as cohere_chat
 from .llms.cohere import completion as cohere_completion  # type: ignore
-from .llms.cohere import embed as cohere_embed
+from .llms.cohere.embed import handler as cohere_embed
 from .llms.custom_llm import CustomLLM, custom_chat_llm_router
 from .llms.databricks.chat import DatabricksChatCompletion
 from .llms.groq.chat.handler import GroqChatCompletion
--- a/litellm/model_prices_and_context_window_backup.json
+++ b/litellm/model_prices_and_context_window_backup.json
@ -3364,54 +3364,56 @@
        "litellm_provider": "cohere",
        "mode": "rerank"
    },
-    "embed-english-v3.0": {
-        "max_tokens": 512, 
-        "max_input_tokens": 512,
-        "input_cost_per_token": 0.00000010,
-        "output_cost_per_token": 0.00000,
-        "litellm_provider": "cohere",
-        "mode": "embedding"
-    },
    "embed-english-light-v3.0": {
-        "max_tokens": 512, 
-        "max_input_tokens": 512,
+        "max_tokens": 1024, 
+        "max_input_tokens": 1024,
        "input_cost_per_token": 0.00000010,
        "output_cost_per_token": 0.00000,
        "litellm_provider": "cohere",
        "mode": "embedding"
    },
    "embed-multilingual-v3.0": {
-        "max_tokens": 512, 
-        "max_input_tokens": 512,
+        "max_tokens": 1024, 
+        "max_input_tokens": 1024,
        "input_cost_per_token": 0.00000010,
        "output_cost_per_token": 0.00000,
        "litellm_provider": "cohere",
        "mode": "embedding"
    },
    "embed-english-v2.0": {
-        "max_tokens": 512, 
-        "max_input_tokens": 512,
+        "max_tokens": 4096, 
+        "max_input_tokens": 4096,
        "input_cost_per_token": 0.00000010,
        "output_cost_per_token": 0.00000,
        "litellm_provider": "cohere",
        "mode": "embedding"
    },
    "embed-english-light-v2.0": {
-        "max_tokens": 512, 
-        "max_input_tokens": 512,
+        "max_tokens": 1024, 
+        "max_input_tokens": 1024,
        "input_cost_per_token": 0.00000010,
        "output_cost_per_token": 0.00000,
        "litellm_provider": "cohere",
        "mode": "embedding"
    },
    "embed-multilingual-v2.0": {
-        "max_tokens": 256, 
-        "max_input_tokens": 256,
+        "max_tokens": 768, 
+        "max_input_tokens": 768,
        "input_cost_per_token": 0.00000010,
        "output_cost_per_token": 0.00000,
        "litellm_provider": "cohere",
        "mode": "embedding"
    },
+    "embed-english-v3.0": {
+        "max_tokens": 1024, 
+        "max_input_tokens": 1024,
+        "input_cost_per_token": 0.00000010,
+        "input_cost_per_image": 0.0001,
+        "output_cost_per_token": 0.00000,
+        "litellm_provider": "cohere",
+        "mode": "embedding",
+        "supports_image_input": true
+    },
    "replicate/meta/llama-2-13b": {
        "max_tokens": 4096,
        "max_input_tokens": 4096,
--- a/litellm/proxy/litellm_pre_call_utils.py
+++ b/litellm/proxy/litellm_pre_call_utils.py
@ -238,11 +238,15 @@ class LiteLLMProxyRequestSetup:
        - Adds org id
        """
        data = LitellmDataForBackendLLMCall()
-        _headers = LiteLLMProxyRequestSetup.add_headers_to_llm_call(
-            headers, user_api_key_dict
-        )
-        if _headers != {}:
-            data["headers"] = _headers
+        if (
+            general_settings
+            and general_settings.get("forward_client_headers_to_llm_api") is True
+        ):
+            _headers = LiteLLMProxyRequestSetup.add_headers_to_llm_call(
+                headers, user_api_key_dict
+            )
+            if _headers != {}:
+                data["headers"] = _headers
        _organization = LiteLLMProxyRequestSetup.get_openai_org_id_from_headers(
            headers, general_settings
        )
--- a/litellm/types/llms/bedrock.py
+++ b/litellm/types/llms/bedrock.py
@ -210,15 +210,23 @@ class ServerSentEvent:
        return f"ServerSentEvent(event={self.event}, data={self.data}, id={self.id}, retry={self.retry})"


+COHERE_EMBEDDING_INPUT_TYPES = Literal[
+    "search_document", "search_query", "classification", "clustering", "image"
+]
+
+
 class CohereEmbeddingRequest(TypedDict, total=False):
-    texts: Required[List[str]]
-    input_type: Required[
-        Literal["search_document", "search_query", "classification", "clustering"]
-    ]
+    texts: List[str]
+    images: List[str]
+    input_type: Required[COHERE_EMBEDDING_INPUT_TYPES]
    truncate: Literal["NONE", "START", "END"]
    embedding_types: Literal["float", "int8", "uint8", "binary", "ubinary"]


+class CohereEmbeddingRequestWithModel(CohereEmbeddingRequest):
+    model: Required[str]
+
+
 class CohereEmbeddingResponse(TypedDict):
    embeddings: List[List[float]]
    id: str
--- a/litellm/utils.py
+++ b/litellm/utils.py
@ -5197,7 +5197,9 @@ def create_proxy_transport_and_mounts():


 def validate_environment(  # noqa: PLR0915
-    model: Optional[str] = None, api_key: Optional[str] = None
+    model: Optional[str] = None,
+    api_key: Optional[str] = None,
+    api_base: Optional[str] = None,
 ) -> dict:
    """
    Checks if the environment variables are valid for the given model.
@ -5224,11 +5226,6 @@ def validate_environment(  # noqa: PLR0915
        _, custom_llm_provider, _, _ = get_llm_provider(model=model)
    except Exception:
        custom_llm_provider = None
-    # # check if llm provider part of model name
-    # if model.split("/",1)[0] in litellm.provider_list:
-    #     custom_llm_provider = model.split("/", 1)[0]
-    #     model = model.split("/", 1)[1]
-    #     custom_llm_provider_passed_in = True

    if custom_llm_provider:
        if custom_llm_provider == "openai":
@ -5497,6 +5494,17 @@ def validate_environment(  # noqa: PLR0915
            if "api_key" not in key.lower():
                new_missing_keys.append(key)
        missing_keys = new_missing_keys
+
+    if api_base is not None:
+        new_missing_keys = []
+        for key in missing_keys:
+            if "api_base" not in key.lower():
+                new_missing_keys.append(key)
+        missing_keys = new_missing_keys
+
+    if len(missing_keys) == 0:  # no missing keys
+        keys_in_environment = True
+
    return {"keys_in_environment": keys_in_environment, "missing_keys": missing_keys}


--- a/model_prices_and_context_window.json
+++ b/model_prices_and_context_window.json
@ -3364,54 +3364,56 @@
        "litellm_provider": "cohere",
        "mode": "rerank"
    },
-    "embed-english-v3.0": {
-        "max_tokens": 512, 
-        "max_input_tokens": 512,
-        "input_cost_per_token": 0.00000010,
-        "output_cost_per_token": 0.00000,
-        "litellm_provider": "cohere",
-        "mode": "embedding"
-    },
    "embed-english-light-v3.0": {
-        "max_tokens": 512, 
-        "max_input_tokens": 512,
+        "max_tokens": 1024, 
+        "max_input_tokens": 1024,
        "input_cost_per_token": 0.00000010,
        "output_cost_per_token": 0.00000,
        "litellm_provider": "cohere",
        "mode": "embedding"
    },
    "embed-multilingual-v3.0": {
-        "max_tokens": 512, 
-        "max_input_tokens": 512,
+        "max_tokens": 1024, 
+        "max_input_tokens": 1024,
        "input_cost_per_token": 0.00000010,
        "output_cost_per_token": 0.00000,
        "litellm_provider": "cohere",
        "mode": "embedding"
    },
    "embed-english-v2.0": {
-        "max_tokens": 512, 
-        "max_input_tokens": 512,
+        "max_tokens": 4096, 
+        "max_input_tokens": 4096,
        "input_cost_per_token": 0.00000010,
        "output_cost_per_token": 0.00000,
        "litellm_provider": "cohere",
        "mode": "embedding"
    },
    "embed-english-light-v2.0": {
-        "max_tokens": 512, 
-        "max_input_tokens": 512,
+        "max_tokens": 1024, 
+        "max_input_tokens": 1024,
        "input_cost_per_token": 0.00000010,
        "output_cost_per_token": 0.00000,
        "litellm_provider": "cohere",
        "mode": "embedding"
    },
    "embed-multilingual-v2.0": {
-        "max_tokens": 256, 
-        "max_input_tokens": 256,
+        "max_tokens": 768, 
+        "max_input_tokens": 768,
        "input_cost_per_token": 0.00000010,
        "output_cost_per_token": 0.00000,
        "litellm_provider": "cohere",
        "mode": "embedding"
    },
+    "embed-english-v3.0": {
+        "max_tokens": 1024, 
+        "max_input_tokens": 1024,
+        "input_cost_per_token": 0.00000010,
+        "input_cost_per_image": 0.0001,
+        "output_cost_per_token": 0.00000,
+        "litellm_provider": "cohere",
+        "mode": "embedding",
+        "supports_image_input": true
+    },
    "replicate/meta/llama-2-13b": {
        "max_tokens": 4096,
        "max_input_tokens": 4096,
--- a/tests/local_testing/test_embedding.py
+++ b/tests/local_testing/test_embedding.py
--- a/tests/local_testing/test_get_llm_provider.py
+++ b/tests/local_testing/test_get_llm_provider.py
@ -160,3 +160,12 @@ def test_get_llm_provider_jina_ai():
    assert custom_llm_provider == "openai_like"
    assert api_base == "https://api.jina.ai/v1"
    assert model == "jina-embeddings-v3"
+
+
+def test_get_llm_provider_hosted_vllm():
+    model, custom_llm_provider, dynamic_api_key, api_base = litellm.get_llm_provider(
+        model="hosted_vllm/llama-3.1-70b-instruct",
+    )
+    assert custom_llm_provider == "hosted_vllm"
+    assert model == "llama-3.1-70b-instruct"
+    assert dynamic_api_key == ""
--- a/tests/local_testing/test_prompt_factory.py
+++ b/tests/local_testing/test_prompt_factory.py
@ -675,3 +675,15 @@ def test_alternating_roles_e2e():
                "stream": False,
            }
        )
+
+
+def test_just_system_message():
+    from litellm.llms.prompt_templates.factory import _bedrock_converse_messages_pt
+
+    with pytest.raises(litellm.BadRequestError) as e:
+        _bedrock_converse_messages_pt(
+            messages=[],
+            model="anthropic.claude-3-sonnet-20240229-v1:0",
+            llm_provider="bedrock",
+        )
+        assert "bedrock requires at least one non-system message" in str(e.value)
--- a/tests/local_testing/test_proxy_server.py
+++ b/tests/local_testing/test_proxy_server.py
@ -225,12 +225,20 @@ def test_add_headers_to_request(litellm_key_header_name):
    "litellm_key_header_name",
    ["x-litellm-key", None],
 )
+@pytest.mark.parametrize(
+    "forward_headers",
+    [True, False],
+)
@mock_patch_acompletion()
 def test_chat_completion_forward_headers(
-    mock_acompletion, client_no_auth, litellm_key_header_name
+    mock_acompletion, client_no_auth, litellm_key_header_name, forward_headers
 ):
    global headers
    try:
+        if forward_headers:
+            gs = getattr(litellm.proxy.proxy_server, "general_settings")
+            gs["forward_client_headers_to_llm_api"] = True
+            setattr(litellm.proxy.proxy_server, "general_settings", gs)
        if litellm_key_header_name is not None:
            gs = getattr(litellm.proxy.proxy_server, "general_settings")
            gs["litellm_key_header_name"] = litellm_key_header_name
@ -260,23 +268,14 @@ def test_chat_completion_forward_headers(
        response = client_no_auth.post(
            "/v1/chat/completions", json=test_data, headers=received_headers
        )
-        mock_acompletion.assert_called_once_with(
-            model="gpt-3.5-turbo",
-            messages=[
-                {"role": "user", "content": "hi"},
-            ],
-            max_tokens=10,
-            litellm_call_id=mock.ANY,
-            litellm_logging_obj=mock.ANY,
-            request_timeout=mock.ANY,
-            specific_deployment=True,
-            metadata=mock.ANY,
-            proxy_server_request=mock.ANY,
-            headers={
+        if not forward_headers:
+            assert "headers" not in mock_acompletion.call_args.kwargs
+        else:
+            assert mock_acompletion.call_args.kwargs["headers"] == {
                "x-custom-header": "Custom-Value",
                "x-another-header": "Another-Value",
-            },
-        )
+            }
+
        print(f"response - {response.text}")
        assert response.status_code == 200
        result = response.json()
--- a/tests/local_testing/test_utils.py
+++ b/tests/local_testing/test_utils.py
@ -331,6 +331,13 @@ def test_validate_environment_api_key():
    ), f"Missing keys={response_obj['missing_keys']}"


+def test_validate_environment_api_base_dynamic():
+    for provider in ["ollama", "ollama_chat"]:
+        kv = validate_environment(provider + "/mistral", api_base="https://example.com")
+        assert kv["keys_in_environment"]
+        assert kv["missing_keys"] == []
+
+
@mock.patch.dict(os.environ, {"OLLAMA_API_BASE": "foo"}, clear=True)
 def test_validate_environment_ollama():
    for provider in ["ollama", "ollama_chat"]: