LiteLLM Minor Fixes & Improvements (09/24/2024) (#5880)

* LiteLLM Minor Fixes & Improvements (09/23/2024) (#5842) * feat(auth_utils.py): enable admin to allow client-side credentials to be passed Makes it easier for devs to experiment with finetuned fireworks ai models * feat(router.py): allow setting configurable_clientside_auth_params for a model Closes https://github.com/BerriAI/litellm/issues/5843 * build(model_prices_and_context_window.json): fix anthropic claude-3-5-sonnet max output token limit Fixes https://github.com/BerriAI/litellm/issues/5850 * fix(azure_ai/): support content list for azure ai Fixes https://github.com/BerriAI/litellm/issues/4237 * fix(litellm_logging.py): always set saved_cache_cost Set to 0 by default * fix(fireworks_ai/cost_calculator.py): add fireworks ai default pricing handles calling 405b+ size models * fix(slack_alerting.py): fix error alerting for failed spend tracking Fixes regression with slack alerting error monitoring * fix(vertex_and_google_ai_studio_gemini.py): handle gemini no candidates in streaming chunk error * docs(bedrock.md): add llama3-1 models * test: fix tests * fix(azure_ai/chat): fix transformation for azure ai calls * feat(azure_ai/embed): Add azure ai embeddings support Closes https://github.com/BerriAI/litellm/issues/5861 * fix(azure_ai/embed): enable async embedding * feat(azure_ai/embed): support azure ai multimodal embeddings * fix(azure_ai/embed): support async multi modal embeddings * feat(together_ai/embed): support together ai embedding calls * feat(rerank/main.py): log source documents for rerank endpoints to langfuse improves rerank endpoint logging * fix(langfuse.py): support logging `/audio/speech` input to langfuse * test(test_embedding.py): fix test * test(test_completion_cost.py): fix helper util
2024-09-25 22:11:57 -07:00 · 2024-09-25 22:11:57 -07:00 · 16c0307eab
commit 16c0307eab
parent 5bc5eaff8a
25 changed files with 1675 additions and 340 deletions
--- a/litellm/init.py
+++ b/litellm/init.py
@ -901,7 +901,7 @@ from .llms.cohere.completion import CohereConfig
 from .llms.clarifai import ClarifaiConfig
 from .llms.AI21.completion import AI21Config
 from .llms.AI21.chat import AI21ChatConfig
-from .llms.together_ai import TogetherAIConfig
+from .llms.together_ai.chat import TogetherAIConfig
 from .llms.cloudflare import CloudflareConfig
 from .llms.palm import PalmConfig
 from .llms.gemini import GeminiConfig
--- a/litellm/cost_calculator.py
+++ b/litellm/cost_calculator.py
@ -28,6 +28,7 @@ from litellm.llms.databricks.cost_calculator import (
 from litellm.llms.fireworks_ai.cost_calculator import (
    cost_per_token as fireworks_ai_cost_per_token,
 )
+from litellm.llms.together_ai.cost_calculator import get_model_params_and_category
 from litellm.rerank_api.types import RerankResponse
 from litellm.types.llms.openai import HttpxBinaryResponseContent
 from litellm.types.router import SPECIAL_MODEL_INFO_PARAMS
@ -395,48 +396,6 @@ def cost_per_token(
        )


-# Extract the number of billion parameters from the model name
-# only used for together_computer LLMs
-def get_model_params_and_category(model_name) -> str:
-    """
-    Helper function for calculating together ai pricing.
-
-    Returns
-    - str - model pricing category if mapped else received model name
-    """
-    import re
-
-    model_name = model_name.lower()
-    re_params_match = re.search(
-        r"(\d+b)", model_name
-    )  # catch all decimals like 3b, 70b, etc
-    category = None
-    if re_params_match is not None:
-        params_match = str(re_params_match.group(1))
-        params_match = params_match.replace("b", "")
-        if params_match is not None:
-            params_billion = float(params_match)
-        else:
-            return model_name
-        # Determine the category based on the number of parameters
-        if params_billion <= 4.0:
-            category = "together-ai-up-to-4b"
-        elif params_billion <= 8.0:
-            category = "together-ai-4.1b-8b"
-        elif params_billion <= 21.0:
-            category = "together-ai-8.1b-21b"
-        elif params_billion <= 41.0:
-            category = "together-ai-21.1b-41b"
-        elif params_billion <= 80.0:
-            category = "together-ai-41.1b-80b"
-        elif params_billion <= 110.0:
-            category = "together-ai-81.1b-110b"
-        if category is not None:
-            return category
-
-    return model_name
-
-
 def get_replicate_completion_pricing(completion_response: dict, total_time=0.0):
    # see https://replicate.com/pricing
    # for all litellm currently supported LLMs, almost all requests go to a100_80gb
@ -477,7 +436,7 @@ def _select_model_name_for_cost_calc(
    if isinstance(completion_response, str):
        return return_model

-    elif return_model is None:
+    elif return_model is None and hasattr(completion_response, "get"):
        return_model = completion_response.get("model", "")  # type: ignore
    hidden_params = getattr(completion_response, "_hidden_params", None)

@ -716,7 +675,9 @@ def completion_cost(
        ):
            # together ai prices based on size of llm
            # get_model_params_and_category takes a model name and returns the category of LLM size it is in model_prices_and_context_window.json
-            model = get_model_params_and_category(model)
+
+            model = get_model_params_and_category(model, call_type=CallTypes(call_type))
+
        # replicate llms are calculate based on time for request running
        # see https://replicate.com/pricing
        elif (
--- a/litellm/integrations/langfuse.py
+++ b/litellm/integrations/langfuse.py
@ -204,6 +204,11 @@ class LangFuseLogger:
            ):
                input = prompt
                output = response_obj["choices"][0]["message"].json()
+            elif response_obj is not None and isinstance(
+                response_obj, litellm.HttpxBinaryResponseContent
+            ):
+                input = prompt
+                output = "speech-output"
            elif response_obj is not None and isinstance(
                response_obj, litellm.TextCompletionResponse
            ):
@ -549,7 +554,10 @@ class LangFuseLogger:
            generation_id = None
            usage = None
            if response_obj is not None:
-                if response_obj.get("id", None) is not None:
+                if (
+                    hasattr(response_obj, "id")
+                    and response_obj.get("id", None) is not None
+                ):
                    generation_id = litellm.utils.get_logging_id(
                        start_time, response_obj
                    )
@ -571,8 +579,8 @@ class LangFuseLogger:
                if _user_api_key_alias is not None:
                    generation_name = f"litellm:{_user_api_key_alias}"

-            if response_obj is not None and "system_fingerprint" in response_obj:
-                system_fingerprint = response_obj.get("system_fingerprint", None)
+            if response_obj is not None:
+                system_fingerprint = getattr(response_obj, "system_fingerprint", None)
            else:
                system_fingerprint = None

--- a/litellm/llms/OpenAI/openai.py
+++ b/litellm/llms/OpenAI/openai.py
@ -1215,7 +1215,6 @@ class OpenAIChatCompletion(BaseLLM):
        client: Optional[AsyncOpenAI] = None,
        max_retries=None,
    ):
-        response = None
        try:
            openai_aclient: AsyncOpenAI = self._get_openai_client(  # type: ignore
                is_async=True,
@ -1237,12 +1236,15 @@ class OpenAIChatCompletion(BaseLLM):
                additional_args={"complete_input_dict": data},
                original_response=stringified_response,
            )
-            return convert_to_model_response_object(
+            returned_response: (
+                litellm.EmbeddingResponse
+            ) = convert_to_model_response_object(
                response_object=stringified_response,
                model_response_object=model_response,
                response_type="embedding",
                _response_headers=headers,
            )  # type: ignore
+            return returned_response
        except OpenAIError as e:
            ## LOGGING
            logging_obj.post_call(
@ -1284,7 +1286,6 @@ class OpenAIChatCompletion(BaseLLM):
        aembedding=None,
    ):
        super().embedding()
-        exception_mapping_worked = False
        try:
            model = model
            data = {"model": model, "input": input, **optional_params}
@ -1299,7 +1300,7 @@ class OpenAIChatCompletion(BaseLLM):
            )

            if aembedding is True:
-                response = self.aembedding(
+                async_response = self.aembedding(
                    data=data,
                    input=input,
                    logging_obj=logging_obj,
@ -1310,7 +1311,7 @@ class OpenAIChatCompletion(BaseLLM):
                    client=client,
                    max_retries=max_retries,
                )
-                return response
+                return async_response

            openai_client: OpenAI = self._get_openai_client(  # type: ignore
                is_async=False,
@ -1335,12 +1336,13 @@ class OpenAIChatCompletion(BaseLLM):
                additional_args={"complete_input_dict": data},
                original_response=sync_embedding_response,
            )
-            return convert_to_model_response_object(
+            response: litellm.EmbeddingResponse = convert_to_model_response_object(
                response_object=sync_embedding_response.model_dump(),
                model_response_object=model_response,
                _response_headers=headers,
                response_type="embedding",
            )  # type: ignore
+            return response
        except OpenAIError as e:
            raise e
        except Exception as e:
--- a/litellm/llms/azure_ai/init.py
+++ b/litellm/llms/azure_ai/init.py
@ -0,0 +1,3 @@
+from .chat.handler import AzureAIChatCompletion
+from .embed.handler import AzureAIEmbedding
+from .rerank.handler import AzureAIRerank
--- a/litellm/llms/azure_ai/cost_calculator.py
+++ b/litellm/llms/azure_ai/cost_calculator.py
--- a/litellm/llms/azure_ai/embed/cohere_transformation.py
+++ b/litellm/llms/azure_ai/embed/cohere_transformation.py
@ -0,0 +1,98 @@
+"""
+Transformation logic from OpenAI /v1/embeddings format to Azure AI Cohere's /v1/embed. 
+
+Why separate file? Make it easy to see how transformation works
+
+Convers
+- Cohere request format
+
+Docs - https://docs.aws.amazon.com/bedrock/latest/userguide/model-parameters-titan-embed-text.html
+"""
+
+from typing import List, Optional, Tuple, Union
+
+from litellm.types.llms.azure_ai import ImageEmbeddingInput, ImageEmbeddingRequest
+from litellm.types.llms.openai import EmbeddingCreateParams
+from litellm.types.utils import Embedding, EmbeddingResponse, Usage
+from litellm.utils import is_base64_encoded
+
+
+class AzureAICohereConfig:
+    def __init__(self) -> None:
+        pass
+
+    def _map_azure_model_group(self, model: str) -> str:
+        if "model=offer-cohere-embed-multili-paygo":
+            return "Cohere-embed-v3-multilingual"
+        elif "model=offer-cohere-embed-english-paygo":
+            return "Cohere-embed-v3-english"
+
+        return model
+
+    def _transform_request_image_embeddings(
+        self, input: List[str], optional_params: dict
+    ) -> ImageEmbeddingRequest:
+        """
+        Assume all str in list is base64 encoded string
+        """
+        image_input: List[ImageEmbeddingInput] = []
+        for i in input:
+            embedding_input = ImageEmbeddingInput(image=i)
+            image_input.append(embedding_input)
+        return ImageEmbeddingRequest(input=image_input, **optional_params)
+
+    def _transform_request(
+        self, input: List[str], optional_params: dict, model: str
+    ) -> Tuple[ImageEmbeddingRequest, EmbeddingCreateParams, List[int]]:
+        """
+        Return the list of input to `/image/embeddings`, `/v1/embeddings`, list of image_embedding_idx for recombination
+        """
+        image_embeddings: List[str] = []
+        image_embedding_idx: List[int] = []
+        for idx, i in enumerate(input):
+            """
+            - is base64 -> route to image embeddings
+            - is ImageEmbeddingInput -> route to image embeddings
+            - else -> route to `/v1/embeddings`
+            """
+            if is_base64_encoded(i):
+                image_embeddings.append(i)
+                image_embedding_idx.append(idx)
+
+        ## REMOVE IMAGE EMBEDDINGS FROM input list
+        filtered_input = [
+            item for idx, item in enumerate(input) if idx not in image_embedding_idx
+        ]
+
+        v1_embeddings_request = EmbeddingCreateParams(
+            input=filtered_input, model=model, **optional_params
+        )
+        image_embeddings_request = self._transform_request_image_embeddings(
+            input=image_embeddings, optional_params=optional_params
+        )
+
+        return image_embeddings_request, v1_embeddings_request, image_embedding_idx
+
+    def _transform_response(self, response: EmbeddingResponse) -> EmbeddingResponse:
+        additional_headers: Optional[dict] = response._hidden_params.get(
+            "additional_headers"
+        )
+        if additional_headers:
+            # CALCULATE USAGE
+            input_tokens: Optional[str] = additional_headers.get(
+                "llm_provider-num_tokens"
+            )
+            if input_tokens:
+                if response.usage:
+                    response.usage.prompt_tokens = int(input_tokens)
+                else:
+                    response.usage = Usage(prompt_tokens=int(input_tokens))
+
+            # SET MODEL
+            base_model: Optional[str] = additional_headers.get(
+                "llm_provider-azureml-model-group"
+            )
+            if base_model:
+                response.model = self._map_azure_model_group(base_model)
+
+        return response
--- a/litellm/llms/azure_ai/embed/handler.py
+++ b/litellm/llms/azure_ai/embed/handler.py
@ -0,0 +1,296 @@
+import asyncio
+import copy
+import json
+import os
+from copy import deepcopy
+from typing import Any, Callable, List, Literal, Optional, Tuple, Union
+
+import httpx
+from openai import OpenAI
+
+import litellm
+from litellm.llms.cohere.embed import embedding as cohere_embedding
+from litellm.llms.custom_httpx.http_handler import (
+    AsyncHTTPHandler,
+    HTTPHandler,
+    _get_httpx_client,
+    get_async_httpx_client,
+)
+from litellm.llms.OpenAI.openai import OpenAIChatCompletion
+from litellm.types.llms.azure_ai import ImageEmbeddingRequest
+from litellm.types.utils import Embedding, EmbeddingResponse
+from litellm.utils import convert_to_model_response_object, is_base64_encoded
+
+from .cohere_transformation import AzureAICohereConfig
+
+
+class AzureAIEmbedding(OpenAIChatCompletion):
+
+    def _process_response(
+        self,
+        image_embedding_responses: Optional[List],
+        text_embedding_responses: Optional[List],
+        image_embeddings_idx: List[int],
+        model_response: EmbeddingResponse,
+        input: List,
+    ):
+        combined_responses = []
+        if (
+            image_embedding_responses is not None
+            and text_embedding_responses is not None
+        ):
+            # Combine and order the results
+            text_idx = 0
+            image_idx = 0
+
+            for idx in range(len(input)):
+                if idx in image_embeddings_idx:
+                    combined_responses.append(image_embedding_responses[image_idx])
+                    image_idx += 1
+                else:
+                    combined_responses.append(text_embedding_responses[text_idx])
+                    text_idx += 1
+
+            model_response.data = combined_responses
+        elif image_embedding_responses is not None:
+            model_response.data = image_embedding_responses
+        elif text_embedding_responses is not None:
+            model_response.data = text_embedding_responses
+
+        response = AzureAICohereConfig()._transform_response(response=model_response)  # type: ignore
+
+        return response
+
+    async def async_image_embedding(
+        self,
+        model: str,
+        data: ImageEmbeddingRequest,
+        timeout: float,
+        logging_obj,
+        model_response: litellm.EmbeddingResponse,
+        optional_params: dict,
+        api_key: Optional[str],
+        api_base: Optional[str],
+        client: Optional[Union[HTTPHandler, AsyncHTTPHandler]] = None,
+    ) -> EmbeddingResponse:
+        if client is None or not isinstance(client, AsyncHTTPHandler):
+            client = AsyncHTTPHandler(timeout=timeout, concurrent_limit=1)
+
+        url = "{}/images/embeddings".format(api_base)
+
+        response = await client.post(
+            url=url,
+            json=data,  # type: ignore
+            headers={"Authorization": "Bearer {}".format(api_key)},
+        )
+
+        embedding_response = response.json()
+        embedding_headers = dict(response.headers)
+        returned_response: litellm.EmbeddingResponse = convert_to_model_response_object(  # type: ignore
+            response_object=embedding_response,
+            model_response_object=model_response,
+            response_type="embedding",
+            stream=False,
+            _response_headers=embedding_headers,
+        )
+        return returned_response
+
+    def image_embedding(
+        self,
+        model: str,
+        data: ImageEmbeddingRequest,
+        timeout: float,
+        logging_obj,
+        model_response: litellm.EmbeddingResponse,
+        optional_params: dict,
+        api_key: Optional[str],
+        api_base: Optional[str],
+        client: Optional[Union[HTTPHandler, AsyncHTTPHandler]] = None,
+    ):
+        if api_base is None:
+            raise ValueError(
+                "api_base is None. Please set AZURE_AI_API_BASE or dynamically via `api_base` param, to make the request."
+            )
+        if api_key is None:
+            raise ValueError(
+                "api_key is None. Please set AZURE_AI_API_KEY or dynamically via `api_key` param, to make the request."
+            )
+
+        if client is None or not isinstance(client, HTTPHandler):
+            client = HTTPHandler(timeout=timeout, concurrent_limit=1)
+
+        url = "{}/images/embeddings".format(api_base)
+
+        response = client.post(
+            url=url,
+            json=data,  # type: ignore
+            headers={"Authorization": "Bearer {}".format(api_key)},
+        )
+
+        embedding_response = response.json()
+        embedding_headers = dict(response.headers)
+        returned_response: litellm.EmbeddingResponse = convert_to_model_response_object(  # type: ignore
+            response_object=embedding_response,
+            model_response_object=model_response,
+            response_type="embedding",
+            stream=False,
+            _response_headers=embedding_headers,
+        )
+        return returned_response
+
+    async def async_embedding(
+        self,
+        model: str,
+        input: List,
+        timeout: float,
+        logging_obj,
+        model_response: litellm.EmbeddingResponse,
+        optional_params: dict,
+        api_key: Optional[str] = None,
+        api_base: Optional[str] = None,
+        client=None,
+    ) -> EmbeddingResponse:
+
+        (
+            image_embeddings_request,
+            v1_embeddings_request,
+            image_embeddings_idx,
+        ) = AzureAICohereConfig()._transform_request(
+            input=input, optional_params=optional_params, model=model
+        )
+
+        image_embedding_responses: Optional[List] = None
+        text_embedding_responses: Optional[List] = None
+
+        if image_embeddings_request["input"]:
+            image_response = await self.async_image_embedding(
+                model=model,
+                data=image_embeddings_request,
+                timeout=timeout,
+                logging_obj=logging_obj,
+                model_response=model_response,
+                optional_params=optional_params,
+                api_key=api_key,
+                api_base=api_base,
+                client=client,
+            )
+
+            image_embedding_responses = image_response.data
+            if image_embedding_responses is None:
+                raise Exception("/image/embeddings route returned None Embeddings.")
+
+        if v1_embeddings_request["input"]:
+            response: EmbeddingResponse = await super().embedding(  # type: ignore
+                model=model,
+                input=input,
+                timeout=timeout,
+                logging_obj=logging_obj,
+                model_response=model_response,
+                optional_params=optional_params,
+                api_key=api_key,
+                api_base=api_base,
+                client=client,
+                aembedding=True,
+            )
+            text_embedding_responses = response.data
+            if text_embedding_responses is None:
+                raise Exception("/v1/embeddings route returned None Embeddings.")
+
+        return self._process_response(
+            image_embedding_responses=image_embedding_responses,
+            text_embedding_responses=text_embedding_responses,
+            image_embeddings_idx=image_embeddings_idx,
+            model_response=model_response,
+            input=input,
+        )
+
+    def embedding(
+        self,
+        model: str,
+        input: List,
+        timeout: float,
+        logging_obj,
+        model_response: litellm.EmbeddingResponse,
+        optional_params: dict,
+        api_key: Optional[str] = None,
+        api_base: Optional[str] = None,
+        client=None,
+        aembedding=None,
+    ):
+        """
+        - Separate image url from text
+        -> route image url call to `/image/embeddings`
+        -> route text call to `/v1/embeddings` (OpenAI route)
+
+        assemble result in-order, and return
+        """
+        if aembedding is True:
+            return self.async_embedding(
+                model,
+                input,
+                timeout,
+                logging_obj,
+                model_response,
+                optional_params,
+                api_key,
+                api_base,
+                client,
+            )
+
+        (
+            image_embeddings_request,
+            v1_embeddings_request,
+            image_embeddings_idx,
+        ) = AzureAICohereConfig()._transform_request(
+            input=input, optional_params=optional_params, model=model
+        )
+
+        image_embedding_responses: Optional[List] = None
+        text_embedding_responses: Optional[List] = None
+
+        if image_embeddings_request["input"]:
+            image_response = self.image_embedding(
+                model=model,
+                data=image_embeddings_request,
+                timeout=timeout,
+                logging_obj=logging_obj,
+                model_response=model_response,
+                optional_params=optional_params,
+                api_key=api_key,
+                api_base=api_base,
+                client=client,
+            )
+
+            image_embedding_responses = image_response.data
+            if image_embedding_responses is None:
+                raise Exception("/image/embeddings route returned None Embeddings.")
+
+        if v1_embeddings_request["input"]:
+            response: EmbeddingResponse = super().embedding(  # type: ignore
+                model,
+                input,
+                timeout,
+                logging_obj,
+                model_response,
+                optional_params,
+                api_key,
+                api_base,
+                client=(
+                    client
+                    if client is not None and isinstance(client, OpenAI)
+                    else None
+                ),
+                aembedding=aembedding,
+            )
+
+            text_embedding_responses = response.data
+            if text_embedding_responses is None:
+                raise Exception("/v1/embeddings route returned None Embeddings.")
+
+        return self._process_response(
+            image_embedding_responses=image_embedding_responses,
+            text_embedding_responses=text_embedding_responses,
+            image_embeddings_idx=image_embeddings_idx,
+            model_response=model_response,
+            input=input,
+        )
--- a/litellm/llms/together_ai.py
+++ b/litellm/llms/together_ai.py
@ -1,239 +0,0 @@
-"""
-Deprecated. We now do together ai calls via the openai client.
-Reference: https://docs.together.ai/docs/openai-api-compatibility
-"""
-
-import json
-import os
-import time
-import types
-from enum import Enum
-from typing import Callable, Optional
-
-import httpx  # type: ignore
-import requests  # type: ignore
-
-import litellm
-from litellm.utils import ModelResponse, Usage
-
-from .prompt_templates.factory import custom_prompt, prompt_factory
-
-
-class TogetherAIError(Exception):
-    def __init__(self, status_code, message):
-        self.status_code = status_code
-        self.message = message
-        self.request = httpx.Request(
-            method="POST", url="https://api.together.xyz/inference"
-        )
-        self.response = httpx.Response(status_code=status_code, request=self.request)
-        super().__init__(
-            self.message
-        )  # Call the base class constructor with the parameters it needs
-
-
-class TogetherAIConfig:
-    """
-    Reference: https://docs.together.ai/reference/inference
-
-    The class `TogetherAIConfig` provides configuration for the TogetherAI's API interface. Here are the parameters:
-
-    - `max_tokens` (int32, required): The maximum number of tokens to generate.
-
-    - `stop` (string, optional): A string sequence that will truncate (stop) the inference text output. For example, "\n\n" will stop generation as soon as the model generates two newlines.
-
-    - `temperature` (float, optional): A decimal number that determines the degree of randomness in the response. A value of 1 will always yield the same output. A temperature less than 1 favors more correctness and is appropriate for question answering or summarization. A value greater than 1 introduces more randomness in the output.
-
-    - `top_p` (float, optional): The `top_p` (nucleus) parameter is used to dynamically adjust the number of choices for each predicted token based on the cumulative probabilities. It specifies a probability threshold, below which all less likely tokens are filtered out. This technique helps to maintain diversity and generate more fluent and natural-sounding text.
-
-    - `top_k` (int32, optional): The `top_k` parameter is used to limit the number of choices for the next predicted word or token. It specifies the maximum number of tokens to consider at each step, based on their probability of occurrence. This technique helps to speed up the generation process and can improve the quality of the generated text by focusing on the most likely options.
-
-    - `repetition_penalty` (float, optional): A number that controls the diversity of generated text by reducing the likelihood of repeated sequences. Higher values decrease repetition.
-
-    - `logprobs` (int32, optional): This parameter is not described in the prompt.
-    """
-
-    max_tokens: Optional[int] = None
-    stop: Optional[str] = None
-    temperature: Optional[int] = None
-    top_p: Optional[float] = None
-    top_k: Optional[int] = None
-    repetition_penalty: Optional[float] = None
-    logprobs: Optional[int] = None
-
-    def __init__(
-        self,
-        max_tokens: Optional[int] = None,
-        stop: Optional[str] = None,
-        temperature: Optional[int] = None,
-        top_p: Optional[float] = None,
-        top_k: Optional[int] = None,
-        repetition_penalty: Optional[float] = None,
-        logprobs: Optional[int] = None,
-    ) -> None:
-        locals_ = locals()
-        for key, value in locals_.items():
-            if key != "self" and value is not None:
-                setattr(self.__class__, key, value)
-
-    @classmethod
-    def get_config(cls):
-        return {
-            k: v
-            for k, v in cls.__dict__.items()
-            if not k.startswith("__")
-            and not isinstance(
-                v,
-                (
-                    types.FunctionType,
-                    types.BuiltinFunctionType,
-                    classmethod,
-                    staticmethod,
-                ),
-            )
-            and v is not None
-        }
-
-
-# def validate_environment(api_key):
-#     if api_key is None:
-#         raise ValueError(
-#             "Missing TogetherAI API Key - A call is being made to together_ai but no key is set either in the environment variables or via params"
-#         )
-#     headers = {
-#         "accept": "application/json",
-#         "content-type": "application/json",
-#         "Authorization": "Bearer " + api_key,
-#     }
-#     return headers
-
-
-# def completion(
-#     model: str,
-#     messages: list,
-#     api_base: str,
-#     model_response: ModelResponse,
-#     print_verbose: Callable,
-#     encoding,
-#     api_key,
-#     logging_obj,
-#     custom_prompt_dict={},
-#     optional_params=None,
-#     litellm_params=None,
-#     logger_fn=None,
-# ):
-#     headers = validate_environment(api_key)
-
-#     ## Load Config
-#     config = litellm.TogetherAIConfig.get_config()
-#     for k, v in config.items():
-#         if (
-#             k not in optional_params
-#         ):  # completion(top_k=3) > togetherai_config(top_k=3) <- allows for dynamic variables to be passed in
-#             optional_params[k] = v
-
-#     print_verbose(f"CUSTOM PROMPT DICT: {custom_prompt_dict}; model: {model}")
-#     if model in custom_prompt_dict:
-#         # check if the model has a registered custom prompt
-#         model_prompt_details = custom_prompt_dict[model]
-#         prompt = custom_prompt(
-#             role_dict=model_prompt_details.get("roles", {}),
-#             initial_prompt_value=model_prompt_details.get("initial_prompt_value", ""),
-#             final_prompt_value=model_prompt_details.get("final_prompt_value", ""),
-#             bos_token=model_prompt_details.get("bos_token", ""),
-#             eos_token=model_prompt_details.get("eos_token", ""),
-#             messages=messages,
-#         )
-#     else:
-#         prompt = prompt_factory(
-#             model=model,
-#             messages=messages,
-#             api_key=api_key,
-#             custom_llm_provider="together_ai",
-#         )  # api key required to query together ai model list
-
-#     data = {
-#         "model": model,
-#         "prompt": prompt,
-#         "request_type": "language-model-inference",
-#         **optional_params,
-#     }
-
-#     ## LOGGING
-#     logging_obj.pre_call(
-#         input=prompt,
-#         api_key=api_key,
-#         additional_args={
-#             "complete_input_dict": data,
-#             "headers": headers,
-#             "api_base": api_base,
-#         },
-#     )
-#     ## COMPLETION CALL
-#     if "stream_tokens" in optional_params and optional_params["stream_tokens"] == True:
-#         response = requests.post(
-#             api_base,
-#             headers=headers,
-#             data=json.dumps(data),
-#             stream=optional_params["stream_tokens"],
-#         )
-#         return response.iter_lines()
-#     else:
-#         response = requests.post(api_base, headers=headers, data=json.dumps(data))
-#         ## LOGGING
-#         logging_obj.post_call(
-#             input=prompt,
-#             api_key=api_key,
-#             original_response=response.text,
-#             additional_args={"complete_input_dict": data},
-#         )
-#         print_verbose(f"raw model_response: {response.text}")
-#         ## RESPONSE OBJECT
-#         if response.status_code != 200:
-#             raise TogetherAIError(
-#                 status_code=response.status_code, message=response.text
-#             )
-#         completion_response = response.json()
-
-#         if "error" in completion_response:
-#             raise TogetherAIError(
-#                 message=json.dumps(completion_response),
-#                 status_code=response.status_code,
-#             )
-#         elif "error" in completion_response["output"]:
-#             raise TogetherAIError(
-#                 message=json.dumps(completion_response["output"]),
-#                 status_code=response.status_code,
-#             )
-
-#         if len(completion_response["output"]["choices"][0]["text"]) >= 0:
-#             model_response.choices[0].message.content = completion_response["output"][
-#                 "choices"
-#             ][0]["text"]
-
-#         ## CALCULATING USAGE
-#         print_verbose(
-#             f"CALCULATING TOGETHERAI TOKEN USAGE. Model Response: {model_response}; model_response['choices'][0]['message'].get('content', ''): {model_response['choices'][0]['message'].get('content', None)}"
-#         )
-#         prompt_tokens = len(encoding.encode(prompt))
-#         completion_tokens = len(
-#             encoding.encode(model_response["choices"][0]["message"].get("content", ""))
-#         )
-#         if "finish_reason" in completion_response["output"]["choices"][0]:
-#             model_response.choices[0].finish_reason = completion_response["output"][
-#                 "choices"
-#             ][0]["finish_reason"]
-#         model_response["created"] = int(time.time())
-#         model_response["model"] = "together_ai/" + model
-#         usage = Usage(
-#             prompt_tokens=prompt_tokens,
-#             completion_tokens=completion_tokens,
-#             total_tokens=prompt_tokens + completion_tokens,
-#         )
-#         setattr(model_response, "usage", usage)
-#         return model_response
-
-
-# def embedding():
-#     # logic for parsing in - calling - parsing out model embedding calls
-#     pass
--- a/litellm/llms/together_ai/chat.py
+++ b/litellm/llms/together_ai/chat.py
@ -0,0 +1,13 @@
+"""
+Support for OpenAI's `/v1/chat/completions` endpoint. 
+
+Calls done in OpenAI/openai.py as TogetherAI is openai-compatible.
+
+Docs: https://docs.together.ai/reference/completions-1
+"""
+
+from ..OpenAI.openai import OpenAIConfig
+
+
+class TogetherAIConfig(OpenAIConfig):
+    pass
--- a/litellm/llms/together_ai/completion.py
+++ b/litellm/llms/together_ai/completion.py
@ -0,0 +1,7 @@
+"""
+Support for OpenAI's `/v1/completions` endpoint. 
+
+Calls done in OpenAI/openai.py as TogetherAI is openai-compatible.
+
+Docs: https://docs.together.ai/reference/completions-1
+"""
--- a/litellm/llms/together_ai/cost_calculator.py
+++ b/litellm/llms/together_ai/cost_calculator.py
@ -0,0 +1,79 @@
+"""
+Handles calculating cost for together ai models
+"""
+
+import re
+
+from litellm.types.utils import CallTypes
+
+
+# Extract the number of billion parameters from the model name
+# only used for together_computer LLMs
+def get_model_params_and_category(model_name, call_type: CallTypes) -> str:
+    """
+    Helper function for calculating together ai pricing.
+
+    Returns
+    - str - model pricing category if mapped else received model name
+    """
+    if call_type == CallTypes.embedding or call_type == CallTypes.aembedding:
+        return get_model_params_and_category_embeddings(model_name=model_name)
+    model_name = model_name.lower()
+    re_params_match = re.search(
+        r"(\d+b)", model_name
+    )  # catch all decimals like 3b, 70b, etc
+    category = None
+    if re_params_match is not None:
+        params_match = str(re_params_match.group(1))
+        params_match = params_match.replace("b", "")
+        if params_match is not None:
+            params_billion = float(params_match)
+        else:
+            return model_name
+        # Determine the category based on the number of parameters
+        if params_billion <= 4.0:
+            category = "together-ai-up-to-4b"
+        elif params_billion <= 8.0:
+            category = "together-ai-4.1b-8b"
+        elif params_billion <= 21.0:
+            category = "together-ai-8.1b-21b"
+        elif params_billion <= 41.0:
+            category = "together-ai-21.1b-41b"
+        elif params_billion <= 80.0:
+            category = "together-ai-41.1b-80b"
+        elif params_billion <= 110.0:
+            category = "together-ai-81.1b-110b"
+        if category is not None:
+            return category
+
+    return model_name
+
+
+def get_model_params_and_category_embeddings(model_name) -> str:
+    """
+    Helper function for calculating together ai embedding pricing.
+
+    Returns
+    - str - model pricing category if mapped else received model name
+    """
+    model_name = model_name.lower()
+    re_params_match = re.search(
+        r"(\d+m)", model_name
+    )  # catch all decimals like 100m, 200m, etc.
+    category = None
+    if re_params_match is not None:
+        params_match = str(re_params_match.group(1))
+        params_match = params_match.replace("m", "")
+        if params_match is not None:
+            params_million = float(params_match)
+        else:
+            return model_name
+        # Determine the category based on the number of parameters
+        if params_million <= 150:
+            category = "together-ai-embedding-up-to-150m"
+        elif params_million <= 350:
+            category = "together-ai-embedding-151m-to-350m"
+        if category is not None:
+            return category
+
+    return model_name
--- a/litellm/llms/together_ai/embed.py
+++ b/litellm/llms/together_ai/embed.py
@ -0,0 +1,7 @@
+"""
+Support for OpenAI's `/v1/embeddings` endpoint. 
+
+Calls done in OpenAI/openai.py as TogetherAI is openai-compatible.
+
+Docs: https://docs.together.ai/reference/completions-1
+"""
--- a/litellm/llms/together_ai/rerank.py
+++ b/litellm/llms/together_ai/rerank.py
--- a/litellm/main.py
+++ b/litellm/main.py
@ -83,7 +83,7 @@ from .llms import (
 from .llms.AI21 import completion as ai21
 from .llms.anthropic.chat import AnthropicChatCompletion
 from .llms.anthropic.completion import AnthropicTextCompletion
-from .llms.azure_ai.chat.handler import AzureAIChatCompletion
+from .llms.azure_ai import AzureAIChatCompletion, AzureAIEmbedding
 from .llms.azure_text import AzureTextCompletion
 from .llms.AzureOpenAI.audio_transcriptions import AzureAudioTranscription
 from .llms.AzureOpenAI.azure import AzureChatCompletion, _check_dynamic_azure_params
@ -168,6 +168,7 @@ openai_o1_chat_completions = OpenAIO1ChatCompletion()
 openai_audio_transcriptions = OpenAIAudioTranscription()
 databricks_chat_completions = DatabricksChatCompletion()
 azure_ai_chat_completions = AzureAIChatCompletion()
+azure_ai_embedding = AzureAIEmbedding()
 anthropic_chat_completions = AnthropicChatCompletion()
 anthropic_text_completions = AnthropicTextCompletion()
 azure_chat_completions = AzureChatCompletion()
@ -3215,6 +3216,8 @@ async def aembedding(*args, **kwargs) -> EmbeddingResponse:
            or custom_llm_provider == "cohere"
            or custom_llm_provider == "huggingface"
            or custom_llm_provider == "bedrock"
+            or custom_llm_provider == "azure_ai"
+            or custom_llm_provider == "together_ai"
        ):  # currently implemented aiohttp calls for just azure and openai, soon all.
            # Await normally
            init_response = await loop.run_in_executor(None, func_with_context)
@ -3385,6 +3388,9 @@ def embedding(
        api_base=api_base,
        api_key=api_key,
    )
+    if dynamic_api_key is not None:
+        api_key = dynamic_api_key
+
    optional_params = get_optional_params_embeddings(
        model=model,
        user=user,
@ -3481,7 +3487,9 @@ def embedding(
                aembedding=aembedding,
            )
        elif (
-            model in litellm.open_ai_embedding_models or custom_llm_provider == "openai"
+            model in litellm.open_ai_embedding_models
+            or custom_llm_provider == "openai"
+            or custom_llm_provider == "together_ai"
        ):
            api_base = (
                api_base
@ -3832,6 +3840,33 @@ def embedding(
                model_response=EmbeddingResponse(),
                aembedding=aembedding,
            )
+        elif custom_llm_provider == "azure_ai":
+            api_base = (
+                api_base  # for deepinfra/perplexity/anyscale/groq/friendliai we check in get_llm_provider and pass in the api base from there
+                or litellm.api_base
+                or get_secret("AZURE_AI_API_BASE")
+            )
+            # set API KEY
+            api_key = (
+                api_key
+                or litellm.api_key  # for deepinfra/perplexity/anyscale/friendliai we check in get_llm_provider and pass in the api key from there
+                or litellm.openai_key
+                or get_secret("AZURE_AI_API_KEY")
+            )
+
+            ## EMBEDDING CALL
+            response = azure_ai_embedding.embedding(
+                model=model,
+                input=input,
+                api_base=api_base,
+                api_key=api_key,
+                logging_obj=logging,
+                timeout=timeout,
+                model_response=EmbeddingResponse(),
+                optional_params=optional_params,
+                client=client,
+                aembedding=aembedding,
+            )
        else:
            args = locals()
            raise ValueError(f"No valid embedding model args passed in - {args}")
@ -4901,7 +4936,11 @@ def speech(
    aspeech: Optional[bool] = None,
    **kwargs,
 ) -> HttpxBinaryResponseContent:
-
+    user = kwargs.get("user", None)
+    litellm_call_id: Optional[str] = kwargs.get("litellm_call_id", None)
+    proxy_server_request = kwargs.get("proxy_server_request", None)
+    model_info = kwargs.get("model_info", None)
+    metadata = kwargs.get("metadata", {})
    model, custom_llm_provider, dynamic_api_key, api_base = get_llm_provider(model=model, custom_llm_provider=custom_llm_provider, api_base=api_base)  # type: ignore
    tags = kwargs.pop("tags", [])

@ -4918,6 +4957,21 @@ def speech(
        max_retries = litellm.num_retries or openai.DEFAULT_MAX_RETRIES

    logging_obj = kwargs.get("litellm_logging_obj", None)
+    logging_obj.update_environment_variables(
+        model=model,
+        user=user,
+        optional_params={},
+        litellm_params={
+            "litellm_call_id": litellm_call_id,
+            "proxy_server_request": proxy_server_request,
+            "model_info": model_info,
+            "metadata": metadata,
+            "preset_cache_key": None,
+            "stream_response": {},
+            **kwargs,
+        },
+        custom_llm_provider=custom_llm_provider,
+    )
    response: Optional[HttpxBinaryResponseContent] = None
    if custom_llm_provider == "openai":
        if voice is None or not (isinstance(voice, str)):
--- a/litellm/model_prices_and_context_window_backup.json
+++ b/litellm/model_prices_and_context_window_backup.json
@ -990,6 +990,26 @@
        "mode": "chat",
        "source":"https://azuremarketplace.microsoft.com/en-us/marketplace/apps/metagenai.meta-llama-3-1-405b-instruct-offer?tab=PlansAndPrice"
    },
+    "azure_ai/Cohere-embed-v3-english": {
+        "max_tokens": 512,
+        "max_input_tokens": 512,
+        "output_vector_size": 1024,
+        "input_cost_per_token": 0.0000001,
+        "output_cost_per_token": 0.0,
+        "litellm_provider": "azure_ai",
+        "mode": "embedding",
+        "source":"https://azuremarketplace.microsoft.com/en-us/marketplace/apps/cohere.cohere-embed-v3-english-offer?tab=PlansAndPrice"
+    },
+    "azure_ai/Cohere-embed-v3-multilingual": {
+        "max_tokens": 512,
+        "max_input_tokens": 512,
+        "output_vector_size": 1024,
+        "input_cost_per_token": 0.0000001,
+        "output_cost_per_token": 0.0,
+        "litellm_provider": "azure_ai",
+        "mode": "embedding",
+        "source":"https://azuremarketplace.microsoft.com/en-us/marketplace/apps/cohere.cohere-embed-v3-english-offer?tab=PlansAndPrice"
+    },
    "babbage-002": {
        "max_tokens": 16384,
        "max_input_tokens": 16384,
@ -4953,50 +4973,71 @@
    "together-ai-up-to-4b": {
        "input_cost_per_token": 0.0000001,
        "output_cost_per_token": 0.0000001,
-        "litellm_provider": "together_ai"
+        "litellm_provider": "together_ai",
+        "mode": "chat"
    },
    "together-ai-4.1b-8b": {
        "input_cost_per_token": 0.0000002,
        "output_cost_per_token": 0.0000002,
-        "litellm_provider": "together_ai"
+        "litellm_provider": "together_ai",
+        "mode": "chat"
    },
    "together-ai-8.1b-21b": {
        "max_tokens": 1000,
        "input_cost_per_token": 0.0000003,
        "output_cost_per_token": 0.0000003,
-        "litellm_provider": "together_ai"
+        "litellm_provider": "together_ai",
+        "mode": "chat"
    },
    "together-ai-21.1b-41b": {
        "input_cost_per_token": 0.0000008,
        "output_cost_per_token": 0.0000008,
-        "litellm_provider": "together_ai"
+        "litellm_provider": "together_ai",
+        "mode": "chat"
    },
    "together-ai-41.1b-80b": {
        "input_cost_per_token": 0.0000009,
        "output_cost_per_token": 0.0000009,
-        "litellm_provider": "together_ai"
+        "litellm_provider": "together_ai",
+        "mode": "chat"
    },
    "together-ai-81.1b-110b": {
        "input_cost_per_token": 0.0000018,
        "output_cost_per_token": 0.0000018,
-        "litellm_provider": "together_ai"
+        "litellm_provider": "together_ai",
+        "mode": "chat"
+    },
+    "together-ai-embedding-up-to-150m": {
+        "input_cost_per_token": 0.000000008,
+        "output_cost_per_token": 0.0,
+        "litellm_provider": "together_ai",
+        "mode": "embedding"
+    },
+    "together-ai-embedding-151m-to-350m": {
+        "input_cost_per_token": 0.000000016,
+        "output_cost_per_token": 0.0,
+        "litellm_provider": "together_ai",
+        "mode": "embedding"
    },
    "together_ai/mistralai/Mixtral-8x7B-Instruct-v0.1": {
        "input_cost_per_token": 0.0000006,
        "output_cost_per_token": 0.0000006,
        "litellm_provider": "together_ai",
        "supports_function_calling": true,
-        "supports_parallel_function_calling": true
+        "supports_parallel_function_calling": true,
+        "mode": "chat"
    },
    "together_ai/mistralai/Mistral-7B-Instruct-v0.1": {
        "litellm_provider": "together_ai",
        "supports_function_calling": true,
-        "supports_parallel_function_calling": true
+        "supports_parallel_function_calling": true,
+        "mode": "chat"
    },
    "together_ai/togethercomputer/CodeLlama-34b-Instruct": {
        "litellm_provider": "together_ai",
        "supports_function_calling": true,
-        "supports_parallel_function_calling": true
+        "supports_parallel_function_calling": true,
+        "mode": "chat"
    },
    "ollama/codegemma": {
        "max_tokens": 8192, 
--- a/litellm/rerank_api/main.py
+++ b/litellm/rerank_api/main.py
@ -8,7 +8,7 @@ from litellm._logging import verbose_logger
 from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLoggingObj
 from litellm.llms.azure_ai.rerank import AzureAIRerank
 from litellm.llms.cohere.rerank import CohereRerank
-from litellm.llms.togetherai.rerank import TogetherAIRerank
+from litellm.llms.together_ai.rerank import TogetherAIRerank
 from litellm.secret_managers.main import get_secret
 from litellm.types.router import *
 from litellm.utils import client, exception_type, supports_httpx_timeout
@ -103,16 +103,14 @@ def rerank(
            )
        )

-        model_parameters = [
-            "top_n",
-            "rank_fields",
-            "return_documents",
-            "max_chunks_per_doc",
-        ]
-        model_params_dict = {}
-        for k, v in optional_params.model_fields.items():
-            if k in model_parameters:
-                model_params_dict[k] = v
+        model_params_dict = {
+            "top_n": top_n,
+            "rank_fields": rank_fields,
+            "return_documents": return_documents,
+            "max_chunks_per_doc": max_chunks_per_doc,
+            "documents": documents,
+        }
+
        litellm_logging_obj.update_environment_variables(
            model=model,
            user=user,
--- a/litellm/tests/test_completion_cost.py
+++ b/litellm/tests/test_completion_cost.py
@ -570,6 +570,9 @@ def test_groq_response_cost_tracking(is_streaming):
    print(f"response_cost: {response_cost}")


+from litellm.types.utils import CallTypes
+
+
 def test_together_ai_qwen_completion_cost():
    input_kwargs = {
        "completion_response": litellm.ModelResponse(
@ -612,7 +615,7 @@ def test_together_ai_qwen_completion_cost():
    }

    response = litellm.cost_calculator.get_model_params_and_category(
-        model_name="qwen/Qwen2-72B-Instruct"
+        model_name="qwen/Qwen2-72B-Instruct", call_type=CallTypes.completion
    )

    assert response == "together-ai-41.1b-80b"
@ -1323,3 +1326,802 @@ def test_completion_cost_vertex_llama3():
    cost = completion_cost(model=model, completion_response=response)

    assert cost == 0
+
+
+def test_together_ai_embedding_completion_cost():
+    from litellm.utils import Choices, EmbeddingResponse, Message, ModelResponse, Usage
+
+    os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
+    litellm.model_cost = litellm.get_model_cost_map(url="")
+    response = EmbeddingResponse(
+        model="togethercomputer/m2-bert-80M-8k-retrieval",
+        data=[
+            {
+                "embedding": [
+                    -0.18039076,
+                    0.11614138,
+                    0.37174946,
+                    0.27238843,
+                    -0.21933095,
+                    -0.15207036,
+                    0.17764972,
+                    -0.08700938,
+                    -0.23863377,
+                    -0.24203257,
+                    0.20441775,
+                    0.04630023,
+                    -0.07832973,
+                    -0.193581,
+                    0.2009999,
+                    -0.30106494,
+                    0.21179546,
+                    -0.23836501,
+                    -0.14919636,
+                    -0.045276586,
+                    0.08645845,
+                    -0.027714893,
+                    -0.009854938,
+                    0.25298217,
+                    -0.1081501,
+                    -0.2383125,
+                    0.23080236,
+                    0.011114239,
+                    0.06954927,
+                    -0.21081704,
+                    0.06937218,
+                    -0.16756944,
+                    -0.2030545,
+                    -0.19809915,
+                    -0.031914014,
+                    -0.15959585,
+                    0.17361341,
+                    0.30239972,
+                    -0.09923253,
+                    0.12680714,
+                    -0.13018028,
+                    0.1302273,
+                    0.19179879,
+                    0.17068875,
+                    0.065124996,
+                    -0.15515316,
+                    0.08250379,
+                    0.07309733,
+                    -0.07283606,
+                    0.21411736,
+                    0.15457751,
+                    -0.08725933,
+                    0.07227311,
+                    0.056812778,
+                    -0.077683985,
+                    0.06833304,
+                    0.0328722,
+                    0.2719641,
+                    -0.06989647,
+                    0.22805125,
+                    0.14953858,
+                    0.0792393,
+                    0.07793462,
+                    0.16176109,
+                    -0.15616545,
+                    -0.25149494,
+                    -0.065352336,
+                    -0.38410214,
+                    -0.27288514,
+                    0.13946335,
+                    -0.21873806,
+                    0.1365704,
+                    0.11738016,
+                    -0.1141173,
+                    0.022973377,
+                    -0.16935326,
+                    0.026940947,
+                    -0.09990286,
+                    -0.05157219,
+                    0.21006724,
+                    0.15897459,
+                    0.011987913,
+                    0.02576497,
+                    -0.11819022,
+                    -0.09184997,
+                    -0.31881434,
+                    -0.17055357,
+                    -0.09523704,
+                    0.008458802,
+                    -0.015483258,
+                    0.038404867,
+                    0.014673892,
+                    -0.041162584,
+                    0.002691519,
+                    0.04601874,
+                    0.059108324,
+                    0.007177156,
+                    0.066804245,
+                    0.038554087,
+                    -0.038720075,
+                    -0.2145991,
+                    -0.15713418,
+                    -0.03712905,
+                    -0.066650696,
+                    0.04227769,
+                    0.018708894,
+                    -0.26332214,
+                    0.0012769096,
+                    -0.13878848,
+                    -0.33141217,
+                    0.118736655,
+                    0.03026654,
+                    0.1017467,
+                    -0.08000539,
+                    0.00092649367,
+                    0.13062756,
+                    -0.03785864,
+                    -0.2038575,
+                    0.07655428,
+                    -0.24818295,
+                    -0.0600955,
+                    0.114760056,
+                    0.027571939,
+                    -0.047068622,
+                    -0.19806816,
+                    0.0774084,
+                    -0.05213658,
+                    -0.042000014,
+                    0.051924672,
+                    -0.14131106,
+                    -0.2309609,
+                    0.20305444,
+                    0.0700591,
+                    0.13863273,
+                    -0.06145084,
+                    -0.039423797,
+                    -0.055951696,
+                    0.04732105,
+                    0.078736484,
+                    0.2566198,
+                    0.054494765,
+                    0.017602794,
+                    -0.107575715,
+                    -0.017887019,
+                    -0.26046592,
+                    -0.077659994,
+                    -0.08430523,
+                    0.18806657,
+                    -0.12292346,
+                    0.06288608,
+                    -0.106739804,
+                    -0.06600645,
+                    -0.14719339,
+                    -0.05070389,
+                    0.23234129,
+                    -0.034023043,
+                    0.056019265,
+                    -0.03627352,
+                    0.11740493,
+                    0.060294818,
+                    -0.21726903,
+                    -0.09775424,
+                    0.27007395,
+                    0.28328258,
+                    0.022495652,
+                    0.13218465,
+                    0.07199022,
+                    -0.15933248,
+                    0.02381037,
+                    -0.08288268,
+                    0.020621575,
+                    0.17395815,
+                    0.06978612,
+                    0.18418784,
+                    -0.12663148,
+                    -0.21287888,
+                    0.21239495,
+                    0.10222956,
+                    0.03952703,
+                    -0.066957936,
+                    -0.035802357,
+                    0.03683884,
+                    0.22524163,
+                    -0.029355489,
+                    -0.11534147,
+                    -0.041979663,
+                    -0.012147716,
+                    -0.07279564,
+                    0.17417553,
+                    0.05546745,
+                    -0.1773277,
+                    -0.26984993,
+                    0.31703642,
+                    0.05958132,
+                    -0.14933203,
+                    -0.084655434,
+                    0.074604444,
+                    -0.077568695,
+                    0.25167143,
+                    -0.17753932,
+                    -0.006415411,
+                    0.068613894,
+                    -0.0031754146,
+                    -0.0039771493,
+                    0.015294107,
+                    0.11839045,
+                    -0.04570732,
+                    0.103238374,
+                    -0.09678329,
+                    -0.21713412,
+                    0.047976546,
+                    -0.14346297,
+                    0.17429878,
+                    -0.31257913,
+                    0.15445377,
+                    -0.10576352,
+                    -0.16792995,
+                    -0.17988597,
+                    -0.14238739,
+                    -0.088244036,
+                    0.2760547,
+                    0.088823885,
+                    -0.08074319,
+                    -0.028918687,
+                    0.107819095,
+                    0.12004892,
+                    0.13343112,
+                    -0.1332874,
+                    -0.0946055,
+                    -0.20433402,
+                    0.17760132,
+                    0.11774745,
+                    0.16756779,
+                    -0.0937686,
+                    0.23887308,
+                    0.27315456,
+                    0.08657822,
+                    0.027402503,
+                    -0.06605757,
+                    0.29859266,
+                    -0.21552202,
+                    0.026192812,
+                    0.1328459,
+                    0.13072926,
+                    0.19236198,
+                    0.01760772,
+                    -0.042355467,
+                    0.08815041,
+                    -0.013158761,
+                    -0.23350924,
+                    -0.043668386,
+                    -0.15479062,
+                    -0.024266671,
+                    0.08113482,
+                    0.14451654,
+                    -0.29152337,
+                    -0.028919466,
+                    0.15022752,
+                    -0.26923147,
+                    0.23846954,
+                    0.03292609,
+                    -0.23572414,
+                    -0.14883325,
+                    -0.12743121,
+                    -0.052229587,
+                    -0.14230779,
+                    0.284658,
+                    0.36885592,
+                    -0.13176951,
+                    -0.16442224,
+                    -0.20283924,
+                    0.048434418,
+                    -0.16231743,
+                    -0.0010730615,
+                    0.1408047,
+                    0.09481033,
+                    0.018139571,
+                    -0.030843062,
+                    0.13304341,
+                    -0.1516288,
+                    -0.051779557,
+                    0.46940327,
+                    -0.07969027,
+                    -0.051570967,
+                    -0.038892798,
+                    0.11187677,
+                    0.1703113,
+                    -0.39926252,
+                    0.06859773,
+                    0.08364686,
+                    0.14696898,
+                    0.026642298,
+                    0.13225247,
+                    0.05730332,
+                    0.35534015,
+                    0.11189959,
+                    0.039673142,
+                    -0.056019083,
+                    0.15707816,
+                    -0.11053284,
+                    0.12823457,
+                    0.20075114,
+                    0.040237684,
+                    -0.19367051,
+                    0.13039409,
+                    -0.26038498,
+                    -0.05770229,
+                    -0.009781617,
+                    0.15812513,
+                    -0.10420735,
+                    -0.020158196,
+                    0.13160926,
+                    -0.20823349,
+                    -0.045596864,
+                    -0.2074525,
+                    0.1546387,
+                    0.30158705,
+                    0.13175933,
+                    0.11967154,
+                    -0.09094463,
+                    0.0019428955,
+                    -0.06745872,
+                    0.02998099,
+                    -0.18385777,
+                    0.014330351,
+                    0.07141392,
+                    -0.17461702,
+                    0.099743806,
+                    -0.016181415,
+                    0.1661396,
+                    0.070834026,
+                    0.110713825,
+                    0.14590909,
+                    0.15404254,
+                    -0.21658006,
+                    0.00715122,
+                    -0.10229453,
+                    -0.09980027,
+                    -0.09406554,
+                    -0.014849227,
+                    -0.26285952,
+                    0.069972225,
+                    0.05732395,
+                    -0.10685719,
+                    0.037572138,
+                    -0.18863359,
+                    -0.00083297276,
+                    -0.16088934,
+                    -0.117982,
+                    -0.16381365,
+                    -0.008932539,
+                    -0.06549256,
+                    -0.08928683,
+                    0.29934987,
+                    0.16532114,
+                    -0.27117223,
+                    -0.12302226,
+                    -0.28685933,
+                    -0.14041144,
+                    -0.0062569617,
+                    -0.20768198,
+                    -0.15385273,
+                    0.20506454,
+                    -0.21685128,
+                    0.1081962,
+                    -0.13133131,
+                    0.18937315,
+                    0.14751591,
+                    0.2786974,
+                    -0.060183275,
+                    0.10365405,
+                    0.109799005,
+                    -0.044105034,
+                    -0.04260162,
+                    0.025758557,
+                    0.07590695,
+                    0.0726137,
+                    -0.09882405,
+                    0.26437432,
+                    0.15884234,
+                    0.115702584,
+                    0.0015900572,
+                    0.11673009,
+                    -0.18648374,
+                    0.3080215,
+                    -0.26407364,
+                    -0.15610488,
+                    0.12658228,
+                    -0.05672454,
+                    0.016239772,
+                    -0.092462406,
+                    -0.36205122,
+                    -0.2925843,
+                    -0.104364775,
+                    -0.2598659,
+                    -0.14073578,
+                    0.10225995,
+                    -0.2612335,
+                    -0.17479639,
+                    0.17488293,
+                    -0.2437756,
+                    0.114384405,
+                    -0.13196659,
+                    -0.067482576,
+                    0.024756929,
+                    0.11779123,
+                    0.2751749,
+                    -0.13306957,
+                    -0.034118645,
+                    -0.14177705,
+                    0.27164033,
+                    0.06266008,
+                    0.11199439,
+                    -0.09814594,
+                    0.13231735,
+                    0.019105865,
+                    -0.2652429,
+                    -0.12924416,
+                    0.0840029,
+                    0.098754935,
+                    0.025883028,
+                    -0.33059177,
+                    -0.10544467,
+                    -0.14131607,
+                    -0.09680401,
+                    -0.047318626,
+                    -0.08157771,
+                    -0.11271855,
+                    0.12637804,
+                    0.11703408,
+                    0.014556337,
+                    0.22788583,
+                    -0.05599293,
+                    0.25811172,
+                    0.22956331,
+                    0.13004553,
+                    0.15419081,
+                    -0.07971162,
+                    0.11692607,
+                    -0.2859737,
+                    0.059627946,
+                    -0.02716421,
+                    0.117603,
+                    -0.061154094,
+                    -0.13555732,
+                    0.17092334,
+                    -0.16639015,
+                    0.2919375,
+                    -0.020189757,
+                    0.18548165,
+                    -0.32514027,
+                    0.19324942,
+                    -0.117969565,
+                    0.23577307,
+                    -0.18052326,
+                    -0.10520473,
+                    -0.2647645,
+                    -0.29393113,
+                    0.052641366,
+                    -0.07733946,
+                    -0.10684275,
+                    -0.15046178,
+                    0.065737076,
+                    -0.0022297644,
+                    -0.010802031,
+                    -0.115943395,
+                    -0.11602136,
+                    0.24265991,
+                    -0.12240144,
+                    0.11817584,
+                    0.026270682,
+                    -0.25762397,
+                    -0.14545679,
+                    0.014168602,
+                    0.106698096,
+                    0.12905516,
+                    -0.12560321,
+                    0.15034604,
+                    0.071529925,
+                    0.123048246,
+                    -0.058863316,
+                    -0.12251829,
+                    0.20463347,
+                    0.06841168,
+                    0.13706751,
+                    0.05893755,
+                    -0.12269708,
+                    0.096701816,
+                    -0.3237337,
+                    -0.2213742,
+                    -0.073655166,
+                    -0.12979327,
+                    0.14173084,
+                    0.19167605,
+                    -0.14523135,
+                    0.06963011,
+                    -0.019228822,
+                    -0.14134938,
+                    0.22017507,
+                    0.007933044,
+                    -0.0065696104,
+                    0.074060634,
+                    -0.13231485,
+                    0.1387053,
+                    -0.14480218,
+                    -0.007837481,
+                    0.29880494,
+                    0.101618655,
+                    0.14514285,
+                    -0.066113696,
+                    -0.041709363,
+                    0.21512671,
+                    -0.090142876,
+                    -0.010337287,
+                    0.13212202,
+                    0.08307805,
+                    0.10144794,
+                    -0.024808172,
+                    0.21877879,
+                    -0.071282186,
+                    -8.786433e-05,
+                    -0.014574037,
+                    -0.11954953,
+                    -0.096931055,
+                    -0.2557228,
+                    0.1090451,
+                    0.15424186,
+                    -0.029206438,
+                    -0.2898023,
+                    0.22510754,
+                    -0.019507697,
+                    0.1566895,
+                    -0.24820097,
+                    -0.012163554,
+                    0.12401036,
+                    0.024711533,
+                    0.24737844,
+                    -0.06311193,
+                    0.0652544,
+                    -0.067403205,
+                    0.15362221,
+                    -0.12093675,
+                    0.096014425,
+                    0.17337392,
+                    -0.017509578,
+                    0.015355054,
+                    0.055885684,
+                    -0.08358914,
+                    -0.018012024,
+                    0.069017515,
+                    0.32854614,
+                    0.0063175815,
+                    -0.09058244,
+                    0.000681382,
+                    -0.10825181,
+                    0.13190223,
+                    0.009358909,
+                    -0.12205342,
+                    0.08268384,
+                    -0.260608,
+                    -0.11042252,
+                    -0.022601532,
+                    -0.080661446,
+                    -0.035559367,
+                    0.14736788,
+                    0.061933476,
+                    -0.07815901,
+                    0.110823035,
+                    -0.00875032,
+                    -0.064237975,
+                    -0.04546554,
+                    -0.05909862,
+                    0.23463917,
+                    -0.20451859,
+                    -0.16576467,
+                    0.10957323,
+                    -0.08632836,
+                    -0.27395645,
+                    0.0002913844,
+                    0.13701706,
+                    -0.058854006,
+                    0.30768716,
+                    -0.037643027,
+                    -0.1365738,
+                    0.095908396,
+                    -0.05029932,
+                    0.14793666,
+                    0.30881998,
+                    -0.018806668,
+                    -0.15902956,
+                    0.07953607,
+                    -0.07259314,
+                    0.17318867,
+                    0.123503335,
+                    -0.11327983,
+                    -0.24497227,
+                    -0.092871994,
+                    0.31053993,
+                    0.09460377,
+                    -0.21152224,
+                    -0.03127119,
+                    -0.018713845,
+                    -0.014523326,
+                    -0.18656968,
+                    0.2255386,
+                    -0.1902719,
+                    0.18821372,
+                    -0.16890709,
+                    -0.04607359,
+                    0.13054903,
+                    -0.05379203,
+                    -0.051014878,
+                    0.054293603,
+                    -0.07299424,
+                    -0.06728367,
+                    -0.052388195,
+                    -0.29960096,
+                    -0.22351485,
+                    -0.06481434,
+                    -0.1619141,
+                    0.24709718,
+                    -0.1203425,
+                    0.029514981,
+                    -0.01951599,
+                    -0.072677284,
+                    -0.25097945,
+                    0.03758907,
+                    0.14380245,
+                    -0.037721623,
+                    -0.19958745,
+                    0.2408246,
+                    -0.13995907,
+                    -0.028115002,
+                    -0.14780775,
+                    0.17445801,
+                    0.11311988,
+                    0.05306163,
+                    0.0018454103,
+                    0.00088805315,
+                    -0.27949628,
+                    -0.23556526,
+                    -0.18175222,
+                    -0.28372183,
+                    -0.43095905,
+                    0.22644317,
+                    0.06072053,
+                    0.02278773,
+                    0.021752749,
+                    0.053462002,
+                    -0.30636713,
+                    0.15607472,
+                    -0.16657323,
+                    -0.07240017,
+                    0.1410017,
+                    -0.026987495,
+                    0.15029654,
+                    0.03340291,
+                    -0.2056912,
+                    0.055395555,
+                    0.11999902,
+                    0.06368412,
+                    -0.025476053,
+                    -0.1702383,
+                    -0.23432998,
+                    0.14855467,
+                    -0.07505147,
+                    -0.030296376,
+                    -0.07001051,
+                    0.10510949,
+                    0.10420236,
+                    0.09809715,
+                    0.17195594,
+                    0.19430229,
+                    -0.16121922,
+                    -0.081139356,
+                    0.15032287,
+                    0.10385191,
+                    -0.18741366,
+                    0.008690719,
+                    -0.12941097,
+                    -0.027797364,
+                    -0.2148853,
+                    0.037788823,
+                    0.16691138,
+                    0.099181786,
+                    -0.0955518,
+                    -0.0074798446,
+                    -0.17511943,
+                    0.14543307,
+                    -0.029364567,
+                    -0.21223477,
+                    -0.05881982,
+                    0.11064195,
+                    -0.2877007,
+                    -0.023934823,
+                    -0.15569815,
+                    0.015789302,
+                    -0.035767324,
+                    -0.15110208,
+                    0.07125638,
+                    0.05703369,
+                    -0.08454703,
+                    -0.07080854,
+                    0.025179204,
+                    -0.10522502,
+                    -0.03670824,
+                    -0.11075579,
+                    0.0681693,
+                    -0.28287485,
+                    0.2769406,
+                    0.026260372,
+                    0.07289979,
+                    0.04669447,
+                    -0.16541554,
+                    0.040775143,
+                    0.035916835,
+                    0.03648039,
+                    0.11299418,
+                    0.14765884,
+                    0.031163761,
+                    0.0011800596,
+                    -0.10715472,
+                    0.02665826,
+                    -0.06237457,
+                    0.15672882,
+                    0.09038829,
+                    0.0061029866,
+                    -0.2592228,
+                    -0.21008603,
+                    0.019810716,
+                    -0.08721265,
+                    0.107840165,
+                    0.28438854,
+                    -0.16649202,
+                    0.19627784,
+                    0.040611178,
+                    0.16516201,
+                    0.24990341,
+                    -0.16222852,
+                    -0.009037945,
+                    0.053751092,
+                    0.1647804,
+                    -0.16184275,
+                    -0.29710436,
+                    0.043035872,
+                    0.04667557,
+                    0.14761224,
+                    -0.09030331,
+                    -0.024515491,
+                    0.10857025,
+                    0.19865094,
+                    -0.07794062,
+                    0.17942934,
+                    0.13322048,
+                    -0.16857187,
+                    0.055713065,
+                    0.18661156,
+                    -0.07864222,
+                    0.23296827,
+                    0.10348465,
+                    -0.11750994,
+                    -0.065938555,
+                    -0.04377608,
+                    0.14903909,
+                    0.019000417,
+                    0.21033548,
+                    0.12162547,
+                    0.1273347,
+                ],
+                "index": 0,
+                "object": "embedding",
+            }
+        ],
+        object="list",
+        usage=Usage(
+            completion_tokens=0,
+            prompt_tokens=0,
+            total_tokens=0,
+            completion_tokens_details=None,
+        ),
+    )
+
+    cost = completion_cost(
+        completion_response=response,
+        custom_llm_provider="together_ai",
+        call_type="embedding",
+    )
--- a/litellm/tests/test_embedding.py
+++ b/litellm/tests/test_embedding.py
@ -104,14 +104,131 @@ def test_openai_embedding_3():
        pytest.fail(f"Error occurred: {e}")


-def test_openai_azure_embedding_simple():
+@pytest.mark.parametrize(
+    "model, api_base, api_key",
+    [
+        # ("azure/azure-embedding-model", None, None),
+        ("together_ai/togethercomputer/m2-bert-80M-8k-retrieval", None, None),
+    ],
+)
+@pytest.mark.parametrize("sync_mode", [True, False])
+@pytest.mark.asyncio
+async def test_openai_azure_embedding_simple(model, api_base, api_key, sync_mode):
    try:
-        litellm.set_verbose = True
+        os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
+        litellm.model_cost = litellm.get_model_cost_map(url="")
+        # litellm.set_verbose = True
+        if sync_mode:
            response = embedding(
-            model="azure/azure-embedding-model",
+                model=model,
                input=["good morning from litellm"],
+                api_base=api_base,
+                api_key=api_key,
+            )
+        else:
+            response = await litellm.aembedding(
+                model=model,
+                input=["good morning from litellm"],
+                api_base=api_base,
+                api_key=api_key,
+            )
+            # print(await response)
+        print(response)
+        print(response._hidden_params)
+        response_keys = set(dict(response).keys())
+        response_keys.discard("_response_ms")
+        assert set(["usage", "model", "object", "data"]) == set(
+            response_keys
+        )  # assert litellm response has expected keys from OpenAI embedding response
+
+        request_cost = litellm.completion_cost(
+            completion_response=response, call_type="embedding"
+        )
+
+        print("Calculated request cost=", request_cost)
+
+        assert isinstance(response.usage, litellm.Usage)
+
+    except Exception as e:
+        pytest.fail(f"Error occurred: {e}")
+
+
+# test_openai_azure_embedding_simple()
+import base64
+
+import requests
+
+litellm.set_verbose = True
+url = "https://dummyimage.com/100/100/fff&text=Test+image"
+response = requests.get(url)
+file_data = response.content
+
+encoded_file = base64.b64encode(file_data).decode("utf-8")
+base64_image = f"data:image/png;base64,{encoded_file}"
+
+
+from openai.types.embedding import Embedding
+
+
+def _azure_ai_image_mock_response(*args, **kwargs):
+    new_response = MagicMock()
+    new_response.headers = {"azureml-model-group": "offer-cohere-embed-multili-paygo"}
+
+    new_response.json.return_value = {
+        "data": [Embedding(embedding=[1234], index=0, object="embedding")],
+        "model": "",
+        "object": "list",
+        "usage": {"prompt_tokens": 1, "total_tokens": 2},
+    }
+
+    return new_response
+
+
+@pytest.mark.parametrize(
+    "model, api_base, api_key",
+    [
+        (
+            "azure_ai/Cohere-embed-v3-multilingual-jzu",
+            "https://Cohere-embed-v3-multilingual-jzu.eastus2.models.ai.azure.com",
+            os.getenv("AZURE_AI_COHERE_API_KEY_2"),
+        )
+    ],
+)
+@pytest.mark.parametrize("sync_mode", [True, False])
+@pytest.mark.asyncio
+async def test_azure_ai_embedding_image(model, api_base, api_key, sync_mode):
+    try:
+        os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
+        litellm.model_cost = litellm.get_model_cost_map(url="")
+        input = base64_image
+        if sync_mode:
+            client = HTTPHandler()
+        else:
+            client = AsyncHTTPHandler()
+        with patch.object(
+            client, "post", side_effect=_azure_ai_image_mock_response
+        ) as mock_client:
+            if sync_mode:
+                response = embedding(
+                    model=model,
+                    input=[input],
+                    api_base=api_base,
+                    api_key=api_key,
+                    client=client,
+                )
+            else:
+                response = await litellm.aembedding(
+                    model=model,
+                    input=[input],
+                    api_base=api_base,
+                    api_key=api_key,
+                    client=client,
                )
        print(response)
+
+        assert len(response.data) == 1
+
+        print(response._hidden_params)
        response_keys = set(dict(response).keys())
        response_keys.discard("_response_ms")
        assert set(["usage", "model", "object", "data"]) == set(
@ -128,9 +245,6 @@ def test_openai_azure_embedding_simple():
        pytest.fail(f"Error occurred: {e}")


-# test_openai_azure_embedding_simple()
-
-
 def test_openai_azure_embedding_timeouts():
    try:
        response = embedding(
@ -226,13 +340,16 @@ def test_openai_azure_embedding_with_oidc_and_cf():
        os.environ["AZURE_API_KEY"] = old_key


+from openai.types.embedding import Embedding
+
+
 def _openai_mock_response(*args, **kwargs):
    new_response = MagicMock()
    new_response.headers = {"hello": "world"}

    new_response.parse.return_value = (
        openai.types.create_embedding_response.CreateEmbeddingResponse(
-            data=[],
+            data=[Embedding(embedding=[1234, 45667], index=0, object="embedding")],
            model="azure/test",
            object="list",
            usage=openai.types.create_embedding_response.Usage(
@ -267,20 +384,28 @@ def test_openai_azure_embedding_optional_arg():
 # test_openai_embedding()


+@pytest.mark.parametrize(
+    "model, api_base",
+    [
+        ("embed-english-v2.0", None),
+    ],
+)
@pytest.mark.parametrize("sync_mode", [True, False])
@pytest.mark.asyncio
-async def test_cohere_embedding(sync_mode):
+async def test_cohere_embedding(sync_mode, model, api_base):
    try:
        # litellm.set_verbose=True
        data = {
-            "model": "embed-english-v2.0",
+            "model": model,
            "input": ["good morning from litellm", "this is another item"],
            "input_type": "search_query",
+            "api_base": api_base,
        }
        if sync_mode:
            response = embedding(**data)
        else:
            response = await litellm.aembedding(**data)
+
        print(f"response:", response)

        assert isinstance(response.usage, litellm.Usage)
--- a/litellm/tests/test_utils.py
+++ b/litellm/tests/test_utils.py
@ -774,3 +774,21 @@ def test_usage_object_null_tokens():
    usage_obj = litellm.Usage(prompt_tokens=2, completion_tokens=None, total_tokens=2)

    assert usage_obj.completion_tokens == 0
+
+
+def test_is_base64_encoded():
+    import base64
+
+    import requests
+
+    litellm.set_verbose = True
+    url = "https://dummyimage.com/100/100/fff&text=Test+image"
+    response = requests.get(url)
+    file_data = response.content
+
+    encoded_file = base64.b64encode(file_data).decode("utf-8")
+    base64_image = f"data:image/png;base64,{encoded_file}"
+
+    from litellm.utils import is_base64_encoded
+
+    assert is_base64_encoded(s=base64_image) is True
--- a/litellm/types/llms/azure_ai.py
+++ b/litellm/types/llms/azure_ai.py
@ -0,0 +1,17 @@
+from typing import Any, Dict, Iterable, List, Literal, Optional, Union
+
+from typing_extensions import Required, TypedDict
+
+
+class ImageEmbeddingInput(TypedDict, total=False):
+    image: Required[str]
+    text: str
+
+
+EncodingFormat = Literal["base64", "binary", "float", "int8", "ubinary", "uint8"]
+
+
+class ImageEmbeddingRequest(TypedDict, total=False):
+    input: Required[List[ImageEmbeddingInput]]
+    dimensions: int
+    encoding_format: EncodingFormat
--- a/litellm/types/llms/openai.py
+++ b/litellm/types/llms/openai.py
@ -9,7 +9,7 @@ from openai.lib.streaming._assistants import (
    AsyncAssistantStreamManager,
 )
 from openai.pagination import AsyncCursorPage, SyncCursorPage
-from openai.types import Batch, FileObject
+from openai.types import Batch, EmbeddingCreateParams, FileObject
 from openai.types.beta.assistant import Assistant
 from openai.types.beta.assistant_tool_param import AssistantToolParam
 from openai.types.beta.thread_create_params import (
--- a/litellm/types/utils.py
+++ b/litellm/types/utils.py
@ -766,7 +766,7 @@ class EmbeddingResponse(OpenAIObject):
    """The actual embedding value"""

    object: Literal["list"]
-    """The object type, which is always "embedding" """
+    """The object type, which is always "list" """

    usage: Optional[Usage] = None
    """Usage statistics for the embedding request."""
--- a/litellm/utils.py
+++ b/litellm/utils.py
@ -11118,6 +11118,10 @@ def is_cached_message(message: AllMessageValues) -> bool:

 def is_base64_encoded(s: str) -> bool:
    try:
+        # Strip out the prefix if it exists
+        if s.startswith("data:"):
+            s = s.split(",")[1]
+
        # Try to decode the string
        decoded_bytes = base64.b64decode(s, validate=True)
        # Check if the original string can be re-encoded to the same string
--- a/model_prices_and_context_window.json
+++ b/model_prices_and_context_window.json
@ -990,6 +990,26 @@
        "mode": "chat",
        "source":"https://azuremarketplace.microsoft.com/en-us/marketplace/apps/metagenai.meta-llama-3-1-405b-instruct-offer?tab=PlansAndPrice"
    },
+    "azure_ai/Cohere-embed-v3-english": {
+        "max_tokens": 512,
+        "max_input_tokens": 512,
+        "output_vector_size": 1024,
+        "input_cost_per_token": 0.0000001,
+        "output_cost_per_token": 0.0,
+        "litellm_provider": "azure_ai",
+        "mode": "embedding",
+        "source":"https://azuremarketplace.microsoft.com/en-us/marketplace/apps/cohere.cohere-embed-v3-english-offer?tab=PlansAndPrice"
+    },
+    "azure_ai/Cohere-embed-v3-multilingual": {
+        "max_tokens": 512,
+        "max_input_tokens": 512,
+        "output_vector_size": 1024,
+        "input_cost_per_token": 0.0000001,
+        "output_cost_per_token": 0.0,
+        "litellm_provider": "azure_ai",
+        "mode": "embedding",
+        "source":"https://azuremarketplace.microsoft.com/en-us/marketplace/apps/cohere.cohere-embed-v3-english-offer?tab=PlansAndPrice"
+    },
    "babbage-002": {
        "max_tokens": 16384,
        "max_input_tokens": 16384,
@ -4964,50 +4984,71 @@
    "together-ai-up-to-4b": {
        "input_cost_per_token": 0.0000001,
        "output_cost_per_token": 0.0000001,
-        "litellm_provider": "together_ai"
+        "litellm_provider": "together_ai",
+        "mode": "chat"
    },
    "together-ai-4.1b-8b": {
        "input_cost_per_token": 0.0000002,
        "output_cost_per_token": 0.0000002,
-        "litellm_provider": "together_ai"
+        "litellm_provider": "together_ai",
+        "mode": "chat"
    },
    "together-ai-8.1b-21b": {
        "max_tokens": 1000,
        "input_cost_per_token": 0.0000003,
        "output_cost_per_token": 0.0000003,
-        "litellm_provider": "together_ai"
+        "litellm_provider": "together_ai",
+        "mode": "chat"
    },
    "together-ai-21.1b-41b": {
        "input_cost_per_token": 0.0000008,
        "output_cost_per_token": 0.0000008,
-        "litellm_provider": "together_ai"
+        "litellm_provider": "together_ai",
+        "mode": "chat"
    },
    "together-ai-41.1b-80b": {
        "input_cost_per_token": 0.0000009,
        "output_cost_per_token": 0.0000009,
-        "litellm_provider": "together_ai"
+        "litellm_provider": "together_ai",
+        "mode": "chat"
    },
    "together-ai-81.1b-110b": {
        "input_cost_per_token": 0.0000018,
        "output_cost_per_token": 0.0000018,
-        "litellm_provider": "together_ai"
+        "litellm_provider": "together_ai",
+        "mode": "chat"
+    },
+    "together-ai-embedding-up-to-150m": {
+        "input_cost_per_token": 0.000000008,
+        "output_cost_per_token": 0.0,
+        "litellm_provider": "together_ai",
+        "mode": "embedding"
+    },
+    "together-ai-embedding-151m-to-350m": {
+        "input_cost_per_token": 0.000000016,
+        "output_cost_per_token": 0.0,
+        "litellm_provider": "together_ai",
+        "mode": "embedding"
    },
    "together_ai/mistralai/Mixtral-8x7B-Instruct-v0.1": {
        "input_cost_per_token": 0.0000006,
        "output_cost_per_token": 0.0000006,
        "litellm_provider": "together_ai",
        "supports_function_calling": true,
-        "supports_parallel_function_calling": true
+        "supports_parallel_function_calling": true,
+        "mode": "chat"
    },
    "together_ai/mistralai/Mistral-7B-Instruct-v0.1": {
        "litellm_provider": "together_ai",
        "supports_function_calling": true,
-        "supports_parallel_function_calling": true
+        "supports_parallel_function_calling": true,
+        "mode": "chat"
    },
    "together_ai/togethercomputer/CodeLlama-34b-Instruct": {
        "litellm_provider": "together_ai",
        "supports_function_calling": true,
-        "supports_parallel_function_calling": true
+        "supports_parallel_function_calling": true,
+        "mode": "chat"
    },
    "ollama/codegemma": {
        "max_tokens": 8192,