From 16c0307eab601b238f21021b9cba04b6957ad34c Mon Sep 17 00:00:00 2001
From: Krish Dholakia <krrishdholakia@gmail.com>
Date: Wed, 25 Sep 2024 22:11:57 -0700
Subject: [PATCH] LiteLLM Minor Fixes & Improvements (09/24/2024) (#5880)

* LiteLLM Minor Fixes & Improvements (09/23/2024)  (#5842)

* feat(auth_utils.py): enable admin to allow client-side credentials to be passed

Makes it easier for devs to experiment with finetuned fireworks ai models

* feat(router.py): allow setting configurable_clientside_auth_params for a model

Closes https://github.com/BerriAI/litellm/issues/5843

* build(model_prices_and_context_window.json): fix anthropic claude-3-5-sonnet max output token limit

Fixes https://github.com/BerriAI/litellm/issues/5850

* fix(azure_ai/): support content list for azure ai

Fixes https://github.com/BerriAI/litellm/issues/4237

* fix(litellm_logging.py): always set saved_cache_cost

Set to 0 by default

* fix(fireworks_ai/cost_calculator.py): add fireworks ai default pricing

handles calling 405b+ size models

* fix(slack_alerting.py): fix error alerting for failed spend tracking

Fixes regression with slack alerting error monitoring

* fix(vertex_and_google_ai_studio_gemini.py): handle gemini no candidates in streaming chunk error

* docs(bedrock.md): add llama3-1 models

* test: fix tests

* fix(azure_ai/chat): fix transformation for azure ai calls

* feat(azure_ai/embed): Add azure ai embeddings support

Closes https://github.com/BerriAI/litellm/issues/5861

* fix(azure_ai/embed): enable async embedding

* feat(azure_ai/embed): support azure ai multimodal embeddings

* fix(azure_ai/embed): support async multi modal embeddings

* feat(together_ai/embed): support together ai embedding calls

* feat(rerank/main.py): log source documents for rerank endpoints to langfuse

improves rerank endpoint logging

* fix(langfuse.py): support logging `/audio/speech` input to langfuse

* test(test_embedding.py): fix test

* test(test_completion_cost.py): fix helper util
---
 litellm/__init__.py                           |   2 +-
 litellm/cost_calculator.py                    |  49 +-
 litellm/integrations/langfuse.py              |  14 +-
 litellm/llms/OpenAI/openai.py                 |  14 +-
 litellm/llms/azure_ai/__init__.py             |   3 +
 litellm/llms/azure_ai/cost_calculator.py      |   0
 .../azure_ai/embed/cohere_transformation.py   |  98 +++
 litellm/llms/azure_ai/embed/handler.py        | 296 +++++++
 litellm/llms/together_ai.py                   | 239 ------
 litellm/llms/together_ai/chat.py              |  13 +
 litellm/llms/together_ai/completion.py        |   7 +
 litellm/llms/together_ai/cost_calculator.py   |  79 ++
 litellm/llms/together_ai/embed.py             |   7 +
 .../{togetherai => together_ai}/rerank.py     |   0
 litellm/main.py                               |  60 +-
 ...odel_prices_and_context_window_backup.json |  59 +-
 litellm/rerank_api/main.py                    |  20 +-
 litellm/tests/test_completion_cost.py         | 804 +++++++++++++++++-
 litellm/tests/test_embedding.py               | 149 +++-
 litellm/tests/test_utils.py                   |  18 +
 litellm/types/llms/azure_ai.py                |  17 +
 litellm/types/llms/openai.py                  |   2 +-
 litellm/types/utils.py                        |   2 +-
 litellm/utils.py                              |   4 +
 model_prices_and_context_window.json          |  59 +-
 25 files changed, 1675 insertions(+), 340 deletions(-)
 create mode 100644 litellm/llms/azure_ai/__init__.py
 create mode 100644 litellm/llms/azure_ai/cost_calculator.py
 create mode 100644 litellm/llms/azure_ai/embed/cohere_transformation.py
 create mode 100644 litellm/llms/azure_ai/embed/handler.py
 delete mode 100644 litellm/llms/together_ai.py
 create mode 100644 litellm/llms/together_ai/chat.py
 create mode 100644 litellm/llms/together_ai/completion.py
 create mode 100644 litellm/llms/together_ai/cost_calculator.py
 create mode 100644 litellm/llms/together_ai/embed.py
 rename litellm/llms/{togetherai => together_ai}/rerank.py (100%)
 create mode 100644 litellm/types/llms/azure_ai.py

diff --git a/litellm/__init__.py b/litellm/__init__.py
index a94ba534b..f95640b58 100644
--- a/litellm/__init__.py
+++ b/litellm/__init__.py
@@ -901,7 +901,7 @@ from .llms.cohere.completion import CohereConfig
 from .llms.clarifai import ClarifaiConfig
 from .llms.AI21.completion import AI21Config
 from .llms.AI21.chat import AI21ChatConfig
-from .llms.together_ai import TogetherAIConfig
+from .llms.together_ai.chat import TogetherAIConfig
 from .llms.cloudflare import CloudflareConfig
 from .llms.palm import PalmConfig
 from .llms.gemini import GeminiConfig
diff --git a/litellm/cost_calculator.py b/litellm/cost_calculator.py
index b55862aaf..d2c5b2cf9 100644
--- a/litellm/cost_calculator.py
+++ b/litellm/cost_calculator.py
@@ -28,6 +28,7 @@ from litellm.llms.databricks.cost_calculator import (
 from litellm.llms.fireworks_ai.cost_calculator import (
     cost_per_token as fireworks_ai_cost_per_token,
 )
+from litellm.llms.together_ai.cost_calculator import get_model_params_and_category
 from litellm.rerank_api.types import RerankResponse
 from litellm.types.llms.openai import HttpxBinaryResponseContent
 from litellm.types.router import SPECIAL_MODEL_INFO_PARAMS
@@ -395,48 +396,6 @@ def cost_per_token(
         )
 
 
-# Extract the number of billion parameters from the model name
-# only used for together_computer LLMs
-def get_model_params_and_category(model_name) -> str:
-    """
-    Helper function for calculating together ai pricing.
-
-    Returns
-    - str - model pricing category if mapped else received model name
-    """
-    import re
-
-    model_name = model_name.lower()
-    re_params_match = re.search(
-        r"(\d+b)", model_name
-    )  # catch all decimals like 3b, 70b, etc
-    category = None
-    if re_params_match is not None:
-        params_match = str(re_params_match.group(1))
-        params_match = params_match.replace("b", "")
-        if params_match is not None:
-            params_billion = float(params_match)
-        else:
-            return model_name
-        # Determine the category based on the number of parameters
-        if params_billion <= 4.0:
-            category = "together-ai-up-to-4b"
-        elif params_billion <= 8.0:
-            category = "together-ai-4.1b-8b"
-        elif params_billion <= 21.0:
-            category = "together-ai-8.1b-21b"
-        elif params_billion <= 41.0:
-            category = "together-ai-21.1b-41b"
-        elif params_billion <= 80.0:
-            category = "together-ai-41.1b-80b"
-        elif params_billion <= 110.0:
-            category = "together-ai-81.1b-110b"
-        if category is not None:
-            return category
-
-    return model_name
-
-
 def get_replicate_completion_pricing(completion_response: dict, total_time=0.0):
     # see https://replicate.com/pricing
     # for all litellm currently supported LLMs, almost all requests go to a100_80gb
@@ -477,7 +436,7 @@ def _select_model_name_for_cost_calc(
     if isinstance(completion_response, str):
         return return_model
 
-    elif return_model is None:
+    elif return_model is None and hasattr(completion_response, "get"):
         return_model = completion_response.get("model", "")  # type: ignore
     hidden_params = getattr(completion_response, "_hidden_params", None)
 
@@ -716,7 +675,9 @@ def completion_cost(
         ):
             # together ai prices based on size of llm
             # get_model_params_and_category takes a model name and returns the category of LLM size it is in model_prices_and_context_window.json
-            model = get_model_params_and_category(model)
+
+            model = get_model_params_and_category(model, call_type=CallTypes(call_type))
+
         # replicate llms are calculate based on time for request running
         # see https://replicate.com/pricing
         elif (
diff --git a/litellm/integrations/langfuse.py b/litellm/integrations/langfuse.py
index 8c6879424..b2a084da3 100644
--- a/litellm/integrations/langfuse.py
+++ b/litellm/integrations/langfuse.py
@@ -204,6 +204,11 @@ class LangFuseLogger:
             ):
                 input = prompt
                 output = response_obj["choices"][0]["message"].json()
+            elif response_obj is not None and isinstance(
+                response_obj, litellm.HttpxBinaryResponseContent
+            ):
+                input = prompt
+                output = "speech-output"
             elif response_obj is not None and isinstance(
                 response_obj, litellm.TextCompletionResponse
             ):
@@ -549,7 +554,10 @@ class LangFuseLogger:
             generation_id = None
             usage = None
             if response_obj is not None:
-                if response_obj.get("id", None) is not None:
+                if (
+                    hasattr(response_obj, "id")
+                    and response_obj.get("id", None) is not None
+                ):
                     generation_id = litellm.utils.get_logging_id(
                         start_time, response_obj
                     )
@@ -571,8 +579,8 @@ class LangFuseLogger:
                 if _user_api_key_alias is not None:
                     generation_name = f"litellm:{_user_api_key_alias}"
 
-            if response_obj is not None and "system_fingerprint" in response_obj:
-                system_fingerprint = response_obj.get("system_fingerprint", None)
+            if response_obj is not None:
+                system_fingerprint = getattr(response_obj, "system_fingerprint", None)
             else:
                 system_fingerprint = None
 
diff --git a/litellm/llms/OpenAI/openai.py b/litellm/llms/OpenAI/openai.py
index 87aa095ab..aafb14bd1 100644
--- a/litellm/llms/OpenAI/openai.py
+++ b/litellm/llms/OpenAI/openai.py
@@ -1215,7 +1215,6 @@ class OpenAIChatCompletion(BaseLLM):
         client: Optional[AsyncOpenAI] = None,
         max_retries=None,
     ):
-        response = None
         try:
             openai_aclient: AsyncOpenAI = self._get_openai_client(  # type: ignore
                 is_async=True,
@@ -1237,12 +1236,15 @@ class OpenAIChatCompletion(BaseLLM):
                 additional_args={"complete_input_dict": data},
                 original_response=stringified_response,
             )
-            return convert_to_model_response_object(
+            returned_response: (
+                litellm.EmbeddingResponse
+            ) = convert_to_model_response_object(
                 response_object=stringified_response,
                 model_response_object=model_response,
                 response_type="embedding",
                 _response_headers=headers,
             )  # type: ignore
+            return returned_response
         except OpenAIError as e:
             ## LOGGING
             logging_obj.post_call(
@@ -1284,7 +1286,6 @@ class OpenAIChatCompletion(BaseLLM):
         aembedding=None,
     ):
         super().embedding()
-        exception_mapping_worked = False
         try:
             model = model
             data = {"model": model, "input": input, **optional_params}
@@ -1299,7 +1300,7 @@ class OpenAIChatCompletion(BaseLLM):
             )
 
             if aembedding is True:
-                response = self.aembedding(
+                async_response = self.aembedding(
                     data=data,
                     input=input,
                     logging_obj=logging_obj,
@@ -1310,7 +1311,7 @@ class OpenAIChatCompletion(BaseLLM):
                     client=client,
                     max_retries=max_retries,
                 )
-                return response
+                return async_response
 
             openai_client: OpenAI = self._get_openai_client(  # type: ignore
                 is_async=False,
@@ -1335,12 +1336,13 @@ class OpenAIChatCompletion(BaseLLM):
                 additional_args={"complete_input_dict": data},
                 original_response=sync_embedding_response,
             )
-            return convert_to_model_response_object(
+            response: litellm.EmbeddingResponse = convert_to_model_response_object(
                 response_object=sync_embedding_response.model_dump(),
                 model_response_object=model_response,
                 _response_headers=headers,
                 response_type="embedding",
             )  # type: ignore
+            return response
         except OpenAIError as e:
             raise e
         except Exception as e:
diff --git a/litellm/llms/azure_ai/__init__.py b/litellm/llms/azure_ai/__init__.py
new file mode 100644
index 000000000..c3e4342ec
--- /dev/null
+++ b/litellm/llms/azure_ai/__init__.py
@@ -0,0 +1,3 @@
+from .chat.handler import AzureAIChatCompletion
+from .embed.handler import AzureAIEmbedding
+from .rerank.handler import AzureAIRerank
diff --git a/litellm/llms/azure_ai/cost_calculator.py b/litellm/llms/azure_ai/cost_calculator.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/litellm/llms/azure_ai/embed/cohere_transformation.py b/litellm/llms/azure_ai/embed/cohere_transformation.py
new file mode 100644
index 000000000..68e0b538a
--- /dev/null
+++ b/litellm/llms/azure_ai/embed/cohere_transformation.py
@@ -0,0 +1,98 @@
+"""
+Transformation logic from OpenAI /v1/embeddings format to Azure AI Cohere's /v1/embed. 
+
+Why separate file? Make it easy to see how transformation works
+
+Convers
+- Cohere request format
+
+Docs - https://docs.aws.amazon.com/bedrock/latest/userguide/model-parameters-titan-embed-text.html
+"""
+
+from typing import List, Optional, Tuple, Union
+
+from litellm.types.llms.azure_ai import ImageEmbeddingInput, ImageEmbeddingRequest
+from litellm.types.llms.openai import EmbeddingCreateParams
+from litellm.types.utils import Embedding, EmbeddingResponse, Usage
+from litellm.utils import is_base64_encoded
+
+
+class AzureAICohereConfig:
+    def __init__(self) -> None:
+        pass
+
+    def _map_azure_model_group(self, model: str) -> str:
+        if "model=offer-cohere-embed-multili-paygo":
+            return "Cohere-embed-v3-multilingual"
+        elif "model=offer-cohere-embed-english-paygo":
+            return "Cohere-embed-v3-english"
+
+        return model
+
+    def _transform_request_image_embeddings(
+        self, input: List[str], optional_params: dict
+    ) -> ImageEmbeddingRequest:
+        """
+        Assume all str in list is base64 encoded string
+        """
+        image_input: List[ImageEmbeddingInput] = []
+        for i in input:
+            embedding_input = ImageEmbeddingInput(image=i)
+            image_input.append(embedding_input)
+        return ImageEmbeddingRequest(input=image_input, **optional_params)
+
+    def _transform_request(
+        self, input: List[str], optional_params: dict, model: str
+    ) -> Tuple[ImageEmbeddingRequest, EmbeddingCreateParams, List[int]]:
+        """
+        Return the list of input to `/image/embeddings`, `/v1/embeddings`, list of image_embedding_idx for recombination
+        """
+        image_embeddings: List[str] = []
+        image_embedding_idx: List[int] = []
+        for idx, i in enumerate(input):
+            """
+            - is base64 -> route to image embeddings
+            - is ImageEmbeddingInput -> route to image embeddings
+            - else -> route to `/v1/embeddings`
+            """
+            if is_base64_encoded(i):
+                image_embeddings.append(i)
+                image_embedding_idx.append(idx)
+
+        ## REMOVE IMAGE EMBEDDINGS FROM input list
+        filtered_input = [
+            item for idx, item in enumerate(input) if idx not in image_embedding_idx
+        ]
+
+        v1_embeddings_request = EmbeddingCreateParams(
+            input=filtered_input, model=model, **optional_params
+        )
+        image_embeddings_request = self._transform_request_image_embeddings(
+            input=image_embeddings, optional_params=optional_params
+        )
+
+        return image_embeddings_request, v1_embeddings_request, image_embedding_idx
+
+    def _transform_response(self, response: EmbeddingResponse) -> EmbeddingResponse:
+        additional_headers: Optional[dict] = response._hidden_params.get(
+            "additional_headers"
+        )
+        if additional_headers:
+            # CALCULATE USAGE
+            input_tokens: Optional[str] = additional_headers.get(
+                "llm_provider-num_tokens"
+            )
+            if input_tokens:
+                if response.usage:
+                    response.usage.prompt_tokens = int(input_tokens)
+                else:
+                    response.usage = Usage(prompt_tokens=int(input_tokens))
+
+            # SET MODEL
+            base_model: Optional[str] = additional_headers.get(
+                "llm_provider-azureml-model-group"
+            )
+            if base_model:
+                response.model = self._map_azure_model_group(base_model)
+
+        return response
diff --git a/litellm/llms/azure_ai/embed/handler.py b/litellm/llms/azure_ai/embed/handler.py
new file mode 100644
index 000000000..2428119b7
--- /dev/null
+++ b/litellm/llms/azure_ai/embed/handler.py
@@ -0,0 +1,296 @@
+import asyncio
+import copy
+import json
+import os
+from copy import deepcopy
+from typing import Any, Callable, List, Literal, Optional, Tuple, Union
+
+import httpx
+from openai import OpenAI
+
+import litellm
+from litellm.llms.cohere.embed import embedding as cohere_embedding
+from litellm.llms.custom_httpx.http_handler import (
+    AsyncHTTPHandler,
+    HTTPHandler,
+    _get_httpx_client,
+    get_async_httpx_client,
+)
+from litellm.llms.OpenAI.openai import OpenAIChatCompletion
+from litellm.types.llms.azure_ai import ImageEmbeddingRequest
+from litellm.types.utils import Embedding, EmbeddingResponse
+from litellm.utils import convert_to_model_response_object, is_base64_encoded
+
+from .cohere_transformation import AzureAICohereConfig
+
+
+class AzureAIEmbedding(OpenAIChatCompletion):
+
+    def _process_response(
+        self,
+        image_embedding_responses: Optional[List],
+        text_embedding_responses: Optional[List],
+        image_embeddings_idx: List[int],
+        model_response: EmbeddingResponse,
+        input: List,
+    ):
+        combined_responses = []
+        if (
+            image_embedding_responses is not None
+            and text_embedding_responses is not None
+        ):
+            # Combine and order the results
+            text_idx = 0
+            image_idx = 0
+
+            for idx in range(len(input)):
+                if idx in image_embeddings_idx:
+                    combined_responses.append(image_embedding_responses[image_idx])
+                    image_idx += 1
+                else:
+                    combined_responses.append(text_embedding_responses[text_idx])
+                    text_idx += 1
+
+            model_response.data = combined_responses
+        elif image_embedding_responses is not None:
+            model_response.data = image_embedding_responses
+        elif text_embedding_responses is not None:
+            model_response.data = text_embedding_responses
+
+        response = AzureAICohereConfig()._transform_response(response=model_response)  # type: ignore
+
+        return response
+
+    async def async_image_embedding(
+        self,
+        model: str,
+        data: ImageEmbeddingRequest,
+        timeout: float,
+        logging_obj,
+        model_response: litellm.EmbeddingResponse,
+        optional_params: dict,
+        api_key: Optional[str],
+        api_base: Optional[str],
+        client: Optional[Union[HTTPHandler, AsyncHTTPHandler]] = None,
+    ) -> EmbeddingResponse:
+        if client is None or not isinstance(client, AsyncHTTPHandler):
+            client = AsyncHTTPHandler(timeout=timeout, concurrent_limit=1)
+
+        url = "{}/images/embeddings".format(api_base)
+
+        response = await client.post(
+            url=url,
+            json=data,  # type: ignore
+            headers={"Authorization": "Bearer {}".format(api_key)},
+        )
+
+        embedding_response = response.json()
+        embedding_headers = dict(response.headers)
+        returned_response: litellm.EmbeddingResponse = convert_to_model_response_object(  # type: ignore
+            response_object=embedding_response,
+            model_response_object=model_response,
+            response_type="embedding",
+            stream=False,
+            _response_headers=embedding_headers,
+        )
+        return returned_response
+
+    def image_embedding(
+        self,
+        model: str,
+        data: ImageEmbeddingRequest,
+        timeout: float,
+        logging_obj,
+        model_response: litellm.EmbeddingResponse,
+        optional_params: dict,
+        api_key: Optional[str],
+        api_base: Optional[str],
+        client: Optional[Union[HTTPHandler, AsyncHTTPHandler]] = None,
+    ):
+        if api_base is None:
+            raise ValueError(
+                "api_base is None. Please set AZURE_AI_API_BASE or dynamically via `api_base` param, to make the request."
+            )
+        if api_key is None:
+            raise ValueError(
+                "api_key is None. Please set AZURE_AI_API_KEY or dynamically via `api_key` param, to make the request."
+            )
+
+        if client is None or not isinstance(client, HTTPHandler):
+            client = HTTPHandler(timeout=timeout, concurrent_limit=1)
+
+        url = "{}/images/embeddings".format(api_base)
+
+        response = client.post(
+            url=url,
+            json=data,  # type: ignore
+            headers={"Authorization": "Bearer {}".format(api_key)},
+        )
+
+        embedding_response = response.json()
+        embedding_headers = dict(response.headers)
+        returned_response: litellm.EmbeddingResponse = convert_to_model_response_object(  # type: ignore
+            response_object=embedding_response,
+            model_response_object=model_response,
+            response_type="embedding",
+            stream=False,
+            _response_headers=embedding_headers,
+        )
+        return returned_response
+
+    async def async_embedding(
+        self,
+        model: str,
+        input: List,
+        timeout: float,
+        logging_obj,
+        model_response: litellm.EmbeddingResponse,
+        optional_params: dict,
+        api_key: Optional[str] = None,
+        api_base: Optional[str] = None,
+        client=None,
+    ) -> EmbeddingResponse:
+
+        (
+            image_embeddings_request,
+            v1_embeddings_request,
+            image_embeddings_idx,
+        ) = AzureAICohereConfig()._transform_request(
+            input=input, optional_params=optional_params, model=model
+        )
+
+        image_embedding_responses: Optional[List] = None
+        text_embedding_responses: Optional[List] = None
+
+        if image_embeddings_request["input"]:
+            image_response = await self.async_image_embedding(
+                model=model,
+                data=image_embeddings_request,
+                timeout=timeout,
+                logging_obj=logging_obj,
+                model_response=model_response,
+                optional_params=optional_params,
+                api_key=api_key,
+                api_base=api_base,
+                client=client,
+            )
+
+            image_embedding_responses = image_response.data
+            if image_embedding_responses is None:
+                raise Exception("/image/embeddings route returned None Embeddings.")
+
+        if v1_embeddings_request["input"]:
+            response: EmbeddingResponse = await super().embedding(  # type: ignore
+                model=model,
+                input=input,
+                timeout=timeout,
+                logging_obj=logging_obj,
+                model_response=model_response,
+                optional_params=optional_params,
+                api_key=api_key,
+                api_base=api_base,
+                client=client,
+                aembedding=True,
+            )
+            text_embedding_responses = response.data
+            if text_embedding_responses is None:
+                raise Exception("/v1/embeddings route returned None Embeddings.")
+
+        return self._process_response(
+            image_embedding_responses=image_embedding_responses,
+            text_embedding_responses=text_embedding_responses,
+            image_embeddings_idx=image_embeddings_idx,
+            model_response=model_response,
+            input=input,
+        )
+
+    def embedding(
+        self,
+        model: str,
+        input: List,
+        timeout: float,
+        logging_obj,
+        model_response: litellm.EmbeddingResponse,
+        optional_params: dict,
+        api_key: Optional[str] = None,
+        api_base: Optional[str] = None,
+        client=None,
+        aembedding=None,
+    ):
+        """
+        - Separate image url from text
+        -> route image url call to `/image/embeddings`
+        -> route text call to `/v1/embeddings` (OpenAI route)
+
+        assemble result in-order, and return
+        """
+        if aembedding is True:
+            return self.async_embedding(
+                model,
+                input,
+                timeout,
+                logging_obj,
+                model_response,
+                optional_params,
+                api_key,
+                api_base,
+                client,
+            )
+
+        (
+            image_embeddings_request,
+            v1_embeddings_request,
+            image_embeddings_idx,
+        ) = AzureAICohereConfig()._transform_request(
+            input=input, optional_params=optional_params, model=model
+        )
+
+        image_embedding_responses: Optional[List] = None
+        text_embedding_responses: Optional[List] = None
+
+        if image_embeddings_request["input"]:
+            image_response = self.image_embedding(
+                model=model,
+                data=image_embeddings_request,
+                timeout=timeout,
+                logging_obj=logging_obj,
+                model_response=model_response,
+                optional_params=optional_params,
+                api_key=api_key,
+                api_base=api_base,
+                client=client,
+            )
+
+            image_embedding_responses = image_response.data
+            if image_embedding_responses is None:
+                raise Exception("/image/embeddings route returned None Embeddings.")
+
+        if v1_embeddings_request["input"]:
+            response: EmbeddingResponse = super().embedding(  # type: ignore
+                model,
+                input,
+                timeout,
+                logging_obj,
+                model_response,
+                optional_params,
+                api_key,
+                api_base,
+                client=(
+                    client
+                    if client is not None and isinstance(client, OpenAI)
+                    else None
+                ),
+                aembedding=aembedding,
+            )
+
+            text_embedding_responses = response.data
+            if text_embedding_responses is None:
+                raise Exception("/v1/embeddings route returned None Embeddings.")
+
+        return self._process_response(
+            image_embedding_responses=image_embedding_responses,
+            text_embedding_responses=text_embedding_responses,
+            image_embeddings_idx=image_embeddings_idx,
+            model_response=model_response,
+            input=input,
+        )
diff --git a/litellm/llms/together_ai.py b/litellm/llms/together_ai.py
deleted file mode 100644
index 3adbcae37..000000000
--- a/litellm/llms/together_ai.py
+++ /dev/null
@@ -1,239 +0,0 @@
-"""
-Deprecated. We now do together ai calls via the openai client.
-Reference: https://docs.together.ai/docs/openai-api-compatibility
-"""
-
-import json
-import os
-import time
-import types
-from enum import Enum
-from typing import Callable, Optional
-
-import httpx  # type: ignore
-import requests  # type: ignore
-
-import litellm
-from litellm.utils import ModelResponse, Usage
-
-from .prompt_templates.factory import custom_prompt, prompt_factory
-
-
-class TogetherAIError(Exception):
-    def __init__(self, status_code, message):
-        self.status_code = status_code
-        self.message = message
-        self.request = httpx.Request(
-            method="POST", url="https://api.together.xyz/inference"
-        )
-        self.response = httpx.Response(status_code=status_code, request=self.request)
-        super().__init__(
-            self.message
-        )  # Call the base class constructor with the parameters it needs
-
-
-class TogetherAIConfig:
-    """
-    Reference: https://docs.together.ai/reference/inference
-
-    The class `TogetherAIConfig` provides configuration for the TogetherAI's API interface. Here are the parameters:
-
-    - `max_tokens` (int32, required): The maximum number of tokens to generate.
-
-    - `stop` (string, optional): A string sequence that will truncate (stop) the inference text output. For example, "\n\n" will stop generation as soon as the model generates two newlines.
-
-    - `temperature` (float, optional): A decimal number that determines the degree of randomness in the response. A value of 1 will always yield the same output. A temperature less than 1 favors more correctness and is appropriate for question answering or summarization. A value greater than 1 introduces more randomness in the output.
-
-    - `top_p` (float, optional): The `top_p` (nucleus) parameter is used to dynamically adjust the number of choices for each predicted token based on the cumulative probabilities. It specifies a probability threshold, below which all less likely tokens are filtered out. This technique helps to maintain diversity and generate more fluent and natural-sounding text.
-
-    - `top_k` (int32, optional): The `top_k` parameter is used to limit the number of choices for the next predicted word or token. It specifies the maximum number of tokens to consider at each step, based on their probability of occurrence. This technique helps to speed up the generation process and can improve the quality of the generated text by focusing on the most likely options.
-
-    - `repetition_penalty` (float, optional): A number that controls the diversity of generated text by reducing the likelihood of repeated sequences. Higher values decrease repetition.
-
-    - `logprobs` (int32, optional): This parameter is not described in the prompt.
-    """
-
-    max_tokens: Optional[int] = None
-    stop: Optional[str] = None
-    temperature: Optional[int] = None
-    top_p: Optional[float] = None
-    top_k: Optional[int] = None
-    repetition_penalty: Optional[float] = None
-    logprobs: Optional[int] = None
-
-    def __init__(
-        self,
-        max_tokens: Optional[int] = None,
-        stop: Optional[str] = None,
-        temperature: Optional[int] = None,
-        top_p: Optional[float] = None,
-        top_k: Optional[int] = None,
-        repetition_penalty: Optional[float] = None,
-        logprobs: Optional[int] = None,
-    ) -> None:
-        locals_ = locals()
-        for key, value in locals_.items():
-            if key != "self" and value is not None:
-                setattr(self.__class__, key, value)
-
-    @classmethod
-    def get_config(cls):
-        return {
-            k: v
-            for k, v in cls.__dict__.items()
-            if not k.startswith("__")
-            and not isinstance(
-                v,
-                (
-                    types.FunctionType,
-                    types.BuiltinFunctionType,
-                    classmethod,
-                    staticmethod,
-                ),
-            )
-            and v is not None
-        }
-
-
-# def validate_environment(api_key):
-#     if api_key is None:
-#         raise ValueError(
-#             "Missing TogetherAI API Key - A call is being made to together_ai but no key is set either in the environment variables or via params"
-#         )
-#     headers = {
-#         "accept": "application/json",
-#         "content-type": "application/json",
-#         "Authorization": "Bearer " + api_key,
-#     }
-#     return headers
-
-
-# def completion(
-#     model: str,
-#     messages: list,
-#     api_base: str,
-#     model_response: ModelResponse,
-#     print_verbose: Callable,
-#     encoding,
-#     api_key,
-#     logging_obj,
-#     custom_prompt_dict={},
-#     optional_params=None,
-#     litellm_params=None,
-#     logger_fn=None,
-# ):
-#     headers = validate_environment(api_key)
-
-#     ## Load Config
-#     config = litellm.TogetherAIConfig.get_config()
-#     for k, v in config.items():
-#         if (
-#             k not in optional_params
-#         ):  # completion(top_k=3) > togetherai_config(top_k=3) <- allows for dynamic variables to be passed in
-#             optional_params[k] = v
-
-#     print_verbose(f"CUSTOM PROMPT DICT: {custom_prompt_dict}; model: {model}")
-#     if model in custom_prompt_dict:
-#         # check if the model has a registered custom prompt
-#         model_prompt_details = custom_prompt_dict[model]
-#         prompt = custom_prompt(
-#             role_dict=model_prompt_details.get("roles", {}),
-#             initial_prompt_value=model_prompt_details.get("initial_prompt_value", ""),
-#             final_prompt_value=model_prompt_details.get("final_prompt_value", ""),
-#             bos_token=model_prompt_details.get("bos_token", ""),
-#             eos_token=model_prompt_details.get("eos_token", ""),
-#             messages=messages,
-#         )
-#     else:
-#         prompt = prompt_factory(
-#             model=model,
-#             messages=messages,
-#             api_key=api_key,
-#             custom_llm_provider="together_ai",
-#         )  # api key required to query together ai model list
-
-#     data = {
-#         "model": model,
-#         "prompt": prompt,
-#         "request_type": "language-model-inference",
-#         **optional_params,
-#     }
-
-#     ## LOGGING
-#     logging_obj.pre_call(
-#         input=prompt,
-#         api_key=api_key,
-#         additional_args={
-#             "complete_input_dict": data,
-#             "headers": headers,
-#             "api_base": api_base,
-#         },
-#     )
-#     ## COMPLETION CALL
-#     if "stream_tokens" in optional_params and optional_params["stream_tokens"] == True:
-#         response = requests.post(
-#             api_base,
-#             headers=headers,
-#             data=json.dumps(data),
-#             stream=optional_params["stream_tokens"],
-#         )
-#         return response.iter_lines()
-#     else:
-#         response = requests.post(api_base, headers=headers, data=json.dumps(data))
-#         ## LOGGING
-#         logging_obj.post_call(
-#             input=prompt,
-#             api_key=api_key,
-#             original_response=response.text,
-#             additional_args={"complete_input_dict": data},
-#         )
-#         print_verbose(f"raw model_response: {response.text}")
-#         ## RESPONSE OBJECT
-#         if response.status_code != 200:
-#             raise TogetherAIError(
-#                 status_code=response.status_code, message=response.text
-#             )
-#         completion_response = response.json()
-
-#         if "error" in completion_response:
-#             raise TogetherAIError(
-#                 message=json.dumps(completion_response),
-#                 status_code=response.status_code,
-#             )
-#         elif "error" in completion_response["output"]:
-#             raise TogetherAIError(
-#                 message=json.dumps(completion_response["output"]),
-#                 status_code=response.status_code,
-#             )
-
-#         if len(completion_response["output"]["choices"][0]["text"]) >= 0:
-#             model_response.choices[0].message.content = completion_response["output"][
-#                 "choices"
-#             ][0]["text"]
-
-#         ## CALCULATING USAGE
-#         print_verbose(
-#             f"CALCULATING TOGETHERAI TOKEN USAGE. Model Response: {model_response}; model_response['choices'][0]['message'].get('content', ''): {model_response['choices'][0]['message'].get('content', None)}"
-#         )
-#         prompt_tokens = len(encoding.encode(prompt))
-#         completion_tokens = len(
-#             encoding.encode(model_response["choices"][0]["message"].get("content", ""))
-#         )
-#         if "finish_reason" in completion_response["output"]["choices"][0]:
-#             model_response.choices[0].finish_reason = completion_response["output"][
-#                 "choices"
-#             ][0]["finish_reason"]
-#         model_response["created"] = int(time.time())
-#         model_response["model"] = "together_ai/" + model
-#         usage = Usage(
-#             prompt_tokens=prompt_tokens,
-#             completion_tokens=completion_tokens,
-#             total_tokens=prompt_tokens + completion_tokens,
-#         )
-#         setattr(model_response, "usage", usage)
-#         return model_response
-
-
-# def embedding():
-#     # logic for parsing in - calling - parsing out model embedding calls
-#     pass
diff --git a/litellm/llms/together_ai/chat.py b/litellm/llms/together_ai/chat.py
new file mode 100644
index 000000000..398bc489c
--- /dev/null
+++ b/litellm/llms/together_ai/chat.py
@@ -0,0 +1,13 @@
+"""
+Support for OpenAI's `/v1/chat/completions` endpoint. 
+
+Calls done in OpenAI/openai.py as TogetherAI is openai-compatible.
+
+Docs: https://docs.together.ai/reference/completions-1
+"""
+
+from ..OpenAI.openai import OpenAIConfig
+
+
+class TogetherAIConfig(OpenAIConfig):
+    pass
diff --git a/litellm/llms/together_ai/completion.py b/litellm/llms/together_ai/completion.py
new file mode 100644
index 000000000..525c0411a
--- /dev/null
+++ b/litellm/llms/together_ai/completion.py
@@ -0,0 +1,7 @@
+"""
+Support for OpenAI's `/v1/completions` endpoint. 
+
+Calls done in OpenAI/openai.py as TogetherAI is openai-compatible.
+
+Docs: https://docs.together.ai/reference/completions-1
+"""
diff --git a/litellm/llms/together_ai/cost_calculator.py b/litellm/llms/together_ai/cost_calculator.py
new file mode 100644
index 000000000..d3b0db8b8
--- /dev/null
+++ b/litellm/llms/together_ai/cost_calculator.py
@@ -0,0 +1,79 @@
+"""
+Handles calculating cost for together ai models
+"""
+
+import re
+
+from litellm.types.utils import CallTypes
+
+
+# Extract the number of billion parameters from the model name
+# only used for together_computer LLMs
+def get_model_params_and_category(model_name, call_type: CallTypes) -> str:
+    """
+    Helper function for calculating together ai pricing.
+
+    Returns
+    - str - model pricing category if mapped else received model name
+    """
+    if call_type == CallTypes.embedding or call_type == CallTypes.aembedding:
+        return get_model_params_and_category_embeddings(model_name=model_name)
+    model_name = model_name.lower()
+    re_params_match = re.search(
+        r"(\d+b)", model_name
+    )  # catch all decimals like 3b, 70b, etc
+    category = None
+    if re_params_match is not None:
+        params_match = str(re_params_match.group(1))
+        params_match = params_match.replace("b", "")
+        if params_match is not None:
+            params_billion = float(params_match)
+        else:
+            return model_name
+        # Determine the category based on the number of parameters
+        if params_billion <= 4.0:
+            category = "together-ai-up-to-4b"
+        elif params_billion <= 8.0:
+            category = "together-ai-4.1b-8b"
+        elif params_billion <= 21.0:
+            category = "together-ai-8.1b-21b"
+        elif params_billion <= 41.0:
+            category = "together-ai-21.1b-41b"
+        elif params_billion <= 80.0:
+            category = "together-ai-41.1b-80b"
+        elif params_billion <= 110.0:
+            category = "together-ai-81.1b-110b"
+        if category is not None:
+            return category
+
+    return model_name
+
+
+def get_model_params_and_category_embeddings(model_name) -> str:
+    """
+    Helper function for calculating together ai embedding pricing.
+
+    Returns
+    - str - model pricing category if mapped else received model name
+    """
+    model_name = model_name.lower()
+    re_params_match = re.search(
+        r"(\d+m)", model_name
+    )  # catch all decimals like 100m, 200m, etc.
+    category = None
+    if re_params_match is not None:
+        params_match = str(re_params_match.group(1))
+        params_match = params_match.replace("m", "")
+        if params_match is not None:
+            params_million = float(params_match)
+        else:
+            return model_name
+        # Determine the category based on the number of parameters
+        if params_million <= 150:
+            category = "together-ai-embedding-up-to-150m"
+        elif params_million <= 350:
+            category = "together-ai-embedding-151m-to-350m"
+        if category is not None:
+            return category
+
+    return model_name
diff --git a/litellm/llms/together_ai/embed.py b/litellm/llms/together_ai/embed.py
new file mode 100644
index 000000000..577df0256
--- /dev/null
+++ b/litellm/llms/together_ai/embed.py
@@ -0,0 +1,7 @@
+"""
+Support for OpenAI's `/v1/embeddings` endpoint. 
+
+Calls done in OpenAI/openai.py as TogetherAI is openai-compatible.
+
+Docs: https://docs.together.ai/reference/completions-1
+"""
diff --git a/litellm/llms/togetherai/rerank.py b/litellm/llms/together_ai/rerank.py
similarity index 100%
rename from litellm/llms/togetherai/rerank.py
rename to litellm/llms/together_ai/rerank.py
diff --git a/litellm/main.py b/litellm/main.py
index 7bb01f937..6ed5534fb 100644
--- a/litellm/main.py
+++ b/litellm/main.py
@@ -83,7 +83,7 @@ from .llms import (
 from .llms.AI21 import completion as ai21
 from .llms.anthropic.chat import AnthropicChatCompletion
 from .llms.anthropic.completion import AnthropicTextCompletion
-from .llms.azure_ai.chat.handler import AzureAIChatCompletion
+from .llms.azure_ai import AzureAIChatCompletion, AzureAIEmbedding
 from .llms.azure_text import AzureTextCompletion
 from .llms.AzureOpenAI.audio_transcriptions import AzureAudioTranscription
 from .llms.AzureOpenAI.azure import AzureChatCompletion, _check_dynamic_azure_params
@@ -168,6 +168,7 @@ openai_o1_chat_completions = OpenAIO1ChatCompletion()
 openai_audio_transcriptions = OpenAIAudioTranscription()
 databricks_chat_completions = DatabricksChatCompletion()
 azure_ai_chat_completions = AzureAIChatCompletion()
+azure_ai_embedding = AzureAIEmbedding()
 anthropic_chat_completions = AnthropicChatCompletion()
 anthropic_text_completions = AnthropicTextCompletion()
 azure_chat_completions = AzureChatCompletion()
@@ -3215,6 +3216,8 @@ async def aembedding(*args, **kwargs) -> EmbeddingResponse:
             or custom_llm_provider == "cohere"
             or custom_llm_provider == "huggingface"
             or custom_llm_provider == "bedrock"
+            or custom_llm_provider == "azure_ai"
+            or custom_llm_provider == "together_ai"
         ):  # currently implemented aiohttp calls for just azure and openai, soon all.
             # Await normally
             init_response = await loop.run_in_executor(None, func_with_context)
@@ -3385,6 +3388,9 @@ def embedding(
         api_base=api_base,
         api_key=api_key,
     )
+    if dynamic_api_key is not None:
+        api_key = dynamic_api_key
+
     optional_params = get_optional_params_embeddings(
         model=model,
         user=user,
@@ -3481,7 +3487,9 @@ def embedding(
                 aembedding=aembedding,
             )
         elif (
-            model in litellm.open_ai_embedding_models or custom_llm_provider == "openai"
+            model in litellm.open_ai_embedding_models
+            or custom_llm_provider == "openai"
+            or custom_llm_provider == "together_ai"
         ):
             api_base = (
                 api_base
@@ -3832,6 +3840,33 @@ def embedding(
                 model_response=EmbeddingResponse(),
                 aembedding=aembedding,
             )
+        elif custom_llm_provider == "azure_ai":
+            api_base = (
+                api_base  # for deepinfra/perplexity/anyscale/groq/friendliai we check in get_llm_provider and pass in the api base from there
+                or litellm.api_base
+                or get_secret("AZURE_AI_API_BASE")
+            )
+            # set API KEY
+            api_key = (
+                api_key
+                or litellm.api_key  # for deepinfra/perplexity/anyscale/friendliai we check in get_llm_provider and pass in the api key from there
+                or litellm.openai_key
+                or get_secret("AZURE_AI_API_KEY")
+            )
+
+            ## EMBEDDING CALL
+            response = azure_ai_embedding.embedding(
+                model=model,
+                input=input,
+                api_base=api_base,
+                api_key=api_key,
+                logging_obj=logging,
+                timeout=timeout,
+                model_response=EmbeddingResponse(),
+                optional_params=optional_params,
+                client=client,
+                aembedding=aembedding,
+            )
         else:
             args = locals()
             raise ValueError(f"No valid embedding model args passed in - {args}")
@@ -4901,7 +4936,11 @@ def speech(
     aspeech: Optional[bool] = None,
     **kwargs,
 ) -> HttpxBinaryResponseContent:
-
+    user = kwargs.get("user", None)
+    litellm_call_id: Optional[str] = kwargs.get("litellm_call_id", None)
+    proxy_server_request = kwargs.get("proxy_server_request", None)
+    model_info = kwargs.get("model_info", None)
+    metadata = kwargs.get("metadata", {})
     model, custom_llm_provider, dynamic_api_key, api_base = get_llm_provider(model=model, custom_llm_provider=custom_llm_provider, api_base=api_base)  # type: ignore
     tags = kwargs.pop("tags", [])
 
@@ -4918,6 +4957,21 @@ def speech(
         max_retries = litellm.num_retries or openai.DEFAULT_MAX_RETRIES
 
     logging_obj = kwargs.get("litellm_logging_obj", None)
+    logging_obj.update_environment_variables(
+        model=model,
+        user=user,
+        optional_params={},
+        litellm_params={
+            "litellm_call_id": litellm_call_id,
+            "proxy_server_request": proxy_server_request,
+            "model_info": model_info,
+            "metadata": metadata,
+            "preset_cache_key": None,
+            "stream_response": {},
+            **kwargs,
+        },
+        custom_llm_provider=custom_llm_provider,
+    )
     response: Optional[HttpxBinaryResponseContent] = None
     if custom_llm_provider == "openai":
         if voice is None or not (isinstance(voice, str)):
diff --git a/litellm/model_prices_and_context_window_backup.json b/litellm/model_prices_and_context_window_backup.json
index 0cd996289..6804d677e 100644
--- a/litellm/model_prices_and_context_window_backup.json
+++ b/litellm/model_prices_and_context_window_backup.json
@@ -990,6 +990,26 @@
         "mode": "chat",
         "source":"https://azuremarketplace.microsoft.com/en-us/marketplace/apps/metagenai.meta-llama-3-1-405b-instruct-offer?tab=PlansAndPrice"
     },
+    "azure_ai/Cohere-embed-v3-english": {
+        "max_tokens": 512,
+        "max_input_tokens": 512,
+        "output_vector_size": 1024,
+        "input_cost_per_token": 0.0000001,
+        "output_cost_per_token": 0.0,
+        "litellm_provider": "azure_ai",
+        "mode": "embedding",
+        "source":"https://azuremarketplace.microsoft.com/en-us/marketplace/apps/cohere.cohere-embed-v3-english-offer?tab=PlansAndPrice"
+    },
+    "azure_ai/Cohere-embed-v3-multilingual": {
+        "max_tokens": 512,
+        "max_input_tokens": 512,
+        "output_vector_size": 1024,
+        "input_cost_per_token": 0.0000001,
+        "output_cost_per_token": 0.0,
+        "litellm_provider": "azure_ai",
+        "mode": "embedding",
+        "source":"https://azuremarketplace.microsoft.com/en-us/marketplace/apps/cohere.cohere-embed-v3-english-offer?tab=PlansAndPrice"
+    },
     "babbage-002": {
         "max_tokens": 16384,
         "max_input_tokens": 16384,
@@ -4953,50 +4973,71 @@
     "together-ai-up-to-4b": {
         "input_cost_per_token": 0.0000001,
         "output_cost_per_token": 0.0000001,
-        "litellm_provider": "together_ai"
+        "litellm_provider": "together_ai",
+        "mode": "chat"
     },
     "together-ai-4.1b-8b": {
         "input_cost_per_token": 0.0000002,
         "output_cost_per_token": 0.0000002,
-        "litellm_provider": "together_ai"
+        "litellm_provider": "together_ai",
+        "mode": "chat"
     },
     "together-ai-8.1b-21b": {
         "max_tokens": 1000,
         "input_cost_per_token": 0.0000003,
         "output_cost_per_token": 0.0000003,
-        "litellm_provider": "together_ai"
+        "litellm_provider": "together_ai",
+        "mode": "chat"
     },
     "together-ai-21.1b-41b": {
         "input_cost_per_token": 0.0000008,
         "output_cost_per_token": 0.0000008,
-        "litellm_provider": "together_ai"
+        "litellm_provider": "together_ai",
+        "mode": "chat"
     },
     "together-ai-41.1b-80b": {
         "input_cost_per_token": 0.0000009,
         "output_cost_per_token": 0.0000009,
-        "litellm_provider": "together_ai"
+        "litellm_provider": "together_ai",
+        "mode": "chat"
     },
     "together-ai-81.1b-110b": {
         "input_cost_per_token": 0.0000018,
         "output_cost_per_token": 0.0000018,
-        "litellm_provider": "together_ai"
+        "litellm_provider": "together_ai",
+        "mode": "chat"
+    },
+    "together-ai-embedding-up-to-150m": {
+        "input_cost_per_token": 0.000000008,
+        "output_cost_per_token": 0.0,
+        "litellm_provider": "together_ai",
+        "mode": "embedding"
+    },
+    "together-ai-embedding-151m-to-350m": {
+        "input_cost_per_token": 0.000000016,
+        "output_cost_per_token": 0.0,
+        "litellm_provider": "together_ai",
+        "mode": "embedding"
     },
     "together_ai/mistralai/Mixtral-8x7B-Instruct-v0.1": {
         "input_cost_per_token": 0.0000006,
         "output_cost_per_token": 0.0000006,
         "litellm_provider": "together_ai",
         "supports_function_calling": true,
-        "supports_parallel_function_calling": true
+        "supports_parallel_function_calling": true,
+        "mode": "chat"
     },
     "together_ai/mistralai/Mistral-7B-Instruct-v0.1": {
         "litellm_provider": "together_ai",
         "supports_function_calling": true,
-        "supports_parallel_function_calling": true
+        "supports_parallel_function_calling": true,
+        "mode": "chat"
     },
     "together_ai/togethercomputer/CodeLlama-34b-Instruct": {
         "litellm_provider": "together_ai",
         "supports_function_calling": true,
-        "supports_parallel_function_calling": true
+        "supports_parallel_function_calling": true,
+        "mode": "chat"
     },
     "ollama/codegemma": {
         "max_tokens": 8192, 
diff --git a/litellm/rerank_api/main.py b/litellm/rerank_api/main.py
index 1498e8b76..db2217b1e 100644
--- a/litellm/rerank_api/main.py
+++ b/litellm/rerank_api/main.py
@@ -8,7 +8,7 @@ from litellm._logging import verbose_logger
 from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLoggingObj
 from litellm.llms.azure_ai.rerank import AzureAIRerank
 from litellm.llms.cohere.rerank import CohereRerank
-from litellm.llms.togetherai.rerank import TogetherAIRerank
+from litellm.llms.together_ai.rerank import TogetherAIRerank
 from litellm.secret_managers.main import get_secret
 from litellm.types.router import *
 from litellm.utils import client, exception_type, supports_httpx_timeout
@@ -103,16 +103,14 @@ def rerank(
             )
         )
 
-        model_parameters = [
-            "top_n",
-            "rank_fields",
-            "return_documents",
-            "max_chunks_per_doc",
-        ]
-        model_params_dict = {}
-        for k, v in optional_params.model_fields.items():
-            if k in model_parameters:
-                model_params_dict[k] = v
+        model_params_dict = {
+            "top_n": top_n,
+            "rank_fields": rank_fields,
+            "return_documents": return_documents,
+            "max_chunks_per_doc": max_chunks_per_doc,
+            "documents": documents,
+        }
+
         litellm_logging_obj.update_environment_variables(
             model=model,
             user=user,
diff --git a/litellm/tests/test_completion_cost.py b/litellm/tests/test_completion_cost.py
index 259d6cccb..b5db9a77e 100644
--- a/litellm/tests/test_completion_cost.py
+++ b/litellm/tests/test_completion_cost.py
@@ -570,6 +570,9 @@ def test_groq_response_cost_tracking(is_streaming):
     print(f"response_cost: {response_cost}")
 
 
+from litellm.types.utils import CallTypes
+
+
 def test_together_ai_qwen_completion_cost():
     input_kwargs = {
         "completion_response": litellm.ModelResponse(
@@ -612,7 +615,7 @@ def test_together_ai_qwen_completion_cost():
     }
 
     response = litellm.cost_calculator.get_model_params_and_category(
-        model_name="qwen/Qwen2-72B-Instruct"
+        model_name="qwen/Qwen2-72B-Instruct", call_type=CallTypes.completion
     )
 
     assert response == "together-ai-41.1b-80b"
@@ -1323,3 +1326,802 @@ def test_completion_cost_vertex_llama3():
     cost = completion_cost(model=model, completion_response=response)
 
     assert cost == 0
+
+
+def test_together_ai_embedding_completion_cost():
+    from litellm.utils import Choices, EmbeddingResponse, Message, ModelResponse, Usage
+
+    os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
+    litellm.model_cost = litellm.get_model_cost_map(url="")
+    response = EmbeddingResponse(
+        model="togethercomputer/m2-bert-80M-8k-retrieval",
+        data=[
+            {
+                "embedding": [
+                    -0.18039076,
+                    0.11614138,
+                    0.37174946,
+                    0.27238843,
+                    -0.21933095,
+                    -0.15207036,
+                    0.17764972,
+                    -0.08700938,
+                    -0.23863377,
+                    -0.24203257,
+                    0.20441775,
+                    0.04630023,
+                    -0.07832973,
+                    -0.193581,
+                    0.2009999,
+                    -0.30106494,
+                    0.21179546,
+                    -0.23836501,
+                    -0.14919636,
+                    -0.045276586,
+                    0.08645845,
+                    -0.027714893,
+                    -0.009854938,
+                    0.25298217,
+                    -0.1081501,
+                    -0.2383125,
+                    0.23080236,
+                    0.011114239,
+                    0.06954927,
+                    -0.21081704,
+                    0.06937218,
+                    -0.16756944,
+                    -0.2030545,
+                    -0.19809915,
+                    -0.031914014,
+                    -0.15959585,
+                    0.17361341,
+                    0.30239972,
+                    -0.09923253,
+                    0.12680714,
+                    -0.13018028,
+                    0.1302273,
+                    0.19179879,
+                    0.17068875,
+                    0.065124996,
+                    -0.15515316,
+                    0.08250379,
+                    0.07309733,
+                    -0.07283606,
+                    0.21411736,
+                    0.15457751,
+                    -0.08725933,
+                    0.07227311,
+                    0.056812778,
+                    -0.077683985,
+                    0.06833304,
+                    0.0328722,
+                    0.2719641,
+                    -0.06989647,
+                    0.22805125,
+                    0.14953858,
+                    0.0792393,
+                    0.07793462,
+                    0.16176109,
+                    -0.15616545,
+                    -0.25149494,
+                    -0.065352336,
+                    -0.38410214,
+                    -0.27288514,
+                    0.13946335,
+                    -0.21873806,
+                    0.1365704,
+                    0.11738016,
+                    -0.1141173,
+                    0.022973377,
+                    -0.16935326,
+                    0.026940947,
+                    -0.09990286,
+                    -0.05157219,
+                    0.21006724,
+                    0.15897459,
+                    0.011987913,
+                    0.02576497,
+                    -0.11819022,
+                    -0.09184997,
+                    -0.31881434,
+                    -0.17055357,
+                    -0.09523704,
+                    0.008458802,
+                    -0.015483258,
+                    0.038404867,
+                    0.014673892,
+                    -0.041162584,
+                    0.002691519,
+                    0.04601874,
+                    0.059108324,
+                    0.007177156,
+                    0.066804245,
+                    0.038554087,
+                    -0.038720075,
+                    -0.2145991,
+                    -0.15713418,
+                    -0.03712905,
+                    -0.066650696,
+                    0.04227769,
+                    0.018708894,
+                    -0.26332214,
+                    0.0012769096,
+                    -0.13878848,
+                    -0.33141217,
+                    0.118736655,
+                    0.03026654,
+                    0.1017467,
+                    -0.08000539,
+                    0.00092649367,
+                    0.13062756,
+                    -0.03785864,
+                    -0.2038575,
+                    0.07655428,
+                    -0.24818295,
+                    -0.0600955,
+                    0.114760056,
+                    0.027571939,
+                    -0.047068622,
+                    -0.19806816,
+                    0.0774084,
+                    -0.05213658,
+                    -0.042000014,
+                    0.051924672,
+                    -0.14131106,
+                    -0.2309609,
+                    0.20305444,
+                    0.0700591,
+                    0.13863273,
+                    -0.06145084,
+                    -0.039423797,
+                    -0.055951696,
+                    0.04732105,
+                    0.078736484,
+                    0.2566198,
+                    0.054494765,
+                    0.017602794,
+                    -0.107575715,
+                    -0.017887019,
+                    -0.26046592,
+                    -0.077659994,
+                    -0.08430523,
+                    0.18806657,
+                    -0.12292346,
+                    0.06288608,
+                    -0.106739804,
+                    -0.06600645,
+                    -0.14719339,
+                    -0.05070389,
+                    0.23234129,
+                    -0.034023043,
+                    0.056019265,
+                    -0.03627352,
+                    0.11740493,
+                    0.060294818,
+                    -0.21726903,
+                    -0.09775424,
+                    0.27007395,
+                    0.28328258,
+                    0.022495652,
+                    0.13218465,
+                    0.07199022,
+                    -0.15933248,
+                    0.02381037,
+                    -0.08288268,
+                    0.020621575,
+                    0.17395815,
+                    0.06978612,
+                    0.18418784,
+                    -0.12663148,
+                    -0.21287888,
+                    0.21239495,
+                    0.10222956,
+                    0.03952703,
+                    -0.066957936,
+                    -0.035802357,
+                    0.03683884,
+                    0.22524163,
+                    -0.029355489,
+                    -0.11534147,
+                    -0.041979663,
+                    -0.012147716,
+                    -0.07279564,
+                    0.17417553,
+                    0.05546745,
+                    -0.1773277,
+                    -0.26984993,
+                    0.31703642,
+                    0.05958132,
+                    -0.14933203,
+                    -0.084655434,
+                    0.074604444,
+                    -0.077568695,
+                    0.25167143,
+                    -0.17753932,
+                    -0.006415411,
+                    0.068613894,
+                    -0.0031754146,
+                    -0.0039771493,
+                    0.015294107,
+                    0.11839045,
+                    -0.04570732,
+                    0.103238374,
+                    -0.09678329,
+                    -0.21713412,
+                    0.047976546,
+                    -0.14346297,
+                    0.17429878,
+                    -0.31257913,
+                    0.15445377,
+                    -0.10576352,
+                    -0.16792995,
+                    -0.17988597,
+                    -0.14238739,
+                    -0.088244036,
+                    0.2760547,
+                    0.088823885,
+                    -0.08074319,
+                    -0.028918687,
+                    0.107819095,
+                    0.12004892,
+                    0.13343112,
+                    -0.1332874,
+                    -0.0946055,
+                    -0.20433402,
+                    0.17760132,
+                    0.11774745,
+                    0.16756779,
+                    -0.0937686,
+                    0.23887308,
+                    0.27315456,
+                    0.08657822,
+                    0.027402503,
+                    -0.06605757,
+                    0.29859266,
+                    -0.21552202,
+                    0.026192812,
+                    0.1328459,
+                    0.13072926,
+                    0.19236198,
+                    0.01760772,
+                    -0.042355467,
+                    0.08815041,
+                    -0.013158761,
+                    -0.23350924,
+                    -0.043668386,
+                    -0.15479062,
+                    -0.024266671,
+                    0.08113482,
+                    0.14451654,
+                    -0.29152337,
+                    -0.028919466,
+                    0.15022752,
+                    -0.26923147,
+                    0.23846954,
+                    0.03292609,
+                    -0.23572414,
+                    -0.14883325,
+                    -0.12743121,
+                    -0.052229587,
+                    -0.14230779,
+                    0.284658,
+                    0.36885592,
+                    -0.13176951,
+                    -0.16442224,
+                    -0.20283924,
+                    0.048434418,
+                    -0.16231743,
+                    -0.0010730615,
+                    0.1408047,
+                    0.09481033,
+                    0.018139571,
+                    -0.030843062,
+                    0.13304341,
+                    -0.1516288,
+                    -0.051779557,
+                    0.46940327,
+                    -0.07969027,
+                    -0.051570967,
+                    -0.038892798,
+                    0.11187677,
+                    0.1703113,
+                    -0.39926252,
+                    0.06859773,
+                    0.08364686,
+                    0.14696898,
+                    0.026642298,
+                    0.13225247,
+                    0.05730332,
+                    0.35534015,
+                    0.11189959,
+                    0.039673142,
+                    -0.056019083,
+                    0.15707816,
+                    -0.11053284,
+                    0.12823457,
+                    0.20075114,
+                    0.040237684,
+                    -0.19367051,
+                    0.13039409,
+                    -0.26038498,
+                    -0.05770229,
+                    -0.009781617,
+                    0.15812513,
+                    -0.10420735,
+                    -0.020158196,
+                    0.13160926,
+                    -0.20823349,
+                    -0.045596864,
+                    -0.2074525,
+                    0.1546387,
+                    0.30158705,
+                    0.13175933,
+                    0.11967154,
+                    -0.09094463,
+                    0.0019428955,
+                    -0.06745872,
+                    0.02998099,
+                    -0.18385777,
+                    0.014330351,
+                    0.07141392,
+                    -0.17461702,
+                    0.099743806,
+                    -0.016181415,
+                    0.1661396,
+                    0.070834026,
+                    0.110713825,
+                    0.14590909,
+                    0.15404254,
+                    -0.21658006,
+                    0.00715122,
+                    -0.10229453,
+                    -0.09980027,
+                    -0.09406554,
+                    -0.014849227,
+                    -0.26285952,
+                    0.069972225,
+                    0.05732395,
+                    -0.10685719,
+                    0.037572138,
+                    -0.18863359,
+                    -0.00083297276,
+                    -0.16088934,
+                    -0.117982,
+                    -0.16381365,
+                    -0.008932539,
+                    -0.06549256,
+                    -0.08928683,
+                    0.29934987,
+                    0.16532114,
+                    -0.27117223,
+                    -0.12302226,
+                    -0.28685933,
+                    -0.14041144,
+                    -0.0062569617,
+                    -0.20768198,
+                    -0.15385273,
+                    0.20506454,
+                    -0.21685128,
+                    0.1081962,
+                    -0.13133131,
+                    0.18937315,
+                    0.14751591,
+                    0.2786974,
+                    -0.060183275,
+                    0.10365405,
+                    0.109799005,
+                    -0.044105034,
+                    -0.04260162,
+                    0.025758557,
+                    0.07590695,
+                    0.0726137,
+                    -0.09882405,
+                    0.26437432,
+                    0.15884234,
+                    0.115702584,
+                    0.0015900572,
+                    0.11673009,
+                    -0.18648374,
+                    0.3080215,
+                    -0.26407364,
+                    -0.15610488,
+                    0.12658228,
+                    -0.05672454,
+                    0.016239772,
+                    -0.092462406,
+                    -0.36205122,
+                    -0.2925843,
+                    -0.104364775,
+                    -0.2598659,
+                    -0.14073578,
+                    0.10225995,
+                    -0.2612335,
+                    -0.17479639,
+                    0.17488293,
+                    -0.2437756,
+                    0.114384405,
+                    -0.13196659,
+                    -0.067482576,
+                    0.024756929,
+                    0.11779123,
+                    0.2751749,
+                    -0.13306957,
+                    -0.034118645,
+                    -0.14177705,
+                    0.27164033,
+                    0.06266008,
+                    0.11199439,
+                    -0.09814594,
+                    0.13231735,
+                    0.019105865,
+                    -0.2652429,
+                    -0.12924416,
+                    0.0840029,
+                    0.098754935,
+                    0.025883028,
+                    -0.33059177,
+                    -0.10544467,
+                    -0.14131607,
+                    -0.09680401,
+                    -0.047318626,
+                    -0.08157771,
+                    -0.11271855,
+                    0.12637804,
+                    0.11703408,
+                    0.014556337,
+                    0.22788583,
+                    -0.05599293,
+                    0.25811172,
+                    0.22956331,
+                    0.13004553,
+                    0.15419081,
+                    -0.07971162,
+                    0.11692607,
+                    -0.2859737,
+                    0.059627946,
+                    -0.02716421,
+                    0.117603,
+                    -0.061154094,
+                    -0.13555732,
+                    0.17092334,
+                    -0.16639015,
+                    0.2919375,
+                    -0.020189757,
+                    0.18548165,
+                    -0.32514027,
+                    0.19324942,
+                    -0.117969565,
+                    0.23577307,
+                    -0.18052326,
+                    -0.10520473,
+                    -0.2647645,
+                    -0.29393113,
+                    0.052641366,
+                    -0.07733946,
+                    -0.10684275,
+                    -0.15046178,
+                    0.065737076,
+                    -0.0022297644,
+                    -0.010802031,
+                    -0.115943395,
+                    -0.11602136,
+                    0.24265991,
+                    -0.12240144,
+                    0.11817584,
+                    0.026270682,
+                    -0.25762397,
+                    -0.14545679,
+                    0.014168602,
+                    0.106698096,
+                    0.12905516,
+                    -0.12560321,
+                    0.15034604,
+                    0.071529925,
+                    0.123048246,
+                    -0.058863316,
+                    -0.12251829,
+                    0.20463347,
+                    0.06841168,
+                    0.13706751,
+                    0.05893755,
+                    -0.12269708,
+                    0.096701816,
+                    -0.3237337,
+                    -0.2213742,
+                    -0.073655166,
+                    -0.12979327,
+                    0.14173084,
+                    0.19167605,
+                    -0.14523135,
+                    0.06963011,
+                    -0.019228822,
+                    -0.14134938,
+                    0.22017507,
+                    0.007933044,
+                    -0.0065696104,
+                    0.074060634,
+                    -0.13231485,
+                    0.1387053,
+                    -0.14480218,
+                    -0.007837481,
+                    0.29880494,
+                    0.101618655,
+                    0.14514285,
+                    -0.066113696,
+                    -0.041709363,
+                    0.21512671,
+                    -0.090142876,
+                    -0.010337287,
+                    0.13212202,
+                    0.08307805,
+                    0.10144794,
+                    -0.024808172,
+                    0.21877879,
+                    -0.071282186,
+                    -8.786433e-05,
+                    -0.014574037,
+                    -0.11954953,
+                    -0.096931055,
+                    -0.2557228,
+                    0.1090451,
+                    0.15424186,
+                    -0.029206438,
+                    -0.2898023,
+                    0.22510754,
+                    -0.019507697,
+                    0.1566895,
+                    -0.24820097,
+                    -0.012163554,
+                    0.12401036,
+                    0.024711533,
+                    0.24737844,
+                    -0.06311193,
+                    0.0652544,
+                    -0.067403205,
+                    0.15362221,
+                    -0.12093675,
+                    0.096014425,
+                    0.17337392,
+                    -0.017509578,
+                    0.015355054,
+                    0.055885684,
+                    -0.08358914,
+                    -0.018012024,
+                    0.069017515,
+                    0.32854614,
+                    0.0063175815,
+                    -0.09058244,
+                    0.000681382,
+                    -0.10825181,
+                    0.13190223,
+                    0.009358909,
+                    -0.12205342,
+                    0.08268384,
+                    -0.260608,
+                    -0.11042252,
+                    -0.022601532,
+                    -0.080661446,
+                    -0.035559367,
+                    0.14736788,
+                    0.061933476,
+                    -0.07815901,
+                    0.110823035,
+                    -0.00875032,
+                    -0.064237975,
+                    -0.04546554,
+                    -0.05909862,
+                    0.23463917,
+                    -0.20451859,
+                    -0.16576467,
+                    0.10957323,
+                    -0.08632836,
+                    -0.27395645,
+                    0.0002913844,
+                    0.13701706,
+                    -0.058854006,
+                    0.30768716,
+                    -0.037643027,
+                    -0.1365738,
+                    0.095908396,
+                    -0.05029932,
+                    0.14793666,
+                    0.30881998,
+                    -0.018806668,
+                    -0.15902956,
+                    0.07953607,
+                    -0.07259314,
+                    0.17318867,
+                    0.123503335,
+                    -0.11327983,
+                    -0.24497227,
+                    -0.092871994,
+                    0.31053993,
+                    0.09460377,
+                    -0.21152224,
+                    -0.03127119,
+                    -0.018713845,
+                    -0.014523326,
+                    -0.18656968,
+                    0.2255386,
+                    -0.1902719,
+                    0.18821372,
+                    -0.16890709,
+                    -0.04607359,
+                    0.13054903,
+                    -0.05379203,
+                    -0.051014878,
+                    0.054293603,
+                    -0.07299424,
+                    -0.06728367,
+                    -0.052388195,
+                    -0.29960096,
+                    -0.22351485,
+                    -0.06481434,
+                    -0.1619141,
+                    0.24709718,
+                    -0.1203425,
+                    0.029514981,
+                    -0.01951599,
+                    -0.072677284,
+                    -0.25097945,
+                    0.03758907,
+                    0.14380245,
+                    -0.037721623,
+                    -0.19958745,
+                    0.2408246,
+                    -0.13995907,
+                    -0.028115002,
+                    -0.14780775,
+                    0.17445801,
+                    0.11311988,
+                    0.05306163,
+                    0.0018454103,
+                    0.00088805315,
+                    -0.27949628,
+                    -0.23556526,
+                    -0.18175222,
+                    -0.28372183,
+                    -0.43095905,
+                    0.22644317,
+                    0.06072053,
+                    0.02278773,
+                    0.021752749,
+                    0.053462002,
+                    -0.30636713,
+                    0.15607472,
+                    -0.16657323,
+                    -0.07240017,
+                    0.1410017,
+                    -0.026987495,
+                    0.15029654,
+                    0.03340291,
+                    -0.2056912,
+                    0.055395555,
+                    0.11999902,
+                    0.06368412,
+                    -0.025476053,
+                    -0.1702383,
+                    -0.23432998,
+                    0.14855467,
+                    -0.07505147,
+                    -0.030296376,
+                    -0.07001051,
+                    0.10510949,
+                    0.10420236,
+                    0.09809715,
+                    0.17195594,
+                    0.19430229,
+                    -0.16121922,
+                    -0.081139356,
+                    0.15032287,
+                    0.10385191,
+                    -0.18741366,
+                    0.008690719,
+                    -0.12941097,
+                    -0.027797364,
+                    -0.2148853,
+                    0.037788823,
+                    0.16691138,
+                    0.099181786,
+                    -0.0955518,
+                    -0.0074798446,
+                    -0.17511943,
+                    0.14543307,
+                    -0.029364567,
+                    -0.21223477,
+                    -0.05881982,
+                    0.11064195,
+                    -0.2877007,
+                    -0.023934823,
+                    -0.15569815,
+                    0.015789302,
+                    -0.035767324,
+                    -0.15110208,
+                    0.07125638,
+                    0.05703369,
+                    -0.08454703,
+                    -0.07080854,
+                    0.025179204,
+                    -0.10522502,
+                    -0.03670824,
+                    -0.11075579,
+                    0.0681693,
+                    -0.28287485,
+                    0.2769406,
+                    0.026260372,
+                    0.07289979,
+                    0.04669447,
+                    -0.16541554,
+                    0.040775143,
+                    0.035916835,
+                    0.03648039,
+                    0.11299418,
+                    0.14765884,
+                    0.031163761,
+                    0.0011800596,
+                    -0.10715472,
+                    0.02665826,
+                    -0.06237457,
+                    0.15672882,
+                    0.09038829,
+                    0.0061029866,
+                    -0.2592228,
+                    -0.21008603,
+                    0.019810716,
+                    -0.08721265,
+                    0.107840165,
+                    0.28438854,
+                    -0.16649202,
+                    0.19627784,
+                    0.040611178,
+                    0.16516201,
+                    0.24990341,
+                    -0.16222852,
+                    -0.009037945,
+                    0.053751092,
+                    0.1647804,
+                    -0.16184275,
+                    -0.29710436,
+                    0.043035872,
+                    0.04667557,
+                    0.14761224,
+                    -0.09030331,
+                    -0.024515491,
+                    0.10857025,
+                    0.19865094,
+                    -0.07794062,
+                    0.17942934,
+                    0.13322048,
+                    -0.16857187,
+                    0.055713065,
+                    0.18661156,
+                    -0.07864222,
+                    0.23296827,
+                    0.10348465,
+                    -0.11750994,
+                    -0.065938555,
+                    -0.04377608,
+                    0.14903909,
+                    0.019000417,
+                    0.21033548,
+                    0.12162547,
+                    0.1273347,
+                ],
+                "index": 0,
+                "object": "embedding",
+            }
+        ],
+        object="list",
+        usage=Usage(
+            completion_tokens=0,
+            prompt_tokens=0,
+            total_tokens=0,
+            completion_tokens_details=None,
+        ),
+    )
+
+    cost = completion_cost(
+        completion_response=response,
+        custom_llm_provider="together_ai",
+        call_type="embedding",
+    )
diff --git a/litellm/tests/test_embedding.py b/litellm/tests/test_embedding.py
index 9641f4aba..732772e76 100644
--- a/litellm/tests/test_embedding.py
+++ b/litellm/tests/test_embedding.py
@@ -104,14 +104,131 @@ def test_openai_embedding_3():
         pytest.fail(f"Error occurred: {e}")
 
 
-def test_openai_azure_embedding_simple():
+@pytest.mark.parametrize(
+    "model, api_base, api_key",
+    [
+        # ("azure/azure-embedding-model", None, None),
+        ("together_ai/togethercomputer/m2-bert-80M-8k-retrieval", None, None),
+    ],
+)
+@pytest.mark.parametrize("sync_mode", [True, False])
+@pytest.mark.asyncio
+async def test_openai_azure_embedding_simple(model, api_base, api_key, sync_mode):
     try:
-        litellm.set_verbose = True
-        response = embedding(
-            model="azure/azure-embedding-model",
-            input=["good morning from litellm"],
-        )
+        os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
+        litellm.model_cost = litellm.get_model_cost_map(url="")
+        # litellm.set_verbose = True
+        if sync_mode:
+            response = embedding(
+                model=model,
+                input=["good morning from litellm"],
+                api_base=api_base,
+                api_key=api_key,
+            )
+        else:
+            response = await litellm.aembedding(
+                model=model,
+                input=["good morning from litellm"],
+                api_base=api_base,
+                api_key=api_key,
+            )
+            # print(await response)
         print(response)
+        print(response._hidden_params)
+        response_keys = set(dict(response).keys())
+        response_keys.discard("_response_ms")
+        assert set(["usage", "model", "object", "data"]) == set(
+            response_keys
+        )  # assert litellm response has expected keys from OpenAI embedding response
+
+        request_cost = litellm.completion_cost(
+            completion_response=response, call_type="embedding"
+        )
+
+        print("Calculated request cost=", request_cost)
+
+        assert isinstance(response.usage, litellm.Usage)
+
+    except Exception as e:
+        pytest.fail(f"Error occurred: {e}")
+
+
+# test_openai_azure_embedding_simple()
+import base64
+
+import requests
+
+litellm.set_verbose = True
+url = "https://dummyimage.com/100/100/fff&text=Test+image"
+response = requests.get(url)
+file_data = response.content
+
+encoded_file = base64.b64encode(file_data).decode("utf-8")
+base64_image = f"data:image/png;base64,{encoded_file}"
+
+
+from openai.types.embedding import Embedding
+
+
+def _azure_ai_image_mock_response(*args, **kwargs):
+    new_response = MagicMock()
+    new_response.headers = {"azureml-model-group": "offer-cohere-embed-multili-paygo"}
+
+    new_response.json.return_value = {
+        "data": [Embedding(embedding=[1234], index=0, object="embedding")],
+        "model": "",
+        "object": "list",
+        "usage": {"prompt_tokens": 1, "total_tokens": 2},
+    }
+
+    return new_response
+
+
+@pytest.mark.parametrize(
+    "model, api_base, api_key",
+    [
+        (
+            "azure_ai/Cohere-embed-v3-multilingual-jzu",
+            "https://Cohere-embed-v3-multilingual-jzu.eastus2.models.ai.azure.com",
+            os.getenv("AZURE_AI_COHERE_API_KEY_2"),
+        )
+    ],
+)
+@pytest.mark.parametrize("sync_mode", [True, False])
+@pytest.mark.asyncio
+async def test_azure_ai_embedding_image(model, api_base, api_key, sync_mode):
+    try:
+        os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
+        litellm.model_cost = litellm.get_model_cost_map(url="")
+        input = base64_image
+        if sync_mode:
+            client = HTTPHandler()
+        else:
+            client = AsyncHTTPHandler()
+        with patch.object(
+            client, "post", side_effect=_azure_ai_image_mock_response
+        ) as mock_client:
+            if sync_mode:
+                response = embedding(
+                    model=model,
+                    input=[input],
+                    api_base=api_base,
+                    api_key=api_key,
+                    client=client,
+                )
+            else:
+                response = await litellm.aembedding(
+                    model=model,
+                    input=[input],
+                    api_base=api_base,
+                    api_key=api_key,
+                    client=client,
+                )
+        print(response)
+
+        assert len(response.data) == 1
+
+        print(response._hidden_params)
         response_keys = set(dict(response).keys())
         response_keys.discard("_response_ms")
         assert set(["usage", "model", "object", "data"]) == set(
@@ -128,9 +245,6 @@ def test_openai_azure_embedding_simple():
         pytest.fail(f"Error occurred: {e}")
 
 
-# test_openai_azure_embedding_simple()
-
-
 def test_openai_azure_embedding_timeouts():
     try:
         response = embedding(
@@ -226,13 +340,16 @@ def test_openai_azure_embedding_with_oidc_and_cf():
         os.environ["AZURE_API_KEY"] = old_key
 
 
+from openai.types.embedding import Embedding
+
+
 def _openai_mock_response(*args, **kwargs):
     new_response = MagicMock()
     new_response.headers = {"hello": "world"}
 
     new_response.parse.return_value = (
         openai.types.create_embedding_response.CreateEmbeddingResponse(
-            data=[],
+            data=[Embedding(embedding=[1234, 45667], index=0, object="embedding")],
             model="azure/test",
             object="list",
             usage=openai.types.create_embedding_response.Usage(
@@ -267,20 +384,28 @@ def test_openai_azure_embedding_optional_arg():
 # test_openai_embedding()
 
 
+@pytest.mark.parametrize(
+    "model, api_base",
+    [
+        ("embed-english-v2.0", None),
+    ],
+)
 @pytest.mark.parametrize("sync_mode", [True, False])
 @pytest.mark.asyncio
-async def test_cohere_embedding(sync_mode):
+async def test_cohere_embedding(sync_mode, model, api_base):
     try:
         # litellm.set_verbose=True
         data = {
-            "model": "embed-english-v2.0",
+            "model": model,
             "input": ["good morning from litellm", "this is another item"],
             "input_type": "search_query",
+            "api_base": api_base,
         }
         if sync_mode:
             response = embedding(**data)
         else:
             response = await litellm.aembedding(**data)
+
         print(f"response:", response)
 
         assert isinstance(response.usage, litellm.Usage)
diff --git a/litellm/tests/test_utils.py b/litellm/tests/test_utils.py
index 75c31c302..9a03c857b 100644
--- a/litellm/tests/test_utils.py
+++ b/litellm/tests/test_utils.py
@@ -774,3 +774,21 @@ def test_usage_object_null_tokens():
     usage_obj = litellm.Usage(prompt_tokens=2, completion_tokens=None, total_tokens=2)
 
     assert usage_obj.completion_tokens == 0
+
+
+def test_is_base64_encoded():
+    import base64
+
+    import requests
+
+    litellm.set_verbose = True
+    url = "https://dummyimage.com/100/100/fff&text=Test+image"
+    response = requests.get(url)
+    file_data = response.content
+
+    encoded_file = base64.b64encode(file_data).decode("utf-8")
+    base64_image = f"data:image/png;base64,{encoded_file}"
+
+    from litellm.utils import is_base64_encoded
+
+    assert is_base64_encoded(s=base64_image) is True
diff --git a/litellm/types/llms/azure_ai.py b/litellm/types/llms/azure_ai.py
new file mode 100644
index 000000000..2d597aef9
--- /dev/null
+++ b/litellm/types/llms/azure_ai.py
@@ -0,0 +1,17 @@
+from typing import Any, Dict, Iterable, List, Literal, Optional, Union
+
+from typing_extensions import Required, TypedDict
+
+
+class ImageEmbeddingInput(TypedDict, total=False):
+    image: Required[str]
+    text: str
+
+
+EncodingFormat = Literal["base64", "binary", "float", "int8", "ubinary", "uint8"]
+
+
+class ImageEmbeddingRequest(TypedDict, total=False):
+    input: Required[List[ImageEmbeddingInput]]
+    dimensions: int
+    encoding_format: EncodingFormat
diff --git a/litellm/types/llms/openai.py b/litellm/types/llms/openai.py
index f2048cfea..b73b4bc3d 100644
--- a/litellm/types/llms/openai.py
+++ b/litellm/types/llms/openai.py
@@ -9,7 +9,7 @@ from openai.lib.streaming._assistants import (
     AsyncAssistantStreamManager,
 )
 from openai.pagination import AsyncCursorPage, SyncCursorPage
-from openai.types import Batch, FileObject
+from openai.types import Batch, EmbeddingCreateParams, FileObject
 from openai.types.beta.assistant import Assistant
 from openai.types.beta.assistant_tool_param import AssistantToolParam
 from openai.types.beta.thread_create_params import (
diff --git a/litellm/types/utils.py b/litellm/types/utils.py
index 618f9bf47..ab1ffe101 100644
--- a/litellm/types/utils.py
+++ b/litellm/types/utils.py
@@ -766,7 +766,7 @@ class EmbeddingResponse(OpenAIObject):
     """The actual embedding value"""
 
     object: Literal["list"]
-    """The object type, which is always "embedding" """
+    """The object type, which is always "list" """
 
     usage: Optional[Usage] = None
     """Usage statistics for the embedding request."""
diff --git a/litellm/utils.py b/litellm/utils.py
index 1c9d7bde7..31150111f 100644
--- a/litellm/utils.py
+++ b/litellm/utils.py
@@ -11118,6 +11118,10 @@ def is_cached_message(message: AllMessageValues) -> bool:
 
 def is_base64_encoded(s: str) -> bool:
     try:
+        # Strip out the prefix if it exists
+        if s.startswith("data:"):
+            s = s.split(",")[1]
+
         # Try to decode the string
         decoded_bytes = base64.b64decode(s, validate=True)
         # Check if the original string can be re-encoded to the same string
diff --git a/model_prices_and_context_window.json b/model_prices_and_context_window.json
index e801788ad..2831f1a5c 100644
--- a/model_prices_and_context_window.json
+++ b/model_prices_and_context_window.json
@@ -990,6 +990,26 @@
         "mode": "chat",
         "source":"https://azuremarketplace.microsoft.com/en-us/marketplace/apps/metagenai.meta-llama-3-1-405b-instruct-offer?tab=PlansAndPrice"
     },
+    "azure_ai/Cohere-embed-v3-english": {
+        "max_tokens": 512,
+        "max_input_tokens": 512,
+        "output_vector_size": 1024,
+        "input_cost_per_token": 0.0000001,
+        "output_cost_per_token": 0.0,
+        "litellm_provider": "azure_ai",
+        "mode": "embedding",
+        "source":"https://azuremarketplace.microsoft.com/en-us/marketplace/apps/cohere.cohere-embed-v3-english-offer?tab=PlansAndPrice"
+    },
+    "azure_ai/Cohere-embed-v3-multilingual": {
+        "max_tokens": 512,
+        "max_input_tokens": 512,
+        "output_vector_size": 1024,
+        "input_cost_per_token": 0.0000001,
+        "output_cost_per_token": 0.0,
+        "litellm_provider": "azure_ai",
+        "mode": "embedding",
+        "source":"https://azuremarketplace.microsoft.com/en-us/marketplace/apps/cohere.cohere-embed-v3-english-offer?tab=PlansAndPrice"
+    },
     "babbage-002": {
         "max_tokens": 16384,
         "max_input_tokens": 16384,
@@ -4964,50 +4984,71 @@
     "together-ai-up-to-4b": {
         "input_cost_per_token": 0.0000001,
         "output_cost_per_token": 0.0000001,
-        "litellm_provider": "together_ai"
+        "litellm_provider": "together_ai",
+        "mode": "chat"
     },
     "together-ai-4.1b-8b": {
         "input_cost_per_token": 0.0000002,
         "output_cost_per_token": 0.0000002,
-        "litellm_provider": "together_ai"
+        "litellm_provider": "together_ai",
+        "mode": "chat"
     },
     "together-ai-8.1b-21b": {
         "max_tokens": 1000,
         "input_cost_per_token": 0.0000003,
         "output_cost_per_token": 0.0000003,
-        "litellm_provider": "together_ai"
+        "litellm_provider": "together_ai",
+        "mode": "chat"
     },
     "together-ai-21.1b-41b": {
         "input_cost_per_token": 0.0000008,
         "output_cost_per_token": 0.0000008,
-        "litellm_provider": "together_ai"
+        "litellm_provider": "together_ai",
+        "mode": "chat"
     },
     "together-ai-41.1b-80b": {
         "input_cost_per_token": 0.0000009,
         "output_cost_per_token": 0.0000009,
-        "litellm_provider": "together_ai"
+        "litellm_provider": "together_ai",
+        "mode": "chat"
     },
     "together-ai-81.1b-110b": {
         "input_cost_per_token": 0.0000018,
         "output_cost_per_token": 0.0000018,
-        "litellm_provider": "together_ai"
+        "litellm_provider": "together_ai",
+        "mode": "chat"
+    },
+    "together-ai-embedding-up-to-150m": {
+        "input_cost_per_token": 0.000000008,
+        "output_cost_per_token": 0.0,
+        "litellm_provider": "together_ai",
+        "mode": "embedding"
+    },
+    "together-ai-embedding-151m-to-350m": {
+        "input_cost_per_token": 0.000000016,
+        "output_cost_per_token": 0.0,
+        "litellm_provider": "together_ai",
+        "mode": "embedding"
     },
     "together_ai/mistralai/Mixtral-8x7B-Instruct-v0.1": {
         "input_cost_per_token": 0.0000006,
         "output_cost_per_token": 0.0000006,
         "litellm_provider": "together_ai",
         "supports_function_calling": true,
-        "supports_parallel_function_calling": true
+        "supports_parallel_function_calling": true,
+        "mode": "chat"
     },
     "together_ai/mistralai/Mistral-7B-Instruct-v0.1": {
         "litellm_provider": "together_ai",
         "supports_function_calling": true,
-        "supports_parallel_function_calling": true
+        "supports_parallel_function_calling": true,
+        "mode": "chat"
     },
     "together_ai/togethercomputer/CodeLlama-34b-Instruct": {
         "litellm_provider": "together_ai",
         "supports_function_calling": true,
-        "supports_parallel_function_calling": true
+        "supports_parallel_function_calling": true,
+        "mode": "chat"
     },
     "ollama/codegemma": {
         "max_tokens": 8192,