add bedrock llama vision support + cohere / infinity rerank - 'return_documents' support (#8684)

* build(model_prices_and_context_window.json): mark bedrock llama as supporting vision based on docs * Add price for Cerebras llama3.3-70b (#8676) * docs(readme.md): fix contributing docs point people to new mock directory testing structure s/o @vibhavbhat * build: update contributing readme * docs(readme.md): improve docs * docs(readme.md): cleanup readme on tests/ * docs(README.md): cleanup doc * feat(infinity/): support returning documents when return_documents=True * test(test_rerank.py): add e2e testing for cohere rerank * fix: fix linting errors * fix(together_ai/): fix together ai transformation * fix: fix linting error * fix: fix linting errors * fix: fix linting errors * test: mark cohere as flaky * build: fix model supports check * test: fix test * test: mark flaky test * fix: fix test * test: fix test --------- Co-authored-by: Yury Koleda <fut.wrk@gmail.com>
2025-04-24 18:24:20 +00:00 · 2025-02-20 21:23:54 -08:00 · 2025-02-20 21:23:54 -08:00 · 251467a525
commit 251467a525
parent b682dc4ec8
13 changed files with 206 additions and 31 deletions
--- a/README.md
+++ b/README.md
@ -343,25 +343,32 @@ curl 'http://0.0.0.0:4000/key/generate' \
 To contribute: Clone the repo locally -> Make a change -> Submit a PR with the change.

 Here's how to modify the repo locally:
+
 Step 1: Clone the repo

 ```
 git clone https://github.com/BerriAI/litellm.git
 ```

-Step 2: Navigate into the project, and install dependencies:
+Step 2: Install dependencies:

 ```
-cd litellm
-poetry install -E extra_proxy -E proxy
+pip install -r requirements.txt
 ```

 Step 3: Test your change:

+a. Add a pytest test within `tests/litellm/`
+
+This folder follows the same directory structure as `litellm/`.
+
+If a corresponding test file does not exist, create one.
+
+b. Run the test
+
 ```
-cd tests # pwd: Documents/litellm/litellm/tests
-poetry run flake8
-poetry run pytest .
+cd tests/litellm # pwd: Documents/litellm/litellm/tests/litellm
+pytest /path/to/test_file.py
 ```

 Step 4: Submit a PR with your changes! 🚀
--- a/litellm/llms/infinity/rerank/transformation.py
+++ b/litellm/llms/infinity/rerank/transformation.py
@ -13,8 +13,14 @@ import litellm
 from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLoggingObj
 from litellm.llms.cohere.rerank.transformation import CohereRerankConfig
 from litellm.secret_managers.main import get_secret_str
-from litellm.types.rerank import RerankBilledUnits, RerankResponseMeta, RerankTokens
-from litellm.types.utils import RerankResponse
+from litellm.types.rerank import (
+    RerankBilledUnits,
+    RerankResponse,
+    RerankResponseDocument,
+    RerankResponseMeta,
+    RerankResponseResult,
+    RerankTokens,
+)

 from .common_utils import InfinityError

@ -88,13 +94,23 @@ class InfinityRerankConfig(CohereRerankConfig):
        )
        rerank_meta = RerankResponseMeta(billed_units=_billed_units, tokens=_tokens)

-        _results: Optional[List[dict]] = raw_response_json.get("results")
-
-        if _results is None:
+        cohere_results: List[RerankResponseResult] = []
+        if raw_response_json.get("results"):
+            for result in raw_response_json.get("results"):
+                _rerank_response = RerankResponseResult(
+                    index=result.get("index"),
+                    relevance_score=result.get("relevance_score"),
+                )
+                if result.get("document"):
+                    _rerank_response["document"] = RerankResponseDocument(
+                        text=result.get("document")
+                    )
+                cohere_results.append(_rerank_response)
+        if cohere_results is None:
            raise ValueError(f"No results found in the response={raw_response_json}")

        return RerankResponse(
            id=raw_response_json.get("id") or str(uuid.uuid4()),
-            results=_results,  # type: ignore
+            results=cohere_results,
            meta=rerank_meta,
        )  # Return response
--- a/litellm/llms/together_ai/rerank/transformation.py
+++ b/litellm/llms/together_ai/rerank/transformation.py
@ -10,7 +10,9 @@ from typing import List, Optional
 from litellm.types.rerank import (
    RerankBilledUnits,
    RerankResponse,
+    RerankResponseDocument,
    RerankResponseMeta,
+    RerankResponseResult,
    RerankTokens,
 )

@ -27,8 +29,35 @@ class TogetherAIRerankConfig:
        if _results is None:
            raise ValueError(f"No results found in the response={response}")

+        rerank_results: List[RerankResponseResult] = []
+
+        for result in _results:
+            # Validate required fields exist
+            if not all(key in result for key in ["index", "relevance_score"]):
+                raise ValueError(f"Missing required fields in the result={result}")
+
+            # Get document data if it exists
+            document_data = result.get("document", {})
+            document = (
+                RerankResponseDocument(text=str(document_data.get("text", "")))
+                if document_data
+                else None
+            )
+
+            # Create typed result
+            rerank_result = RerankResponseResult(
+                index=int(result["index"]),
+                relevance_score=float(result["relevance_score"]),
+            )
+
+            # Only add document if it exists
+            if document:
+                rerank_result["document"] = document
+
+            rerank_results.append(rerank_result)
+
        return RerankResponse(
            id=response.get("id") or str(uuid.uuid4()),
-            results=_results,  # type: ignore
+            results=rerank_results,
            meta=rerank_meta,
        )  # Return response
--- a/litellm/model_prices_and_context_window_backup.json
+++ b/litellm/model_prices_and_context_window_backup.json
@ -2643,6 +2643,17 @@
        "supports_function_calling": true,
        "supports_tool_choice": true
    },
+    "cerebras/llama3.3-70b": {
+        "max_tokens": 128000,
+        "max_input_tokens": 128000,
+        "max_output_tokens": 128000,
+        "input_cost_per_token": 0.00000085,
+        "output_cost_per_token": 0.0000012,
+        "litellm_provider": "cerebras",
+        "mode": "chat",
+        "supports_function_calling": true,
+        "supports_tool_choice": true
+    },
    "friendliai/meta-llama-3.1-8b-instruct": {
        "max_tokens": 8192,
        "max_input_tokens": 8192,
@ -7450,7 +7461,8 @@
        "litellm_provider": "bedrock",
        "mode": "chat",
        "supports_function_calling": true, 
-        "supports_tool_choice": false
+        "supports_tool_choice": false,
+        "supports_vision": true
    },
    "us.meta.llama3-2-11b-instruct-v1:0": {
        "max_tokens": 128000,
@ -7461,7 +7473,8 @@
        "litellm_provider": "bedrock",
        "mode": "chat",
        "supports_function_calling": true, 
-        "supports_tool_choice": false
+        "supports_tool_choice": false,
+        "supports_vision": true
    },
    "meta.llama3-2-90b-instruct-v1:0": {
        "max_tokens": 128000,
@ -7472,7 +7485,8 @@
        "litellm_provider": "bedrock",
        "mode": "chat",
        "supports_function_calling": true, 
-        "supports_tool_choice": false
+        "supports_tool_choice": false,
+        "supports_vision": true
    },
    "us.meta.llama3-2-90b-instruct-v1:0": {
        "max_tokens": 128000,
@ -7483,7 +7497,8 @@
        "litellm_provider": "bedrock",
        "mode": "chat",
        "supports_function_calling": true, 
-        "supports_tool_choice": false
+        "supports_tool_choice": false,
+        "supports_vision": true
    },
    "us.meta.llama3-3-70b-instruct-v1:0": {
        "max_tokens": 4096, 
--- a/litellm/types/llms/rerank.py
+++ b/litellm/types/llms/rerank.py
@ -0,0 +1,19 @@
+import json
+from enum import Enum
+from typing import Any, Dict, List, Literal, Optional, Tuple, TypedDict, Union
+
+from typing_extensions import (
+    Protocol,
+    Required,
+    Self,
+    TypeGuard,
+    get_origin,
+    override,
+    runtime_checkable,
+)
+
+
+class InfinityRerankResult(TypedDict):
+    index: int
+    relevance_score: float
+    document: Optional[str]
--- a/litellm/types/rerank.py
+++ b/litellm/types/rerank.py
@ -7,7 +7,7 @@ https://docs.cohere.com/reference/rerank
 from typing import List, Optional, Union

 from pydantic import BaseModel, PrivateAttr
-from typing_extensions import TypedDict
+from typing_extensions import Required, TypedDict


 class RerankRequest(BaseModel):
@ -45,9 +45,14 @@ class RerankResponseMeta(TypedDict, total=False):
    tokens: Optional[RerankTokens]


-class RerankResponseResult(TypedDict):
-    index: int
-    relevance_score: float
+class RerankResponseDocument(TypedDict):
+    text: str
+
+
+class RerankResponseResult(TypedDict, total=False):
+    index: Required[int]
+    relevance_score: Required[float]
+    document: RerankResponseDocument


 class RerankResponse(BaseModel):
--- a/model_prices_and_context_window.json
+++ b/model_prices_and_context_window.json
@ -2643,6 +2643,17 @@
        "supports_function_calling": true,
        "supports_tool_choice": true
    },
+    "cerebras/llama3.3-70b": {
+        "max_tokens": 128000,
+        "max_input_tokens": 128000,
+        "max_output_tokens": 128000,
+        "input_cost_per_token": 0.00000085,
+        "output_cost_per_token": 0.0000012,
+        "litellm_provider": "cerebras",
+        "mode": "chat",
+        "supports_function_calling": true,
+        "supports_tool_choice": true
+    },
    "friendliai/meta-llama-3.1-8b-instruct": {
        "max_tokens": 8192,
        "max_input_tokens": 8192,
@ -7450,7 +7461,8 @@
        "litellm_provider": "bedrock",
        "mode": "chat",
        "supports_function_calling": true, 
-        "supports_tool_choice": false
+        "supports_tool_choice": false,
+        "supports_vision": true
    },
    "us.meta.llama3-2-11b-instruct-v1:0": {
        "max_tokens": 128000,
@ -7461,7 +7473,8 @@
        "litellm_provider": "bedrock",
        "mode": "chat",
        "supports_function_calling": true, 
-        "supports_tool_choice": false
+        "supports_tool_choice": false,
+        "supports_vision": true
    },
    "meta.llama3-2-90b-instruct-v1:0": {
        "max_tokens": 128000,
@ -7472,7 +7485,8 @@
        "litellm_provider": "bedrock",
        "mode": "chat",
        "supports_function_calling": true, 
-        "supports_tool_choice": false
+        "supports_tool_choice": false,
+        "supports_vision": true
    },
    "us.meta.llama3-2-90b-instruct-v1:0": {
        "max_tokens": 128000,
@ -7483,7 +7497,8 @@
        "litellm_provider": "bedrock",
        "mode": "chat",
        "supports_function_calling": true, 
-        "supports_tool_choice": false
+        "supports_tool_choice": false,
+        "supports_vision": true
    },
    "us.meta.llama3-3-70b-instruct-v1:0": {
        "max_tokens": 4096, 
--- a/tests/README.MD
+++ b/tests/README.MD
@ -1 +1,9 @@
-**In total litellm runs 500+ tests** Most tests are in [/litellm/tests](https://github.com/BerriAI/litellm/tree/main/litellm/tests). These are just the tests for the proxy docker image, used for circle ci. 
+**In total litellm runs 1000+ tests** 
+
+[02/20/2025] Update:
+
+To make it easier to contribute and map what behavior is tested,
+
+we've started mapping the litellm directory in `tests/litellm` 
+
+This folder can only run mock tests.
--- a/tests/litellm_utils_tests/test_utils.py
+++ b/tests/litellm_utils_tests/test_utils.py
@ -1165,6 +1165,9 @@ def test_models_by_provider():
    """
    Make sure all providers from model map are in the valid providers list
    """
+    os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
+    litellm.model_cost = litellm.get_model_cost_map(url="")
+
    from litellm import models_by_provider

    providers = set()
--- a/tests/llm_translation/test_infinity.py
+++ b/tests/llm_translation/test_infinity.py
@ -87,6 +87,39 @@ async def test_infinity_rerank():
        assert_response_shape(response, custom_llm_provider="infinity")


+@pytest.mark.asyncio()
+async def test_infinity_rerank_with_return_documents():
+    mock_response = AsyncMock()
+
+    mock_response = AsyncMock()
+
+    def return_val():
+        return {
+            "id": "cmpl-mockid",
+            "results": [{"index": 0, "relevance_score": 0.95, "document": "hello"}],
+            "usage": {"prompt_tokens": 100, "total_tokens": 150},
+        }
+
+    mock_response.json = return_val
+    mock_response.headers = {"key": "value"}
+    mock_response.status_code = 200
+
+    with patch(
+        "litellm.llms.custom_httpx.http_handler.AsyncHTTPHandler.post",
+        return_value=mock_response,
+    ) as mock_post:
+        response = await litellm.arerank(
+            model="infinity/rerank-model",
+            query="hello",
+            documents=["hello", "world"],
+            top_n=3,
+            return_documents=True,
+            api_base="https://api.infinity.ai",
+        )
+        assert response.results[0]["document"] == {"text": "hello"}
+        assert_response_shape(response, custom_llm_provider="infinity")
+
+
@pytest.mark.asyncio()
 async def test_infinity_rerank_with_env(monkeypatch):
    # Set up mock response
--- a/tests/llm_translation/test_rerank.py
+++ b/tests/llm_translation/test_rerank.py
@ -9,6 +9,7 @@ from dotenv import load_dotenv
 load_dotenv()
 import io
 import os
+from typing import Optional, Dict

 sys.path.insert(
    0, os.path.abspath("../..")
@ -29,7 +30,11 @@ from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler
 def assert_response_shape(response, custom_llm_provider):
    expected_response_shape = {"id": str, "results": list, "meta": dict}

-    expected_results_shape = {"index": int, "relevance_score": float}
+    expected_results_shape = {
+        "index": int,
+        "relevance_score": float,
+        "document": Optional[Dict[str, str]],
+    }

    expected_meta_shape = {"api_version": dict, "billed_units": dict}

@ -44,6 +49,9 @@ def assert_response_shape(response, custom_llm_provider):
        assert isinstance(
            result["relevance_score"], expected_results_shape["relevance_score"]
        )
+        if "document" in result:
+            assert isinstance(result["document"], Dict)
+            assert isinstance(result["document"]["text"], str)
    assert isinstance(response.meta, expected_response_shape["meta"])

    if custom_llm_provider == "cohere":
@ -364,17 +372,15 @@ def test_rerank_response_assertions():
        **{
            "id": "ab0fcca0-b617-11ef-b292-0242ac110002",
            "results": [
-                {"index": 2, "relevance_score": 0.9958819150924683, "document": None},
-                {"index": 0, "relevance_score": 0.001293411129154265, "document": None},
+                {"index": 2, "relevance_score": 0.9958819150924683},
+                {"index": 0, "relevance_score": 0.001293411129154265},
                {
                    "index": 1,
                    "relevance_score": 7.641685078851879e-05,
-                    "document": None,
                },
                {
                    "index": 3,
                    "relevance_score": 7.621097756782547e-05,
-                    "document": None,
                },
            ],
            "meta": {
@ -387,3 +393,19 @@ def test_rerank_response_assertions():
    )

    assert_response_shape(r, custom_llm_provider="custom")
+
+
+@pytest.mark.flaky(retries=3, delay=1)
+def test_rerank_cohere_api():
+    response = litellm.rerank(
+        model="cohere/rerank-english-v3.0",
+        query="hello",
+        documents=["hello", "world"],
+        return_documents=True,
+        top_n=3,
+    )
+    print("rerank response", response)
+    assert response.results[0]["document"] is not None
+    assert response.results[0]["document"]["text"] is not None
+    assert response.results[0]["document"]["text"] == "hello"
+    assert response.results[1]["document"]["text"] == "world"
--- a/tests/local_testing/test_completion_cost.py
+++ b/tests/local_testing/test_completion_cost.py
@ -2775,6 +2775,8 @@ def test_bedrock_cost_calc_with_region():
    os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
    litellm.model_cost = litellm.get_model_cost_map(url="")

+    litellm.add_known_models()
+
    hidden_params = {
        "custom_llm_provider": "bedrock",
        "region_name": "us-east-1",
--- a/tests/local_testing/test_embedding.py
+++ b/tests/local_testing/test_embedding.py
@ -961,6 +961,7 @@ async def test_gemini_embeddings(sync_mode, input):

@pytest.mark.parametrize("sync_mode", [True, False])
@pytest.mark.asyncio
+@pytest.mark.flaky(retries=3, delay=1)
 async def test_hf_embedddings_with_optional_params(sync_mode):
    litellm.set_verbose = True