add bedrock llama vision support + cohere / infinity rerank - 'return_documents' support (#8684)

* build(model_prices_and_context_window.json): mark bedrock llama as supporting vision based on docs * Add price for Cerebras llama3.3-70b (#8676) * docs(readme.md): fix contributing docs point people to new mock directory testing structure s/o @vibhavbhat * build: update contributing readme * docs(readme.md): improve docs * docs(readme.md): cleanup readme on tests/ * docs(README.md): cleanup doc * feat(infinity/): support returning documents when return_documents=True * test(test_rerank.py): add e2e testing for cohere rerank * fix: fix linting errors * fix(together_ai/): fix together ai transformation * fix: fix linting error * fix: fix linting errors * fix: fix linting errors * test: mark cohere as flaky * build: fix model supports check * test: fix test * test: mark flaky test * fix: fix test * test: fix test --------- Co-authored-by: Yury Koleda <fut.wrk@gmail.com>
2025-04-26 03:04:13 +00:00 · 2025-02-20 21:23:54 -08:00 · 2025-02-20 21:23:54 -08:00 · 251467a525
commit 251467a525
parent b682dc4ec8
13 changed files with 206 additions and 31 deletions
--- a/README.md
+++ b/README.md
@ -343,25 +343,32 @@ curl 'http://0.0.0.0:4000/key/generate' \
 To contribute: Clone the repo locally -> Make a change -> Submit a PR with the change.
 Here's how to modify the repo locally:
 Step 1: Clone the repo
 ```
 git clone https://github.com/BerriAI/litellm.git
 ```
-Step 2: Navigate into the project, and install dependencies:
+Step 2: Install dependencies:
 ```
-cd litellm
+pip install -r requirements.txt
 poetry install -E extra_proxy -E proxy
 ```
 Step 3: Test your change:
 a. Add a pytest test within `tests/litellm/`
 This folder follows the same directory structure as `litellm/`.
 If a corresponding test file does not exist, create one.
 b. Run the test
 ```
-cd tests # pwd: Documents/litellm/litellm/tests
+cd tests/litellm # pwd: Documents/litellm/litellm/tests/litellm
-poetry run flake8
+pytest /path/to/test_file.py
 poetry run pytest .
 ```
 Step 4: Submit a PR with your changes! 🚀
--- a/litellm/llms/infinity/rerank/transformation.py
+++ b/litellm/llms/infinity/rerank/transformation.py
@ -13,8 +13,14 @@ import litellm
 from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLoggingObj
 from litellm.llms.cohere.rerank.transformation import CohereRerankConfig
 from litellm.secret_managers.main import get_secret_str
-from litellm.types.rerank import RerankBilledUnits, RerankResponseMeta, RerankTokens
+from litellm.types.rerank import (
-from litellm.types.utils import RerankResponse
+    RerankBilledUnits,
    RerankResponse,
    RerankResponseDocument,
    RerankResponseMeta,
    RerankResponseResult,
    RerankTokens,
 )
 from .common_utils import InfinityError
@ -88,13 +94,23 @@ class InfinityRerankConfig(CohereRerankConfig):
        )
        rerank_meta = RerankResponseMeta(billed_units=_billed_units, tokens=_tokens)
-        _results: Optional[List[dict]] = raw_response_json.get("results")
+        cohere_results: List[RerankResponseResult] = []
-
+        if raw_response_json.get("results"):
-        if _results is None:
+            for result in raw_response_json.get("results"):
                _rerank_response = RerankResponseResult(
                    index=result.get("index"),
                    relevance_score=result.get("relevance_score"),
                )
                if result.get("document"):
                    _rerank_response["document"] = RerankResponseDocument(
                        text=result.get("document")
                    )
                cohere_results.append(_rerank_response)
        if cohere_results is None:
            raise ValueError(f"No results found in the response={raw_response_json}")
        return RerankResponse(
            id=raw_response_json.get("id") or str(uuid.uuid4()),
-            results=_results,  # type: ignore
+            results=cohere_results,
            meta=rerank_meta,
        )  # Return response
--- a/litellm/llms/together_ai/rerank/transformation.py
+++ b/litellm/llms/together_ai/rerank/transformation.py
@ -10,7 +10,9 @@ from typing import List, Optional
 from litellm.types.rerank import (
    RerankBilledUnits,
    RerankResponse,
    RerankResponseDocument,
    RerankResponseMeta,
    RerankResponseResult,
    RerankTokens,
 )
@ -27,8 +29,35 @@ class TogetherAIRerankConfig:
        if _results is None:
            raise ValueError(f"No results found in the response={response}")
        rerank_results: List[RerankResponseResult] = []
        for result in _results:
            # Validate required fields exist
            if not all(key in result for key in ["index", "relevance_score"]):
                raise ValueError(f"Missing required fields in the result={result}")
            # Get document data if it exists
            document_data = result.get("document", {})
            document = (
                RerankResponseDocument(text=str(document_data.get("text", "")))
                if document_data
                else None
            )
            # Create typed result
            rerank_result = RerankResponseResult(
                index=int(result["index"]),
                relevance_score=float(result["relevance_score"]),
            )
            # Only add document if it exists
            if document:
                rerank_result["document"] = document
            rerank_results.append(rerank_result)
        return RerankResponse(
            id=response.get("id") or str(uuid.uuid4()),
-            results=_results,  # type: ignore
+            results=rerank_results,
            meta=rerank_meta,
        )  # Return response
--- a/litellm/model_prices_and_context_window_backup.json
+++ b/litellm/model_prices_and_context_window_backup.json
@ -2643,6 +2643,17 @@
        "supports_function_calling": true,
        "supports_tool_choice": true
    },
    "cerebras/llama3.3-70b": {
        "max_tokens": 128000,
        "max_input_tokens": 128000,
        "max_output_tokens": 128000,
        "input_cost_per_token": 0.00000085,
        "output_cost_per_token": 0.0000012,
        "litellm_provider": "cerebras",
        "mode": "chat",
        "supports_function_calling": true,
        "supports_tool_choice": true
    },
    "friendliai/meta-llama-3.1-8b-instruct": {
        "max_tokens": 8192,
        "max_input_tokens": 8192,
@ -7450,7 +7461,8 @@
        "litellm_provider": "bedrock",
        "mode": "chat",
        "supports_function_calling": true, 
-        "supports_tool_choice": false
+        "supports_tool_choice": false,
        "supports_vision": true
    },
    "us.meta.llama3-2-11b-instruct-v1:0": {
        "max_tokens": 128000,
@ -7461,7 +7473,8 @@
        "litellm_provider": "bedrock",
        "mode": "chat",
        "supports_function_calling": true, 
-        "supports_tool_choice": false
+        "supports_tool_choice": false,
        "supports_vision": true
    },
    "meta.llama3-2-90b-instruct-v1:0": {
        "max_tokens": 128000,
@ -7472,7 +7485,8 @@
        "litellm_provider": "bedrock",
        "mode": "chat",
        "supports_function_calling": true, 
-        "supports_tool_choice": false
+        "supports_tool_choice": false,
        "supports_vision": true
    },
    "us.meta.llama3-2-90b-instruct-v1:0": {
        "max_tokens": 128000,
@ -7483,7 +7497,8 @@
        "litellm_provider": "bedrock",
        "mode": "chat",
        "supports_function_calling": true, 
-        "supports_tool_choice": false
+        "supports_tool_choice": false,
        "supports_vision": true
    },
    "us.meta.llama3-3-70b-instruct-v1:0": {
        "max_tokens": 4096, 
--- a/litellm/types/llms/rerank.py
+++ b/litellm/types/llms/rerank.py
@ -0,0 +1,19 @@
 import json
 from enum import Enum
 from typing import Any, Dict, List, Literal, Optional, Tuple, TypedDict, Union
 from typing_extensions import (
    Protocol,
    Required,
    Self,
    TypeGuard,
    get_origin,
    override,
    runtime_checkable,
 )
 class InfinityRerankResult(TypedDict):
    index: int
    relevance_score: float
    document: Optional[str]
--- a/litellm/types/rerank.py
+++ b/litellm/types/rerank.py
@ -7,7 +7,7 @@ https://docs.cohere.com/reference/rerank
 from typing import List, Optional, Union
 from pydantic import BaseModel, PrivateAttr
-from typing_extensions import TypedDict
+from typing_extensions import Required, TypedDict
 class RerankRequest(BaseModel):
@ -45,9 +45,14 @@ class RerankResponseMeta(TypedDict, total=False):
    tokens: Optional[RerankTokens]
-class RerankResponseResult(TypedDict):
+class RerankResponseDocument(TypedDict):
-    index: int
+    text: str
-    relevance_score: float
+
 class RerankResponseResult(TypedDict, total=False):
    index: Required[int]
    relevance_score: Required[float]
    document: RerankResponseDocument
 class RerankResponse(BaseModel):
--- a/model_prices_and_context_window.json
+++ b/model_prices_and_context_window.json
@ -2643,6 +2643,17 @@
        "supports_function_calling": true,
        "supports_tool_choice": true
    },
    "cerebras/llama3.3-70b": {
        "max_tokens": 128000,
        "max_input_tokens": 128000,
        "max_output_tokens": 128000,
        "input_cost_per_token": 0.00000085,
        "output_cost_per_token": 0.0000012,
        "litellm_provider": "cerebras",
        "mode": "chat",
        "supports_function_calling": true,
        "supports_tool_choice": true
    },
    "friendliai/meta-llama-3.1-8b-instruct": {
        "max_tokens": 8192,
        "max_input_tokens": 8192,
@ -7450,7 +7461,8 @@
        "litellm_provider": "bedrock",
        "mode": "chat",
        "supports_function_calling": true, 
-        "supports_tool_choice": false
+        "supports_tool_choice": false,
        "supports_vision": true
    },
    "us.meta.llama3-2-11b-instruct-v1:0": {
        "max_tokens": 128000,
@ -7461,7 +7473,8 @@
        "litellm_provider": "bedrock",
        "mode": "chat",
        "supports_function_calling": true, 
-        "supports_tool_choice": false
+        "supports_tool_choice": false,
        "supports_vision": true
    },
    "meta.llama3-2-90b-instruct-v1:0": {
        "max_tokens": 128000,
@ -7472,7 +7485,8 @@
        "litellm_provider": "bedrock",
        "mode": "chat",
        "supports_function_calling": true, 
-        "supports_tool_choice": false
+        "supports_tool_choice": false,
        "supports_vision": true
    },
    "us.meta.llama3-2-90b-instruct-v1:0": {
        "max_tokens": 128000,
@ -7483,7 +7497,8 @@
        "litellm_provider": "bedrock",
        "mode": "chat",
        "supports_function_calling": true, 
-        "supports_tool_choice": false
+        "supports_tool_choice": false,
        "supports_vision": true
    },
    "us.meta.llama3-3-70b-instruct-v1:0": {
        "max_tokens": 4096, 
--- a/tests/README.MD
+++ b/tests/README.MD
@ -1 +1,9 @@
-**In total litellm runs 500+ tests** Most tests are in [/litellm/tests](https://github.com/BerriAI/litellm/tree/main/litellm/tests). These are just the tests for the proxy docker image, used for circle ci. 
+**In total litellm runs 1000+ tests** 
 [02/20/2025] Update:
 To make it easier to contribute and map what behavior is tested,
 we've started mapping the litellm directory in `tests/litellm` 
 This folder can only run mock tests.
--- a/tests/litellm_utils_tests/test_utils.py
+++ b/tests/litellm_utils_tests/test_utils.py
@ -1165,6 +1165,9 @@ def test_models_by_provider():
    """
    Make sure all providers from model map are in the valid providers list
    """
    os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
    litellm.model_cost = litellm.get_model_cost_map(url="")
    from litellm import models_by_provider
    providers = set()
--- a/tests/llm_translation/test_infinity.py
+++ b/tests/llm_translation/test_infinity.py
@ -87,6 +87,39 @@ async def test_infinity_rerank():
        assert_response_shape(response, custom_llm_provider="infinity")
@pytest.mark.asyncio()
 async def test_infinity_rerank_with_return_documents():
    mock_response = AsyncMock()
    mock_response = AsyncMock()
    def return_val():
        return {
            "id": "cmpl-mockid",
            "results": [{"index": 0, "relevance_score": 0.95, "document": "hello"}],
            "usage": {"prompt_tokens": 100, "total_tokens": 150},
        }
    mock_response.json = return_val
    mock_response.headers = {"key": "value"}
    mock_response.status_code = 200
    with patch(
        "litellm.llms.custom_httpx.http_handler.AsyncHTTPHandler.post",
        return_value=mock_response,
    ) as mock_post:
        response = await litellm.arerank(
            model="infinity/rerank-model",
            query="hello",
            documents=["hello", "world"],
            top_n=3,
            return_documents=True,
            api_base="https://api.infinity.ai",
        )
        assert response.results[0]["document"] == {"text": "hello"}
        assert_response_shape(response, custom_llm_provider="infinity")
@pytest.mark.asyncio()
 async def test_infinity_rerank_with_env(monkeypatch):
    # Set up mock response
--- a/tests/llm_translation/test_rerank.py
+++ b/tests/llm_translation/test_rerank.py
@ -9,6 +9,7 @@ from dotenv import load_dotenv
 load_dotenv()
 import io
 import os
 from typing import Optional, Dict
 sys.path.insert(
    0, os.path.abspath("../..")
@ -29,7 +30,11 @@ from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler
 def assert_response_shape(response, custom_llm_provider):
    expected_response_shape = {"id": str, "results": list, "meta": dict}
-    expected_results_shape = {"index": int, "relevance_score": float}
+    expected_results_shape = {
        "index": int,
        "relevance_score": float,
        "document": Optional[Dict[str, str]],
    }
    expected_meta_shape = {"api_version": dict, "billed_units": dict}
@ -44,6 +49,9 @@ def assert_response_shape(response, custom_llm_provider):
        assert isinstance(
            result["relevance_score"], expected_results_shape["relevance_score"]
        )
        if "document" in result:
            assert isinstance(result["document"], Dict)
            assert isinstance(result["document"]["text"], str)
    assert isinstance(response.meta, expected_response_shape["meta"])
    if custom_llm_provider == "cohere":
@ -364,17 +372,15 @@ def test_rerank_response_assertions():
        **{
            "id": "ab0fcca0-b617-11ef-b292-0242ac110002",
            "results": [
-                {"index": 2, "relevance_score": 0.9958819150924683, "document": None},
+                {"index": 2, "relevance_score": 0.9958819150924683},
-                {"index": 0, "relevance_score": 0.001293411129154265, "document": None},
+                {"index": 0, "relevance_score": 0.001293411129154265},
                {
                    "index": 1,
                    "relevance_score": 7.641685078851879e-05,
                    "document": None,
                },
                {
                    "index": 3,
                    "relevance_score": 7.621097756782547e-05,
                    "document": None,
                },
            ],
            "meta": {
@ -387,3 +393,19 @@ def test_rerank_response_assertions():
    )
    assert_response_shape(r, custom_llm_provider="custom")
@pytest.mark.flaky(retries=3, delay=1)
 def test_rerank_cohere_api():
    response = litellm.rerank(
        model="cohere/rerank-english-v3.0",
        query="hello",
        documents=["hello", "world"],
        return_documents=True,
        top_n=3,
    )
    print("rerank response", response)
    assert response.results[0]["document"] is not None
    assert response.results[0]["document"]["text"] is not None
    assert response.results[0]["document"]["text"] == "hello"
    assert response.results[1]["document"]["text"] == "world"
--- a/tests/local_testing/test_completion_cost.py
+++ b/tests/local_testing/test_completion_cost.py
@ -2775,6 +2775,8 @@ def test_bedrock_cost_calc_with_region():
    os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
    litellm.model_cost = litellm.get_model_cost_map(url="")
    litellm.add_known_models()
    hidden_params = {
        "custom_llm_provider": "bedrock",
        "region_name": "us-east-1",
--- a/tests/local_testing/test_embedding.py
+++ b/tests/local_testing/test_embedding.py
@ -961,6 +961,7 @@ async def test_gemini_embeddings(sync_mode, input):
@pytest.mark.parametrize("sync_mode", [True, False])
@pytest.mark.asyncio
@pytest.mark.flaky(retries=3, delay=1)
 async def test_hf_embedddings_with_optional_params(sync_mode):
    litellm.set_verbose = True