(fix) litellm.text_completion raises a non-blocking error on simple usage (#6546)

* unit test test_huggingface_text_completion_logprobs * fix return TextCompletionHandler convert_chat_to_text_completion * fix hf rest api * fix test_huggingface_text_completion_logprobs * fix linting errors * fix importLiteLLMResponseObjectHandler * fix test for LiteLLMResponseObjectHandler * fix test text completion
2024-11-05 05:17:48 +05:30 · 2024-11-05 05:17:48 +05:30 · 58ce30acee
commit 58ce30acee
parent 67ddf55ebd
6 changed files with 374 additions and 111 deletions
--- a/litellm/litellm_core_utils/llm_response_utils/convert_dict_to_response.py
+++ b/litellm/litellm_core_utils/llm_response_utils/convert_dict_to_response.py
@ -14,11 +14,17 @@ from litellm.types.utils import (
    Delta,
    EmbeddingResponse,
    Function,
+    HiddenParams,
    ImageResponse,
+)
+from litellm.types.utils import Logprobs as TextCompletionLogprobs
+from litellm.types.utils import (
    Message,
    ModelResponse,
    RerankResponse,
    StreamingChoices,
+    TextChoices,
+    TextCompletionResponse,
    TranscriptionResponse,
    Usage,
 )
@ -235,6 +241,77 @@ class LiteLLMResponseObjectHandler:
            model_response_object = ImageResponse(**model_response_dict)
            return model_response_object

+    @staticmethod
+    def convert_chat_to_text_completion(
+        response: ModelResponse,
+        text_completion_response: TextCompletionResponse,
+        custom_llm_provider: Optional[str] = None,
+    ) -> TextCompletionResponse:
+        """
+        Converts a chat completion response to a text completion response format.
+
+        Note: This is used for huggingface. For OpenAI / Azure Text the providers files directly return TextCompletionResponse which we then send to user
+
+        Args:
+            response (ModelResponse): The chat completion response to convert
+
+        Returns:
+            TextCompletionResponse: The converted text completion response
+
+        Example:
+            chat_response = completion(model="gpt-3.5-turbo", messages=[{"role": "user", "content": "Hi"}])
+            text_response = convert_chat_to_text_completion(chat_response)
+        """
+        transformed_logprobs = LiteLLMResponseObjectHandler._convert_provider_response_logprobs_to_text_completion_logprobs(
+            response=response,
+            custom_llm_provider=custom_llm_provider,
+        )
+
+        text_completion_response["id"] = response.get("id", None)
+        text_completion_response["object"] = "text_completion"
+        text_completion_response["created"] = response.get("created", None)
+        text_completion_response["model"] = response.get("model", None)
+        choices_list: List[TextChoices] = []
+
+        # Convert each choice to TextChoices
+        for choice in response["choices"]:
+            text_choices = TextChoices()
+            text_choices["text"] = choice["message"]["content"]
+            text_choices["index"] = choice["index"]
+            text_choices["logprobs"] = transformed_logprobs
+            text_choices["finish_reason"] = choice["finish_reason"]
+            choices_list.append(text_choices)
+
+        text_completion_response["choices"] = choices_list
+        text_completion_response["usage"] = response.get("usage", None)
+        text_completion_response._hidden_params = HiddenParams(
+            **response._hidden_params
+        )
+        return text_completion_response
+
+    @staticmethod
+    def _convert_provider_response_logprobs_to_text_completion_logprobs(
+        response: ModelResponse,
+        custom_llm_provider: Optional[str] = None,
+    ) -> Optional[TextCompletionLogprobs]:
+        """
+        Convert logprobs from provider to OpenAI.Completion() format
+
+        Only supported for HF TGI models
+        """
+        transformed_logprobs: Optional[TextCompletionLogprobs] = None
+        if custom_llm_provider == "huggingface":
+            # only supported for TGI models
+            try:
+                raw_response = response._hidden_params.get("original_response", None)
+                transformed_logprobs = litellm.huggingface._transform_logprobs(
+                    hf_response=raw_response
+                )
+            except Exception as e:
+                verbose_logger.exception(f"LiteLLM non blocking exception: {e}")
+
+        return transformed_logprobs
+

 def convert_to_model_response_object(  # noqa: PLR0915
    response_object: Optional[dict] = None,
--- a/litellm/llms/huggingface_restapi.py
+++ b/litellm/llms/huggingface_restapi.py
@ -15,6 +15,7 @@ import litellm
 from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLoggingObj
 from litellm.secret_managers.main import get_secret_str
 from litellm.types.completion import ChatCompletionMessageToolCallParam
+from litellm.types.utils import Logprobs as TextCompletionLogprobs
 from litellm.utils import Choices, CustomStreamWrapper, Message, ModelResponse, Usage

 from .base import BaseLLM
@ -1183,3 +1184,73 @@ class Huggingface(BaseLLM):
            input=input,
            encoding=encoding,
        )
+
+    def _transform_logprobs(
+        self, hf_response: Optional[List]
+    ) -> Optional[TextCompletionLogprobs]:
+        """
+        Transform Hugging Face logprobs to OpenAI.Completion() format
+        """
+        if hf_response is None:
+            return None
+
+        # Initialize an empty list for the transformed logprobs
+        _logprob: TextCompletionLogprobs = TextCompletionLogprobs(
+            text_offset=[],
+            token_logprobs=[],
+            tokens=[],
+            top_logprobs=[],
+        )
+
+        # For each Hugging Face response, transform the logprobs
+        for response in hf_response:
+            # Extract the relevant information from the response
+            response_details = response["details"]
+            top_tokens = response_details.get("top_tokens", {})
+
+            for i, token in enumerate(response_details["prefill"]):
+                # Extract the text of the token
+                token_text = token["text"]
+
+                # Extract the logprob of the token
+                token_logprob = token["logprob"]
+
+                # Add the token information to the 'token_info' list
+                _logprob.tokens.append(token_text)
+                _logprob.token_logprobs.append(token_logprob)
+
+                # stub this to work with llm eval harness
+                top_alt_tokens = {"": -1.0, "": -2.0, "": -3.0}  # noqa: F601
+                _logprob.top_logprobs.append(top_alt_tokens)
+
+            # For each element in the 'tokens' list, extract the relevant information
+            for i, token in enumerate(response_details["tokens"]):
+                # Extract the text of the token
+                token_text = token["text"]
+
+                # Extract the logprob of the token
+                token_logprob = token["logprob"]
+
+                top_alt_tokens = {}
+                temp_top_logprobs = []
+                if top_tokens != {}:
+                    temp_top_logprobs = top_tokens[i]
+
+                # top_alt_tokens should look like this: { "alternative_1": -1, "alternative_2": -2, "alternative_3": -3 }
+                for elem in temp_top_logprobs:
+                    text = elem["text"]
+                    logprob = elem["logprob"]
+                    top_alt_tokens[text] = logprob
+
+                # Add the token information to the 'token_info' list
+                _logprob.tokens.append(token_text)
+                _logprob.token_logprobs.append(token_logprob)
+                _logprob.top_logprobs.append(top_alt_tokens)
+
+                # Add the text offset of the token
+                # This is computed as the sum of the lengths of all previous tokens
+                _logprob.text_offset.append(
+                    sum(len(t["text"]) for t in response_details["tokens"][:i])
+                )
+
+        return _logprob
--- a/litellm/main.py
+++ b/litellm/main.py
@ -3867,34 +3867,17 @@ async def atext_completion(
                custom_llm_provider=custom_llm_provider,
            )
        else:
-            transformed_logprobs = None
-            # only supported for TGI models
-            try:
-                raw_response = response._hidden_params.get("original_response", None)
-                transformed_logprobs = litellm.utils.transform_logprobs(raw_response)
-            except Exception as e:
-                print_verbose(f"LiteLLM non blocking exception: {e}")
-
-            ## TRANSLATE CHAT TO TEXT FORMAT ##
+            ## OpenAI / Azure Text Completion Returns here
            if isinstance(response, TextCompletionResponse):
                return response
            elif asyncio.iscoroutine(response):
                response = await response

            text_completion_response = TextCompletionResponse()
-            text_completion_response["id"] = response.get("id", None)
-            text_completion_response["object"] = "text_completion"
-            text_completion_response["created"] = response.get("created", None)
-            text_completion_response["model"] = response.get("model", None)
-            text_choices = TextChoices()
-            text_choices["text"] = response["choices"][0]["message"]["content"]
-            text_choices["index"] = response["choices"][0]["index"]
-            text_choices["logprobs"] = transformed_logprobs
-            text_choices["finish_reason"] = response["choices"][0]["finish_reason"]
-            text_completion_response["choices"] = [text_choices]
-            text_completion_response["usage"] = response.get("usage", None)
-            text_completion_response._hidden_params = HiddenParams(
-                **response._hidden_params
+            text_completion_response = litellm.utils.LiteLLMResponseObjectHandler.convert_chat_to_text_completion(
+                text_completion_response=text_completion_response,
+                response=response,
+                custom_llm_provider=custom_llm_provider,
            )
            return text_completion_response
    except Exception as e:
@ -4156,29 +4139,17 @@ def text_completion(  # noqa: PLR0915
        return response
    elif isinstance(response, TextCompletionStreamWrapper):
        return response
-    transformed_logprobs = None
-    # only supported for TGI models
-    try:
-        raw_response = response._hidden_params.get("original_response", None)
-        transformed_logprobs = litellm.utils.transform_logprobs(raw_response)
-    except Exception as e:
-        verbose_logger.exception(f"LiteLLM non blocking exception: {e}")

+    # OpenAI Text / Azure Text will return here
    if isinstance(response, TextCompletionResponse):
        return response

-    text_completion_response["id"] = response.get("id", None)
-    text_completion_response["object"] = "text_completion"
-    text_completion_response["created"] = response.get("created", None)
-    text_completion_response["model"] = response.get("model", None)
-    text_choices = TextChoices()
-    text_choices["text"] = response["choices"][0]["message"]["content"]
-    text_choices["index"] = response["choices"][0]["index"]
-    text_choices["logprobs"] = transformed_logprobs
-    text_choices["finish_reason"] = response["choices"][0]["finish_reason"]
-    text_completion_response["choices"] = [text_choices]
-    text_completion_response["usage"] = response.get("usage", None)
-    text_completion_response._hidden_params = HiddenParams(**response._hidden_params)
+    text_completion_response = (
+        litellm.utils.LiteLLMResponseObjectHandler.convert_chat_to_text_completion(
+            response=response,
+            text_completion_response=text_completion_response,
+        )
+    )

    return text_completion_response

--- a/litellm/utils.py
+++ b/litellm/utils.py
@ -71,6 +71,7 @@ from litellm.litellm_core_utils.get_llm_provider_logic import (
 )
 from litellm.litellm_core_utils.llm_request_utils import _ensure_extra_body_is_safe
 from litellm.litellm_core_utils.llm_response_utils.convert_dict_to_response import (
+    LiteLLMResponseObjectHandler,
    _handle_invalid_parallel_tool_calls,
    convert_to_model_response_object,
    convert_to_streaming_response,
@ -8388,76 +8389,6 @@ def get_valid_models() -> List[str]:
        return []  # NON-Blocking


-# used for litellm.text_completion() to transform HF logprobs to OpenAI.Completion() format
-def transform_logprobs(hf_response):
-    # Initialize an empty list for the transformed logprobs
-    transformed_logprobs = []
-
-    # For each Hugging Face response, transform the logprobs
-    for response in hf_response:
-        # Extract the relevant information from the response
-        response_details = response["details"]
-        top_tokens = response_details.get("top_tokens", {})
-
-        # Initialize an empty list for the token information
-        token_info = {
-            "tokens": [],
-            "token_logprobs": [],
-            "text_offset": [],
-            "top_logprobs": [],
-        }
-
-        for i, token in enumerate(response_details["prefill"]):
-            # Extract the text of the token
-            token_text = token["text"]
-
-            # Extract the logprob of the token
-            token_logprob = token["logprob"]
-
-            # Add the token information to the 'token_info' list
-            token_info["tokens"].append(token_text)
-            token_info["token_logprobs"].append(token_logprob)
-
-            # stub this to work with llm eval harness
-            top_alt_tokens = {"": -1, "": -2, "": -3}  # noqa: F601
-            token_info["top_logprobs"].append(top_alt_tokens)
-
-        # For each element in the 'tokens' list, extract the relevant information
-        for i, token in enumerate(response_details["tokens"]):
-            # Extract the text of the token
-            token_text = token["text"]
-
-            # Extract the logprob of the token
-            token_logprob = token["logprob"]
-
-            top_alt_tokens = {}
-            temp_top_logprobs = []
-            if top_tokens != {}:
-                temp_top_logprobs = top_tokens[i]
-
-            # top_alt_tokens should look like this: { "alternative_1": -1, "alternative_2": -2, "alternative_3": -3 }
-            for elem in temp_top_logprobs:
-                text = elem["text"]
-                logprob = elem["logprob"]
-                top_alt_tokens[text] = logprob
-
-            # Add the token information to the 'token_info' list
-            token_info["tokens"].append(token_text)
-            token_info["token_logprobs"].append(token_logprob)
-            token_info["top_logprobs"].append(top_alt_tokens)
-
-            # Add the text offset of the token
-            # This is computed as the sum of the lengths of all previous tokens
-            token_info["text_offset"].append(
-                sum(len(t["text"]) for t in response_details["tokens"][:i])
-            )
-
-        # Add the 'token_info' list to the 'transformed_logprobs' list
-        transformed_logprobs = token_info
-
-    return transformed_logprobs
-
-
 def print_args_passed_to_litellm(original_function, args, kwargs):
    try:
        # we've already printed this for acompletion, don't print for completion
--- a/tests/llm_translation/test_text_completion.py
+++ b/tests/llm_translation/test_text_completion.py
@ -0,0 +1,141 @@
+import json
+import os
+import sys
+from datetime import datetime
+
+sys.path.insert(
+    0, os.path.abspath("../..")
+)  # Adds the parent directory to the system path
+
+import litellm
+import pytest
+
+from litellm.utils import (
+    LiteLLMResponseObjectHandler,
+)
+
+
+from datetime import timedelta
+
+from litellm.types.utils import (
+    ModelResponse,
+    TextCompletionResponse,
+    TextChoices,
+    Logprobs as TextCompletionLogprobs,
+    Usage,
+)
+
+
+def test_convert_chat_to_text_completion():
+    """Test converting chat completion to text completion"""
+    chat_response = ModelResponse(
+        id="chat123",
+        created=1234567890,
+        model="gpt-3.5-turbo",
+        choices=[
+            {
+                "index": 0,
+                "message": {"content": "Hello, world!"},
+                "finish_reason": "stop",
+            }
+        ],
+        usage={"total_tokens": 10, "completion_tokens": 10},
+        _hidden_params={"api_key": "test"},
+    )
+
+    text_completion = TextCompletionResponse()
+    result = LiteLLMResponseObjectHandler.convert_chat_to_text_completion(
+        response=chat_response, text_completion_response=text_completion
+    )
+
+    assert isinstance(result, TextCompletionResponse)
+    assert result.id == "chat123"
+    assert result.object == "text_completion"
+    assert result.created == 1234567890
+    assert result.model == "gpt-3.5-turbo"
+    assert result.choices[0].text == "Hello, world!"
+    assert result.choices[0].finish_reason == "stop"
+    assert result.usage == Usage(
+        completion_tokens=10,
+        prompt_tokens=0,
+        total_tokens=10,
+        completion_tokens_details=None,
+        prompt_tokens_details=None,
+    )
+
+
+def test_convert_provider_response_logprobs():
+    """Test converting provider logprobs to text completion logprobs"""
+    response = ModelResponse(
+        id="test123",
+        _hidden_params={
+            "original_response": {
+                "details": {"tokens": [{"text": "hello", "logprob": -1.0}]}
+            }
+        },
+    )
+
+    result = LiteLLMResponseObjectHandler._convert_provider_response_logprobs_to_text_completion_logprobs(
+        response=response, custom_llm_provider="huggingface"
+    )
+
+    # Note: The actual assertion here depends on the implementation of
+    # litellm.huggingface._transform_logprobs, but we can at least test the function call
+    assert (
+        result is not None or result is None
+    )  # Will depend on the actual implementation
+
+
+def test_convert_provider_response_logprobs_non_huggingface():
+    """Test converting provider logprobs for non-huggingface provider"""
+    response = ModelResponse(id="test123", _hidden_params={})
+
+    result = LiteLLMResponseObjectHandler._convert_provider_response_logprobs_to_text_completion_logprobs(
+        response=response, custom_llm_provider="openai"
+    )
+
+    assert result is None
+
+
+def test_convert_chat_to_text_completion_multiple_choices():
+    """Test converting chat completion to text completion with multiple choices"""
+    chat_response = ModelResponse(
+        id="chat456",
+        created=1234567890,
+        model="gpt-3.5-turbo",
+        choices=[
+            {
+                "index": 0,
+                "message": {"content": "First response"},
+                "finish_reason": "stop",
+            },
+            {
+                "index": 1,
+                "message": {"content": "Second response"},
+                "finish_reason": "length",
+            },
+        ],
+        usage={"total_tokens": 20},
+        _hidden_params={"api_key": "test"},
+    )
+
+    text_completion = TextCompletionResponse()
+    result = LiteLLMResponseObjectHandler.convert_chat_to_text_completion(
+        response=chat_response, text_completion_response=text_completion
+    )
+
+    assert isinstance(result, TextCompletionResponse)
+    assert result.id == "chat456"
+    assert result.object == "text_completion"
+    assert len(result.choices) == 2
+    assert result.choices[0].text == "First response"
+    assert result.choices[0].finish_reason == "stop"
+    assert result.choices[1].text == "Second response"
+    assert result.choices[1].finish_reason == "length"
+    assert result.usage == Usage(
+        completion_tokens=0,
+        prompt_tokens=0,
+        total_tokens=20,
+        completion_tokens_details=None,
+        prompt_tokens_details=None,
+    )
--- a/tests/llm_translation/test_text_completion_unit_tests.py
+++ b/tests/llm_translation/test_text_completion_unit_tests.py
@ -3,11 +3,15 @@ import os
 import sys
 from datetime import datetime
 from unittest.mock import AsyncMock
+import pytest
+import httpx
+from respx import MockRouter

 sys.path.insert(
    0, os.path.abspath("../..")
 )  # Adds the parent directory to the system path

+import litellm
 from litellm.types.utils import TextCompletionResponse


@ -62,3 +66,71 @@ def test_convert_dict_to_text_completion_response():
    assert response.choices[0].logprobs.token_logprobs == [None, -12.203847]
    assert response.choices[0].logprobs.tokens == ["hello", " crisp"]
    assert response.choices[0].logprobs.top_logprobs == [None, {",": -2.1568563}]
+
+
+@pytest.mark.asyncio
+@pytest.mark.respx
+async def test_huggingface_text_completion_logprobs(respx_mock: MockRouter):
+    """Test text completion with Hugging Face, focusing on logprobs structure"""
+    litellm.set_verbose = True
+
+    # Mock the raw response from Hugging Face
+    mock_response = [
+        {
+            "generated_text": ",\n\nI have a question...",  # truncated for brevity
+            "details": {
+                "finish_reason": "length",
+                "generated_tokens": 100,
+                "seed": None,
+                "prefill": [],
+                "tokens": [
+                    {"id": 28725, "text": ",", "logprob": -1.7626953, "special": False},
+                    {"id": 13, "text": "\n", "logprob": -1.7314453, "special": False},
+                ],
+            },
+        }
+    ]
+
+    # Mock the API request
+    mock_request = respx_mock.post(
+        "https://api-inference.huggingface.co/models/mistralai/Mistral-7B-v0.1"
+    ).mock(return_value=httpx.Response(200, json=mock_response))
+
+    response = await litellm.atext_completion(
+        model="huggingface/mistralai/Mistral-7B-v0.1",
+        prompt="good morning",
+    )
+
+    # Verify the request
+    assert mock_request.called
+    request_body = json.loads(mock_request.calls[0].request.content)
+    assert request_body == {
+        "inputs": "good morning",
+        "parameters": {"details": True, "return_full_text": False},
+        "stream": False,
+    }
+
+    print("response=", response)
+
+    # Verify response structure
+    assert isinstance(response, TextCompletionResponse)
+    assert response.object == "text_completion"
+    assert response.model == "mistralai/Mistral-7B-v0.1"
+
+    # Verify logprobs structure
+    choice = response.choices[0]
+    assert choice.finish_reason == "length"
+    assert choice.index == 0
+    assert isinstance(choice.logprobs.tokens, list)
+    assert isinstance(choice.logprobs.token_logprobs, list)
+    assert isinstance(choice.logprobs.text_offset, list)
+    assert isinstance(choice.logprobs.top_logprobs, list)
+    assert choice.logprobs.tokens == [",", "\n"]
+    assert choice.logprobs.token_logprobs == [-1.7626953, -1.7314453]
+    assert choice.logprobs.text_offset == [0, 1]
+    assert choice.logprobs.top_logprobs == [{}, {}]
+
+    # Verify usage
+    assert response.usage["completion_tokens"] > 0
+    assert response.usage["prompt_tokens"] > 0
+    assert response.usage["total_tokens"] > 0