diff --git a/litellm/litellm_core_utils/llm_response_utils/convert_dict_to_response.py b/litellm/litellm_core_utils/llm_response_utils/convert_dict_to_response.py index 76077ad46..93926a81f 100644 --- a/litellm/litellm_core_utils/llm_response_utils/convert_dict_to_response.py +++ b/litellm/litellm_core_utils/llm_response_utils/convert_dict_to_response.py @@ -14,11 +14,17 @@ from litellm.types.utils import ( Delta, EmbeddingResponse, Function, + HiddenParams, ImageResponse, +) +from litellm.types.utils import Logprobs as TextCompletionLogprobs +from litellm.types.utils import ( Message, ModelResponse, RerankResponse, StreamingChoices, + TextChoices, + TextCompletionResponse, TranscriptionResponse, Usage, ) @@ -235,6 +241,77 @@ class LiteLLMResponseObjectHandler: model_response_object = ImageResponse(**model_response_dict) return model_response_object + @staticmethod + def convert_chat_to_text_completion( + response: ModelResponse, + text_completion_response: TextCompletionResponse, + custom_llm_provider: Optional[str] = None, + ) -> TextCompletionResponse: + """ + Converts a chat completion response to a text completion response format. + + Note: This is used for huggingface. For OpenAI / Azure Text the providers files directly return TextCompletionResponse which we then send to user + + Args: + response (ModelResponse): The chat completion response to convert + + Returns: + TextCompletionResponse: The converted text completion response + + Example: + chat_response = completion(model="gpt-3.5-turbo", messages=[{"role": "user", "content": "Hi"}]) + text_response = convert_chat_to_text_completion(chat_response) + """ + transformed_logprobs = LiteLLMResponseObjectHandler._convert_provider_response_logprobs_to_text_completion_logprobs( + response=response, + custom_llm_provider=custom_llm_provider, + ) + + text_completion_response["id"] = response.get("id", None) + text_completion_response["object"] = "text_completion" + text_completion_response["created"] = response.get("created", None) + text_completion_response["model"] = response.get("model", None) + choices_list: List[TextChoices] = [] + + # Convert each choice to TextChoices + for choice in response["choices"]: + text_choices = TextChoices() + text_choices["text"] = choice["message"]["content"] + text_choices["index"] = choice["index"] + text_choices["logprobs"] = transformed_logprobs + text_choices["finish_reason"] = choice["finish_reason"] + choices_list.append(text_choices) + + text_completion_response["choices"] = choices_list + text_completion_response["usage"] = response.get("usage", None) + text_completion_response._hidden_params = HiddenParams( + **response._hidden_params + ) + return text_completion_response + + @staticmethod + def _convert_provider_response_logprobs_to_text_completion_logprobs( + response: ModelResponse, + custom_llm_provider: Optional[str] = None, + ) -> Optional[TextCompletionLogprobs]: + """ + Convert logprobs from provider to OpenAI.Completion() format + + Only supported for HF TGI models + """ + transformed_logprobs: Optional[TextCompletionLogprobs] = None + if custom_llm_provider == "huggingface": + # only supported for TGI models + try: + raw_response = response._hidden_params.get("original_response", None) + transformed_logprobs = litellm.huggingface._transform_logprobs( + hf_response=raw_response + ) + except Exception as e: + verbose_logger.exception(f"LiteLLM non blocking exception: {e}") + + return transformed_logprobs + def convert_to_model_response_object( # noqa: PLR0915 response_object: Optional[dict] = None, diff --git a/litellm/llms/huggingface_restapi.py b/litellm/llms/huggingface_restapi.py index 67db83ba2..907d72a60 100644 --- a/litellm/llms/huggingface_restapi.py +++ b/litellm/llms/huggingface_restapi.py @@ -15,6 +15,7 @@ import litellm from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLoggingObj from litellm.secret_managers.main import get_secret_str from litellm.types.completion import ChatCompletionMessageToolCallParam +from litellm.types.utils import Logprobs as TextCompletionLogprobs from litellm.utils import Choices, CustomStreamWrapper, Message, ModelResponse, Usage from .base import BaseLLM @@ -1183,3 +1184,73 @@ class Huggingface(BaseLLM): input=input, encoding=encoding, ) + + def _transform_logprobs( + self, hf_response: Optional[List] + ) -> Optional[TextCompletionLogprobs]: + """ + Transform Hugging Face logprobs to OpenAI.Completion() format + """ + if hf_response is None: + return None + + # Initialize an empty list for the transformed logprobs + _logprob: TextCompletionLogprobs = TextCompletionLogprobs( + text_offset=[], + token_logprobs=[], + tokens=[], + top_logprobs=[], + ) + + # For each Hugging Face response, transform the logprobs + for response in hf_response: + # Extract the relevant information from the response + response_details = response["details"] + top_tokens = response_details.get("top_tokens", {}) + + for i, token in enumerate(response_details["prefill"]): + # Extract the text of the token + token_text = token["text"] + + # Extract the logprob of the token + token_logprob = token["logprob"] + + # Add the token information to the 'token_info' list + _logprob.tokens.append(token_text) + _logprob.token_logprobs.append(token_logprob) + + # stub this to work with llm eval harness + top_alt_tokens = {"": -1.0, "": -2.0, "": -3.0} # noqa: F601 + _logprob.top_logprobs.append(top_alt_tokens) + + # For each element in the 'tokens' list, extract the relevant information + for i, token in enumerate(response_details["tokens"]): + # Extract the text of the token + token_text = token["text"] + + # Extract the logprob of the token + token_logprob = token["logprob"] + + top_alt_tokens = {} + temp_top_logprobs = [] + if top_tokens != {}: + temp_top_logprobs = top_tokens[i] + + # top_alt_tokens should look like this: { "alternative_1": -1, "alternative_2": -2, "alternative_3": -3 } + for elem in temp_top_logprobs: + text = elem["text"] + logprob = elem["logprob"] + top_alt_tokens[text] = logprob + + # Add the token information to the 'token_info' list + _logprob.tokens.append(token_text) + _logprob.token_logprobs.append(token_logprob) + _logprob.top_logprobs.append(top_alt_tokens) + + # Add the text offset of the token + # This is computed as the sum of the lengths of all previous tokens + _logprob.text_offset.append( + sum(len(t["text"]) for t in response_details["tokens"][:i]) + ) + + return _logprob diff --git a/litellm/main.py b/litellm/main.py index a964ba7e6..2f3a2ea2b 100644 --- a/litellm/main.py +++ b/litellm/main.py @@ -3867,34 +3867,17 @@ async def atext_completion( custom_llm_provider=custom_llm_provider, ) else: - transformed_logprobs = None - # only supported for TGI models - try: - raw_response = response._hidden_params.get("original_response", None) - transformed_logprobs = litellm.utils.transform_logprobs(raw_response) - except Exception as e: - print_verbose(f"LiteLLM non blocking exception: {e}") - - ## TRANSLATE CHAT TO TEXT FORMAT ## + ## OpenAI / Azure Text Completion Returns here if isinstance(response, TextCompletionResponse): return response elif asyncio.iscoroutine(response): response = await response text_completion_response = TextCompletionResponse() - text_completion_response["id"] = response.get("id", None) - text_completion_response["object"] = "text_completion" - text_completion_response["created"] = response.get("created", None) - text_completion_response["model"] = response.get("model", None) - text_choices = TextChoices() - text_choices["text"] = response["choices"][0]["message"]["content"] - text_choices["index"] = response["choices"][0]["index"] - text_choices["logprobs"] = transformed_logprobs - text_choices["finish_reason"] = response["choices"][0]["finish_reason"] - text_completion_response["choices"] = [text_choices] - text_completion_response["usage"] = response.get("usage", None) - text_completion_response._hidden_params = HiddenParams( - **response._hidden_params + text_completion_response = litellm.utils.LiteLLMResponseObjectHandler.convert_chat_to_text_completion( + text_completion_response=text_completion_response, + response=response, + custom_llm_provider=custom_llm_provider, ) return text_completion_response except Exception as e: @@ -4156,29 +4139,17 @@ def text_completion( # noqa: PLR0915 return response elif isinstance(response, TextCompletionStreamWrapper): return response - transformed_logprobs = None - # only supported for TGI models - try: - raw_response = response._hidden_params.get("original_response", None) - transformed_logprobs = litellm.utils.transform_logprobs(raw_response) - except Exception as e: - verbose_logger.exception(f"LiteLLM non blocking exception: {e}") + # OpenAI Text / Azure Text will return here if isinstance(response, TextCompletionResponse): return response - text_completion_response["id"] = response.get("id", None) - text_completion_response["object"] = "text_completion" - text_completion_response["created"] = response.get("created", None) - text_completion_response["model"] = response.get("model", None) - text_choices = TextChoices() - text_choices["text"] = response["choices"][0]["message"]["content"] - text_choices["index"] = response["choices"][0]["index"] - text_choices["logprobs"] = transformed_logprobs - text_choices["finish_reason"] = response["choices"][0]["finish_reason"] - text_completion_response["choices"] = [text_choices] - text_completion_response["usage"] = response.get("usage", None) - text_completion_response._hidden_params = HiddenParams(**response._hidden_params) + text_completion_response = ( + litellm.utils.LiteLLMResponseObjectHandler.convert_chat_to_text_completion( + response=response, + text_completion_response=text_completion_response, + ) + ) return text_completion_response diff --git a/litellm/utils.py b/litellm/utils.py index 8bd001def..0f7ff50a0 100644 --- a/litellm/utils.py +++ b/litellm/utils.py @@ -71,6 +71,7 @@ from litellm.litellm_core_utils.get_llm_provider_logic import ( ) from litellm.litellm_core_utils.llm_request_utils import _ensure_extra_body_is_safe from litellm.litellm_core_utils.llm_response_utils.convert_dict_to_response import ( + LiteLLMResponseObjectHandler, _handle_invalid_parallel_tool_calls, convert_to_model_response_object, convert_to_streaming_response, @@ -8388,76 +8389,6 @@ def get_valid_models() -> List[str]: return [] # NON-Blocking -# used for litellm.text_completion() to transform HF logprobs to OpenAI.Completion() format -def transform_logprobs(hf_response): - # Initialize an empty list for the transformed logprobs - transformed_logprobs = [] - - # For each Hugging Face response, transform the logprobs - for response in hf_response: - # Extract the relevant information from the response - response_details = response["details"] - top_tokens = response_details.get("top_tokens", {}) - - # Initialize an empty list for the token information - token_info = { - "tokens": [], - "token_logprobs": [], - "text_offset": [], - "top_logprobs": [], - } - - for i, token in enumerate(response_details["prefill"]): - # Extract the text of the token - token_text = token["text"] - - # Extract the logprob of the token - token_logprob = token["logprob"] - - # Add the token information to the 'token_info' list - token_info["tokens"].append(token_text) - token_info["token_logprobs"].append(token_logprob) - - # stub this to work with llm eval harness - top_alt_tokens = {"": -1, "": -2, "": -3} # noqa: F601 - token_info["top_logprobs"].append(top_alt_tokens) - - # For each element in the 'tokens' list, extract the relevant information - for i, token in enumerate(response_details["tokens"]): - # Extract the text of the token - token_text = token["text"] - - # Extract the logprob of the token - token_logprob = token["logprob"] - - top_alt_tokens = {} - temp_top_logprobs = [] - if top_tokens != {}: - temp_top_logprobs = top_tokens[i] - - # top_alt_tokens should look like this: { "alternative_1": -1, "alternative_2": -2, "alternative_3": -3 } - for elem in temp_top_logprobs: - text = elem["text"] - logprob = elem["logprob"] - top_alt_tokens[text] = logprob - - # Add the token information to the 'token_info' list - token_info["tokens"].append(token_text) - token_info["token_logprobs"].append(token_logprob) - token_info["top_logprobs"].append(top_alt_tokens) - - # Add the text offset of the token - # This is computed as the sum of the lengths of all previous tokens - token_info["text_offset"].append( - sum(len(t["text"]) for t in response_details["tokens"][:i]) - ) - - # Add the 'token_info' list to the 'transformed_logprobs' list - transformed_logprobs = token_info - - return transformed_logprobs - - def print_args_passed_to_litellm(original_function, args, kwargs): try: # we've already printed this for acompletion, don't print for completion diff --git a/tests/llm_translation/test_text_completion.py b/tests/llm_translation/test_text_completion.py new file mode 100644 index 000000000..50c96e6eb --- /dev/null +++ b/tests/llm_translation/test_text_completion.py @@ -0,0 +1,141 @@ +import json +import os +import sys +from datetime import datetime + +sys.path.insert( + 0, os.path.abspath("../..") +) # Adds the parent directory to the system path + +import litellm +import pytest + +from litellm.utils import ( + LiteLLMResponseObjectHandler, +) + + +from datetime import timedelta + +from litellm.types.utils import ( + ModelResponse, + TextCompletionResponse, + TextChoices, + Logprobs as TextCompletionLogprobs, + Usage, +) + + +def test_convert_chat_to_text_completion(): + """Test converting chat completion to text completion""" + chat_response = ModelResponse( + id="chat123", + created=1234567890, + model="gpt-3.5-turbo", + choices=[ + { + "index": 0, + "message": {"content": "Hello, world!"}, + "finish_reason": "stop", + } + ], + usage={"total_tokens": 10, "completion_tokens": 10}, + _hidden_params={"api_key": "test"}, + ) + + text_completion = TextCompletionResponse() + result = LiteLLMResponseObjectHandler.convert_chat_to_text_completion( + response=chat_response, text_completion_response=text_completion + ) + + assert isinstance(result, TextCompletionResponse) + assert result.id == "chat123" + assert result.object == "text_completion" + assert result.created == 1234567890 + assert result.model == "gpt-3.5-turbo" + assert result.choices[0].text == "Hello, world!" + assert result.choices[0].finish_reason == "stop" + assert result.usage == Usage( + completion_tokens=10, + prompt_tokens=0, + total_tokens=10, + completion_tokens_details=None, + prompt_tokens_details=None, + ) + + +def test_convert_provider_response_logprobs(): + """Test converting provider logprobs to text completion logprobs""" + response = ModelResponse( + id="test123", + _hidden_params={ + "original_response": { + "details": {"tokens": [{"text": "hello", "logprob": -1.0}]} + } + }, + ) + + result = LiteLLMResponseObjectHandler._convert_provider_response_logprobs_to_text_completion_logprobs( + response=response, custom_llm_provider="huggingface" + ) + + # Note: The actual assertion here depends on the implementation of + # litellm.huggingface._transform_logprobs, but we can at least test the function call + assert ( + result is not None or result is None + ) # Will depend on the actual implementation + + +def test_convert_provider_response_logprobs_non_huggingface(): + """Test converting provider logprobs for non-huggingface provider""" + response = ModelResponse(id="test123", _hidden_params={}) + + result = LiteLLMResponseObjectHandler._convert_provider_response_logprobs_to_text_completion_logprobs( + response=response, custom_llm_provider="openai" + ) + + assert result is None + + +def test_convert_chat_to_text_completion_multiple_choices(): + """Test converting chat completion to text completion with multiple choices""" + chat_response = ModelResponse( + id="chat456", + created=1234567890, + model="gpt-3.5-turbo", + choices=[ + { + "index": 0, + "message": {"content": "First response"}, + "finish_reason": "stop", + }, + { + "index": 1, + "message": {"content": "Second response"}, + "finish_reason": "length", + }, + ], + usage={"total_tokens": 20}, + _hidden_params={"api_key": "test"}, + ) + + text_completion = TextCompletionResponse() + result = LiteLLMResponseObjectHandler.convert_chat_to_text_completion( + response=chat_response, text_completion_response=text_completion + ) + + assert isinstance(result, TextCompletionResponse) + assert result.id == "chat456" + assert result.object == "text_completion" + assert len(result.choices) == 2 + assert result.choices[0].text == "First response" + assert result.choices[0].finish_reason == "stop" + assert result.choices[1].text == "Second response" + assert result.choices[1].finish_reason == "length" + assert result.usage == Usage( + completion_tokens=0, + prompt_tokens=0, + total_tokens=20, + completion_tokens_details=None, + prompt_tokens_details=None, + ) diff --git a/tests/llm_translation/test_text_completion_unit_tests.py b/tests/llm_translation/test_text_completion_unit_tests.py index 2012ae11b..9d5359a4a 100644 --- a/tests/llm_translation/test_text_completion_unit_tests.py +++ b/tests/llm_translation/test_text_completion_unit_tests.py @@ -3,11 +3,15 @@ import os import sys from datetime import datetime from unittest.mock import AsyncMock +import pytest +import httpx +from respx import MockRouter sys.path.insert( 0, os.path.abspath("../..") ) # Adds the parent directory to the system path +import litellm from litellm.types.utils import TextCompletionResponse @@ -62,3 +66,71 @@ def test_convert_dict_to_text_completion_response(): assert response.choices[0].logprobs.token_logprobs == [None, -12.203847] assert response.choices[0].logprobs.tokens == ["hello", " crisp"] assert response.choices[0].logprobs.top_logprobs == [None, {",": -2.1568563}] + + +@pytest.mark.asyncio +@pytest.mark.respx +async def test_huggingface_text_completion_logprobs(respx_mock: MockRouter): + """Test text completion with Hugging Face, focusing on logprobs structure""" + litellm.set_verbose = True + + # Mock the raw response from Hugging Face + mock_response = [ + { + "generated_text": ",\n\nI have a question...", # truncated for brevity + "details": { + "finish_reason": "length", + "generated_tokens": 100, + "seed": None, + "prefill": [], + "tokens": [ + {"id": 28725, "text": ",", "logprob": -1.7626953, "special": False}, + {"id": 13, "text": "\n", "logprob": -1.7314453, "special": False}, + ], + }, + } + ] + + # Mock the API request + mock_request = respx_mock.post( + "https://api-inference.huggingface.co/models/mistralai/Mistral-7B-v0.1" + ).mock(return_value=httpx.Response(200, json=mock_response)) + + response = await litellm.atext_completion( + model="huggingface/mistralai/Mistral-7B-v0.1", + prompt="good morning", + ) + + # Verify the request + assert mock_request.called + request_body = json.loads(mock_request.calls[0].request.content) + assert request_body == { + "inputs": "good morning", + "parameters": {"details": True, "return_full_text": False}, + "stream": False, + } + + print("response=", response) + + # Verify response structure + assert isinstance(response, TextCompletionResponse) + assert response.object == "text_completion" + assert response.model == "mistralai/Mistral-7B-v0.1" + + # Verify logprobs structure + choice = response.choices[0] + assert choice.finish_reason == "length" + assert choice.index == 0 + assert isinstance(choice.logprobs.tokens, list) + assert isinstance(choice.logprobs.token_logprobs, list) + assert isinstance(choice.logprobs.text_offset, list) + assert isinstance(choice.logprobs.top_logprobs, list) + assert choice.logprobs.tokens == [",", "\n"] + assert choice.logprobs.token_logprobs == [-1.7626953, -1.7314453] + assert choice.logprobs.text_offset == [0, 1] + assert choice.logprobs.top_logprobs == [{}, {}] + + # Verify usage + assert response.usage["completion_tokens"] > 0 + assert response.usage["prompt_tokens"] > 0 + assert response.usage["total_tokens"] > 0