litellm-mirror/tests/llm_translation/test_huggingface_chat_completion.py
Krish Dholakia 6ba3c4a4f8
VertexAI non-jsonl file storage support (#9781)
* test: add initial e2e test

* fix(vertex_ai/files): initial commit adding sync file create support

* refactor: initial commit of vertex ai non-jsonl files reaching gcp endpoint

* fix(vertex_ai/files/transformation.py): initial working commit of non-jsonl file call reaching backend endpoint

* fix(vertex_ai/files/transformation.py): working e2e non-jsonl file upload

* test: working e2e jsonl call

* test: unit testing for jsonl file creation

* fix(vertex_ai/transformation.py): reset file pointer after read

allow multiple reads on same file object

* fix: fix linting errors

* fix: fix ruff linting errors

* fix: fix import

* fix: fix linting error

* fix: fix linting error

* fix(vertex_ai/files/transformation.py): fix linting error

* test: update test

* test: update tests

* fix: fix linting errors

* fix: fix test

* fix: fix linting error
2025-04-09 14:01:48 -07:00

359 lines
No EOL
13 KiB
Python

"""
Test HuggingFace LLM
"""
from re import M
import httpx
from base_llm_unit_tests import BaseLLMChatTest
import json
import os
import sys
from unittest.mock import patch, MagicMock, AsyncMock
sys.path.insert(
0, os.path.abspath("../..")
) # Adds the parent directory to the system path
import litellm
import pytest
from litellm.types.utils import ModelResponseStream, ModelResponse
from respx import MockRouter
MOCK_COMPLETION_RESPONSE = {
"id": "9115d3daeab10608",
"object": "chat.completion",
"created": 11111,
"model": "meta-llama/Meta-Llama-3-8B-Instruct",
"prompt": [],
"choices": [
{
"finish_reason": "stop",
"seed": 3629048360264764400,
"logprobs": None,
"index": 0,
"message": {
"role": "assistant",
"content": "This is a test response from the mocked HuggingFace API.",
"tool_calls": []
}
}
],
"usage": {
"prompt_tokens": 10,
"completion_tokens": 20,
"total_tokens": 30
}
}
MOCK_STREAMING_CHUNKS = [
{"id": "id1", "object": "chat.completion.chunk", "created": 1111,
"choices": [{"index": 0, "text": "Deep", "logprobs": None, "finish_reason": None, "seed": None,
"delta": {"token_id": 34564, "role": "assistant", "content": "Deep", "tool_calls": None}}],
"model": "meta-llama/Meta-Llama-3-8B-Instruct-Turbo", "usage": None},
{"id": "id2", "object": "chat.completion.chunk", "created": 1111,
"choices": [{"index": 0, "text": " learning", "logprobs": None, "finish_reason": None, "seed": None,
"delta": {"token_id": 6975, "role": "assistant", "content": " learning", "tool_calls": None}}],
"model": "meta-llama/Meta-Llama-3-8B-Instruct-Turbo", "usage": None},
{"id": "id3", "object": "chat.completion.chunk", "created": 1111,
"choices": [{"index": 0, "text": " is", "logprobs": None, "finish_reason": None, "seed": None,
"delta": {"token_id": 374, "role": "assistant", "content": " is", "tool_calls": None}}],
"model": "meta-llama/Meta-Llama-3-8B-Instruct-Turbo", "usage": None},
{"id": "sid4", "object": "chat.completion.chunk", "created": 1111,
"choices": [{"index": 0, "text": " response", "logprobs": None, "finish_reason": "length", "seed": 2853637492034609700,
"delta": {"token_id": 323, "role": "assistant", "content": " response", "tool_calls": None}}],
"model": "meta-llama/Meta-Llama-3-8B-Instruct-Turbo",
"usage": {"prompt_tokens": 26, "completion_tokens": 20, "total_tokens": 46}}
]
PROVIDER_MAPPING_RESPONSE = {
"fireworks-ai": {
"status": "live",
"providerId": "accounts/fireworks/models/llama-v3-8b-instruct",
"task": "conversational"
},
"together": {
"status": "live",
"providerId": "meta-llama/Meta-Llama-3-8B-Instruct-Turbo",
"task": "conversational"
},
"hf-inference": {
"status": "live",
"providerId": "meta-llama/Meta-Llama-3-8B-Instruct",
"task": "conversational"
},
}
@pytest.fixture
def mock_provider_mapping():
with patch("litellm.llms.huggingface.chat.transformation._fetch_inference_provider_mapping") as mock:
mock.return_value = PROVIDER_MAPPING_RESPONSE
yield mock
@pytest.fixture(autouse=True)
def clear_lru_cache():
from litellm.llms.huggingface.common_utils import _fetch_inference_provider_mapping
_fetch_inference_provider_mapping.cache_clear()
yield
_fetch_inference_provider_mapping.cache_clear()
@pytest.fixture
def mock_http_handler():
"""Fixture to mock the HTTP handler"""
with patch(
"litellm.llms.custom_httpx.http_handler.HTTPHandler.post"
) as mock:
print(f"Creating mock HTTP handler: {mock}")
mock_response = MagicMock()
mock_response.raise_for_status.return_value = None
mock_response.status_code = 200
def mock_side_effect(*args, **kwargs):
if kwargs.get("stream", True):
mock_response.iter_lines.return_value = iter([
f"data: {json.dumps(chunk)}".encode('utf-8')
for chunk in MOCK_STREAMING_CHUNKS
] + [b'data: [DONE]'])
else:
mock_response.json.return_value = MOCK_COMPLETION_RESPONSE
return mock_response
mock.side_effect = mock_side_effect
yield mock
@pytest.fixture
def mock_http_async_handler():
"""Fixture to mock the async HTTP handler"""
with patch(
"litellm.llms.custom_httpx.http_handler.AsyncHTTPHandler.post",
new_callable=AsyncMock
) as mock:
print(f"Creating mock async HTTP handler: {mock}")
mock_response = MagicMock()
mock_response.raise_for_status.return_value = None
mock_response.status_code = 200
mock_response.headers = {"content-type": "application/json"}
mock_response.json.return_value = MOCK_COMPLETION_RESPONSE
mock_response.text = json.dumps(MOCK_COMPLETION_RESPONSE)
async def mock_side_effect(*args, **kwargs):
if kwargs.get("stream", True):
async def mock_aiter():
for chunk in MOCK_STREAMING_CHUNKS:
yield f"data: {json.dumps(chunk)}".encode('utf-8')
yield b"data: [DONE]"
mock_response.aiter_lines = mock_aiter
return mock_response
mock.side_effect = mock_side_effect
yield mock
class TestHuggingFace(BaseLLMChatTest):
@pytest.fixture(autouse=True)
def setup(self, mock_provider_mapping, mock_http_handler, mock_http_async_handler):
self.mock_provider_mapping = mock_provider_mapping
self.mock_http = mock_http_handler
self.mock_http_async = mock_http_async_handler
self.model = "huggingface/together/meta-llama/Meta-Llama-3-8B-Instruct"
litellm.set_verbose = False
def get_base_completion_call_args(self) -> dict:
"""Implementation of abstract method from BaseLLMChatTest"""
return {"model": self.model}
def test_completion_non_streaming(self):
messages = [{"role": "user", "content": "This is a dummy message"}]
response = litellm.completion(
model=self.model,
messages=messages,
stream=False
)
assert isinstance(response, ModelResponse)
assert response.choices[0].message.content == "This is a test response from the mocked HuggingFace API."
assert response.usage is not None
assert response.model == self.model.split("/",2)[2]
def test_completion_streaming(self):
messages = [{"role": "user", "content": "This is a dummy message"}]
response = litellm.completion(
model=self.model,
messages=messages,
stream=True
)
chunks = list(response)
assert len(chunks) > 0
assert self.mock_http.called
call_args = self.mock_http.call_args
assert call_args is not None
kwargs = call_args[1]
data = json.loads(kwargs["data"])
assert data["stream"] is True
assert data["messages"] == messages
assert isinstance(chunks, list)
assert isinstance(chunks[0], ModelResponseStream)
assert isinstance(chunks[0].id, str)
assert chunks[0].model == self.model.split("/",1)[1]
@pytest.mark.asyncio
async def test_async_completion_streaming(self):
"""Test async streaming completion"""
messages = [{"role": "user", "content": "This is a dummy message"}]
response = await litellm.acompletion(
model=self.model,
messages=messages,
stream=True
)
chunks = []
async for chunk in response:
chunks.append(chunk)
assert self.mock_http_async.called
assert len(chunks) > 0
assert isinstance(chunks[0], ModelResponseStream)
assert isinstance(chunks[0].id, str)
assert chunks[0].model == self.model.split("/",1)[1]
@pytest.mark.asyncio
async def test_async_completion_non_streaming(self):
"""Test async non-streaming completion"""
messages = [{"role": "user", "content": "This is a dummy message"}]
response = await litellm.acompletion(
model=self.model,
messages=messages,
stream=False
)
assert self.mock_http_async.called
assert isinstance(response, ModelResponse)
assert response.choices[0].message.content == "This is a test response from the mocked HuggingFace API."
assert response.usage is not None
assert response.model == self.model.split("/",2)[2]
def test_tool_call_no_arguments(self, tool_call_no_arguments):
mock_tool_response = {
**MOCK_COMPLETION_RESPONSE,
"choices": [{
"finish_reason": "tool_calls",
"index": 0,
"message": tool_call_no_arguments
}]
}
with patch.object(self.mock_http, "side_effect", lambda *args, **kwargs: MagicMock(
status_code=200,
json=lambda: mock_tool_response,
raise_for_status=lambda: None
)):
messages = [{"role": "user", "content": "Get the FAQ"}]
tools = [{
"type": "function",
"function": {
"name": "Get-FAQ",
"description": "Get FAQ information",
"parameters": {
"type": "object",
"properties": {},
"required": []
}
}
}]
response = litellm.completion(
model=self.model,
messages=messages,
tools=tools,
tool_choice="auto"
)
assert response.choices[0].message.tool_calls is not None
assert len(response.choices[0].message.tool_calls) == 1
assert response.choices[0].message.tool_calls[0].function.name == tool_call_no_arguments["tool_calls"][0]["function"]["name"]
assert response.choices[0].message.tool_calls[0].function.arguments == tool_call_no_arguments["tool_calls"][0]["function"]["arguments"]
@pytest.mark.parametrize(
"model, provider, expected_url",
[
("meta-llama/Llama-3-8B-Instruct", None, "https://router.huggingface.co/hf-inference/models/meta-llama/Llama-3-8B-Instruct/v1/chat/completions"),
("together/meta-llama/Llama-3-8B-Instruct", None, "https://router.huggingface.co/together/v1/chat/completions"),
("novita/meta-llama/Llama-3-8B-Instruct", None, "https://router.huggingface.co/novita/chat/completions"),
("http://custom-endpoint.com/v1/chat/completions", None, "http://custom-endpoint.com/v1/chat/completions"),
],
)
def test_get_complete_url(self, model, provider, expected_url):
"""Test that the complete URL is constructed correctly for different providers"""
from litellm.llms.huggingface.chat.transformation import HuggingFaceChatConfig
config = HuggingFaceChatConfig()
url = config.get_complete_url(
api_base=None,
model=model,
optional_params={},
stream=False,
api_key="test_api_key",
litellm_params={}
)
assert url == expected_url
def test_validate_environment(self):
"""Test that the environment is validated correctly"""
from litellm.llms.huggingface.chat.transformation import HuggingFaceChatConfig
config = HuggingFaceChatConfig()
headers = config.validate_environment(
headers={},
model="huggingface/fireworks-ai/meta-llama/Meta-Llama-3-8B-Instruct",
messages=[{"role": "user", "content": "Hello"}],
optional_params={},
api_key="test_api_key",
litellm_params={}
)
assert headers["Authorization"] == "Bearer test_api_key"
assert headers["content-type"] == "application/json"
@pytest.mark.parametrize(
"model, expected_model",
[
("together/meta-llama/Llama-3-8B-Instruct", "meta-llama/Meta-Llama-3-8B-Instruct-Turbo"),
("meta-llama/Meta-Llama-3-8B-Instruct", "meta-llama/Meta-Llama-3-8B-Instruct"),
],
)
def test_transform_request(self, model, expected_model):
from litellm.llms.huggingface.chat.transformation import HuggingFaceChatConfig
config = HuggingFaceChatConfig()
messages = [{"role": "user", "content": "Hello"}]
transformed_request = config.transform_request(
model=model,
messages=messages,
optional_params={},
litellm_params={},
headers={}
)
assert transformed_request["model"] == expected_model
assert transformed_request["messages"] == messages
@pytest.mark.asyncio
async def test_completion_cost(self):
pass