Gemini-2.5-flash - support reasoning cost calc + return reasoning content (#10141)

* build(model_prices_and_context_window.json): add vertex ai gemini-2.5-flash pricing

* build(model_prices_and_context_window.json): add gemini reasoning token pricing

* fix(vertex_and_google_ai_studio_gemini.py): support counting thinking tokens for gemini

allows accurate cost calc

* fix(utils.py): add reasoning token cost calc to generic cost calc

ensures gemini-2.5-flash cost calculation is accurate

* build(model_prices_and_context_window.json): mark gemini-2.5-flash as 'supports_reasoning'

* feat(gemini/): support 'thinking' + 'reasoning_effort' params + new unit tests

allow controlling thinking effort for gemini-2.5-flash models

* test: update unit testing

* feat(vertex_and_google_ai_studio_gemini.py): return reasoning content if given in gemini response

* test: update model name

* fix: fix ruff check

* test(test_spend_management_endpoints.py): update tests to be less sensitive to new keys / updates to usage object

* fix(vertex_and_google_ai_studio_gemini.py): fix translation
This commit is contained in:
Krish Dholakia 2025-04-19 09:20:52 -07:00 committed by GitHub
parent db4ebe10c8
commit 36308a31be
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
16 changed files with 453 additions and 88 deletions

View file

@ -76,6 +76,11 @@ class BaseLLMChatTest(ABC):
"""Must return the base completion call args"""
pass
def get_base_completion_call_args_with_reasoning_model(self) -> dict:
"""Must return the base completion call args with reasoning_effort"""
return {}
def test_developer_role_translation(self):
"""
Test that the developer role is translated correctly for non-OpenAI providers.
@ -1126,6 +1131,46 @@ class BaseLLMChatTest(ABC):
print(response)
def test_reasoning_effort(self):
"""Test that reasoning_effort is passed correctly to the model"""
from litellm.utils import supports_reasoning
from litellm import completion
os.environ["LITELLM_LOCAL_MODEL_COST_MAP"] = "True"
litellm.model_cost = litellm.get_model_cost_map(url="")
base_completion_call_args = self.get_base_completion_call_args_with_reasoning_model()
if len(base_completion_call_args) == 0:
print("base_completion_call_args is empty")
pytest.skip("Model does not support reasoning")
if not supports_reasoning(base_completion_call_args["model"], None):
print("Model does not support reasoning")
pytest.skip("Model does not support reasoning")
_, provider, _, _ = litellm.get_llm_provider(
model=base_completion_call_args["model"]
)
## CHECK PARAM MAPPING
optional_params = get_optional_params(
model=base_completion_call_args["model"],
custom_llm_provider=provider,
reasoning_effort="high",
)
# either accepts reasoning effort or thinking budget
assert "reasoning_effort" in optional_params or "4096" in json.dumps(optional_params)
try:
litellm._turn_on_debug()
response = completion(
**base_completion_call_args,
reasoning_effort="low",
messages=[{"role": "user", "content": "Hello!"}],
)
print(f"response: {response}")
except Exception as e:
pytest.fail(f"Error: {e}")
class BaseOSeriesModelsTest(ABC): # test across azure/openai