fix(amazon_deepseek_transformation.py): remove </think> from stream o… (#8717)

* fix(amazon_deepseek_transformation.py): remove </think> from stream output - cleanup user facing stream

* fix(key_managenet_endpoints.py): return `/key/list` sorted by created_at

makes it easier to see created key

* style: cleanup team table

* feat(key_edit_view.tsx): support setting model specific tpm/rpm limits on keys
This commit is contained in:
Krish Dholakia 2025-02-22 21:46:55 -08:00 committed by GitHub
parent c4d5b65e7b
commit d7e4cb3606
6 changed files with 50 additions and 8 deletions

View file

@ -2,6 +2,7 @@ from typing import Any, List, Optional, cast
from httpx import Response
from litellm import verbose_logger
from litellm.litellm_core_utils.llm_response_utils.convert_dict_to_response import (
_parse_content_for_reasoning,
)
@ -93,7 +94,12 @@ class AmazonDeepseekR1ResponseIterator(BaseModelResponseIterator):
"""
try:
typed_chunk = AmazonDeepSeekR1StreamingResponse(**chunk) # type: ignore
if "</think>" in typed_chunk["generation"]:
generated_content = typed_chunk["generation"]
if generated_content == "</think>" and not self.has_finished_thinking:
verbose_logger.debug(
"Deepseek r1: </think> received, setting has_finished_thinking to True"
)
generated_content = ""
self.has_finished_thinking = True
prompt_token_count = typed_chunk.get("prompt_token_count") or 0
@ -110,12 +116,12 @@ class AmazonDeepseekR1ResponseIterator(BaseModelResponseIterator):
finish_reason=typed_chunk["stop_reason"],
delta=Delta(
content=(
typed_chunk["generation"]
generated_content
if self.has_finished_thinking
else None
),
reasoning_content=(
typed_chunk["generation"]
generated_content
if not self.has_finished_thinking
else None
),
@ -124,5 +130,6 @@ class AmazonDeepseekR1ResponseIterator(BaseModelResponseIterator):
],
usage=usage,
)
except Exception as e:
raise e

View file

@ -1962,6 +1962,10 @@ async def _list_key_helper(
where=where, # type: ignore
skip=skip, # type: ignore
take=size, # type: ignore
order=[
{"created_at": "desc"},
{"token": "desc"}, # fallback sort
],
)
verbose_proxy_logger.debug(f"Fetched {len(keys)} keys")

View file

@ -133,11 +133,23 @@ export function KeyEditView({
</Form.Item>
<Form.Item label="TPM Limit" name="tpm_limit">
<InputNumber style={{ width: "100%" }} />
<InputNumber style={{ width: "100%" }} min={0}/>
</Form.Item>
<Form.Item label="RPM Limit" name="rpm_limit">
<InputNumber style={{ width: "100%" }} />
<InputNumber style={{ width: "100%" }} min={0}/>
</Form.Item>
<Form.Item label="Max Parallel Requests" name="max_parallel_requests">
<InputNumber style={{ width: "100%" }} min={0}/>
</Form.Item>
<Form.Item label="Model TPM Limit" name="model_tpm_limit">
<Input.TextArea rows={4} placeholder='{"gpt-4": 100, "claude-v1": 200}'/>
</Form.Item>
<Form.Item label="Model RPM Limit" name="model_rpm_limit">
<Input.TextArea rows={4} placeholder='{"gpt-4": 100, "claude-v1": 200}'/>
</Form.Item>
<Form.Item label="Guardrails" name="guardrails">

View file

@ -332,6 +332,9 @@ export default function KeyInfoView({ keyId, onClose, keyData, accessToken, user
<Text className="font-medium">Rate Limits</Text>
<Text>TPM: {keyData.tpm_limit !== null ? keyData.tpm_limit : "Unlimited"}</Text>
<Text>RPM: {keyData.rpm_limit !== null ? keyData.rpm_limit : "Unlimited"}</Text>
<Text>Max Parallel Requests: {keyData.max_parallel_requests !== null ? keyData.max_parallel_requests : "Unlimited"}</Text>
<Text>Model TPM Limits: {keyData.metadata?.model_tpm_limit ? JSON.stringify(keyData.metadata.model_tpm_limit) : "Unlimited"}</Text>
<Text>Model RPM Limits: {keyData.metadata?.model_rpm_limit ? JSON.stringify(keyData.metadata.model_rpm_limit) : "Unlimited"}</Text>
</div>
<div>

View file

@ -2476,6 +2476,25 @@ export const keyUpdateCall = async (
try {
console.log("Form Values in keyUpdateCall:", formValues); // Log the form values before making the API call
if (formValues.model_tpm_limit) {
console.log("formValues.model_tpm_limit:", formValues.model_tpm_limit);
// if there's an exception JSON.parse, show it in the message
try {
formValues.model_tpm_limit = JSON.parse(formValues.model_tpm_limit);
} catch (error) {
throw new Error("Failed to parse model_tpm_limit: " + error);
}
}
if (formValues.model_rpm_limit) {
console.log("formValues.model_rpm_limit:", formValues.model_rpm_limit);
// if there's an exception JSON.parse, show it in the message
try {
formValues.model_rpm_limit = JSON.parse(formValues.model_rpm_limit);
} catch (error) {
throw new Error("Failed to parse model_rpm_limit: " + error);
}
}
const url = proxyBaseUrl ? `${proxyBaseUrl}/key/update` : `/key/update`;
const response = await fetch(url, {
method: "POST",

View file

@ -389,7 +389,6 @@ const Teams: React.FC<TeamProps> = ({
>
{team["team_alias"]}
</TableCell>
<TableRow>
<TableCell>
<div className="overflow-hidden">
<Tooltip title={team.team_id}>
@ -408,8 +407,6 @@ const Teams: React.FC<TeamProps> = ({
</Tooltip>
</div>
</TableCell>
</TableRow>
<TableCell
style={{
maxWidth: "4px",