Merge pull request #1344 from BerriAI/litellm_speed_improvements

Litellm speed improvements
This commit is contained in:
Krish Dholakia 2024-01-06 22:38:10 +05:30 committed by GitHub
commit 439ee3bafc
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
7 changed files with 46 additions and 82 deletions

View file

@ -248,7 +248,7 @@ class AzureChatCompletion(BaseLLM):
else:
azure_client = client
response = azure_client.chat.completions.create(**data, timeout=timeout) # type: ignore
stringified_response = response.model_dump_json()
stringified_response = response.model_dump()
## LOGGING
logging_obj.post_call(
input=messages,
@ -261,7 +261,7 @@ class AzureChatCompletion(BaseLLM):
},
)
return convert_to_model_response_object(
response_object=json.loads(stringified_response),
response_object=stringified_response,
model_response_object=model_response,
)
except AzureOpenAIError as e:
@ -323,7 +323,7 @@ class AzureChatCompletion(BaseLLM):
**data, timeout=timeout
)
return convert_to_model_response_object(
response_object=json.loads(response.model_dump_json()),
response_object=response.model_dump(),
model_response_object=model_response,
)
except AzureOpenAIError as e:
@ -465,7 +465,7 @@ class AzureChatCompletion(BaseLLM):
else:
openai_aclient = client
response = await openai_aclient.embeddings.create(**data, timeout=timeout)
stringified_response = response.model_dump_json()
stringified_response = response.model_dump()
## LOGGING
logging_obj.post_call(
input=input,
@ -474,7 +474,7 @@ class AzureChatCompletion(BaseLLM):
original_response=stringified_response,
)
return convert_to_model_response_object(
response_object=json.loads(stringified_response),
response_object=stringified_response,
model_response_object=model_response,
response_type="embedding",
)
@ -564,7 +564,7 @@ class AzureChatCompletion(BaseLLM):
original_response=response,
)
return convert_to_model_response_object(response_object=json.loads(response.model_dump_json()), model_response_object=model_response, response_type="embedding") # type: ignore
return convert_to_model_response_object(response_object=response.model_dump(), model_response_object=model_response, response_type="embedding") # type: ignore
except AzureOpenAIError as e:
exception_mapping_worked = True
raise e
@ -599,7 +599,7 @@ class AzureChatCompletion(BaseLLM):
else:
openai_aclient = client
response = await openai_aclient.images.generate(**data, timeout=timeout)
stringified_response = response.model_dump_json()
stringified_response = response.model_dump()
## LOGGING
logging_obj.post_call(
input=input,
@ -608,7 +608,7 @@ class AzureChatCompletion(BaseLLM):
original_response=stringified_response,
)
return convert_to_model_response_object(
response_object=json.loads(stringified_response),
response_object=stringified_response,
model_response_object=model_response,
response_type="image_generation",
)
@ -697,7 +697,7 @@ class AzureChatCompletion(BaseLLM):
original_response=response,
)
# return response
return convert_to_model_response_object(response_object=json.loads(response.model_dump_json()), model_response_object=model_response, response_type="image_generation") # type: ignore
return convert_to_model_response_object(response_object=response.model_dump(), model_response_object=model_response, response_type="image_generation") # type: ignore
except AzureOpenAIError as e:
exception_mapping_worked = True
raise e

View file

@ -280,18 +280,6 @@ class OpenAIChatCompletion(BaseLLM):
max_retries=max_retries,
)
else:
## LOGGING
logging_obj.pre_call(
input=messages,
api_key=api_key,
additional_args={
"headers": headers,
"api_base": api_base,
"acompletion": acompletion,
"complete_input_dict": data,
},
)
if not isinstance(max_retries, int):
raise OpenAIError(
status_code=422, message="max retries must be an int"
@ -306,8 +294,21 @@ class OpenAIChatCompletion(BaseLLM):
)
else:
openai_client = client
## LOGGING
logging_obj.pre_call(
input=messages,
api_key=openai_client.api_key,
additional_args={
"headers": headers,
"api_base": openai_client._base_url._uri_reference,
"acompletion": acompletion,
"complete_input_dict": data,
},
)
response = openai_client.chat.completions.create(**data, timeout=timeout) # type: ignore
stringified_response = response.model_dump_json()
stringified_response = response.model_dump()
logging_obj.post_call(
input=messages,
api_key=api_key,
@ -315,7 +316,7 @@ class OpenAIChatCompletion(BaseLLM):
additional_args={"complete_input_dict": data},
)
return convert_to_model_response_object(
response_object=json.loads(stringified_response),
response_object=stringified_response,
model_response_object=model_response,
)
except Exception as e:
@ -386,7 +387,7 @@ class OpenAIChatCompletion(BaseLLM):
response = await openai_aclient.chat.completions.create(
**data, timeout=timeout
)
stringified_response = response.model_dump_json()
stringified_response = response.model_dump()
logging_obj.post_call(
input=data["messages"],
api_key=api_key,
@ -394,7 +395,7 @@ class OpenAIChatCompletion(BaseLLM):
additional_args={"complete_input_dict": data},
)
return convert_to_model_response_object(
response_object=json.loads(stringified_response),
response_object=stringified_response,
model_response_object=model_response,
)
except Exception as e:
@ -527,7 +528,7 @@ class OpenAIChatCompletion(BaseLLM):
else:
openai_aclient = client
response = await openai_aclient.embeddings.create(**data, timeout=timeout) # type: ignore
stringified_response = response.model_dump_json()
stringified_response = response.model_dump()
## LOGGING
logging_obj.post_call(
input=input,
@ -535,7 +536,7 @@ class OpenAIChatCompletion(BaseLLM):
additional_args={"complete_input_dict": data},
original_response=stringified_response,
)
return convert_to_model_response_object(response_object=json.loads(stringified_response), model_response_object=model_response, response_type="embedding") # type: ignore
return convert_to_model_response_object(response_object=stringified_response, model_response_object=model_response, response_type="embedding") # type: ignore
except Exception as e:
## LOGGING
logging_obj.post_call(
@ -597,7 +598,7 @@ class OpenAIChatCompletion(BaseLLM):
original_response=response,
)
return convert_to_model_response_object(response_object=json.loads(response.model_dump_json()), model_response_object=model_response, response_type="embedding") # type: ignore
return convert_to_model_response_object(response_object=response.model_dump(), model_response_object=model_response, response_type="embedding") # type: ignore
except OpenAIError as e:
exception_mapping_worked = True
raise e
@ -634,7 +635,7 @@ class OpenAIChatCompletion(BaseLLM):
else:
openai_aclient = client
response = await openai_aclient.images.generate(**data, timeout=timeout) # type: ignore
stringified_response = response.model_dump_json()
stringified_response = response.model_dump()
## LOGGING
logging_obj.post_call(
input=prompt,
@ -642,7 +643,7 @@ class OpenAIChatCompletion(BaseLLM):
additional_args={"complete_input_dict": data},
original_response=stringified_response,
)
return convert_to_model_response_object(response_object=json.loads(stringified_response), model_response_object=model_response, response_type="image_generation") # type: ignore
return convert_to_model_response_object(response_object=stringified_response, model_response_object=model_response, response_type="image_generation") # type: ignore
except Exception as e:
## LOGGING
logging_obj.post_call(
@ -710,7 +711,7 @@ class OpenAIChatCompletion(BaseLLM):
original_response=response,
)
# return response
return convert_to_model_response_object(response_object=json.loads(response.model_dump_json()), model_response_object=model_response, response_type="image_generation") # type: ignore
return convert_to_model_response_object(response_object=response.model_dump(), model_response_object=model_response, response_type="image_generation") # type: ignore
except OpenAIError as e:
exception_mapping_worked = True
raise e

View file

@ -360,14 +360,6 @@ def embedding(
except Exception as e:
raise SagemakerError(status_code=500, message=f"{str(e)}")
## LOGGING
logging_obj.post_call(
input=input,
api_key="",
additional_args={"complete_input_dict": data},
original_response=response,
)
response = json.loads(response["Body"].read().decode("utf8"))
## LOGGING
logging_obj.post_call(
@ -376,6 +368,7 @@ def embedding(
original_response=response,
additional_args={"complete_input_dict": data},
)
print_verbose(f"raw model_response: {response}")
if "embedding" not in response:
raise SagemakerError(status_code=500, message="embedding not found in response")

View file

@ -352,18 +352,16 @@ class Router:
else:
model_client = potential_model_client
self.total_calls[model_name] += 1
response = await asyncio.wait_for(
litellm.acompletion(
response = await litellm.acompletion(
**{
**data,
"messages": messages,
"caching": self.cache_responses,
"client": model_client,
"timeout": self.timeout,
**kwargs,
}
),
timeout=self.timeout,
)
)
self.success_calls[model_name] += 1
return response
except Exception as e:
@ -614,18 +612,16 @@ class Router:
else:
model_client = potential_model_client
self.total_calls[model_name] += 1
response = await asyncio.wait_for(
litellm.atext_completion(
response = await litellm.atext_completion(
**{
**data,
"prompt": prompt,
"caching": self.cache_responses,
"client": model_client,
"timeout": self.timeout,
**kwargs,
}
),
timeout=self.timeout,
)
)
self.success_calls[model_name] += 1
return response
except Exception as e:

View file

@ -956,6 +956,8 @@ class Logging:
):
# Log the exact result from the LLM API, for streaming - log the type of response received
litellm.error_logs["POST_CALL"] = locals()
if isinstance(original_response, dict):
original_response = json.dumps(original_response)
try:
self.model_call_details["input"] = input
self.model_call_details["api_key"] = api_key