forked from phoenix/litellm-mirror
fix(main.py): fixing print_verbose
This commit is contained in:
parent
763ecf681a
commit
5b3978eff4
5 changed files with 240 additions and 222 deletions
|
@ -141,227 +141,233 @@ def completion(
|
|||
litellm_params=None,
|
||||
logger_fn=None,
|
||||
):
|
||||
headers = validate_environment(api_key, headers)
|
||||
task = get_hf_task_for_model(model)
|
||||
print_verbose(f"{model}, {task}")
|
||||
completion_url = ""
|
||||
input_text = None
|
||||
if "https" in model:
|
||||
completion_url = model
|
||||
elif api_base:
|
||||
completion_url = api_base
|
||||
elif "HF_API_BASE" in os.environ:
|
||||
completion_url = os.getenv("HF_API_BASE", "")
|
||||
elif "HUGGINGFACE_API_BASE" in os.environ:
|
||||
completion_url = os.getenv("HUGGINGFACE_API_BASE", "")
|
||||
else:
|
||||
completion_url = f"https://api-inference.huggingface.co/models/{model}"
|
||||
|
||||
## Load Config
|
||||
config=litellm.HuggingfaceConfig.get_config()
|
||||
for k, v in config.items():
|
||||
if k not in optional_params: # completion(top_k=3) > huggingfaceConfig(top_k=3) <- allows for dynamic variables to be passed in
|
||||
optional_params[k] = v
|
||||
|
||||
### MAP INPUT PARAMS
|
||||
if task == "conversational":
|
||||
inference_params = copy.deepcopy(optional_params)
|
||||
inference_params.pop("details")
|
||||
inference_params.pop("return_full_text")
|
||||
past_user_inputs = []
|
||||
generated_responses = []
|
||||
text = ""
|
||||
for message in messages:
|
||||
if message["role"] == "user":
|
||||
if text != "":
|
||||
past_user_inputs.append(text)
|
||||
text = message["content"]
|
||||
elif message["role"] == "assistant" or message["role"] == "system":
|
||||
generated_responses.append(message["content"])
|
||||
data = {
|
||||
"inputs": {
|
||||
"text": text,
|
||||
"past_user_inputs": past_user_inputs,
|
||||
"generated_responses": generated_responses
|
||||
},
|
||||
"parameters": inference_params
|
||||
}
|
||||
input_text = "".join(message["content"] for message in messages)
|
||||
elif task == "text-generation-inference":
|
||||
# always send "details" and "return_full_text" as params
|
||||
if model in custom_prompt_dict:
|
||||
# check if the model has a registered custom prompt
|
||||
model_prompt_details = custom_prompt_dict[model]
|
||||
prompt = custom_prompt(
|
||||
role_dict=model_prompt_details.get("roles", None),
|
||||
initial_prompt_value=model_prompt_details.get("initial_prompt_value", ""),
|
||||
final_prompt_value=model_prompt_details.get("final_prompt_value", ""),
|
||||
messages=messages
|
||||
)
|
||||
try:
|
||||
headers = validate_environment(api_key, headers)
|
||||
task = get_hf_task_for_model(model)
|
||||
print_verbose(f"{model}, {task}")
|
||||
completion_url = ""
|
||||
input_text = None
|
||||
if "https" in model:
|
||||
completion_url = model
|
||||
elif api_base:
|
||||
completion_url = api_base
|
||||
elif "HF_API_BASE" in os.environ:
|
||||
completion_url = os.getenv("HF_API_BASE", "")
|
||||
elif "HUGGINGFACE_API_BASE" in os.environ:
|
||||
completion_url = os.getenv("HUGGINGFACE_API_BASE", "")
|
||||
else:
|
||||
prompt = prompt_factory(model=model, messages=messages)
|
||||
data = {
|
||||
"inputs": prompt,
|
||||
"parameters": optional_params,
|
||||
"stream": True if "stream" in optional_params and optional_params["stream"] == True else False,
|
||||
}
|
||||
input_text = prompt
|
||||
else:
|
||||
# Non TGI and Conversational llms
|
||||
# We need this branch, it removes 'details' and 'return_full_text' from params
|
||||
if model in custom_prompt_dict:
|
||||
# check if the model has a registered custom prompt
|
||||
model_prompt_details = custom_prompt_dict[model]
|
||||
prompt = custom_prompt(
|
||||
role_dict=model_prompt_details.get("roles", {}),
|
||||
initial_prompt_value=model_prompt_details.get("initial_prompt_value", ""),
|
||||
final_prompt_value=model_prompt_details.get("final_prompt_value", ""),
|
||||
bos_token=model_prompt_details.get("bos_token", ""),
|
||||
eos_token=model_prompt_details.get("eos_token", ""),
|
||||
messages=messages,
|
||||
)
|
||||
else:
|
||||
prompt = prompt_factory(model=model, messages=messages)
|
||||
inference_params = copy.deepcopy(optional_params)
|
||||
inference_params.pop("details")
|
||||
inference_params.pop("return_full_text")
|
||||
data = {
|
||||
"inputs": prompt,
|
||||
"parameters": inference_params,
|
||||
"stream": True if "stream" in optional_params and optional_params["stream"] == True else False,
|
||||
}
|
||||
input_text = prompt
|
||||
## LOGGING
|
||||
logging_obj.pre_call(
|
||||
input=input_text,
|
||||
api_key=api_key,
|
||||
additional_args={"complete_input_dict": data, "task": task, "headers": headers},
|
||||
)
|
||||
## COMPLETION CALL
|
||||
if "stream" in optional_params and optional_params["stream"] == True:
|
||||
response = requests.post(
|
||||
completion_url,
|
||||
headers=headers,
|
||||
data=json.dumps(data),
|
||||
stream=optional_params["stream"]
|
||||
)
|
||||
return response.iter_lines()
|
||||
else:
|
||||
response = requests.post(
|
||||
completion_url,
|
||||
headers=headers,
|
||||
data=json.dumps(data)
|
||||
)
|
||||
completion_url = f"https://api-inference.huggingface.co/models/{model}"
|
||||
|
||||
## Some servers might return streaming responses even though stream was not set to true. (e.g. Baseten)
|
||||
is_streamed = False
|
||||
if response.__dict__['headers'].get("Content-Type", "") == "text/event-stream":
|
||||
is_streamed = True
|
||||
|
||||
# iterate over the complete streamed response, and return the final answer
|
||||
if is_streamed:
|
||||
streamed_response = CustomStreamWrapper(completion_stream=response.iter_lines(), model=model, custom_llm_provider="huggingface", logging_obj=logging_obj)
|
||||
content = ""
|
||||
for chunk in streamed_response:
|
||||
content += chunk["choices"][0]["delta"]["content"]
|
||||
completion_response: List[Dict[str, Any]] = [{"generated_text": content}]
|
||||
## LOGGING
|
||||
logging_obj.post_call(
|
||||
input=input_text,
|
||||
api_key=api_key,
|
||||
original_response=completion_response,
|
||||
additional_args={"complete_input_dict": data, "task": task},
|
||||
)
|
||||
else:
|
||||
## LOGGING
|
||||
logging_obj.post_call(
|
||||
input=input_text,
|
||||
api_key=api_key,
|
||||
original_response=response.text,
|
||||
additional_args={"complete_input_dict": data, "task": task},
|
||||
)
|
||||
## RESPONSE OBJECT
|
||||
try:
|
||||
completion_response = response.json()
|
||||
except:
|
||||
raise HuggingfaceError(
|
||||
message=response.text, status_code=response.status_code
|
||||
## Load Config
|
||||
config=litellm.HuggingfaceConfig.get_config()
|
||||
for k, v in config.items():
|
||||
if k not in optional_params: # completion(top_k=3) > huggingfaceConfig(top_k=3) <- allows for dynamic variables to be passed in
|
||||
optional_params[k] = v
|
||||
|
||||
### MAP INPUT PARAMS
|
||||
if task == "conversational":
|
||||
inference_params = copy.deepcopy(optional_params)
|
||||
inference_params.pop("details")
|
||||
inference_params.pop("return_full_text")
|
||||
past_user_inputs = []
|
||||
generated_responses = []
|
||||
text = ""
|
||||
for message in messages:
|
||||
if message["role"] == "user":
|
||||
if text != "":
|
||||
past_user_inputs.append(text)
|
||||
text = message["content"]
|
||||
elif message["role"] == "assistant" or message["role"] == "system":
|
||||
generated_responses.append(message["content"])
|
||||
data = {
|
||||
"inputs": {
|
||||
"text": text,
|
||||
"past_user_inputs": past_user_inputs,
|
||||
"generated_responses": generated_responses
|
||||
},
|
||||
"parameters": inference_params
|
||||
}
|
||||
input_text = "".join(message["content"] for message in messages)
|
||||
elif task == "text-generation-inference":
|
||||
# always send "details" and "return_full_text" as params
|
||||
if model in custom_prompt_dict:
|
||||
# check if the model has a registered custom prompt
|
||||
model_prompt_details = custom_prompt_dict[model]
|
||||
prompt = custom_prompt(
|
||||
role_dict=model_prompt_details.get("roles", None),
|
||||
initial_prompt_value=model_prompt_details.get("initial_prompt_value", ""),
|
||||
final_prompt_value=model_prompt_details.get("final_prompt_value", ""),
|
||||
messages=messages
|
||||
)
|
||||
print_verbose(f"response: {completion_response}")
|
||||
if isinstance(completion_response, dict) and "error" in completion_response:
|
||||
print_verbose(f"completion error: {completion_response['error']}")
|
||||
print_verbose(f"response.status_code: {response.status_code}")
|
||||
raise HuggingfaceError(
|
||||
message=completion_response["error"],
|
||||
status_code=response.status_code,
|
||||
)
|
||||
else:
|
||||
if task == "conversational":
|
||||
if len(completion_response["generated_text"]) > 0: # type: ignore
|
||||
model_response["choices"][0]["message"][
|
||||
"content"
|
||||
] = completion_response["generated_text"] # type: ignore
|
||||
elif task == "text-generation-inference":
|
||||
if len(completion_response[0]["generated_text"]) > 0:
|
||||
model_response["choices"][0]["message"][
|
||||
"content"
|
||||
] = completion_response[0]["generated_text"]
|
||||
## GETTING LOGPROBS + FINISH REASON
|
||||
if "details" in completion_response[0] and "tokens" in completion_response[0]["details"]:
|
||||
model_response.choices[0].finish_reason = completion_response[0]["details"]["finish_reason"]
|
||||
sum_logprob = 0
|
||||
for token in completion_response[0]["details"]["tokens"]:
|
||||
sum_logprob += token["logprob"]
|
||||
model_response["choices"][0]["message"]._logprob = sum_logprob
|
||||
if "best_of" in optional_params and optional_params["best_of"] > 1:
|
||||
if "details" in completion_response[0] and "best_of_sequences" in completion_response[0]["details"]:
|
||||
choices_list = []
|
||||
for idx, item in enumerate(completion_response[0]["details"]["best_of_sequences"]):
|
||||
sum_logprob = 0
|
||||
for token in item["tokens"]:
|
||||
sum_logprob += token["logprob"]
|
||||
if len(item["generated_text"]) > 0:
|
||||
message_obj = Message(content=item["generated_text"], logprobs=sum_logprob)
|
||||
else:
|
||||
message_obj = Message(content=None)
|
||||
choice_obj = Choices(finish_reason=item["finish_reason"], index=idx+1, message=message_obj)
|
||||
choices_list.append(choice_obj)
|
||||
model_response["choices"].extend(choices_list)
|
||||
else:
|
||||
if len(completion_response[0]["generated_text"]) > 0:
|
||||
model_response["choices"][0]["message"][
|
||||
"content"
|
||||
] = completion_response[0]["generated_text"]
|
||||
## CALCULATING USAGE
|
||||
prompt_tokens = 0
|
||||
try:
|
||||
prompt_tokens = len(
|
||||
encoding.encode(input_text)
|
||||
) ##[TODO] use the llama2 tokenizer here
|
||||
except:
|
||||
# this should remain non blocking we should not block a response returning if calculating usage fails
|
||||
pass
|
||||
print_verbose(f'output: {model_response["choices"][0]["message"]}')
|
||||
output_text = model_response["choices"][0]["message"].get("content", "")
|
||||
if output_text is not None and len(output_text) > 0:
|
||||
completion_tokens = 0
|
||||
prompt = prompt_factory(model=model, messages=messages)
|
||||
data = {
|
||||
"inputs": prompt,
|
||||
"parameters": optional_params,
|
||||
"stream": True if "stream" in optional_params and optional_params["stream"] == True else False,
|
||||
}
|
||||
input_text = prompt
|
||||
else:
|
||||
# Non TGI and Conversational llms
|
||||
# We need this branch, it removes 'details' and 'return_full_text' from params
|
||||
if model in custom_prompt_dict:
|
||||
# check if the model has a registered custom prompt
|
||||
model_prompt_details = custom_prompt_dict[model]
|
||||
prompt = custom_prompt(
|
||||
role_dict=model_prompt_details.get("roles", {}),
|
||||
initial_prompt_value=model_prompt_details.get("initial_prompt_value", ""),
|
||||
final_prompt_value=model_prompt_details.get("final_prompt_value", ""),
|
||||
bos_token=model_prompt_details.get("bos_token", ""),
|
||||
eos_token=model_prompt_details.get("eos_token", ""),
|
||||
messages=messages,
|
||||
)
|
||||
else:
|
||||
prompt = prompt_factory(model=model, messages=messages)
|
||||
inference_params = copy.deepcopy(optional_params)
|
||||
inference_params.pop("details")
|
||||
inference_params.pop("return_full_text")
|
||||
data = {
|
||||
"inputs": prompt,
|
||||
"parameters": inference_params,
|
||||
"stream": True if "stream" in optional_params and optional_params["stream"] == True else False,
|
||||
}
|
||||
input_text = prompt
|
||||
## LOGGING
|
||||
logging_obj.pre_call(
|
||||
input=input_text,
|
||||
api_key=api_key,
|
||||
additional_args={"complete_input_dict": data, "task": task, "headers": headers},
|
||||
)
|
||||
## COMPLETION CALL
|
||||
if "stream" in optional_params and optional_params["stream"] == True:
|
||||
response = requests.post(
|
||||
completion_url,
|
||||
headers=headers,
|
||||
data=json.dumps(data),
|
||||
stream=optional_params["stream"]
|
||||
)
|
||||
return response.iter_lines()
|
||||
else:
|
||||
response = requests.post(
|
||||
completion_url,
|
||||
headers=headers,
|
||||
data=json.dumps(data)
|
||||
)
|
||||
|
||||
## Some servers might return streaming responses even though stream was not set to true. (e.g. Baseten)
|
||||
is_streamed = False
|
||||
if response.__dict__['headers'].get("Content-Type", "") == "text/event-stream":
|
||||
is_streamed = True
|
||||
|
||||
# iterate over the complete streamed response, and return the final answer
|
||||
if is_streamed:
|
||||
streamed_response = CustomStreamWrapper(completion_stream=response.iter_lines(), model=model, custom_llm_provider="huggingface", logging_obj=logging_obj)
|
||||
content = ""
|
||||
for chunk in streamed_response:
|
||||
content += chunk["choices"][0]["delta"]["content"]
|
||||
completion_response: List[Dict[str, Any]] = [{"generated_text": content}]
|
||||
## LOGGING
|
||||
logging_obj.post_call(
|
||||
input=input_text,
|
||||
api_key=api_key,
|
||||
original_response=completion_response,
|
||||
additional_args={"complete_input_dict": data, "task": task},
|
||||
)
|
||||
else:
|
||||
## LOGGING
|
||||
logging_obj.post_call(
|
||||
input=input_text,
|
||||
api_key=api_key,
|
||||
original_response=response.text,
|
||||
additional_args={"complete_input_dict": data, "task": task},
|
||||
)
|
||||
## RESPONSE OBJECT
|
||||
try:
|
||||
completion_response = response.json()
|
||||
except:
|
||||
raise HuggingfaceError(
|
||||
message=f"Original Response received: {response.text}; Stacktrace: {traceback.format_exc()}", status_code=response.status_code
|
||||
)
|
||||
print_verbose(f"response: {completion_response}")
|
||||
if isinstance(completion_response, dict) and "error" in completion_response:
|
||||
print_verbose(f"completion error: {completion_response['error']}")
|
||||
print_verbose(f"response.status_code: {response.status_code}")
|
||||
raise HuggingfaceError(
|
||||
message=completion_response["error"],
|
||||
status_code=response.status_code,
|
||||
)
|
||||
else:
|
||||
if task == "conversational":
|
||||
if len(completion_response["generated_text"]) > 0: # type: ignore
|
||||
model_response["choices"][0]["message"][
|
||||
"content"
|
||||
] = completion_response["generated_text"] # type: ignore
|
||||
elif task == "text-generation-inference":
|
||||
if len(completion_response[0]["generated_text"]) > 0:
|
||||
model_response["choices"][0]["message"][
|
||||
"content"
|
||||
] = completion_response[0]["generated_text"]
|
||||
## GETTING LOGPROBS + FINISH REASON
|
||||
if "details" in completion_response[0] and "tokens" in completion_response[0]["details"]:
|
||||
model_response.choices[0].finish_reason = completion_response[0]["details"]["finish_reason"]
|
||||
sum_logprob = 0
|
||||
for token in completion_response[0]["details"]["tokens"]:
|
||||
sum_logprob += token["logprob"]
|
||||
model_response["choices"][0]["message"]._logprob = sum_logprob
|
||||
if "best_of" in optional_params and optional_params["best_of"] > 1:
|
||||
if "details" in completion_response[0] and "best_of_sequences" in completion_response[0]["details"]:
|
||||
choices_list = []
|
||||
for idx, item in enumerate(completion_response[0]["details"]["best_of_sequences"]):
|
||||
sum_logprob = 0
|
||||
for token in item["tokens"]:
|
||||
sum_logprob += token["logprob"]
|
||||
if len(item["generated_text"]) > 0:
|
||||
message_obj = Message(content=item["generated_text"], logprobs=sum_logprob)
|
||||
else:
|
||||
message_obj = Message(content=None)
|
||||
choice_obj = Choices(finish_reason=item["finish_reason"], index=idx+1, message=message_obj)
|
||||
choices_list.append(choice_obj)
|
||||
model_response["choices"].extend(choices_list)
|
||||
else:
|
||||
if len(completion_response[0]["generated_text"]) > 0:
|
||||
model_response["choices"][0]["message"][
|
||||
"content"
|
||||
] = completion_response[0]["generated_text"]
|
||||
## CALCULATING USAGE
|
||||
prompt_tokens = 0
|
||||
try:
|
||||
completion_tokens = len(
|
||||
encoding.encode(model_response["choices"][0]["message"].get("content", ""))
|
||||
prompt_tokens = len(
|
||||
encoding.encode(input_text)
|
||||
) ##[TODO] use the llama2 tokenizer here
|
||||
except:
|
||||
# this should remain non blocking we should not block a response returning if calculating usage fails
|
||||
pass
|
||||
else:
|
||||
completion_tokens = 0
|
||||
print_verbose(f'output: {model_response["choices"][0]["message"]}')
|
||||
output_text = model_response["choices"][0]["message"].get("content", "")
|
||||
if output_text is not None and len(output_text) > 0:
|
||||
completion_tokens = 0
|
||||
try:
|
||||
completion_tokens = len(
|
||||
encoding.encode(model_response["choices"][0]["message"].get("content", ""))
|
||||
) ##[TODO] use the llama2 tokenizer here
|
||||
except:
|
||||
# this should remain non blocking we should not block a response returning if calculating usage fails
|
||||
pass
|
||||
else:
|
||||
completion_tokens = 0
|
||||
|
||||
model_response["created"] = time.time()
|
||||
model_response["model"] = model
|
||||
model_response.usage.completion_tokens = completion_tokens
|
||||
model_response.usage.prompt_tokens = prompt_tokens
|
||||
model_response.usage.total_tokens = prompt_tokens + completion_tokens
|
||||
model_response._hidden_params["original_response"] = completion_response
|
||||
return model_response
|
||||
model_response["created"] = time.time()
|
||||
model_response["model"] = model
|
||||
model_response.usage.completion_tokens = completion_tokens
|
||||
model_response.usage.prompt_tokens = prompt_tokens
|
||||
model_response.usage.total_tokens = prompt_tokens + completion_tokens
|
||||
model_response._hidden_params["original_response"] = completion_response
|
||||
return model_response
|
||||
except HuggingfaceError as e:
|
||||
raise e
|
||||
except Exception as e:
|
||||
import traceback
|
||||
raise HuggingfaceError(status_code=500, message=traceback.format_exc())
|
||||
|
||||
|
||||
def embedding(
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue