forked from phoenix/litellm-mirror
fix(main.py): fixing print_verbose
This commit is contained in:
parent
763ecf681a
commit
5b3978eff4
5 changed files with 240 additions and 222 deletions
|
@ -141,227 +141,233 @@ def completion(
|
||||||
litellm_params=None,
|
litellm_params=None,
|
||||||
logger_fn=None,
|
logger_fn=None,
|
||||||
):
|
):
|
||||||
headers = validate_environment(api_key, headers)
|
try:
|
||||||
task = get_hf_task_for_model(model)
|
headers = validate_environment(api_key, headers)
|
||||||
print_verbose(f"{model}, {task}")
|
task = get_hf_task_for_model(model)
|
||||||
completion_url = ""
|
print_verbose(f"{model}, {task}")
|
||||||
input_text = None
|
completion_url = ""
|
||||||
if "https" in model:
|
input_text = None
|
||||||
completion_url = model
|
if "https" in model:
|
||||||
elif api_base:
|
completion_url = model
|
||||||
completion_url = api_base
|
elif api_base:
|
||||||
elif "HF_API_BASE" in os.environ:
|
completion_url = api_base
|
||||||
completion_url = os.getenv("HF_API_BASE", "")
|
elif "HF_API_BASE" in os.environ:
|
||||||
elif "HUGGINGFACE_API_BASE" in os.environ:
|
completion_url = os.getenv("HF_API_BASE", "")
|
||||||
completion_url = os.getenv("HUGGINGFACE_API_BASE", "")
|
elif "HUGGINGFACE_API_BASE" in os.environ:
|
||||||
else:
|
completion_url = os.getenv("HUGGINGFACE_API_BASE", "")
|
||||||
completion_url = f"https://api-inference.huggingface.co/models/{model}"
|
|
||||||
|
|
||||||
## Load Config
|
|
||||||
config=litellm.HuggingfaceConfig.get_config()
|
|
||||||
for k, v in config.items():
|
|
||||||
if k not in optional_params: # completion(top_k=3) > huggingfaceConfig(top_k=3) <- allows for dynamic variables to be passed in
|
|
||||||
optional_params[k] = v
|
|
||||||
|
|
||||||
### MAP INPUT PARAMS
|
|
||||||
if task == "conversational":
|
|
||||||
inference_params = copy.deepcopy(optional_params)
|
|
||||||
inference_params.pop("details")
|
|
||||||
inference_params.pop("return_full_text")
|
|
||||||
past_user_inputs = []
|
|
||||||
generated_responses = []
|
|
||||||
text = ""
|
|
||||||
for message in messages:
|
|
||||||
if message["role"] == "user":
|
|
||||||
if text != "":
|
|
||||||
past_user_inputs.append(text)
|
|
||||||
text = message["content"]
|
|
||||||
elif message["role"] == "assistant" or message["role"] == "system":
|
|
||||||
generated_responses.append(message["content"])
|
|
||||||
data = {
|
|
||||||
"inputs": {
|
|
||||||
"text": text,
|
|
||||||
"past_user_inputs": past_user_inputs,
|
|
||||||
"generated_responses": generated_responses
|
|
||||||
},
|
|
||||||
"parameters": inference_params
|
|
||||||
}
|
|
||||||
input_text = "".join(message["content"] for message in messages)
|
|
||||||
elif task == "text-generation-inference":
|
|
||||||
# always send "details" and "return_full_text" as params
|
|
||||||
if model in custom_prompt_dict:
|
|
||||||
# check if the model has a registered custom prompt
|
|
||||||
model_prompt_details = custom_prompt_dict[model]
|
|
||||||
prompt = custom_prompt(
|
|
||||||
role_dict=model_prompt_details.get("roles", None),
|
|
||||||
initial_prompt_value=model_prompt_details.get("initial_prompt_value", ""),
|
|
||||||
final_prompt_value=model_prompt_details.get("final_prompt_value", ""),
|
|
||||||
messages=messages
|
|
||||||
)
|
|
||||||
else:
|
else:
|
||||||
prompt = prompt_factory(model=model, messages=messages)
|
completion_url = f"https://api-inference.huggingface.co/models/{model}"
|
||||||
data = {
|
|
||||||
"inputs": prompt,
|
|
||||||
"parameters": optional_params,
|
|
||||||
"stream": True if "stream" in optional_params and optional_params["stream"] == True else False,
|
|
||||||
}
|
|
||||||
input_text = prompt
|
|
||||||
else:
|
|
||||||
# Non TGI and Conversational llms
|
|
||||||
# We need this branch, it removes 'details' and 'return_full_text' from params
|
|
||||||
if model in custom_prompt_dict:
|
|
||||||
# check if the model has a registered custom prompt
|
|
||||||
model_prompt_details = custom_prompt_dict[model]
|
|
||||||
prompt = custom_prompt(
|
|
||||||
role_dict=model_prompt_details.get("roles", {}),
|
|
||||||
initial_prompt_value=model_prompt_details.get("initial_prompt_value", ""),
|
|
||||||
final_prompt_value=model_prompt_details.get("final_prompt_value", ""),
|
|
||||||
bos_token=model_prompt_details.get("bos_token", ""),
|
|
||||||
eos_token=model_prompt_details.get("eos_token", ""),
|
|
||||||
messages=messages,
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
prompt = prompt_factory(model=model, messages=messages)
|
|
||||||
inference_params = copy.deepcopy(optional_params)
|
|
||||||
inference_params.pop("details")
|
|
||||||
inference_params.pop("return_full_text")
|
|
||||||
data = {
|
|
||||||
"inputs": prompt,
|
|
||||||
"parameters": inference_params,
|
|
||||||
"stream": True if "stream" in optional_params and optional_params["stream"] == True else False,
|
|
||||||
}
|
|
||||||
input_text = prompt
|
|
||||||
## LOGGING
|
|
||||||
logging_obj.pre_call(
|
|
||||||
input=input_text,
|
|
||||||
api_key=api_key,
|
|
||||||
additional_args={"complete_input_dict": data, "task": task, "headers": headers},
|
|
||||||
)
|
|
||||||
## COMPLETION CALL
|
|
||||||
if "stream" in optional_params and optional_params["stream"] == True:
|
|
||||||
response = requests.post(
|
|
||||||
completion_url,
|
|
||||||
headers=headers,
|
|
||||||
data=json.dumps(data),
|
|
||||||
stream=optional_params["stream"]
|
|
||||||
)
|
|
||||||
return response.iter_lines()
|
|
||||||
else:
|
|
||||||
response = requests.post(
|
|
||||||
completion_url,
|
|
||||||
headers=headers,
|
|
||||||
data=json.dumps(data)
|
|
||||||
)
|
|
||||||
|
|
||||||
## Some servers might return streaming responses even though stream was not set to true. (e.g. Baseten)
|
## Load Config
|
||||||
is_streamed = False
|
config=litellm.HuggingfaceConfig.get_config()
|
||||||
if response.__dict__['headers'].get("Content-Type", "") == "text/event-stream":
|
for k, v in config.items():
|
||||||
is_streamed = True
|
if k not in optional_params: # completion(top_k=3) > huggingfaceConfig(top_k=3) <- allows for dynamic variables to be passed in
|
||||||
|
optional_params[k] = v
|
||||||
# iterate over the complete streamed response, and return the final answer
|
|
||||||
if is_streamed:
|
### MAP INPUT PARAMS
|
||||||
streamed_response = CustomStreamWrapper(completion_stream=response.iter_lines(), model=model, custom_llm_provider="huggingface", logging_obj=logging_obj)
|
if task == "conversational":
|
||||||
content = ""
|
inference_params = copy.deepcopy(optional_params)
|
||||||
for chunk in streamed_response:
|
inference_params.pop("details")
|
||||||
content += chunk["choices"][0]["delta"]["content"]
|
inference_params.pop("return_full_text")
|
||||||
completion_response: List[Dict[str, Any]] = [{"generated_text": content}]
|
past_user_inputs = []
|
||||||
## LOGGING
|
generated_responses = []
|
||||||
logging_obj.post_call(
|
text = ""
|
||||||
input=input_text,
|
for message in messages:
|
||||||
api_key=api_key,
|
if message["role"] == "user":
|
||||||
original_response=completion_response,
|
if text != "":
|
||||||
additional_args={"complete_input_dict": data, "task": task},
|
past_user_inputs.append(text)
|
||||||
)
|
text = message["content"]
|
||||||
else:
|
elif message["role"] == "assistant" or message["role"] == "system":
|
||||||
## LOGGING
|
generated_responses.append(message["content"])
|
||||||
logging_obj.post_call(
|
data = {
|
||||||
input=input_text,
|
"inputs": {
|
||||||
api_key=api_key,
|
"text": text,
|
||||||
original_response=response.text,
|
"past_user_inputs": past_user_inputs,
|
||||||
additional_args={"complete_input_dict": data, "task": task},
|
"generated_responses": generated_responses
|
||||||
)
|
},
|
||||||
## RESPONSE OBJECT
|
"parameters": inference_params
|
||||||
try:
|
}
|
||||||
completion_response = response.json()
|
input_text = "".join(message["content"] for message in messages)
|
||||||
except:
|
elif task == "text-generation-inference":
|
||||||
raise HuggingfaceError(
|
# always send "details" and "return_full_text" as params
|
||||||
message=response.text, status_code=response.status_code
|
if model in custom_prompt_dict:
|
||||||
|
# check if the model has a registered custom prompt
|
||||||
|
model_prompt_details = custom_prompt_dict[model]
|
||||||
|
prompt = custom_prompt(
|
||||||
|
role_dict=model_prompt_details.get("roles", None),
|
||||||
|
initial_prompt_value=model_prompt_details.get("initial_prompt_value", ""),
|
||||||
|
final_prompt_value=model_prompt_details.get("final_prompt_value", ""),
|
||||||
|
messages=messages
|
||||||
)
|
)
|
||||||
print_verbose(f"response: {completion_response}")
|
|
||||||
if isinstance(completion_response, dict) and "error" in completion_response:
|
|
||||||
print_verbose(f"completion error: {completion_response['error']}")
|
|
||||||
print_verbose(f"response.status_code: {response.status_code}")
|
|
||||||
raise HuggingfaceError(
|
|
||||||
message=completion_response["error"],
|
|
||||||
status_code=response.status_code,
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
if task == "conversational":
|
|
||||||
if len(completion_response["generated_text"]) > 0: # type: ignore
|
|
||||||
model_response["choices"][0]["message"][
|
|
||||||
"content"
|
|
||||||
] = completion_response["generated_text"] # type: ignore
|
|
||||||
elif task == "text-generation-inference":
|
|
||||||
if len(completion_response[0]["generated_text"]) > 0:
|
|
||||||
model_response["choices"][0]["message"][
|
|
||||||
"content"
|
|
||||||
] = completion_response[0]["generated_text"]
|
|
||||||
## GETTING LOGPROBS + FINISH REASON
|
|
||||||
if "details" in completion_response[0] and "tokens" in completion_response[0]["details"]:
|
|
||||||
model_response.choices[0].finish_reason = completion_response[0]["details"]["finish_reason"]
|
|
||||||
sum_logprob = 0
|
|
||||||
for token in completion_response[0]["details"]["tokens"]:
|
|
||||||
sum_logprob += token["logprob"]
|
|
||||||
model_response["choices"][0]["message"]._logprob = sum_logprob
|
|
||||||
if "best_of" in optional_params and optional_params["best_of"] > 1:
|
|
||||||
if "details" in completion_response[0] and "best_of_sequences" in completion_response[0]["details"]:
|
|
||||||
choices_list = []
|
|
||||||
for idx, item in enumerate(completion_response[0]["details"]["best_of_sequences"]):
|
|
||||||
sum_logprob = 0
|
|
||||||
for token in item["tokens"]:
|
|
||||||
sum_logprob += token["logprob"]
|
|
||||||
if len(item["generated_text"]) > 0:
|
|
||||||
message_obj = Message(content=item["generated_text"], logprobs=sum_logprob)
|
|
||||||
else:
|
|
||||||
message_obj = Message(content=None)
|
|
||||||
choice_obj = Choices(finish_reason=item["finish_reason"], index=idx+1, message=message_obj)
|
|
||||||
choices_list.append(choice_obj)
|
|
||||||
model_response["choices"].extend(choices_list)
|
|
||||||
else:
|
else:
|
||||||
if len(completion_response[0]["generated_text"]) > 0:
|
prompt = prompt_factory(model=model, messages=messages)
|
||||||
model_response["choices"][0]["message"][
|
data = {
|
||||||
"content"
|
"inputs": prompt,
|
||||||
] = completion_response[0]["generated_text"]
|
"parameters": optional_params,
|
||||||
## CALCULATING USAGE
|
"stream": True if "stream" in optional_params and optional_params["stream"] == True else False,
|
||||||
prompt_tokens = 0
|
}
|
||||||
try:
|
input_text = prompt
|
||||||
prompt_tokens = len(
|
else:
|
||||||
encoding.encode(input_text)
|
# Non TGI and Conversational llms
|
||||||
) ##[TODO] use the llama2 tokenizer here
|
# We need this branch, it removes 'details' and 'return_full_text' from params
|
||||||
except:
|
if model in custom_prompt_dict:
|
||||||
# this should remain non blocking we should not block a response returning if calculating usage fails
|
# check if the model has a registered custom prompt
|
||||||
pass
|
model_prompt_details = custom_prompt_dict[model]
|
||||||
print_verbose(f'output: {model_response["choices"][0]["message"]}')
|
prompt = custom_prompt(
|
||||||
output_text = model_response["choices"][0]["message"].get("content", "")
|
role_dict=model_prompt_details.get("roles", {}),
|
||||||
if output_text is not None and len(output_text) > 0:
|
initial_prompt_value=model_prompt_details.get("initial_prompt_value", ""),
|
||||||
completion_tokens = 0
|
final_prompt_value=model_prompt_details.get("final_prompt_value", ""),
|
||||||
|
bos_token=model_prompt_details.get("bos_token", ""),
|
||||||
|
eos_token=model_prompt_details.get("eos_token", ""),
|
||||||
|
messages=messages,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
prompt = prompt_factory(model=model, messages=messages)
|
||||||
|
inference_params = copy.deepcopy(optional_params)
|
||||||
|
inference_params.pop("details")
|
||||||
|
inference_params.pop("return_full_text")
|
||||||
|
data = {
|
||||||
|
"inputs": prompt,
|
||||||
|
"parameters": inference_params,
|
||||||
|
"stream": True if "stream" in optional_params and optional_params["stream"] == True else False,
|
||||||
|
}
|
||||||
|
input_text = prompt
|
||||||
|
## LOGGING
|
||||||
|
logging_obj.pre_call(
|
||||||
|
input=input_text,
|
||||||
|
api_key=api_key,
|
||||||
|
additional_args={"complete_input_dict": data, "task": task, "headers": headers},
|
||||||
|
)
|
||||||
|
## COMPLETION CALL
|
||||||
|
if "stream" in optional_params and optional_params["stream"] == True:
|
||||||
|
response = requests.post(
|
||||||
|
completion_url,
|
||||||
|
headers=headers,
|
||||||
|
data=json.dumps(data),
|
||||||
|
stream=optional_params["stream"]
|
||||||
|
)
|
||||||
|
return response.iter_lines()
|
||||||
|
else:
|
||||||
|
response = requests.post(
|
||||||
|
completion_url,
|
||||||
|
headers=headers,
|
||||||
|
data=json.dumps(data)
|
||||||
|
)
|
||||||
|
|
||||||
|
## Some servers might return streaming responses even though stream was not set to true. (e.g. Baseten)
|
||||||
|
is_streamed = False
|
||||||
|
if response.__dict__['headers'].get("Content-Type", "") == "text/event-stream":
|
||||||
|
is_streamed = True
|
||||||
|
|
||||||
|
# iterate over the complete streamed response, and return the final answer
|
||||||
|
if is_streamed:
|
||||||
|
streamed_response = CustomStreamWrapper(completion_stream=response.iter_lines(), model=model, custom_llm_provider="huggingface", logging_obj=logging_obj)
|
||||||
|
content = ""
|
||||||
|
for chunk in streamed_response:
|
||||||
|
content += chunk["choices"][0]["delta"]["content"]
|
||||||
|
completion_response: List[Dict[str, Any]] = [{"generated_text": content}]
|
||||||
|
## LOGGING
|
||||||
|
logging_obj.post_call(
|
||||||
|
input=input_text,
|
||||||
|
api_key=api_key,
|
||||||
|
original_response=completion_response,
|
||||||
|
additional_args={"complete_input_dict": data, "task": task},
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
## LOGGING
|
||||||
|
logging_obj.post_call(
|
||||||
|
input=input_text,
|
||||||
|
api_key=api_key,
|
||||||
|
original_response=response.text,
|
||||||
|
additional_args={"complete_input_dict": data, "task": task},
|
||||||
|
)
|
||||||
|
## RESPONSE OBJECT
|
||||||
|
try:
|
||||||
|
completion_response = response.json()
|
||||||
|
except:
|
||||||
|
raise HuggingfaceError(
|
||||||
|
message=f"Original Response received: {response.text}; Stacktrace: {traceback.format_exc()}", status_code=response.status_code
|
||||||
|
)
|
||||||
|
print_verbose(f"response: {completion_response}")
|
||||||
|
if isinstance(completion_response, dict) and "error" in completion_response:
|
||||||
|
print_verbose(f"completion error: {completion_response['error']}")
|
||||||
|
print_verbose(f"response.status_code: {response.status_code}")
|
||||||
|
raise HuggingfaceError(
|
||||||
|
message=completion_response["error"],
|
||||||
|
status_code=response.status_code,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
if task == "conversational":
|
||||||
|
if len(completion_response["generated_text"]) > 0: # type: ignore
|
||||||
|
model_response["choices"][0]["message"][
|
||||||
|
"content"
|
||||||
|
] = completion_response["generated_text"] # type: ignore
|
||||||
|
elif task == "text-generation-inference":
|
||||||
|
if len(completion_response[0]["generated_text"]) > 0:
|
||||||
|
model_response["choices"][0]["message"][
|
||||||
|
"content"
|
||||||
|
] = completion_response[0]["generated_text"]
|
||||||
|
## GETTING LOGPROBS + FINISH REASON
|
||||||
|
if "details" in completion_response[0] and "tokens" in completion_response[0]["details"]:
|
||||||
|
model_response.choices[0].finish_reason = completion_response[0]["details"]["finish_reason"]
|
||||||
|
sum_logprob = 0
|
||||||
|
for token in completion_response[0]["details"]["tokens"]:
|
||||||
|
sum_logprob += token["logprob"]
|
||||||
|
model_response["choices"][0]["message"]._logprob = sum_logprob
|
||||||
|
if "best_of" in optional_params and optional_params["best_of"] > 1:
|
||||||
|
if "details" in completion_response[0] and "best_of_sequences" in completion_response[0]["details"]:
|
||||||
|
choices_list = []
|
||||||
|
for idx, item in enumerate(completion_response[0]["details"]["best_of_sequences"]):
|
||||||
|
sum_logprob = 0
|
||||||
|
for token in item["tokens"]:
|
||||||
|
sum_logprob += token["logprob"]
|
||||||
|
if len(item["generated_text"]) > 0:
|
||||||
|
message_obj = Message(content=item["generated_text"], logprobs=sum_logprob)
|
||||||
|
else:
|
||||||
|
message_obj = Message(content=None)
|
||||||
|
choice_obj = Choices(finish_reason=item["finish_reason"], index=idx+1, message=message_obj)
|
||||||
|
choices_list.append(choice_obj)
|
||||||
|
model_response["choices"].extend(choices_list)
|
||||||
|
else:
|
||||||
|
if len(completion_response[0]["generated_text"]) > 0:
|
||||||
|
model_response["choices"][0]["message"][
|
||||||
|
"content"
|
||||||
|
] = completion_response[0]["generated_text"]
|
||||||
|
## CALCULATING USAGE
|
||||||
|
prompt_tokens = 0
|
||||||
try:
|
try:
|
||||||
completion_tokens = len(
|
prompt_tokens = len(
|
||||||
encoding.encode(model_response["choices"][0]["message"].get("content", ""))
|
encoding.encode(input_text)
|
||||||
) ##[TODO] use the llama2 tokenizer here
|
) ##[TODO] use the llama2 tokenizer here
|
||||||
except:
|
except:
|
||||||
# this should remain non blocking we should not block a response returning if calculating usage fails
|
# this should remain non blocking we should not block a response returning if calculating usage fails
|
||||||
pass
|
pass
|
||||||
else:
|
print_verbose(f'output: {model_response["choices"][0]["message"]}')
|
||||||
completion_tokens = 0
|
output_text = model_response["choices"][0]["message"].get("content", "")
|
||||||
|
if output_text is not None and len(output_text) > 0:
|
||||||
|
completion_tokens = 0
|
||||||
|
try:
|
||||||
|
completion_tokens = len(
|
||||||
|
encoding.encode(model_response["choices"][0]["message"].get("content", ""))
|
||||||
|
) ##[TODO] use the llama2 tokenizer here
|
||||||
|
except:
|
||||||
|
# this should remain non blocking we should not block a response returning if calculating usage fails
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
completion_tokens = 0
|
||||||
|
|
||||||
model_response["created"] = time.time()
|
model_response["created"] = time.time()
|
||||||
model_response["model"] = model
|
model_response["model"] = model
|
||||||
model_response.usage.completion_tokens = completion_tokens
|
model_response.usage.completion_tokens = completion_tokens
|
||||||
model_response.usage.prompt_tokens = prompt_tokens
|
model_response.usage.prompt_tokens = prompt_tokens
|
||||||
model_response.usage.total_tokens = prompt_tokens + completion_tokens
|
model_response.usage.total_tokens = prompt_tokens + completion_tokens
|
||||||
model_response._hidden_params["original_response"] = completion_response
|
model_response._hidden_params["original_response"] = completion_response
|
||||||
return model_response
|
return model_response
|
||||||
|
except HuggingfaceError as e:
|
||||||
|
raise e
|
||||||
|
except Exception as e:
|
||||||
|
import traceback
|
||||||
|
raise HuggingfaceError(status_code=500, message=traceback.format_exc())
|
||||||
|
|
||||||
|
|
||||||
def embedding(
|
def embedding(
|
||||||
|
|
|
@ -1961,8 +1961,7 @@ def moderation(input: str, api_key: Optional[str]=None):
|
||||||
## Set verbose to true -> ```litellm.set_verbose = True```
|
## Set verbose to true -> ```litellm.set_verbose = True```
|
||||||
def print_verbose(print_statement):
|
def print_verbose(print_statement):
|
||||||
if litellm.set_verbose:
|
if litellm.set_verbose:
|
||||||
import logging
|
print(print_statement) # noqa
|
||||||
logging.info(f"LiteLLM: {print_statement}")
|
|
||||||
|
|
||||||
def config_completion(**kwargs):
|
def config_completion(**kwargs):
|
||||||
if litellm.config_path != None:
|
if litellm.config_path != None:
|
||||||
|
|
|
@ -52,6 +52,7 @@ def is_port_in_use(port):
|
||||||
@click.command()
|
@click.command()
|
||||||
@click.option('--host', default='0.0.0.0', help='Host for the server to listen on.')
|
@click.option('--host', default='0.0.0.0', help='Host for the server to listen on.')
|
||||||
@click.option('--port', default=8000, help='Port to bind the server to.')
|
@click.option('--port', default=8000, help='Port to bind the server to.')
|
||||||
|
@click.option('--num_workers', default=1, help='Number of uvicorn workers to spin up')
|
||||||
@click.option('--api_base', default=None, help='API base URL.')
|
@click.option('--api_base', default=None, help='API base URL.')
|
||||||
@click.option('--api_version', default="2023-07-01-preview", help='For azure - pass in the api version.')
|
@click.option('--api_version', default="2023-07-01-preview", help='For azure - pass in the api version.')
|
||||||
@click.option('--model', '-m', default=None, help='The model name to pass to litellm expects')
|
@click.option('--model', '-m', default=None, help='The model name to pass to litellm expects')
|
||||||
|
@ -74,17 +75,17 @@ def is_port_in_use(port):
|
||||||
@click.option('--test', flag_value=True, help='proxy chat completions url to make a test request to')
|
@click.option('--test', flag_value=True, help='proxy chat completions url to make a test request to')
|
||||||
@click.option('--local', is_flag=True, default=False, help='for local debugging')
|
@click.option('--local', is_flag=True, default=False, help='for local debugging')
|
||||||
@click.option('--cost', is_flag=True, default=False, help='for viewing cost logs')
|
@click.option('--cost', is_flag=True, default=False, help='for viewing cost logs')
|
||||||
def run_server(host, port, api_base, api_version, model, alias, add_key, headers, save, debug, temperature, max_tokens, request_timeout, drop_params, create_proxy, add_function_to_prompt, config, file, max_budget, telemetry, logs, test, local, cost):
|
def run_server(host, port, api_base, api_version, model, alias, add_key, headers, save, debug, temperature, max_tokens, request_timeout, drop_params, create_proxy, add_function_to_prompt, config, file, max_budget, telemetry, logs, test, local, cost, num_workers):
|
||||||
global feature_telemetry
|
global feature_telemetry
|
||||||
args = locals()
|
args = locals()
|
||||||
if local:
|
if local:
|
||||||
from proxy_server import app, initialize, print_cost_logs, usage_telemetry, add_keys_to_config
|
from proxy_server import app, save_worker_config, print_cost_logs, usage_telemetry, add_keys_to_config
|
||||||
debug = True
|
debug = True
|
||||||
else:
|
else:
|
||||||
try:
|
try:
|
||||||
from .proxy_server import app, initialize, print_cost_logs, usage_telemetry, add_keys_to_config
|
from .proxy_server import app, save_worker_config, print_cost_logs, usage_telemetry, add_keys_to_config
|
||||||
except ImportError as e:
|
except ImportError as e:
|
||||||
from proxy_server import app, initialize, print_cost_logs, usage_telemetry, add_keys_to_config
|
from proxy_server import app, save_worker_config, print_cost_logs, usage_telemetry, add_keys_to_config
|
||||||
feature_telemetry = usage_telemetry
|
feature_telemetry = usage_telemetry
|
||||||
if create_proxy == True:
|
if create_proxy == True:
|
||||||
repo_url = 'https://github.com/BerriAI/litellm'
|
repo_url = 'https://github.com/BerriAI/litellm'
|
||||||
|
@ -163,7 +164,7 @@ def run_server(host, port, api_base, api_version, model, alias, add_key, headers
|
||||||
else:
|
else:
|
||||||
if headers:
|
if headers:
|
||||||
headers = json.loads(headers)
|
headers = json.loads(headers)
|
||||||
initialize(model=model, alias=alias, api_base=api_base, api_version=api_version, debug=debug, temperature=temperature, max_tokens=max_tokens, request_timeout=request_timeout, max_budget=max_budget, telemetry=telemetry, drop_params=drop_params, add_function_to_prompt=add_function_to_prompt, headers=headers, save=save, config=config)
|
save_worker_config(model=model, alias=alias, api_base=api_base, api_version=api_version, debug=debug, temperature=temperature, max_tokens=max_tokens, request_timeout=request_timeout, max_budget=max_budget, telemetry=telemetry, drop_params=drop_params, add_function_to_prompt=add_function_to_prompt, headers=headers, save=save, config=config)
|
||||||
try:
|
try:
|
||||||
import uvicorn
|
import uvicorn
|
||||||
except:
|
except:
|
||||||
|
@ -174,7 +175,7 @@ def run_server(host, port, api_base, api_version, model, alias, add_key, headers
|
||||||
|
|
||||||
if port == 8000 and is_port_in_use(port):
|
if port == 8000 and is_port_in_use(port):
|
||||||
port = random.randint(1024, 49152)
|
port = random.randint(1024, 49152)
|
||||||
uvicorn.run(app, host=host, port=port)
|
uvicorn.run("proxy_server:app", host=host, port=port, workers=num_workers)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
|
@ -129,11 +129,12 @@ llm_router: Optional[litellm.Router] = None
|
||||||
llm_model_list: Optional[list] = None
|
llm_model_list: Optional[list] = None
|
||||||
server_settings: dict = {}
|
server_settings: dict = {}
|
||||||
log_file = "api_log.json"
|
log_file = "api_log.json"
|
||||||
|
worker_config = None
|
||||||
|
|
||||||
#### HELPER FUNCTIONS ####
|
#### HELPER FUNCTIONS ####
|
||||||
def print_verbose(print_statement):
|
def print_verbose(print_statement):
|
||||||
global user_debug
|
global user_debug
|
||||||
|
print(f"user debug value: {user_debug}")
|
||||||
if user_debug:
|
if user_debug:
|
||||||
print(print_statement)
|
print(print_statement)
|
||||||
|
|
||||||
|
@ -337,6 +338,9 @@ def load_config():
|
||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
def save_worker_config(**data):
|
||||||
|
import json
|
||||||
|
os.environ["WORKER_CONFIG"] = json.dumps(data)
|
||||||
|
|
||||||
def initialize(
|
def initialize(
|
||||||
model,
|
model,
|
||||||
|
@ -532,6 +536,7 @@ def litellm_completion(*args, **kwargs):
|
||||||
for key, value in m["litellm_params"].items():
|
for key, value in m["litellm_params"].items():
|
||||||
kwargs[key] = value
|
kwargs[key] = value
|
||||||
break
|
break
|
||||||
|
print(f"litellm set verbose pre-call: {litellm.set_verbose}")
|
||||||
if call_type == "chat_completion":
|
if call_type == "chat_completion":
|
||||||
response = litellm.completion(*args, **kwargs)
|
response = litellm.completion(*args, **kwargs)
|
||||||
elif call_type == "text_completion":
|
elif call_type == "text_completion":
|
||||||
|
@ -540,6 +545,14 @@ def litellm_completion(*args, **kwargs):
|
||||||
return StreamingResponse(data_generator(response), media_type='text/event-stream')
|
return StreamingResponse(data_generator(response), media_type='text/event-stream')
|
||||||
return response
|
return response
|
||||||
|
|
||||||
|
|
||||||
|
@app.on_event("startup")
|
||||||
|
def startup_event():
|
||||||
|
import json
|
||||||
|
worker_config = json.loads(os.getenv("WORKER_CONFIG"))
|
||||||
|
initialize(**worker_config)
|
||||||
|
print(f"\033[32mWorker Initialized\033[0m\n")
|
||||||
|
|
||||||
#### API ENDPOINTS ####
|
#### API ENDPOINTS ####
|
||||||
@router.get("/v1/models")
|
@router.get("/v1/models")
|
||||||
@router.get("/models") # if project requires model list
|
@router.get("/models") # if project requires model list
|
||||||
|
|
|
@ -285,8 +285,7 @@ class TextCompletionResponse(OpenAIObject):
|
||||||
############################################################
|
############################################################
|
||||||
def print_verbose(print_statement):
|
def print_verbose(print_statement):
|
||||||
if litellm.set_verbose:
|
if litellm.set_verbose:
|
||||||
import logging
|
print(print_statement) # noqa
|
||||||
logging.info(f"LiteLLM: {print_statement}")
|
|
||||||
|
|
||||||
####### LOGGING ###################
|
####### LOGGING ###################
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue