mirror of
https://github.com/BerriAI/litellm.git
synced 2025-04-26 03:04:13 +00:00
docs(proxy_server.md): update proxy server docs to include multi-agent autogen tutorial
This commit is contained in:
parent
ed7cf37e68
commit
dcb866b353
9 changed files with 122 additions and 19 deletions
|
@ -208,6 +208,85 @@ user_proxy.initiate_chat(assistant, message="Plot a chart of META and TESLA stoc
|
||||||
|
|
||||||
Credits [@victordibia](https://github.com/microsoft/autogen/issues/45#issuecomment-1749921972) for this tutorial.
|
Credits [@victordibia](https://github.com/microsoft/autogen/issues/45#issuecomment-1749921972) for this tutorial.
|
||||||
</TabItem>
|
</TabItem>
|
||||||
|
<TabItem value="multi-agent AutoGen" label="AutoGen Multi-Agent">
|
||||||
|
```python
|
||||||
|
from autogen import AssistantAgent, GroupChatManager, UserProxyAgent
|
||||||
|
from autogen.agentchat import GroupChat
|
||||||
|
|
||||||
|
config_list = [
|
||||||
|
{
|
||||||
|
"model": "ollama/mistralorca",
|
||||||
|
"api_base": "http://localhost:8000", # litellm compatible endpoint
|
||||||
|
"api_type": "open_ai",
|
||||||
|
"api_key": "NULL", # just a placeholder
|
||||||
|
}
|
||||||
|
]
|
||||||
|
llm_config = {"config_list": config_list, "seed": 42}
|
||||||
|
|
||||||
|
code_config_list = [
|
||||||
|
{
|
||||||
|
"model": "ollama/phind-code",
|
||||||
|
"api_base": "http://localhost:8000", # litellm compatible endpoint
|
||||||
|
"api_type": "open_ai",
|
||||||
|
"api_key": "NULL", # just a placeholder
|
||||||
|
}
|
||||||
|
]
|
||||||
|
|
||||||
|
code_config = {"config_list": code_config_list, "seed": 42}
|
||||||
|
|
||||||
|
admin = UserProxyAgent(
|
||||||
|
name="Admin",
|
||||||
|
system_message="A human admin. Interact with the planner to discuss the plan. Plan execution needs to be approved by this admin.",
|
||||||
|
llm_config=llm_config,
|
||||||
|
code_execution_config=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
engineer = AssistantAgent(
|
||||||
|
name="Engineer",
|
||||||
|
llm_config=code_config,
|
||||||
|
system_message="""Engineer. You follow an approved plan. You write python/shell code to solve tasks. Wrap the code in a code block that specifies the script type. The user can't modify your code. So do not suggest incomplete code which requires others to modify. Don't use a code block if it's not intended to be executed by the executor.
|
||||||
|
Don't include multiple code blocks in one response. Do not ask others to copy and paste the result. Check the execution result returned by the executor.
|
||||||
|
If the result indicates there is an error, fix the error and output the code again. Suggest the full code instead of partial code or code changes. If the error can't be fixed or if the task is not solved even after the code is executed successfully, analyze the problem, revisit your assumption, collect additional info you need, and think of a different approach to try.
|
||||||
|
""",
|
||||||
|
)
|
||||||
|
planner = AssistantAgent(
|
||||||
|
name="Planner",
|
||||||
|
system_message="""Planner. Suggest a plan. Revise the plan based on feedback from admin and critic, until admin approval.
|
||||||
|
The plan may involve an engineer who can write code and a scientist who doesn't write code.
|
||||||
|
Explain the plan first. Be clear which step is performed by an engineer, and which step is performed by a scientist.
|
||||||
|
""",
|
||||||
|
llm_config=llm_config,
|
||||||
|
)
|
||||||
|
executor = UserProxyAgent(
|
||||||
|
name="Executor",
|
||||||
|
system_message="Executor. Execute the code written by the engineer and report the result.",
|
||||||
|
human_input_mode="NEVER",
|
||||||
|
llm_config=llm_config,
|
||||||
|
code_execution_config={"last_n_messages": 3, "work_dir": "paper"},
|
||||||
|
)
|
||||||
|
critic = AssistantAgent(
|
||||||
|
name="Critic",
|
||||||
|
system_message="Critic. Double check plan, claims, code from other agents and provide feedback. Check whether the plan includes adding verifiable info such as source URL.",
|
||||||
|
llm_config=llm_config,
|
||||||
|
)
|
||||||
|
groupchat = GroupChat(
|
||||||
|
agents=[admin, engineer, planner, executor, critic],
|
||||||
|
messages=[],
|
||||||
|
max_round=50,
|
||||||
|
)
|
||||||
|
manager = GroupChatManager(groupchat=groupchat, llm_config=llm_config)
|
||||||
|
|
||||||
|
|
||||||
|
admin.initiate_chat(
|
||||||
|
manager,
|
||||||
|
message="""
|
||||||
|
""",
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
Credits [@Nathan](https://gist.github.com/CUexter) for this tutorial.
|
||||||
|
</TabItem>
|
||||||
<TabItem value="langroid" label="Langroid">
|
<TabItem value="langroid" label="Langroid">
|
||||||
|
|
||||||
```python
|
```python
|
||||||
|
|
Binary file not shown.
Binary file not shown.
|
@ -121,7 +121,7 @@ def completion(
|
||||||
sum_logprob = 0
|
sum_logprob = 0
|
||||||
for token in completion_response[0]["details"]["tokens"]:
|
for token in completion_response[0]["details"]["tokens"]:
|
||||||
sum_logprob += token["logprob"]
|
sum_logprob += token["logprob"]
|
||||||
model_response["choices"][0]["message"]["logprobs"] = sum_logprob
|
model_response["choices"][0]["message"]._logprobs = sum_logprob
|
||||||
else:
|
else:
|
||||||
raise BasetenError(
|
raise BasetenError(
|
||||||
message=f"Unable to parse response. Original response: {response.text}",
|
message=f"Unable to parse response. Original response: {response.text}",
|
||||||
|
|
|
@ -141,7 +141,6 @@ def completion(
|
||||||
litellm_params=None,
|
litellm_params=None,
|
||||||
logger_fn=None,
|
logger_fn=None,
|
||||||
):
|
):
|
||||||
print(f'headers inside hf rest api: {headers}')
|
|
||||||
headers = validate_environment(api_key, headers)
|
headers = validate_environment(api_key, headers)
|
||||||
task = get_hf_task_for_model(model)
|
task = get_hf_task_for_model(model)
|
||||||
print_verbose(f"{model}, {task}")
|
print_verbose(f"{model}, {task}")
|
||||||
|
@ -254,8 +253,6 @@ def completion(
|
||||||
|
|
||||||
## Some servers might return streaming responses even though stream was not set to true. (e.g. Baseten)
|
## Some servers might return streaming responses even though stream was not set to true. (e.g. Baseten)
|
||||||
is_streamed = False
|
is_streamed = False
|
||||||
print(f"response keys: {response.__dict__.keys()}")
|
|
||||||
print(f"response keys: {response.__dict__['headers']}")
|
|
||||||
if response.__dict__['headers']["Content-Type"] == "text/event-stream":
|
if response.__dict__['headers']["Content-Type"] == "text/event-stream":
|
||||||
is_streamed = True
|
is_streamed = True
|
||||||
|
|
||||||
|
@ -313,7 +310,7 @@ def completion(
|
||||||
sum_logprob = 0
|
sum_logprob = 0
|
||||||
for token in completion_response[0]["details"]["tokens"]:
|
for token in completion_response[0]["details"]["tokens"]:
|
||||||
sum_logprob += token["logprob"]
|
sum_logprob += token["logprob"]
|
||||||
model_response["choices"][0]["message"]["logprobs"] = sum_logprob
|
model_response["choices"][0]["message"]._logprob = sum_logprob
|
||||||
if "best_of" in optional_params and optional_params["best_of"] > 1:
|
if "best_of" in optional_params and optional_params["best_of"] > 1:
|
||||||
if "details" in completion_response[0] and "best_of_sequences" in completion_response[0]["details"]:
|
if "details" in completion_response[0] and "best_of_sequences" in completion_response[0]["details"]:
|
||||||
choices_list = []
|
choices_list = []
|
||||||
|
@ -337,9 +334,14 @@ def completion(
|
||||||
prompt_tokens = len(
|
prompt_tokens = len(
|
||||||
encoding.encode(input_text)
|
encoding.encode(input_text)
|
||||||
) ##[TODO] use the llama2 tokenizer here
|
) ##[TODO] use the llama2 tokenizer here
|
||||||
completion_tokens = len(
|
print_verbose(f'output: {model_response["choices"][0]["message"]}')
|
||||||
encoding.encode(model_response["choices"][0]["message"].get("content", ""))
|
output_text = model_response["choices"][0]["message"].get("content", "")
|
||||||
) ##[TODO] use the llama2 tokenizer here
|
if output_text is not None and len(output_text) > 0:
|
||||||
|
completion_tokens = len(
|
||||||
|
encoding.encode(model_response["choices"][0]["message"].get("content", ""))
|
||||||
|
) ##[TODO] use the llama2 tokenizer here
|
||||||
|
else:
|
||||||
|
completion_tokens = 0
|
||||||
|
|
||||||
model_response["created"] = time.time()
|
model_response["created"] = time.time()
|
||||||
model_response["model"] = model
|
model_response["model"] = model
|
||||||
|
|
|
@ -729,7 +729,6 @@ def completion(
|
||||||
headers
|
headers
|
||||||
or litellm.headers
|
or litellm.headers
|
||||||
)
|
)
|
||||||
print(f'headers before hf rest api: {hf_headers}')
|
|
||||||
model_response = huggingface_restapi.completion(
|
model_response = huggingface_restapi.completion(
|
||||||
model=model,
|
model=model,
|
||||||
messages=messages,
|
messages=messages,
|
||||||
|
|
|
@ -9,6 +9,7 @@ import backoff
|
||||||
import openai.error
|
import openai.error
|
||||||
|
|
||||||
import litellm
|
import litellm
|
||||||
|
from litellm.utils import trim_messages
|
||||||
import litellm.exceptions
|
import litellm.exceptions
|
||||||
|
|
||||||
cost_dict: Dict[str, Dict[str, float]] = defaultdict(dict)
|
cost_dict: Dict[str, Dict[str, float]] = defaultdict(dict)
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
import sys, os, platform, time, copy
|
import sys, os, platform, time, copy
|
||||||
import threading
|
import threading
|
||||||
import shutil, random, traceback
|
import shutil, random, traceback
|
||||||
|
messages = []
|
||||||
sys.path.insert(
|
sys.path.insert(
|
||||||
0, os.path.abspath("../..")
|
0, os.path.abspath("../..")
|
||||||
) # Adds the parent directory to the system path - for litellm local dev
|
) # Adds the parent directory to the system path - for litellm local dev
|
||||||
|
@ -70,6 +71,7 @@ print()
|
||||||
import litellm
|
import litellm
|
||||||
from fastapi import FastAPI, Request
|
from fastapi import FastAPI, Request
|
||||||
from fastapi.routing import APIRouter
|
from fastapi.routing import APIRouter
|
||||||
|
from fastapi.encoders import jsonable_encoder
|
||||||
from fastapi.responses import StreamingResponse, FileResponse
|
from fastapi.responses import StreamingResponse, FileResponse
|
||||||
from fastapi.middleware.cors import CORSMiddleware
|
from fastapi.middleware.cors import CORSMiddleware
|
||||||
import json
|
import json
|
||||||
|
@ -106,6 +108,12 @@ def print_verbose(print_statement):
|
||||||
if user_debug:
|
if user_debug:
|
||||||
print(print_statement)
|
print(print_statement)
|
||||||
|
|
||||||
|
def find_avatar_url(role):
|
||||||
|
role = role.replace(" ", "%20")
|
||||||
|
avatar_filename = f"avatars/{role}.png"
|
||||||
|
avatar_url = f"/static/{avatar_filename}"
|
||||||
|
return avatar_url
|
||||||
|
|
||||||
def usage_telemetry(feature: str): # helps us know if people are using this feature. Set `litellm --telemetry False` to your cli call to turn this off
|
def usage_telemetry(feature: str): # helps us know if people are using this feature. Set `litellm --telemetry False` to your cli call to turn this off
|
||||||
if user_telemetry:
|
if user_telemetry:
|
||||||
data = {
|
data = {
|
||||||
|
@ -162,6 +170,13 @@ def save_params_to_config(data: dict):
|
||||||
with open(user_config_path, 'wb') as f:
|
with open(user_config_path, 'wb') as f:
|
||||||
tomli_w.dump(config, f)
|
tomli_w.dump(config, f)
|
||||||
|
|
||||||
|
def print_cost_logs():
|
||||||
|
with open('costs.json', 'r') as f:
|
||||||
|
# print this in green
|
||||||
|
print("\033[1;32m")
|
||||||
|
print(f.read())
|
||||||
|
print("\033[0m")
|
||||||
|
return
|
||||||
|
|
||||||
def load_config():
|
def load_config():
|
||||||
try:
|
try:
|
||||||
|
@ -469,13 +484,20 @@ async def v1_chat_completion(request: Request):
|
||||||
response = litellm_completion(data, type="chat_completion")
|
response = litellm_completion(data, type="chat_completion")
|
||||||
return response
|
return response
|
||||||
|
|
||||||
def print_cost_logs():
|
@router.post("/send_message")
|
||||||
with open('costs.json', 'r') as f:
|
async def send_message(request: Request):
|
||||||
# print this in green
|
try:
|
||||||
print("\033[1;32m")
|
data = await request.json()
|
||||||
print(f.read())
|
role = data.get("role")
|
||||||
print("\033[0m")
|
text = data.get("text")
|
||||||
return
|
|
||||||
|
avatarUrl = find_avatar_url(role)
|
||||||
|
|
||||||
|
message = {"role": role, "text": text, "avatarUrl": avatarUrl}
|
||||||
|
messages.append(message)
|
||||||
|
return jsonable_encoder(messages)
|
||||||
|
except:
|
||||||
|
return "An error occurred", 500
|
||||||
|
|
||||||
@router.get("/ollama_logs")
|
@router.get("/ollama_logs")
|
||||||
async def retrieve_server_log(request: Request):
|
async def retrieve_server_log(request: Request):
|
||||||
|
|
|
@ -119,7 +119,7 @@ class Message(OpenAIObject):
|
||||||
super(Message, self).__init__(**params)
|
super(Message, self).__init__(**params)
|
||||||
self.content = content
|
self.content = content
|
||||||
self.role = role
|
self.role = role
|
||||||
self.logprobs = logprobs
|
self._logprobs = logprobs
|
||||||
|
|
||||||
class Delta(OpenAIObject):
|
class Delta(OpenAIObject):
|
||||||
def __init__(self, content=None, logprobs=None, role=None, **params):
|
def __init__(self, content=None, logprobs=None, role=None, **params):
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue