fix(ollama.py): use tiktoken as backup for prompt token counting

This commit is contained in:
Krrish Dholakia 2024-01-09 09:47:18 +05:30 committed by ishaan-jaff
parent 22e0a6c7df
commit 22a900463e
4 changed files with 12 additions and 12 deletions

View file

@ -217,7 +217,7 @@ def get_ollama_response(
model_response["choices"][0]["message"]["content"] = response_json["response"]
model_response["created"] = int(time.time())
model_response["model"] = "ollama/" + model
prompt_tokens = response_json["prompt_eval_count"] # type: ignore
prompt_tokens = response_json.get("prompt_eval_count", len(encoding.encode(prompt))) # type: ignore
completion_tokens = response_json["eval_count"]
model_response["usage"] = litellm.Usage(
prompt_tokens=prompt_tokens,
@ -318,7 +318,7 @@ async def ollama_acompletion(url, data, model_response, encoding, logging_obj):
]
model_response["created"] = int(time.time())
model_response["model"] = "ollama/" + data["model"]
prompt_tokens = response_json["prompt_eval_count"] # type: ignore
prompt_tokens = response_json.get("prompt_eval_count", len(encoding.encode(data["prompt"]))) # type: ignore
completion_tokens = response_json["eval_count"]
model_response["usage"] = litellm.Usage(
prompt_tokens=prompt_tokens,

View file

@ -1,5 +1,5 @@
model_list:
- model_name: text-davinci-003
- model_name: gpt-3.5-turbo-instruct
litellm_params:
model: ollama/zephyr
- model_name: gpt-4

View file

@ -602,7 +602,7 @@ def openai_text_completion_test():
try:
# OVERRIDE WITH DYNAMIC MAX TOKENS
response_1 = litellm.completion(
model="text-davinci-003",
model="gpt-3.5-turbo-instruct",
messages=[
{
"content": "Hello, how are you? Be as verbose as possible",
@ -616,7 +616,7 @@ def openai_text_completion_test():
# USE CONFIG TOKENS
response_2 = litellm.completion(
model="text-davinci-003",
model="gpt-3.5-turbo-instruct",
messages=[
{
"content": "Hello, how are you? Be as verbose as possible",
@ -630,7 +630,7 @@ def openai_text_completion_test():
assert len(response_2_text) < len(response_1_text)
response_3 = litellm.completion(
model="text-davinci-003",
model="gpt-3.5-turbo-instruct",
messages=[{"content": "Hello, how are you?", "role": "user"}],
n=2,
)

View file

@ -2682,7 +2682,7 @@ def test_completion_openai_prompt():
try:
print("\n text 003 test\n")
response = text_completion(
model="text-davinci-003", prompt="What's the weather in SF?"
model="gpt-3.5-turbo-instruct", prompt="What's the weather in SF?"
)
print(response)
response_str = response["choices"][0]["text"]
@ -2700,7 +2700,7 @@ def test_completion_openai_engine_and_model():
print("\n text 003 test\n")
litellm.set_verbose = True
response = text_completion(
model="text-davinci-003",
model="gpt-3.5-turbo-instruct",
engine="anything",
prompt="What's the weather in SF?",
max_tokens=5,
@ -2721,7 +2721,7 @@ def test_completion_openai_engine():
print("\n text 003 test\n")
litellm.set_verbose = True
response = text_completion(
engine="text-davinci-003", prompt="What's the weather in SF?", max_tokens=5
engine="gpt-3.5-turbo-instruct", prompt="What's the weather in SF?", max_tokens=5
)
print(response)
response_str = response["choices"][0]["text"]
@ -2757,7 +2757,7 @@ def test_text_completion_basic():
print("\n test 003 with echo and logprobs \n")
litellm.set_verbose = False
response = text_completion(
model="text-davinci-003",
model="gpt-3.5-turbo-instruct",
prompt="good morning",
max_tokens=10,
logprobs=10,
@ -2779,7 +2779,7 @@ def test_completion_text_003_prompt_array():
try:
litellm.set_verbose = False
response = text_completion(
model="text-davinci-003",
model="gpt-3.5-turbo-instruct",
prompt=token_prompt, # token prompt is a 2d list
)
print("\n\n response")
@ -2857,7 +2857,7 @@ def test_text_completion_stream():
# async def test_text_completion_async_stream():
# try:
# response = await atext_completion(
# model="text-completion-openai/text-davinci-003",
# model="text-completion-openai/gpt-3.5-turbo-instruct",
# prompt="good morning",
# stream=True,
# max_tokens=10,