fix(ollama.py): use tiktoken as backup for prompt token counting

This commit is contained in:
Krrish Dholakia 2024-01-09 09:47:18 +05:30 committed by ishaan-jaff
parent 22e0a6c7df
commit 22a900463e
4 changed files with 12 additions and 12 deletions

View file

@ -217,7 +217,7 @@ def get_ollama_response(
model_response["choices"][0]["message"]["content"] = response_json["response"] model_response["choices"][0]["message"]["content"] = response_json["response"]
model_response["created"] = int(time.time()) model_response["created"] = int(time.time())
model_response["model"] = "ollama/" + model model_response["model"] = "ollama/" + model
prompt_tokens = response_json["prompt_eval_count"] # type: ignore prompt_tokens = response_json.get("prompt_eval_count", len(encoding.encode(prompt))) # type: ignore
completion_tokens = response_json["eval_count"] completion_tokens = response_json["eval_count"]
model_response["usage"] = litellm.Usage( model_response["usage"] = litellm.Usage(
prompt_tokens=prompt_tokens, prompt_tokens=prompt_tokens,
@ -318,7 +318,7 @@ async def ollama_acompletion(url, data, model_response, encoding, logging_obj):
] ]
model_response["created"] = int(time.time()) model_response["created"] = int(time.time())
model_response["model"] = "ollama/" + data["model"] model_response["model"] = "ollama/" + data["model"]
prompt_tokens = response_json["prompt_eval_count"] # type: ignore prompt_tokens = response_json.get("prompt_eval_count", len(encoding.encode(data["prompt"]))) # type: ignore
completion_tokens = response_json["eval_count"] completion_tokens = response_json["eval_count"]
model_response["usage"] = litellm.Usage( model_response["usage"] = litellm.Usage(
prompt_tokens=prompt_tokens, prompt_tokens=prompt_tokens,

View file

@ -1,5 +1,5 @@
model_list: model_list:
- model_name: text-davinci-003 - model_name: gpt-3.5-turbo-instruct
litellm_params: litellm_params:
model: ollama/zephyr model: ollama/zephyr
- model_name: gpt-4 - model_name: gpt-4

View file

@ -602,7 +602,7 @@ def openai_text_completion_test():
try: try:
# OVERRIDE WITH DYNAMIC MAX TOKENS # OVERRIDE WITH DYNAMIC MAX TOKENS
response_1 = litellm.completion( response_1 = litellm.completion(
model="text-davinci-003", model="gpt-3.5-turbo-instruct",
messages=[ messages=[
{ {
"content": "Hello, how are you? Be as verbose as possible", "content": "Hello, how are you? Be as verbose as possible",
@ -616,7 +616,7 @@ def openai_text_completion_test():
# USE CONFIG TOKENS # USE CONFIG TOKENS
response_2 = litellm.completion( response_2 = litellm.completion(
model="text-davinci-003", model="gpt-3.5-turbo-instruct",
messages=[ messages=[
{ {
"content": "Hello, how are you? Be as verbose as possible", "content": "Hello, how are you? Be as verbose as possible",
@ -630,7 +630,7 @@ def openai_text_completion_test():
assert len(response_2_text) < len(response_1_text) assert len(response_2_text) < len(response_1_text)
response_3 = litellm.completion( response_3 = litellm.completion(
model="text-davinci-003", model="gpt-3.5-turbo-instruct",
messages=[{"content": "Hello, how are you?", "role": "user"}], messages=[{"content": "Hello, how are you?", "role": "user"}],
n=2, n=2,
) )

View file

@ -2682,7 +2682,7 @@ def test_completion_openai_prompt():
try: try:
print("\n text 003 test\n") print("\n text 003 test\n")
response = text_completion( response = text_completion(
model="text-davinci-003", prompt="What's the weather in SF?" model="gpt-3.5-turbo-instruct", prompt="What's the weather in SF?"
) )
print(response) print(response)
response_str = response["choices"][0]["text"] response_str = response["choices"][0]["text"]
@ -2700,7 +2700,7 @@ def test_completion_openai_engine_and_model():
print("\n text 003 test\n") print("\n text 003 test\n")
litellm.set_verbose = True litellm.set_verbose = True
response = text_completion( response = text_completion(
model="text-davinci-003", model="gpt-3.5-turbo-instruct",
engine="anything", engine="anything",
prompt="What's the weather in SF?", prompt="What's the weather in SF?",
max_tokens=5, max_tokens=5,
@ -2721,7 +2721,7 @@ def test_completion_openai_engine():
print("\n text 003 test\n") print("\n text 003 test\n")
litellm.set_verbose = True litellm.set_verbose = True
response = text_completion( response = text_completion(
engine="text-davinci-003", prompt="What's the weather in SF?", max_tokens=5 engine="gpt-3.5-turbo-instruct", prompt="What's the weather in SF?", max_tokens=5
) )
print(response) print(response)
response_str = response["choices"][0]["text"] response_str = response["choices"][0]["text"]
@ -2757,7 +2757,7 @@ def test_text_completion_basic():
print("\n test 003 with echo and logprobs \n") print("\n test 003 with echo and logprobs \n")
litellm.set_verbose = False litellm.set_verbose = False
response = text_completion( response = text_completion(
model="text-davinci-003", model="gpt-3.5-turbo-instruct",
prompt="good morning", prompt="good morning",
max_tokens=10, max_tokens=10,
logprobs=10, logprobs=10,
@ -2779,7 +2779,7 @@ def test_completion_text_003_prompt_array():
try: try:
litellm.set_verbose = False litellm.set_verbose = False
response = text_completion( response = text_completion(
model="text-davinci-003", model="gpt-3.5-turbo-instruct",
prompt=token_prompt, # token prompt is a 2d list prompt=token_prompt, # token prompt is a 2d list
) )
print("\n\n response") print("\n\n response")
@ -2857,7 +2857,7 @@ def test_text_completion_stream():
# async def test_text_completion_async_stream(): # async def test_text_completion_async_stream():
# try: # try:
# response = await atext_completion( # response = await atext_completion(
# model="text-completion-openai/text-davinci-003", # model="text-completion-openai/gpt-3.5-turbo-instruct",
# prompt="good morning", # prompt="good morning",
# stream=True, # stream=True,
# max_tokens=10, # max_tokens=10,