adding custom prompt templates to ollama

2025-04-26 03:04:13 +00:00 · 2023-10-05 10:47:51 -07:00 · 2023-10-05 10:47:51 -07:00 · ed31860206
commit ed31860206
parent 966ad27662
7 changed files with 164 additions and 85 deletions
--- a/docs/my-website/docs/proxy_server.md
+++ b/docs/my-website/docs/proxy_server.md
@ -151,6 +151,38 @@ $ litellm --model command-nightly
 ### Deploy Proxy
 <Tabs>
 <TabItem value="self-hosted" label="Self-Hosted">
 **Step 1: Clone the repo**
 ```shell
 git clone https://github.com/BerriAI/liteLLM-proxy.git
 ```
 **Step 2: Put your API keys in .env** 
 Copy the .env.template and put in the relevant keys (e.g. OPENAI_API_KEY="sk-..")
 **Step 3: Test your proxy**
 Start your proxy server
 ```shell
 cd litellm-proxy && python3 main.py 
 ```
 Make your first call 
 ```python
 import openai 
 openai.api_key = "sk-litellm-master-key"
 openai.api_base = "http://0.0.0.0:8080"
 response = openai.ChatCompletion.create(model="gpt-3.5-turbo", messages=[{"role": "user", "content": "Hey"}])
 print(response)
 ```
 </TabItem>
 <TabItem value="litellm-hosted" label="LiteLLM-Hosted">
 Deploy the proxy to https://api.litellm.ai
 ```shell 
@ -161,7 +193,6 @@ $ litellm --model claude-instant-1 --deploy
 ```
 This will host a ChatCompletions API at: https://api.litellm.ai/44508ad4
 #### Other supported models:
 <Tabs>
 <TabItem value="anthropic" label="Anthropic">
@ -280,6 +311,8 @@ curl --location 'https://api.litellm.ai/44508ad4/chat/completions' \
 ```
 </TabItem>
 </Tabs>
 </TabItem>
 </Tabs>
 ## Setting api base, temperature, max tokens
--- a/litellm/pycache/init.cpython-311.pyc
+++ b/litellm/pycache/init.cpython-311.pyc
--- a/litellm/pycache/main.cpython-311.pyc
+++ b/litellm/pycache/main.cpython-311.pyc
--- a/litellm/pycache/utils.cpython-311.pyc
+++ b/litellm/pycache/utils.cpython-311.pyc
--- a/litellm/main.py
+++ b/litellm/main.py
@ -29,22 +29,24 @@ from litellm.utils import (
    get_api_key,
    mock_completion_streaming_obj,
 )
-from .llms import anthropic
+from .llms import (
-from .llms import together_ai
+    anthropic,
-from .llms import ai21
+    together_ai,
-from .llms import sagemaker
+    ai21,
-from .llms import bedrock
+    sagemaker,
-from .llms import huggingface_restapi
+    bedrock,
-from .llms import replicate
+    huggingface_restapi,
-from .llms import aleph_alpha
+    replicate,
-from .llms import nlp_cloud
+    aleph_alpha,
-from .llms import baseten
+    nlp_cloud,
-from .llms import vllm
+    baseten,
-from .llms import ollama
+    vllm,
-from .llms import cohere
+    ollama,
-from .llms import petals
+    cohere,
-from .llms import oobabooga
+    petals,
-from .llms import palm
+    oobabooga,
    palm)
 from .llms.prompt_templates.factory import prompt_factory, custom_prompt
 import tiktoken
 from concurrent.futures import ThreadPoolExecutor
 from typing import Callable, List, Optional, Dict
@ -1040,13 +1042,25 @@ def completion(
            response = model_response
        elif custom_llm_provider == "ollama":
            endpoint = (
-                litellm.api_base if litellm.api_base is not None else api_base
+                litellm.api_base 
                or api_base
                or "http://localhost:11434"
            )
-            prompt = " ".join([message["content"] for message in messages])
+            if model in litellm.custom_prompt_dict:
                # check if the model has a registered custom prompt
                model_prompt_details = litellm.custom_prompt_dict[model]
                prompt = custom_prompt(
                    role_dict=model_prompt_details["roles"], 
                    initial_prompt_value=model_prompt_details["initial_prompt_value"],  
                    final_prompt_value=model_prompt_details["final_prompt_value"], 
                    messages=messages
                )
            else:
                prompt = prompt_factory(model=model, messages=messages)
            ## LOGGING
            logging.pre_call(
-                input=prompt, api_key=None, additional_args={"endpoint": endpoint}
+                input=prompt, api_key=None, additional_args={"endpoint": endpoint, "custom_prompt_dict": litellm.custom_prompt_dict}
            )
            if kwargs.get('acompletion', False) == True:
                async_generator = ollama.async_get_ollama_response_stream(endpoint, model, prompt)
--- a/litellm/tests/test_ollama_local.py
+++ b/litellm/tests/test_ollama_local.py
@ -1,5 +1,5 @@
-# ##### THESE TESTS CAN ONLY RUN LOCALLY WITH THE OLLAMA SERVER RUNNING ######
+# # ##### THESE TESTS CAN ONLY RUN LOCALLY WITH THE OLLAMA SERVER RUNNING ######
-# # https://ollama.ai/
+# # # https://ollama.ai/
 # import sys, os
 # import traceback
@ -16,27 +16,55 @@
 # user_message = "respond in 20 words. who are you?"
 # messages = [{ "content": user_message,"role": "user"}]
-# def test_completion_ollama():
+# # def test_completion_ollama():
-#     try:
+# #     try:
-#         response = completion(
+# #         response = completion(
-#             model="ollama/llama2", 
+# #             model="ollama/llama2", 
-#             messages=messages, 
+# #             messages=messages, 
-#             api_base="http://localhost:11434"
+# #             api_base="http://localhost:11434"
-#         )
+# #         )
-#         print(response)
+# #         print(response)
-#     except Exception as e:
+# #     except Exception as e:
-#         pytest.fail(f"Error occurred: {e}")
+# #         pytest.fail(f"Error occurred: {e}")
 # # test_completion_ollama()
-# def test_completion_ollama_stream():
+# # def test_completion_ollama_stream():
 # #     user_message = "what is litellm?"
 # #     messages = [{ "content": user_message,"role": "user"}]
 # #     try:
 # #         response = completion(
 # #             model="ollama/llama2", 
 # #             messages=messages, 
 # #             stream=True
 # #         )
 # #         print(response)
 # #         for chunk in response:
 # #             print(chunk)
 # #             # print(chunk['choices'][0]['delta'])
 # #     except Exception as e:
 # #         pytest.fail(f"Error occurred: {e}")
 # # test_completion_ollama_stream()
 # def test_completion_ollama_custom_prompt_template():
 #     user_message = "what is litellm?"
 #     litellm.register_prompt_template(
 #         model="llama2",
 #         roles={
 #             "system": {"pre_message": "System: "},
 #             "user": {"pre_message": "User: "},
 #             "assistant": {"pre_message": "Assistant: "}
 #         }
 #     )
 #     messages = [{ "content": user_message,"role": "user"}]
 #     litellm.set_verbose = True
 #     try:
 #         response = completion(
 #             model="ollama/llama2", 
 #             messages=messages, 
 #             api_base="http://localhost:11434", 
 #             stream=True
 #         )
 #         print(response)
@ -45,54 +73,54 @@
 #             # print(chunk['choices'][0]['delta'])
 #     except Exception as e:
 #         traceback.print_exc()
 #         pytest.fail(f"Error occurred: {e}")
-# test_completion_ollama_stream()
+# test_completion_ollama_custom_prompt_template()
 # # async def test_completion_ollama_async_stream():
 # #     user_message = "what is the weather"
 # #     messages = [{ "content": user_message,"role": "user"}]
 # #     try:
 # #         response = await litellm.acompletion(
 # #             model="ollama/llama2", 
 # #             messages=messages, 
 # #             api_base="http://localhost:11434", 
 # #             stream=True
 # #         )
 # #         async for chunk in response:
 # #             print(chunk)
 # #             # print(chunk['choices'][0]['delta'])
 # #     except Exception as e:
 # #         pytest.fail(f"Error occurred: {e}")
 # # # import asyncio
 # # # asyncio.run(test_completion_ollama_async_stream())
 # # def prepare_messages_for_chat(text: str) -> list:
 # #     messages = [
 # #         {"role": "user", "content": text},
 # #     ]
 # #     return messages
-# async def test_completion_ollama_async_stream():
+# # async def ask_question():
-#     user_message = "what is the weather"
+# #     params = {
-#     messages = [{ "content": user_message,"role": "user"}]
+# #         "messages": prepare_messages_for_chat("What is litellm? tell me 10 things about it who is sihaan.write an essay"),
-#     try:
+# #         "api_base": "http://localhost:11434",
-#         response = await litellm.acompletion(
+# #         "model": "ollama/llama2",
-#             model="ollama/llama2", 
+# #         "stream": True,
-#             messages=messages, 
+# #     }
-#             api_base="http://localhost:11434", 
+# #     response = await litellm.acompletion(**params)
-#             stream=True
+# #     return response
 #         )
 #         async for chunk in response:
 #             print(chunk)
-#             # print(chunk['choices'][0]['delta'])
+# # async def main():
 # #     response = await ask_question()
 # #     async for chunk in response:
 # #         print(chunk)
-#     except Exception as e:
+# # if __name__ == "__main__":
-#         pytest.fail(f"Error occurred: {e}")
+# #     import asyncio
-
+# #     asyncio.run(main())
 # # import asyncio
 # # asyncio.run(test_completion_ollama_async_stream())
 # def prepare_messages_for_chat(text: str) -> list:
 #     messages = [
 #         {"role": "user", "content": text},
 #     ]
 #     return messages
 # async def ask_question():
 #     params = {
 #         "messages": prepare_messages_for_chat("What is litellm? tell me 10 things about it who is sihaan.write an essay"),
 #         "api_base": "http://localhost:11434",
 #         "model": "ollama/llama2",
 #         "stream": True,
 #     }
 #     response = await litellm.acompletion(**params)
 #     return response
 # async def main():
 #     response = await ask_question()
 #     async for chunk in response:
 #         print(chunk)
 # if __name__ == "__main__":
 #     import asyncio
 #     asyncio.run(main())
--- a/litellm/utils.py
+++ b/litellm/utils.py
@ -2081,24 +2081,28 @@ def modify_integration(integration_name, integration_params):
 # custom prompt helper function
 def register_prompt_template(model: str, roles: dict, initial_prompt_value: str = "", final_prompt_value: str = ""):
    """
    Format the openai prompt, to follow your custom format. 
    Example usage:
    ```
    import litellm 
    litellm.register_prompt_template(
 	    model="llama-2",
        initial_prompt_value="You are a good assistant" # [OPTIONAL]
 	    roles={
            "system": {
-                "pre_message": "[INST] <<SYS>>\n",
+                "pre_message": "[INST] <<SYS>>\n", # [OPTIONAL]
-                "post_message": "\n<</SYS>>\n [/INST]\n"
+                "post_message": "\n<</SYS>>\n [/INST]\n" # [OPTIONAL]
            },
-            "user": { # follow this format https://github.com/facebookresearch/llama/blob/77062717054710e352a99add63d160274ce670c6/llama/generation.py#L348
+            "user": { 
-                "pre_message": "[INST] ",
+                "pre_message": "[INST] ", # [OPTIONAL]
-                "post_message": " [/INST]\n"
+                "post_message": " [/INST]" # [OPTIONAL]
            }, 
            "assistant": {
-                "post_message": "\n" # follows this - https://replicate.com/blog/how-to-prompt-llama
+                "pre_message": "\n" # [OPTIONAL]
                "post_message": "\n" # [OPTIONAL]
            }
        }
        final_prompt_value="Now answer as best you can:" # [OPTIONAL]
    )
    ```
    """