forked from phoenix-oss/llama-stack-mirror
		
	chore: remove llama_models.llama3.api imports from providers (#1107)
There should be a choke-point for llama3.api imports -- this is the prompt adapter. Creating a ChatFormat() object on demand is inexpensive. The underlying Tokenizer is a singleton anyway.
This commit is contained in:
		
							parent
							
								
									e9b8259cf9
								
							
						
					
					
						commit
						cdcbeb005b
					
				
					 13 changed files with 77 additions and 113 deletions
				
			
		|  | @ -7,7 +7,6 @@ import json | |||
| import logging | ||||
| from typing import AsyncGenerator, Dict, List, Optional, Union | ||||
| 
 | ||||
| from llama_models.llama3.api.chat_format import ChatFormat | ||||
| from openai.types.chat import ChatCompletionMessageToolCall | ||||
| from pydantic import BaseModel | ||||
| 
 | ||||
|  | @ -40,6 +39,7 @@ from llama_stack.models.llama.datatypes import ( | |||
| ) | ||||
| from llama_stack.providers.utils.inference.prompt_adapter import ( | ||||
|     convert_image_content_to_url, | ||||
|     decode_assistant_message, | ||||
| ) | ||||
| 
 | ||||
| logger = logging.getLogger(__name__) | ||||
|  | @ -149,7 +149,7 @@ def convert_openai_completion_logprobs_stream(text: str, logprobs: Optional[Unio | |||
|     return None | ||||
| 
 | ||||
| 
 | ||||
| def process_completion_response(response: OpenAICompatCompletionResponse, formatter: ChatFormat) -> CompletionResponse: | ||||
| def process_completion_response(response: OpenAICompatCompletionResponse) -> CompletionResponse: | ||||
|     choice = response.choices[0] | ||||
|     # drop suffix <eot_id> if present and return stop reason as end of turn | ||||
|     if choice.text.endswith("<|eot_id|>"): | ||||
|  | @ -174,16 +174,13 @@ def process_completion_response(response: OpenAICompatCompletionResponse, format | |||
| 
 | ||||
| def process_chat_completion_response( | ||||
|     response: OpenAICompatCompletionResponse, | ||||
|     formatter: ChatFormat, | ||||
|     request: ChatCompletionRequest, | ||||
| ) -> ChatCompletionResponse: | ||||
|     choice = response.choices[0] | ||||
| 
 | ||||
|     # TODO: This does not work well with tool calls for vLLM remote provider | ||||
|     #   Ref: https://github.com/meta-llama/llama-stack/issues/1058 | ||||
|     raw_message = formatter.decode_assistant_message_from_content( | ||||
|         text_from_choice(choice), get_stop_reason(choice.finish_reason) | ||||
|     ) | ||||
|     raw_message = decode_assistant_message(text_from_choice(choice), get_stop_reason(choice.finish_reason)) | ||||
| 
 | ||||
|     # NOTE: If we do not set tools in chat-completion request, we should not | ||||
|     # expect the ToolCall in the response. Instead, we should return the raw | ||||
|  | @ -217,7 +214,7 @@ def process_chat_completion_response( | |||
| 
 | ||||
| 
 | ||||
| async def process_completion_stream_response( | ||||
|     stream: AsyncGenerator[OpenAICompatCompletionResponse, None], formatter: ChatFormat | ||||
|     stream: AsyncGenerator[OpenAICompatCompletionResponse, None], | ||||
| ) -> AsyncGenerator: | ||||
|     stop_reason = None | ||||
| 
 | ||||
|  | @ -254,7 +251,6 @@ async def process_completion_stream_response( | |||
| 
 | ||||
| async def process_chat_completion_stream_response( | ||||
|     stream: AsyncGenerator[OpenAICompatCompletionResponse, None], | ||||
|     formatter: ChatFormat, | ||||
|     request: ChatCompletionRequest, | ||||
| ) -> AsyncGenerator: | ||||
|     yield ChatCompletionResponseStreamChunk( | ||||
|  | @ -333,7 +329,7 @@ async def process_chat_completion_stream_response( | |||
|             ) | ||||
| 
 | ||||
|     # parse tool calls and report errors | ||||
|     message = formatter.decode_assistant_message_from_content(buffer, stop_reason) | ||||
|     message = decode_assistant_message(buffer, stop_reason) | ||||
| 
 | ||||
|     parsed_tool_calls = len(message.tool_calls) > 0 | ||||
|     if ipython and not parsed_tool_calls: | ||||
|  |  | |||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue