mirror of
https://github.com/meta-llama/llama-stack.git
synced 2025-08-12 04:50:39 +00:00
Clarify variable name
This commit is contained in:
parent
b3fbf1e18c
commit
bb024daf21
1 changed files with 18 additions and 13 deletions
|
@ -257,40 +257,45 @@ class VLLMInferenceImpl(Inference, ModelsProtocolPrivate):
|
||||||
resolved_llama_model = resolve_model(model.provider_model_id)
|
resolved_llama_model = resolve_model(model.provider_model_id)
|
||||||
if resolved_llama_model is not None:
|
if resolved_llama_model is not None:
|
||||||
# Load from Hugging Face repo into default local cache dir
|
# Load from Hugging Face repo into default local cache dir
|
||||||
resolved_model_id = resolved_llama_model.huggingface_repo
|
model_id_for_vllm = resolved_llama_model.huggingface_repo
|
||||||
|
|
||||||
# Detect a genuine Meta Llama model to trigger Meta-specific preprocessing.
|
# Detect a genuine Meta Llama model to trigger Meta-specific preprocessing.
|
||||||
# Don't set self.is_meta_llama_model until we actually load the model.
|
# Don't set self.is_meta_llama_model until we actually load the model.
|
||||||
is_meta_llama_model = True
|
is_meta_llama_model = True
|
||||||
else: # if resolved_llama_model is None
|
else: # if resolved_llama_model is None
|
||||||
# Not a Llama model name. Pass the model id through to vLLM's loader
|
# Not a Llama model name. Pass the model id through to vLLM's loader
|
||||||
resolved_model_id = model.provider_model_id
|
model_id_for_vllm = model.provider_model_id
|
||||||
is_meta_llama_model = False
|
is_meta_llama_model = False
|
||||||
|
|
||||||
logger.info(f"Model id {model} resolved to {resolved_model_id}")
|
|
||||||
|
|
||||||
if self.resolved_model_id is not None:
|
if self.resolved_model_id is not None:
|
||||||
if resolved_model_id != self.resolved_model_id:
|
if model_id_for_vllm != self.resolved_model_id:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"Attempted to serve two LLMs (ids '{self.resolved_model_id}') and "
|
f"Attempted to serve two LLMs (ids '{self.resolved_model_id}') and "
|
||||||
f"'{resolved_model_id}') from one copy of provider '{self}'. Use multiple "
|
f"'{model_id_for_vllm}') from one copy of provider '{self}'. Use multiple "
|
||||||
f"copies of the provider instead."
|
f"copies of the provider instead."
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
# Model already loaded
|
# Model already loaded
|
||||||
|
logger.info(
|
||||||
|
f"Requested id {model} resolves to {model_id_for_vllm}, "
|
||||||
|
f"which is already loaded. Continuing."
|
||||||
|
)
|
||||||
self.model_ids.add(model.model_id)
|
self.model_ids.add(model.model_id)
|
||||||
return model
|
return model
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
f"Requested id {model} resolves to {model_id_for_vllm}. Loading "
|
||||||
|
f"{model_id_for_vllm}."
|
||||||
|
)
|
||||||
if is_meta_llama_model:
|
if is_meta_llama_model:
|
||||||
logger.info(f"Model {resolved_model_id} is a Meta Llama model.")
|
logger.info(f"Model {model_id_for_vllm} is a Meta Llama model.")
|
||||||
self.is_meta_llama_model = is_meta_llama_model
|
self.is_meta_llama_model = is_meta_llama_model
|
||||||
logger.info(f"Preloading model: {resolved_model_id}")
|
|
||||||
|
|
||||||
# If we get here, this is the first time registering a model.
|
# If we get here, this is the first time registering a model.
|
||||||
# Preload so that the first inference request won't time out.
|
# Preload so that the first inference request won't time out.
|
||||||
engine_args = AsyncEngineArgs(
|
engine_args = AsyncEngineArgs(
|
||||||
model=resolved_model_id,
|
model=model_id_for_vllm,
|
||||||
tokenizer=resolved_model_id,
|
tokenizer=model_id_for_vllm,
|
||||||
tensor_parallel_size=self.config.tensor_parallel_size,
|
tensor_parallel_size=self.config.tensor_parallel_size,
|
||||||
enforce_eager=self.config.enforce_eager,
|
enforce_eager=self.config.enforce_eager,
|
||||||
gpu_memory_utilization=self.config.gpu_memory_utilization,
|
gpu_memory_utilization=self.config.gpu_memory_utilization,
|
||||||
|
@ -324,7 +329,7 @@ class VLLMInferenceImpl(Inference, ModelsProtocolPrivate):
|
||||||
model_config=model_config,
|
model_config=model_config,
|
||||||
base_model_paths=[
|
base_model_paths=[
|
||||||
# The layer below us will only see resolved model IDs
|
# The layer below us will only see resolved model IDs
|
||||||
BaseModelPath(resolved_model_id, resolved_model_id)
|
BaseModelPath(model_id_for_vllm, model_id_for_vllm)
|
||||||
],
|
],
|
||||||
),
|
),
|
||||||
response_role="assistant",
|
response_role="assistant",
|
||||||
|
@ -334,10 +339,10 @@ class VLLMInferenceImpl(Inference, ModelsProtocolPrivate):
|
||||||
tool_parser=tool_parser,
|
tool_parser=tool_parser,
|
||||||
chat_template_content_format="auto",
|
chat_template_content_format="auto",
|
||||||
)
|
)
|
||||||
self.resolved_model_id = resolved_model_id
|
self.resolved_model_id = model_id_for_vllm
|
||||||
self.model_ids.add(model.model_id)
|
self.model_ids.add(model.model_id)
|
||||||
|
|
||||||
logger.info(f"Finished preloading model: {resolved_model_id}")
|
logger.info(f"Finished preloading model: {model_id_for_vllm}")
|
||||||
|
|
||||||
return model
|
return model
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue