forked from phoenix-oss/llama-stack-mirror
Use new definitions of Model / SKU
This commit is contained in:
parent
156bfa0e15
commit
09cf3fe78b
8 changed files with 63 additions and 65 deletions
|
@ -16,12 +16,13 @@ import httpx
|
||||||
from huggingface_hub import snapshot_download
|
from huggingface_hub import snapshot_download
|
||||||
from huggingface_hub.utils import GatedRepoError, RepositoryNotFoundError
|
from huggingface_hub.utils import GatedRepoError, RepositoryNotFoundError
|
||||||
|
|
||||||
from llama_models.datatypes import CheckpointQuantizationFormat, ModelDefinition
|
from llama_models.datatypes import Model
|
||||||
from llama_models.sku_list import (
|
from llama_models.sku_list import (
|
||||||
llama3_1_model_list,
|
all_registered_models,
|
||||||
llama_meta_folder_path,
|
llama_meta_net_info,
|
||||||
llama_meta_pth_size,
|
resolve_model,
|
||||||
)
|
)
|
||||||
|
from termcolor import cprint
|
||||||
|
|
||||||
from llama_toolchain.cli.subcommand import Subcommand
|
from llama_toolchain.cli.subcommand import Subcommand
|
||||||
from llama_toolchain.utils import DEFAULT_DUMP_DIR
|
from llama_toolchain.utils import DEFAULT_DUMP_DIR
|
||||||
|
@ -45,7 +46,7 @@ class Download(Subcommand):
|
||||||
self.parser.set_defaults(func=self._run_download_cmd)
|
self.parser.set_defaults(func=self._run_download_cmd)
|
||||||
|
|
||||||
def _add_arguments(self):
|
def _add_arguments(self):
|
||||||
models = llama3_1_model_list()
|
models = all_registered_models()
|
||||||
self.parser.add_argument(
|
self.parser.add_argument(
|
||||||
"--source",
|
"--source",
|
||||||
choices=["meta", "huggingface"],
|
choices=["meta", "huggingface"],
|
||||||
|
@ -53,7 +54,7 @@ class Download(Subcommand):
|
||||||
)
|
)
|
||||||
self.parser.add_argument(
|
self.parser.add_argument(
|
||||||
"--model-id",
|
"--model-id",
|
||||||
choices=[x.sku.value for x in models],
|
choices=[x.descriptor() for x in models],
|
||||||
required=True,
|
required=True,
|
||||||
)
|
)
|
||||||
self.parser.add_argument(
|
self.parser.add_argument(
|
||||||
|
@ -80,12 +81,12 @@ safetensors files to avoid downloading duplicate weights.
|
||||||
""",
|
""",
|
||||||
)
|
)
|
||||||
|
|
||||||
def _hf_download(self, model: ModelDefinition, hf_token: str, ignore_patterns: str):
|
def _hf_download(self, model: Model, hf_token: str, ignore_patterns: str):
|
||||||
repo_id = model.huggingface_id
|
repo_id = model.huggingface_repo
|
||||||
if repo_id is None:
|
if repo_id is None:
|
||||||
raise ValueError(f"No repo id found for model {model.sku.value}")
|
raise ValueError(f"No repo id found for model {model.descriptor()}")
|
||||||
|
|
||||||
output_dir = Path(DEFAULT_CHECKPOINT_DIR) / model.sku.value
|
output_dir = Path(DEFAULT_CHECKPOINT_DIR) / model.descriptor()
|
||||||
os.makedirs(output_dir, exist_ok=True)
|
os.makedirs(output_dir, exist_ok=True)
|
||||||
try:
|
try:
|
||||||
true_output_dir = snapshot_download(
|
true_output_dir = snapshot_download(
|
||||||
|
@ -111,43 +112,37 @@ safetensors files to avoid downloading duplicate weights.
|
||||||
|
|
||||||
print(f"Successfully downloaded model to {true_output_dir}")
|
print(f"Successfully downloaded model to {true_output_dir}")
|
||||||
|
|
||||||
def _meta_download(self, model: ModelDefinition, meta_url: str):
|
def _meta_download(self, model: Model, meta_url: str):
|
||||||
output_dir = Path(DEFAULT_CHECKPOINT_DIR) / model.sku.value
|
output_dir = Path(DEFAULT_CHECKPOINT_DIR) / model.descriptor()
|
||||||
os.makedirs(output_dir, exist_ok=True)
|
os.makedirs(output_dir, exist_ok=True)
|
||||||
|
|
||||||
gpus = model.hardware_requirements.gpu_count
|
info = llama_meta_net_info(model)
|
||||||
files = [
|
|
||||||
"tokenizer.model",
|
|
||||||
"params.json",
|
|
||||||
]
|
|
||||||
if model.quantization_format == CheckpointQuantizationFormat.fp8_mixed:
|
|
||||||
files.extend([f"fp8_scales_{i}.pt" for i in range(gpus)])
|
|
||||||
files.extend([f"consolidated.{i:02d}.pth" for i in range(gpus)])
|
|
||||||
|
|
||||||
folder_path = llama_meta_folder_path(model)
|
|
||||||
pth_size = llama_meta_pth_size(model)
|
|
||||||
|
|
||||||
# I believe we can use some concurrency here if needed but not sure it is worth it
|
# I believe we can use some concurrency here if needed but not sure it is worth it
|
||||||
for f in files:
|
for f in info.files:
|
||||||
output_file = str(output_dir / f)
|
output_file = str(output_dir / f)
|
||||||
url = meta_url.replace("*", f"{folder_path}/{f}")
|
url = meta_url.replace("*", f"{info.folder}/{f}")
|
||||||
total_size = pth_size if "consolidated" in f else 0
|
total_size = info.pth_size if "consolidated" in f else 0
|
||||||
|
cprint(f"Downloading `{f}`...", "white")
|
||||||
downloader = ResumableDownloader(url, output_file, total_size)
|
downloader = ResumableDownloader(url, output_file, total_size)
|
||||||
asyncio.run(downloader.download())
|
asyncio.run(downloader.download())
|
||||||
|
|
||||||
def _run_download_cmd(self, args: argparse.Namespace):
|
def _run_download_cmd(self, args: argparse.Namespace):
|
||||||
by_id = {model.sku.value: model for model in llama3_1_model_list()}
|
model = resolve_model(args.model_id)
|
||||||
assert args.model_id in by_id, f"Unexpected model id {args.model_id}"
|
if model is None:
|
||||||
|
self.parser.error(f"Model {args.model_id} not found")
|
||||||
|
return
|
||||||
|
|
||||||
model = by_id[args.model_id]
|
|
||||||
if args.source == "huggingface":
|
if args.source == "huggingface":
|
||||||
self._hf_download(model, args.hf_token, args.ignore_patterns)
|
self._hf_download(model, args.hf_token, args.ignore_patterns)
|
||||||
else:
|
else:
|
||||||
if not args.meta_url:
|
meta_url = args.meta_url
|
||||||
self.parser.error(
|
if not meta_url:
|
||||||
"Please provide a meta url to download the model from llama.meta.com"
|
meta_url = input(
|
||||||
|
"Please provide the signed URL you received via email (e.g., https://llama3-1.llamameta.net/*?Policy...): "
|
||||||
)
|
)
|
||||||
self._meta_download(model, args.meta_url)
|
assert meta_url is not None and "llama3-1.llamameta.net" in meta_url
|
||||||
|
self._meta_download(model, meta_url)
|
||||||
|
|
||||||
|
|
||||||
class ResumableDownloader:
|
class ResumableDownloader:
|
||||||
|
@ -170,7 +165,10 @@ class ResumableDownloader:
|
||||||
if self.total_size > 0:
|
if self.total_size > 0:
|
||||||
return
|
return
|
||||||
|
|
||||||
response = await client.head(self.url, follow_redirects=True)
|
# Force disable compression when trying to retrieve file size
|
||||||
|
response = await client.head(
|
||||||
|
self.url, follow_redirects=True, headers={"Accept-Encoding": "identity"}
|
||||||
|
)
|
||||||
response.raise_for_status()
|
response.raise_for_status()
|
||||||
self.url = str(response.url) # Update URL in case of redirects
|
self.url = str(response.url) # Update URL in case of redirects
|
||||||
self.total_size = int(response.headers.get("Content-Length", 0))
|
self.total_size = int(response.headers.get("Content-Length", 0))
|
||||||
|
|
|
@ -9,7 +9,7 @@ import json
|
||||||
|
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
|
|
||||||
from llama_models.sku_list import llama3_1_model_list
|
from llama_models.sku_list import resolve_model
|
||||||
|
|
||||||
from termcolor import colored
|
from termcolor import colored
|
||||||
|
|
||||||
|
@ -47,20 +47,13 @@ class ModelDescribe(Subcommand):
|
||||||
)
|
)
|
||||||
|
|
||||||
def _run_model_describe_cmd(self, args: argparse.Namespace) -> None:
|
def _run_model_describe_cmd(self, args: argparse.Namespace) -> None:
|
||||||
models = llama3_1_model_list()
|
model = resolve_model(args.model_id)
|
||||||
by_id = {model.sku.value: model for model in models}
|
if model is None:
|
||||||
|
self.parser.error(
|
||||||
if args.model_id not in by_id:
|
|
||||||
print(
|
|
||||||
f"Model {args.model_id} not found; try 'llama model list' for a list of available models."
|
f"Model {args.model_id} not found; try 'llama model list' for a list of available models."
|
||||||
)
|
)
|
||||||
return
|
return
|
||||||
|
|
||||||
model = by_id[args.model_id]
|
|
||||||
|
|
||||||
sampling_params = model.recommended_sampling_params.dict()
|
|
||||||
for k in ("max_tokens", "repetition_penalty"):
|
|
||||||
del sampling_params[k]
|
|
||||||
rows = [
|
rows = [
|
||||||
(
|
(
|
||||||
colored("Model", "white", attrs=["bold"]),
|
colored("Model", "white", attrs=["bold"]),
|
||||||
|
@ -70,12 +63,19 @@ class ModelDescribe(Subcommand):
|
||||||
("Description", model.description_markdown),
|
("Description", model.description_markdown),
|
||||||
("Context Length", f"{model.max_seq_length // 1024}K tokens"),
|
("Context Length", f"{model.max_seq_length // 1024}K tokens"),
|
||||||
("Weights format", model.quantization_format.value),
|
("Weights format", model.quantization_format.value),
|
||||||
|
("Model params.json", json.dumps(model.model_args, indent=4)),
|
||||||
|
]
|
||||||
|
|
||||||
|
if model.recommended_sampling_params is not None:
|
||||||
|
sampling_params = model.recommended_sampling_params.dict()
|
||||||
|
for k in ("max_tokens", "repetition_penalty"):
|
||||||
|
del sampling_params[k]
|
||||||
|
rows.append(
|
||||||
(
|
(
|
||||||
"Recommended sampling params",
|
"Recommended sampling params",
|
||||||
json.dumps(sampling_params, cls=EnumEncoder, indent=4),
|
json.dumps(sampling_params, cls=EnumEncoder, indent=4),
|
||||||
),
|
)
|
||||||
("Model params.json", json.dumps(model.model_args, indent=4)),
|
)
|
||||||
]
|
|
||||||
|
|
||||||
print_table(
|
print_table(
|
||||||
rows,
|
rows,
|
||||||
|
|
|
@ -6,7 +6,7 @@
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
|
|
||||||
from llama_models.sku_list import llama3_1_model_list
|
from llama_models.sku_list import all_registered_models
|
||||||
|
|
||||||
from llama_toolchain.cli.subcommand import Subcommand
|
from llama_toolchain.cli.subcommand import Subcommand
|
||||||
from llama_toolchain.cli.table import print_table
|
from llama_toolchain.cli.table import print_table
|
||||||
|
@ -30,21 +30,22 @@ class ModelList(Subcommand):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def _run_model_list_cmd(self, args: argparse.Namespace) -> None:
|
def _run_model_list_cmd(self, args: argparse.Namespace) -> None:
|
||||||
models = llama3_1_model_list()
|
|
||||||
headers = [
|
headers = [
|
||||||
"Model ID",
|
"Model Descriptor",
|
||||||
"HuggingFace ID",
|
"HuggingFace Repo",
|
||||||
"Context Length",
|
"Context Length",
|
||||||
"Hardware Requirements",
|
"Hardware Requirements",
|
||||||
]
|
]
|
||||||
|
|
||||||
rows = []
|
rows = []
|
||||||
for model in models:
|
for model in all_registered_models():
|
||||||
req = model.hardware_requirements
|
req = model.hardware_requirements
|
||||||
|
|
||||||
|
descriptor = model.descriptor()
|
||||||
rows.append(
|
rows.append(
|
||||||
[
|
[
|
||||||
model.sku.value,
|
descriptor,
|
||||||
model.huggingface_id,
|
model.huggingface_repo,
|
||||||
f"{model.max_seq_length // 1024}K",
|
f"{model.max_seq_length // 1024}K",
|
||||||
f"{req.gpu_count} GPU{'s' if req.gpu_count > 1 else ''}, each >= {req.memory_gb_per_gpu}GB VRAM",
|
f"{req.gpu_count} GPU{'s' if req.gpu_count > 1 else ''}, each >= {req.memory_gb_per_gpu}GB VRAM",
|
||||||
]
|
]
|
||||||
|
|
|
@ -13,7 +13,7 @@ from pyopenapi import webmethod
|
||||||
|
|
||||||
@json_schema_type
|
@json_schema_type
|
||||||
class CompletionRequest(BaseModel):
|
class CompletionRequest(BaseModel):
|
||||||
model: PretrainedModel
|
model: str
|
||||||
content: InterleavedTextAttachment
|
content: InterleavedTextAttachment
|
||||||
sampling_params: Optional[SamplingParams] = SamplingParams()
|
sampling_params: Optional[SamplingParams] = SamplingParams()
|
||||||
|
|
||||||
|
@ -39,7 +39,7 @@ class CompletionResponseStreamChunk(BaseModel):
|
||||||
|
|
||||||
@json_schema_type
|
@json_schema_type
|
||||||
class BatchCompletionRequest(BaseModel):
|
class BatchCompletionRequest(BaseModel):
|
||||||
model: PretrainedModel
|
model: str
|
||||||
content_batch: List[InterleavedTextAttachment]
|
content_batch: List[InterleavedTextAttachment]
|
||||||
sampling_params: Optional[SamplingParams] = SamplingParams()
|
sampling_params: Optional[SamplingParams] = SamplingParams()
|
||||||
logprobs: Optional[LogProbConfig] = None
|
logprobs: Optional[LogProbConfig] = None
|
||||||
|
@ -53,7 +53,7 @@ class BatchCompletionResponse(BaseModel):
|
||||||
|
|
||||||
@json_schema_type
|
@json_schema_type
|
||||||
class ChatCompletionRequest(BaseModel):
|
class ChatCompletionRequest(BaseModel):
|
||||||
model: InstructModel
|
model: str
|
||||||
messages: List[Message]
|
messages: List[Message]
|
||||||
sampling_params: Optional[SamplingParams] = SamplingParams()
|
sampling_params: Optional[SamplingParams] = SamplingParams()
|
||||||
|
|
||||||
|
@ -80,7 +80,7 @@ class ChatCompletionResponse(BaseModel):
|
||||||
|
|
||||||
@json_schema_type
|
@json_schema_type
|
||||||
class BatchChatCompletionRequest(BaseModel):
|
class BatchChatCompletionRequest(BaseModel):
|
||||||
model: InstructModel
|
model: str
|
||||||
messages_batch: List[List[Message]]
|
messages_batch: List[List[Message]]
|
||||||
sampling_params: Optional[SamplingParams] = SamplingParams()
|
sampling_params: Optional[SamplingParams] = SamplingParams()
|
||||||
|
|
||||||
|
|
|
@ -18,7 +18,6 @@ from .api import (
|
||||||
ChatCompletionResponseStreamChunk,
|
ChatCompletionResponseStreamChunk,
|
||||||
CompletionRequest,
|
CompletionRequest,
|
||||||
Inference,
|
Inference,
|
||||||
InstructModel,
|
|
||||||
UserMessage,
|
UserMessage,
|
||||||
)
|
)
|
||||||
from .event_logger import EventLogger
|
from .event_logger import EventLogger
|
||||||
|
@ -67,7 +66,7 @@ async def run_main(host: str, port: int, stream: bool):
|
||||||
cprint(f"User>{message.content}", "green")
|
cprint(f"User>{message.content}", "green")
|
||||||
iterator = client.chat_completion(
|
iterator = client.chat_completion(
|
||||||
ChatCompletionRequest(
|
ChatCompletionRequest(
|
||||||
model=InstructModel.llama3_8b_chat,
|
model="Meta-Llama-3.1-8B-Instruct",
|
||||||
messages=[message],
|
messages=[message],
|
||||||
stream=stream,
|
stream=stream,
|
||||||
)
|
)
|
||||||
|
|
|
@ -25,7 +25,7 @@ class PostTrainingSFTRequest(BaseModel):
|
||||||
|
|
||||||
job_uuid: str
|
job_uuid: str
|
||||||
|
|
||||||
model: PretrainedModel
|
model: str
|
||||||
dataset: TrainEvalDataset
|
dataset: TrainEvalDataset
|
||||||
validation_dataset: TrainEvalDataset
|
validation_dataset: TrainEvalDataset
|
||||||
|
|
||||||
|
|
|
@ -15,7 +15,7 @@ class RewardScoringRequest(BaseModel):
|
||||||
"""Request to score a reward function. A list of prompts and a list of responses per prompt."""
|
"""Request to score a reward function. A list of prompts and a list of responses per prompt."""
|
||||||
|
|
||||||
dialog_generations: List[DialogGenerations]
|
dialog_generations: List[DialogGenerations]
|
||||||
model: RewardModel
|
model: str
|
||||||
|
|
||||||
|
|
||||||
@json_schema_type
|
@json_schema_type
|
||||||
|
|
|
@ -22,7 +22,7 @@ class SyntheticDataGenerationRequest(BaseModel):
|
||||||
|
|
||||||
dialogs: List[Message]
|
dialogs: List[Message]
|
||||||
filtering_function: FilteringFunction = FilteringFunction.none
|
filtering_function: FilteringFunction = FilteringFunction.none
|
||||||
model: Optional[RewardModel] = None
|
model: Optional[str] = None
|
||||||
|
|
||||||
|
|
||||||
@json_schema_type
|
@json_schema_type
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue