Merge branch 'main' into eval_api_final

This commit is contained in:
Xi Yan 2025-03-26 12:29:45 -07:00
commit bc0cd07008
79 changed files with 3257 additions and 2358 deletions

View file

@ -818,14 +818,7 @@
"delete": {
"responses": {
"200": {
"description": "OK",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/FileResponse"
}
}
}
"description": "OK"
},
"400": {
"$ref": "#/components/responses/BadRequest400"
@ -4002,22 +3995,33 @@
"type": "object",
"properties": {
"strategy": {
"$ref": "#/components/schemas/SamplingStrategy"
"$ref": "#/components/schemas/SamplingStrategy",
"description": "The sampling strategy."
},
"max_tokens": {
"type": "integer",
"default": 0
"default": 0,
"description": "The maximum number of tokens that can be generated in the completion. The token count of your prompt plus max_tokens cannot exceed the model's context length."
},
"repetition_penalty": {
"type": "number",
"default": 1.0
"default": 1.0,
"description": "Number between -2.0 and 2.0. Positive values penalize new tokens based on whether they appear in the text so far, increasing the model's likelihood to talk about new topics."
},
"stop": {
"type": "array",
"items": {
"type": "string"
},
"description": "Up to 4 sequences where the API will stop generating further tokens. The returned text will not contain the stop sequence."
}
},
"additionalProperties": false,
"required": [
"strategy"
],
"title": "SamplingParams"
"title": "SamplingParams",
"description": "Sampling parameters."
},
"SamplingStrategy": {
"oneOf": [
@ -6078,46 +6082,6 @@
"title": "FileUploadResponse",
"description": "Response after initiating a file upload session."
},
"FileResponse": {
"type": "object",
"properties": {
"bucket": {
"type": "string",
"description": "Bucket under which the file is stored (valid chars: a-zA-Z0-9_-)"
},
"key": {
"type": "string",
"description": "Key under which the file is stored (valid chars: a-zA-Z0-9_-/.)"
},
"mime_type": {
"type": "string",
"description": "MIME type of the file"
},
"url": {
"type": "string",
"description": "Upload URL for the file contents"
},
"bytes": {
"type": "integer",
"description": "Size of the file in bytes"
},
"created_at": {
"type": "integer",
"description": "Timestamp of when the file was created"
}
},
"additionalProperties": false,
"required": [
"bucket",
"key",
"mime_type",
"url",
"bytes",
"created_at"
],
"title": "FileResponse",
"description": "Response representing a file entry."
},
"EmbeddingsRequest": {
"type": "object",
"properties": {
@ -6498,52 +6462,47 @@
"title": "URIDataSource",
"description": "A dataset that can be obtained from a URI."
},
"EqualityGrader": {
"FileResponse": {
"type": "object",
"properties": {
"type": {
"bucket": {
"type": "string",
"const": "equality",
"default": "equality"
"description": "Bucket under which the file is stored (valid chars: a-zA-Z0-9_-)"
},
"key": {
"type": "string",
"description": "Key under which the file is stored (valid chars: a-zA-Z0-9_-/.)"
},
"mime_type": {
"type": "string",
"description": "MIME type of the file"
},
"url": {
"type": "string",
"description": "Upload URL for the file contents"
},
"bytes": {
"type": "integer",
"description": "Size of the file in bytes"
},
"created_at": {
"type": "integer",
"description": "Timestamp of when the file was created"
}
},
"additionalProperties": false,
"required": [
"type"
"bucket",
"key",
"mime_type",
"url",
"bytes",
"created_at"
],
"title": "EqualityGrader"
"title": "FileResponse",
"description": "Response representing a file entry."
},
"FactualityGrader": {
"type": "object",
"properties": {
"type": {
"type": "string",
"const": "factuality",
"default": "factuality"
}
},
"additionalProperties": false,
"required": [
"type"
],
"title": "FactualityGrader"
},
"FaithfulnessGrader": {
"type": "object",
"properties": {
"type": {
"type": "string",
"const": "faithfulness",
"default": "faithfulness"
}
},
"additionalProperties": false,
"required": [
"type"
],
"title": "FaithfulnessGrader"
},
"Grader": {
"Model": {
"type": "object",
"properties": {
"identifier": {
@ -7895,6 +7854,31 @@
"title": "IterrowsResponse",
"description": "A paginated list of rows from a dataset."
},
"Job": {
"type": "object",
"properties": {
"job_id": {
"type": "string"
},
"status": {
"type": "string",
"enum": [
"completed",
"in_progress",
"failed",
"scheduled",
"cancelled"
],
"title": "JobStatus"
}
},
"additionalProperties": false,
"required": [
"job_id",
"status"
],
"title": "Job"
},
"ListAgentSessionsResponse": {
"type": "object",
"properties": {

View file

@ -557,10 +557,6 @@ paths:
responses:
'200':
description: OK
content:
application/json:
schema:
$ref: '#/components/schemas/FileResponse'
'400':
$ref: '#/components/responses/BadRequest400'
'429':
@ -2764,16 +2760,33 @@ components:
properties:
strategy:
$ref: '#/components/schemas/SamplingStrategy'
description: The sampling strategy.
max_tokens:
type: integer
default: 0
description: >-
The maximum number of tokens that can be generated in the completion.
The token count of your prompt plus max_tokens cannot exceed the model's
context length.
repetition_penalty:
type: number
default: 1.0
description: >-
Number between -2.0 and 2.0. Positive values penalize new tokens based
on whether they appear in the text so far, increasing the model's likelihood
to talk about new topics.
stop:
type: array
items:
type: string
description: >-
Up to 4 sequences where the API will stop generating further tokens. The
returned text will not contain the stop sequence.
additionalProperties: false
required:
- strategy
title: SamplingParams
description: Sampling parameters.
SamplingStrategy:
oneOf:
- $ref: '#/components/schemas/GreedySamplingStrategy'
@ -4246,39 +4259,6 @@ components:
title: FileUploadResponse
description: >-
Response after initiating a file upload session.
FileResponse:
type: object
properties:
bucket:
type: string
description: >-
Bucket under which the file is stored (valid chars: a-zA-Z0-9_-)
key:
type: string
description: >-
Key under which the file is stored (valid chars: a-zA-Z0-9_-/.)
mime_type:
type: string
description: MIME type of the file
url:
type: string
description: Upload URL for the file contents
bytes:
type: integer
description: Size of the file in bytes
created_at:
type: integer
description: Timestamp of when the file was created
additionalProperties: false
required:
- bucket
- key
- mime_type
- url
- bytes
- created_at
title: FileResponse
description: Response representing a file entry.
EmbeddingsRequest:
type: object
properties:
@ -4550,40 +4530,40 @@ components:
title: URIDataSource
description: >-
A dataset that can be obtained from a URI.
EqualityGrader:
FileResponse:
type: object
properties:
type:
bucket:
type: string
const: equality
default: equality
description: >-
Bucket under which the file is stored (valid chars: a-zA-Z0-9_-)
key:
type: string
description: >-
Key under which the file is stored (valid chars: a-zA-Z0-9_-/.)
mime_type:
type: string
description: MIME type of the file
url:
type: string
description: Upload URL for the file contents
bytes:
type: integer
description: Size of the file in bytes
created_at:
type: integer
description: Timestamp of when the file was created
additionalProperties: false
required:
- type
title: EqualityGrader
FactualityGrader:
type: object
properties:
type:
type: string
const: factuality
default: factuality
additionalProperties: false
required:
- type
title: FactualityGrader
FaithfulnessGrader:
type: object
properties:
type:
type: string
const: faithfulness
default: faithfulness
additionalProperties: false
required:
- type
title: FaithfulnessGrader
Grader:
- bucket
- key
- mime_type
- url
- bytes
- created_at
title: FileResponse
description: Response representing a file entry.
Model:
type: object
properties:
identifier:
@ -5453,6 +5433,25 @@ components:
- data
title: IterrowsResponse
description: A paginated list of rows from a dataset.
Job:
type: object
properties:
job_id:
type: string
status:
type: string
enum:
- completed
- in_progress
- failed
- scheduled
- cancelled
title: JobStatus
additionalProperties: false
required:
- job_id
- status
title: Job
ListAgentSessionsResponse:
type: object
properties:

File diff suppressed because one or more lines are too long

View file

@ -963,16 +963,19 @@
"\n",
"client.benchmarks.register(\n",
" benchmark_id=\"meta-reference::mmmu\",\n",
" # Note: we can use any value as `dataset_id` because we'll be using the `evaluate_rows` API which accepts the \n",
" # `input_rows` argument and does not fetch data from the dataset.\n",
" dataset_id=f\"mmmu-{subset}-{split}\",\n",
" scoring_functions=[\"basic::regex_parser_multiple_choice_answer\"],\n",
" # Note: for the same reason as above, we can use any value as `scoring_functions`.\n",
" scoring_functions=[],\n",
")\n",
"\n",
"response = client.eval.evaluate_rows_alpha(\n",
"response = client.eval.evaluate_rows(\n",
" benchmark_id=\"meta-reference::mmmu\",\n",
" input_rows=eval_rows,\n",
" # Note: Here we define the actual scoring functions.\n",
" scoring_functions=[\"basic::regex_parser_multiple_choice_answer\"],\n",
" benchmark_config={\n",
" \"type\": \"benchmark\",\n",
" \"eval_candidate\": {\n",
" \"type\": \"model\",\n",
" \"model\": \"meta-llama/Llama-3.2-90B-Vision-Instruct\",\n",
@ -1139,12 +1142,11 @@
" scoring_functions=[\"llm-as-judge::405b-simpleqa\"],\n",
")\n",
"\n",
"response = client.eval.evaluate_rows_alpha(\n",
"response = client.eval.evaluate_rows(\n",
" benchmark_id=\"meta-reference::simpleqa\",\n",
" input_rows=eval_rows.data,\n",
" scoring_functions=[\"llm-as-judge::405b-simpleqa\"],\n",
" benchmark_config={\n",
" \"type\": \"benchmark\",\n",
" \"eval_candidate\": {\n",
" \"type\": \"model\",\n",
" \"model\": \"meta-llama/Llama-3.2-90B-Vision-Instruct\",\n",
@ -1288,12 +1290,11 @@
" \"enable_session_persistence\": False,\n",
"}\n",
"\n",
"response = client.eval.evaluate_rows_alpha(\n",
"response = client.eval.evaluate_rows(\n",
" benchmark_id=\"meta-reference::simpleqa\",\n",
" input_rows=eval_rows.data,\n",
" scoring_functions=[\"llm-as-judge::405b-simpleqa\"],\n",
" benchmark_config={\n",
" \"type\": \"benchmark\",\n",
" \"eval_candidate\": {\n",
" \"type\": \"agent\",\n",
" \"config\": agent_config,\n",

View file

@ -22,7 +22,7 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 13,
"metadata": {},
"outputs": [
{
@ -68,7 +68,7 @@
},
{
"cell_type": "code",
"execution_count": 12,
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
@ -1395,6 +1395,349 @@
"pprint(session_response.turns)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### 3.1 Improved RAG with Long Context\n",
"\n",
"- Instead of performing reteival tool, we send documents as attachments to the agent and let it use the entire document context. \n",
"- Note how that the model is able to understand the entire context from documentation and answers the question with better factuality with improved retrieval. "
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">Question:</span> What precision formats does torchtune support?\n",
"</pre>\n"
],
"text/plain": [
"\u001b[1;36mQuestion:\u001b[0m What precision formats does torchtune support?\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #808000; text-decoration-color: #808000; font-weight: bold\">Agent Answer:</span> Torchtune supports two precision formats: `fp32` <span style=\"font-weight: bold\">(</span>full-precision<span style=\"font-weight: bold\">)</span> and `bfloat16` <span style=\"font-weight: bold\">(</span>half-precision<span style=\"font-weight: bold\">)</span>. \n",
"The `bfloat16` format uses <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">2</span> bytes per model parameter, which is half the memory of `fp32`, and also improves \n",
"training speed.\n",
"</pre>\n"
],
"text/plain": [
"\u001b[1;33mAgent Answer:\u001b[0m Torchtune supports two precision formats: `fp32` \u001b[1m(\u001b[0mfull-precision\u001b[1m)\u001b[0m and `bfloat16` \u001b[1m(\u001b[0mhalf-precision\u001b[1m)\u001b[0m. \n",
"The `bfloat16` format uses \u001b[1;36m2\u001b[0m bytes per model parameter, which is half the memory of `fp32`, and also improves \n",
"training speed.\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">Question:</span> What does DoRA stand for in torchtune?\n",
"</pre>\n"
],
"text/plain": [
"\u001b[1;36mQuestion:\u001b[0m What does DoRA stand for in torchtune?\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #808000; text-decoration-color: #808000; font-weight: bold\">Agent Answer:</span> DoRA stands for Weight-Decomposed Low-Rank Adaptation. It is a variant of LoRA <span style=\"font-weight: bold\">(</span>Low-Rank Adaptation<span style=\"font-weight: bold\">)</span> \n",
"that further decomposes the pre-trained weights into two components: magnitude and direction. The magnitude \n",
"component is a scalar vector that adjusts the scale, while the direction component corresponds to the original LoRA\n",
"decomposition and updates the orientation of weights. DoRA adds a small overhead to LoRA training due to the \n",
"addition of the magnitude parameter, but it has been shown to improve the performance of LoRA, particularly at low \n",
"ranks.\n",
"</pre>\n"
],
"text/plain": [
"\u001b[1;33mAgent Answer:\u001b[0m DoRA stands for Weight-Decomposed Low-Rank Adaptation. It is a variant of LoRA \u001b[1m(\u001b[0mLow-Rank Adaptation\u001b[1m)\u001b[0m \n",
"that further decomposes the pre-trained weights into two components: magnitude and direction. The magnitude \n",
"component is a scalar vector that adjusts the scale, while the direction component corresponds to the original LoRA\n",
"decomposition and updates the orientation of weights. DoRA adds a small overhead to LoRA training due to the \n",
"addition of the magnitude parameter, but it has been shown to improve the performance of LoRA, particularly at low \n",
"ranks.\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">Question:</span> How does the CPUOffloadOptimizer reduce GPU memory usage?\n",
"</pre>\n"
],
"text/plain": [
"\u001b[1;36mQuestion:\u001b[0m How does the CPUOffloadOptimizer reduce GPU memory usage?\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #808000; text-decoration-color: #808000; font-weight: bold\">Agent Answer:</span> The CPUOffloadOptimizer reduces GPU memory usage by offloading optimizer states and gradients to the \n",
"CPU, and performing optimizer steps on the CPU. This can significantly reduce GPU memory usage at the cost of CPU \n",
"RAM and training speed.\n",
"</pre>\n"
],
"text/plain": [
"\u001b[1;33mAgent Answer:\u001b[0m The CPUOffloadOptimizer reduces GPU memory usage by offloading optimizer states and gradients to the \n",
"CPU, and performing optimizer steps on the CPU. This can significantly reduce GPU memory usage at the cost of CPU \n",
"RAM and training speed.\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">Question:</span> How do I ensure only LoRA parameters are trainable when fine-tuning?\n",
"</pre>\n"
],
"text/plain": [
"\u001b[1;36mQuestion:\u001b[0m How do I ensure only LoRA parameters are trainable when fine-tuning?\n"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #808000; text-decoration-color: #808000; font-weight: bold\">Agent Answer:</span> To ensure only LoRA parameters are trainable when fine-tuning, you can use the `set_trainable_params`\n",
"function from `torchtune.modules.peft.peft_utils` to set the `requires_grad` attribute of the LoRA parameters to \n",
"`<span style=\"color: #00ff00; text-decoration-color: #00ff00; font-style: italic\">True</span>` and the `requires_grad` attribute of the other parameters to `<span style=\"color: #ff0000; text-decoration-color: #ff0000; font-style: italic\">False</span>`.\n",
"\n",
"Here is an example:\n",
"```python\n",
"from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\n",
"\n",
"# Get the LoRA parameters\n",
"lora_params = <span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">get_adapter_params</span><span style=\"font-weight: bold\">(</span>model<span style=\"font-weight: bold\">)</span>\n",
"\n",
"# Set the LoRA parameters to trainable and the other parameters to non-trainable\n",
"<span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">set_trainable_params</span><span style=\"font-weight: bold\">(</span>model, lora_params<span style=\"font-weight: bold\">)</span>\n",
"```\n",
"This will ensure that only the LoRA parameters are updated during fine-tuning, while the other parameters remain \n",
"frozen.\n",
"\n",
"Alternatively, you can also use the `lora_finetune` recipe in torchtune, which automatically sets the LoRA \n",
"parameters to trainable and the other parameters to non-trainable. You can run the recipe using the following \n",
"command:\n",
"```bash\n",
"tune run lora_finetune --config llama2/7B_lora\n",
"```\n",
"This will fine-tune the LoRA parameters of the Llama2 model using the default settings. You can modify the config \n",
"file to change the hyperparameters or the model architecture.\n",
"</pre>\n"
],
"text/plain": [
"\u001b[1;33mAgent Answer:\u001b[0m To ensure only LoRA parameters are trainable when fine-tuning, you can use the `set_trainable_params`\n",
"function from `torchtune.modules.peft.peft_utils` to set the `requires_grad` attribute of the LoRA parameters to \n",
"`\u001b[3;92mTrue\u001b[0m` and the `requires_grad` attribute of the other parameters to `\u001b[3;91mFalse\u001b[0m`.\n",
"\n",
"Here is an example:\n",
"```python\n",
"from torchtune.modules.peft.peft_utils import get_adapter_params, set_trainable_params\n",
"\n",
"# Get the LoRA parameters\n",
"lora_params = \u001b[1;35mget_adapter_params\u001b[0m\u001b[1m(\u001b[0mmodel\u001b[1m)\u001b[0m\n",
"\n",
"# Set the LoRA parameters to trainable and the other parameters to non-trainable\n",
"\u001b[1;35mset_trainable_params\u001b[0m\u001b[1m(\u001b[0mmodel, lora_params\u001b[1m)\u001b[0m\n",
"```\n",
"This will ensure that only the LoRA parameters are updated during fine-tuning, while the other parameters remain \n",
"frozen.\n",
"\n",
"Alternatively, you can also use the `lora_finetune` recipe in torchtune, which automatically sets the LoRA \n",
"parameters to trainable and the other parameters to non-trainable. You can run the recipe using the following \n",
"command:\n",
"```bash\n",
"tune run lora_finetune --config llama2/7B_lora\n",
"```\n",
"This will fine-tune the LoRA parameters of the Llama2 model using the default settings. You can modify the config \n",
"file to change the hyperparameters or the model architecture.\n"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"urls = [\n",
" \"memory_optimizations.rst\",\n",
" \"chat.rst\",\n",
" \"llama3.rst\",\n",
" \"datasets.rst\",\n",
" \"qat_finetune.rst\",\n",
" \"lora_finetune.rst\",\n",
"]\n",
"\n",
"attachments = [\n",
" {\n",
" \"content\": {\n",
" \"uri\": f\"https://raw.githubusercontent.com/pytorch/torchtune/main/docs/source/tutorials/{url}\",\n",
" },\n",
" \"mime_type\": \"text/plain\",\n",
" }\n",
"\n",
" for i, url in enumerate(urls)\n",
"]\n",
"\n",
"rag_attachment_agent = Agent(\n",
" client,\n",
" model=MODEL_ID,\n",
" instructions=\"You are a helpful assistant that can answer questions about the Torchtune project. Use context from attached documentation for Torchtune to answer questions.\",\n",
")\n",
"\n",
"for example in examples:\n",
" session_id = rag_attachment_agent.create_session(session_name=f\"rag_attachment_session_{uuid.uuid4()}\")\n",
" response = rag_attachment_agent.create_turn(\n",
" messages=[\n",
" {\n",
" \"role\": \"user\",\n",
" \"content\": example[\"input_query\"]\n",
" }\n",
" ],\n",
" session_id=session_id,\n",
" documents=attachments,\n",
" stream=False\n",
" )\n",
" rich.print(f\"[bold cyan]Question:[/bold cyan] {example['input_query']}\")\n",
" rich.print(f\"[bold yellow]Agent Answer:[/bold yellow] {response.output_message.content}\")\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">ScoringScoreResponse</span><span style=\"font-weight: bold\">(</span>\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ </span><span style=\"color: #808000; text-decoration-color: #808000\">results</span>=<span style=\"font-weight: bold\">{</span>\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ </span><span style=\"color: #008000; text-decoration-color: #008000\">'braintrust::factuality'</span>: <span style=\"color: #800080; text-decoration-color: #800080; font-weight: bold\">ScoringResult</span><span style=\"font-weight: bold\">(</span>\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">aggregated_results</span>=<span style=\"font-weight: bold\">{</span><span style=\"color: #008000; text-decoration-color: #008000\">'average'</span>: <span style=\"font-weight: bold\">{</span><span style=\"color: #008000; text-decoration-color: #008000\">'average'</span>: <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">0.6</span><span style=\"font-weight: bold\">}}</span>,\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ </span><span style=\"color: #808000; text-decoration-color: #808000\">score_rows</span>=<span style=\"font-weight: bold\">[</span>\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ </span><span style=\"font-weight: bold\">{</span>\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ </span><span style=\"color: #008000; text-decoration-color: #008000\">'score'</span>: <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">0.6</span>,\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ </span><span style=\"color: #008000; text-decoration-color: #008000\">'metadata'</span>: <span style=\"font-weight: bold\">{</span>\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ │ </span><span style=\"color: #008000; text-decoration-color: #008000\">'choice'</span>: <span style=\"color: #008000; text-decoration-color: #008000\">'B'</span>,\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ │ </span><span style=\"color: #008000; text-decoration-color: #008000\">'rationale'</span>: <span style=\"color: #008000; text-decoration-color: #008000\">'1. Both the expert and the submitted answers mention that Torchtune supports two precision formats: `fp32` (full-precision) and `bfloat16` (half-precision).\\n2. The expert answer specifies that `fp32` uses 4 bytes per model and optimizer parameter, while `bfloat16` uses 2 bytes per model and optimizer parameter.\\n3. The submitted answer also mentions that `bfloat16` uses 2 bytes per model parameter, which is consistent with the expert answer.\\n4. The submitted answer adds that `bfloat16` improves training speed, which is additional information not present in the expert answer.\\n5. There is no conflict between the submitted answer and the expert answer; the submitted answer simply provides more information.\\n\\nBased on this analysis, the submitted answer is a superset of the expert answer and is fully consistent with it.'</span>\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ </span><span style=\"font-weight: bold\">}</span>\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ </span><span style=\"font-weight: bold\">}</span>,\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ </span><span style=\"font-weight: bold\">{</span>\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ </span><span style=\"color: #008000; text-decoration-color: #008000\">'score'</span>: <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">0.6</span>,\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ </span><span style=\"color: #008000; text-decoration-color: #008000\">'metadata'</span>: <span style=\"font-weight: bold\">{</span>\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ │ </span><span style=\"color: #008000; text-decoration-color: #008000\">'choice'</span>: <span style=\"color: #008000; text-decoration-color: #008000\">'B'</span>,\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ │ </span><span style=\"color: #008000; text-decoration-color: #008000\">'rationale'</span>: <span style=\"color: #008000; text-decoration-color: #008000\">'1. The expert answer provides the definition of DoRA as \"Weight-Decomposed Low-Rank Adaptation.\"\\n2. The submitted answer also states that DoRA stands for \"Weight-Decomposed Low-Rank Adaptation,\" which matches the expert answer.\\n3. The submitted answer includes additional information about DoRA, explaining that it is a variant of LoRA and describing how it decomposes pre-trained weights into magnitude and direction components.\\n4. The submitted answer further explains the role of the magnitude component and the direction component, and mentions the performance improvement and overhead associated with DoRA.\\n5. The additional details in the submitted answer do not contradict the expert answer; instead, they expand upon it.\\n6. Therefore, the submitted answer is a superset of the expert answer and is fully consistent with it.'</span>\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ </span><span style=\"font-weight: bold\">}</span>\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ </span><span style=\"font-weight: bold\">}</span>,\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ </span><span style=\"font-weight: bold\">{</span>\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ </span><span style=\"color: #008000; text-decoration-color: #008000\">'score'</span>: <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">0.6</span>,\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ </span><span style=\"color: #008000; text-decoration-color: #008000\">'metadata'</span>: <span style=\"font-weight: bold\">{</span>\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ │ </span><span style=\"color: #008000; text-decoration-color: #008000\">'choice'</span>: <span style=\"color: #008000; text-decoration-color: #008000\">'B'</span>,\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ │ </span><span style=\"color: #008000; text-decoration-color: #008000\">'rationale'</span>: <span style=\"color: #008000; text-decoration-color: #008000\">'1. The expert answer states that the CPUOffloadOptimizer reduces GPU memory usage by keeping optimizer states on CPU and performing optimizer steps on CPU. It also mentions the optional offloading of gradients to CPU with the parameter offload_gradients=True.\\n\\n2. The submitted answer states that the CPUOffloadOptimizer reduces GPU memory usage by offloading optimizer states and gradients to the CPU, and performing optimizer steps on the CPU. It adds that this can significantly reduce GPU memory usage at the cost of CPU RAM and training speed.\\n\\n3. Comparing both answers:\\n - Both answers agree on offloading optimizer states to the CPU and performing optimizer steps on the CPU.\\n - Both mention the offloading of gradients to the CPU, but the expert answer specifies it as optional with a parameter, while the submission does not specify this detail.\\n - The submission adds additional information about the trade-off involving CPU RAM and training speed, which is not mentioned in the expert answer.\\n\\n4. The submitted answer includes all the details from the expert answer and adds more information about the trade-offs, making it a superset of the expert answer.\\n\\nTherefore, the correct choice is (B) The submitted answer is a superset of the expert answer and is fully consistent with it.'</span>\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ </span><span style=\"font-weight: bold\">}</span>\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ </span><span style=\"font-weight: bold\">}</span>,\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ </span><span style=\"font-weight: bold\">{</span>\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ </span><span style=\"color: #008000; text-decoration-color: #008000\">'score'</span>: <span style=\"color: #008080; text-decoration-color: #008080; font-weight: bold\">0.6</span>,\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ </span><span style=\"color: #008000; text-decoration-color: #008000\">'metadata'</span>: <span style=\"font-weight: bold\">{</span>\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ │ </span><span style=\"color: #008000; text-decoration-color: #008000\">'choice'</span>: <span style=\"color: #008000; text-decoration-color: #008000\">'B'</span>,\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ │ </span><span style=\"color: #008000; text-decoration-color: #008000\">'rationale'</span>: <span style=\"color: #008000; text-decoration-color: #008000\">\"1. **Expert Answer Analysis**: The expert answer provides a method to ensure only LoRA parameters are trainable by using torchtune's utility functions. It mentions fetching LoRA parameters with `get_adapter_params(lora_model)` and setting them as trainable with `set_trainable_params(lora_model, lora_params)`. It also notes that the LoRA recipe handles this automatically.\\n\\n2. **Submitted Answer Analysis**: The submitted answer provides a similar method using `set_trainable_params` to set the `requires_grad` attribute of LoRA parameters to `True` and other parameters to `False`. It includes a code example demonstrating this process. Additionally, it mentions using the `lora_finetune` recipe in torchtune, which automatically sets the LoRA parameters to trainable.\\n\\n3. **Comparison**: The submitted answer includes all the details from the expert answer regarding the use of `get_adapter_params` and `set_trainable_params`. It also provides additional information about setting the `requires_grad` attribute and using the `lora_finetune` recipe, which is not mentioned in the expert answer.\\n\\n4. **Conclusion**: The submitted answer is a superset of the expert answer as it contains all the information from the expert answer and additional details. There is no conflict between the two answers, and the additional information in the submission is consistent with the expert's explanation.\\n\\nTherefore, the correct choice is (B) The submitted answer is a superset of the expert answer and is fully consistent with it.\"</span>\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ │ </span><span style=\"font-weight: bold\">}</span>\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ │ </span><span style=\"font-weight: bold\">}</span>\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ │ </span><span style=\"font-weight: bold\">]</span>\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ │ </span><span style=\"font-weight: bold\">)</span>\n",
"<span style=\"color: #7fbf7f; text-decoration-color: #7fbf7f\">│ </span><span style=\"font-weight: bold\">}</span>\n",
"<span style=\"font-weight: bold\">)</span>\n",
"</pre>\n"
],
"text/plain": [
"\u001b[1;35mScoringScoreResponse\u001b[0m\u001b[1m(\u001b[0m\n",
"\u001b[2;32m│ \u001b[0m\u001b[33mresults\u001b[0m=\u001b[1m{\u001b[0m\n",
"\u001b[2;32m│ │ \u001b[0m\u001b[32m'braintrust::factuality'\u001b[0m: \u001b[1;35mScoringResult\u001b[0m\u001b[1m(\u001b[0m\n",
"\u001b[2;32m│ │ │ \u001b[0m\u001b[33maggregated_results\u001b[0m=\u001b[1m{\u001b[0m\u001b[32m'average'\u001b[0m: \u001b[1m{\u001b[0m\u001b[32m'average'\u001b[0m: \u001b[1;36m0.6\u001b[0m\u001b[1m}\u001b[0m\u001b[1m}\u001b[0m,\n",
"\u001b[2;32m│ │ │ \u001b[0m\u001b[33mscore_rows\u001b[0m=\u001b[1m[\u001b[0m\n",
"\u001b[2;32m│ │ │ │ \u001b[0m\u001b[1m{\u001b[0m\n",
"\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[32m'score'\u001b[0m: \u001b[1;36m0.6\u001b[0m,\n",
"\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[32m'metadata'\u001b[0m: \u001b[1m{\u001b[0m\n",
"\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[32m'choice'\u001b[0m: \u001b[32m'B'\u001b[0m,\n",
"\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[32m'rationale'\u001b[0m: \u001b[32m'1. Both the expert and the submitted answers mention that Torchtune supports two precision formats: `fp32` \u001b[0m\u001b[32m(\u001b[0m\u001b[32mfull-precision\u001b[0m\u001b[32m)\u001b[0m\u001b[32m and `bfloat16` \u001b[0m\u001b[32m(\u001b[0m\u001b[32mhalf-precision\u001b[0m\u001b[32m)\u001b[0m\u001b[32m.\\n2. The expert answer specifies that `fp32` uses 4 bytes per model and optimizer parameter, while `bfloat16` uses 2 bytes per model and optimizer parameter.\\n3. The submitted answer also mentions that `bfloat16` uses 2 bytes per model parameter, which is consistent with the expert answer.\\n4. The submitted answer adds that `bfloat16` improves training speed, which is additional information not present in the expert answer.\\n5. There is no conflict between the submitted answer and the expert answer; the submitted answer simply provides more information.\\n\\nBased on this analysis, the submitted answer is a superset of the expert answer and is fully consistent with it.'\u001b[0m\n",
"\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[1m}\u001b[0m\n",
"\u001b[2;32m│ │ │ │ \u001b[0m\u001b[1m}\u001b[0m,\n",
"\u001b[2;32m│ │ │ │ \u001b[0m\u001b[1m{\u001b[0m\n",
"\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[32m'score'\u001b[0m: \u001b[1;36m0.6\u001b[0m,\n",
"\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[32m'metadata'\u001b[0m: \u001b[1m{\u001b[0m\n",
"\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[32m'choice'\u001b[0m: \u001b[32m'B'\u001b[0m,\n",
"\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[32m'rationale'\u001b[0m: \u001b[32m'1. The expert answer provides the definition of DoRA as \"Weight-Decomposed Low-Rank Adaptation.\"\\n2. The submitted answer also states that DoRA stands for \"Weight-Decomposed Low-Rank Adaptation,\" which matches the expert answer.\\n3. The submitted answer includes additional information about DoRA, explaining that it is a variant of LoRA and describing how it decomposes pre-trained weights into magnitude and direction components.\\n4. The submitted answer further explains the role of the magnitude component and the direction component, and mentions the performance improvement and overhead associated with DoRA.\\n5. The additional details in the submitted answer do not contradict the expert answer; instead, they expand upon it.\\n6. Therefore, the submitted answer is a superset of the expert answer and is fully consistent with it.'\u001b[0m\n",
"\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[1m}\u001b[0m\n",
"\u001b[2;32m│ │ │ │ \u001b[0m\u001b[1m}\u001b[0m,\n",
"\u001b[2;32m│ │ │ │ \u001b[0m\u001b[1m{\u001b[0m\n",
"\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[32m'score'\u001b[0m: \u001b[1;36m0.6\u001b[0m,\n",
"\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[32m'metadata'\u001b[0m: \u001b[1m{\u001b[0m\n",
"\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[32m'choice'\u001b[0m: \u001b[32m'B'\u001b[0m,\n",
"\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[32m'rationale'\u001b[0m: \u001b[32m'1. The expert answer states that the CPUOffloadOptimizer reduces GPU memory usage by keeping optimizer states on CPU and performing optimizer steps on CPU. It also mentions the optional offloading of gradients to CPU with the parameter \u001b[0m\u001b[32moffload_gradients\u001b[0m\u001b[32m=\u001b[0m\u001b[32mTrue\u001b[0m\u001b[32m.\\n\\n2. The submitted answer states that the CPUOffloadOptimizer reduces GPU memory usage by offloading optimizer states and gradients to the CPU, and performing optimizer steps on the CPU. It adds that this can significantly reduce GPU memory usage at the cost of CPU RAM and training speed.\\n\\n3. Comparing both answers:\\n - Both answers agree on offloading optimizer states to the CPU and performing optimizer steps on the CPU.\\n - Both mention the offloading of gradients to the CPU, but the expert answer specifies it as optional with a parameter, while the submission does not specify this detail.\\n - The submission adds additional information about the trade-off involving CPU RAM and training speed, which is not mentioned in the expert answer.\\n\\n4. The submitted answer includes all the details from the expert answer and adds more information about the trade-offs, making it a superset of the expert answer.\\n\\nTherefore, the correct choice is \u001b[0m\u001b[32m(\u001b[0m\u001b[32mB\u001b[0m\u001b[32m)\u001b[0m\u001b[32m The submitted answer is a superset of the expert answer and is fully consistent with it.'\u001b[0m\n",
"\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[1m}\u001b[0m\n",
"\u001b[2;32m│ │ │ │ \u001b[0m\u001b[1m}\u001b[0m,\n",
"\u001b[2;32m│ │ │ │ \u001b[0m\u001b[1m{\u001b[0m\n",
"\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[32m'score'\u001b[0m: \u001b[1;36m0.6\u001b[0m,\n",
"\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[32m'metadata'\u001b[0m: \u001b[1m{\u001b[0m\n",
"\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[32m'choice'\u001b[0m: \u001b[32m'B'\u001b[0m,\n",
"\u001b[2;32m│ │ │ │ │ │ \u001b[0m\u001b[32m'rationale'\u001b[0m: \u001b[32m\"1. **Expert Answer Analysis**: The expert answer provides a method to ensure only LoRA parameters are trainable by using torchtune's utility functions. It mentions fetching LoRA parameters with `get_adapter_params\u001b[0m\u001b[32m(\u001b[0m\u001b[32mlora_model\u001b[0m\u001b[32m)\u001b[0m\u001b[32m` and setting them as trainable with `set_trainable_params\u001b[0m\u001b[32m(\u001b[0m\u001b[32mlora_model, lora_params\u001b[0m\u001b[32m)\u001b[0m\u001b[32m`. It also notes that the LoRA recipe handles this automatically.\\n\\n2. **Submitted Answer Analysis**: The submitted answer provides a similar method using `set_trainable_params` to set the `requires_grad` attribute of LoRA parameters to `True` and other parameters to `False`. It includes a code example demonstrating this process. Additionally, it mentions using the `lora_finetune` recipe in torchtune, which automatically sets the LoRA parameters to trainable.\\n\\n3. **Comparison**: The submitted answer includes all the details from the expert answer regarding the use of `get_adapter_params` and `set_trainable_params`. It also provides additional information about setting the `requires_grad` attribute and using the `lora_finetune` recipe, which is not mentioned in the expert answer.\\n\\n4. **Conclusion**: The submitted answer is a superset of the expert answer as it contains all the information from the expert answer and additional details. There is no conflict between the two answers, and the additional information in the submission is consistent with the expert's explanation.\\n\\nTherefore, the correct choice is \u001b[0m\u001b[32m(\u001b[0m\u001b[32mB\u001b[0m\u001b[32m)\u001b[0m\u001b[32m The submitted answer is a superset of the expert answer and is fully consistent with it.\"\u001b[0m\n",
"\u001b[2;32m│ │ │ │ │ \u001b[0m\u001b[1m}\u001b[0m\n",
"\u001b[2;32m│ │ │ │ \u001b[0m\u001b[1m}\u001b[0m\n",
"\u001b[2;32m│ │ │ \u001b[0m\u001b[1m]\u001b[0m\n",
"\u001b[2;32m│ │ \u001b[0m\u001b[1m)\u001b[0m\n",
"\u001b[2;32m│ \u001b[0m\u001b[1m}\u001b[0m\n",
"\u001b[1m)\u001b[0m\n"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"eval_rows = []\n",
"for i, session_id in enumerate(rag_attachment_agent.sessions):\n",
" session_response = client.agents.session.retrieve(agent_id=rag_attachment_agent.agent_id, session_id=session_id)\n",
" for turn in session_response.turns:\n",
" eval_rows.append({\n",
" \"input_query\": examples[i][\"input_query\"],\n",
" \"expected_answer\": examples[i][\"expected_answer\"],\n",
" \"generated_answer\": turn.output_message.content,\n",
" })\n",
"\n",
"scoring_params = {\n",
" \"braintrust::factuality\": None,\n",
"}\n",
"scoring_response = client.scoring.score(\n",
" input_rows=eval_rows,\n",
" scoring_functions=scoring_params,\n",
")\n",
"pprint(scoring_response)"
]
},
{
"cell_type": "markdown",
"metadata": {},

View file

@ -21,7 +21,7 @@ from llama_stack.distribution.stack import LlamaStack # noqa: E402
from .pyopenapi.options import Options # noqa: E402
from .pyopenapi.specification import Info, Server # noqa: E402
from .pyopenapi.utility import Specification, validate_api_method_return_types # noqa: E402
from .pyopenapi.utility import Specification, validate_api # noqa: E402
def str_presenter(dumper, data):
@ -40,8 +40,7 @@ def main(output_dir: str):
raise ValueError(f"Directory {output_dir} does not exist")
# Validate API protocols before generating spec
print("Validating API method return types...")
return_type_errors = validate_api_method_return_types()
return_type_errors = validate_api()
if return_type_errors:
print("\nAPI Method Return Type Validation Errors:\n")
for error in return_type_errors:

View file

@ -7,10 +7,9 @@
import json
import typing
import inspect
import os
from pathlib import Path
from typing import TextIO
from typing import Any, Dict, List, Optional, Protocol, Type, Union, get_type_hints, get_origin, get_args
from typing import Any, List, Optional, Union, get_type_hints, get_origin, get_args
from llama_stack.strong_typing.schema import object_to_json, StrictJsonType
from llama_stack.distribution.resolver import api_protocol_map
@ -125,29 +124,59 @@ def is_optional_type(type_: Any) -> bool:
return origin is Optional or (origin is Union and type(None) in args)
def validate_api_method_return_types() -> List[str]:
"""Validate that all API methods have proper return types."""
def _validate_api_method_return_type(method) -> str | None:
hints = get_type_hints(method)
if 'return' not in hints:
return "has no return type annotation"
return_type = hints['return']
if is_optional_type(return_type):
return "returns Optional type"
def _validate_api_delete_method_returns_none(method) -> str | None:
hints = get_type_hints(method)
if 'return' not in hints:
return "has no return type annotation"
return_type = hints['return']
if return_type is not None and return_type is not type(None):
return "does not return None"
_VALIDATORS = {
"GET": [
_validate_api_method_return_type,
],
"DELETE": [
_validate_api_delete_method_returns_none,
],
}
def _get_methods_by_type(protocol, method_type: str):
members = inspect.getmembers(protocol, predicate=inspect.isfunction)
return {
method_name: method
for method_name, method in members
if (webmethod := getattr(method, '__webmethod__', None))
if webmethod and webmethod.method == method_type
}
def validate_api() -> List[str]:
"""Validate the API protocols."""
errors = []
protocols = api_protocol_map()
for protocol_name, protocol in protocols.items():
methods = inspect.getmembers(protocol, predicate=inspect.isfunction)
for method_name, method in methods:
if not hasattr(method, '__webmethod__'):
continue
# Only check GET methods
if method.__webmethod__.method != "GET":
continue
hints = get_type_hints(method)
if 'return' not in hints:
errors.append(f"Method {protocol_name}.{method_name} has no return type annotation")
else:
return_type = hints['return']
if is_optional_type(return_type):
errors.append(f"Method {protocol_name}.{method_name} returns Optional type")
for target, validators in _VALIDATORS.items():
for protocol_name, protocol in protocols.items():
for validator in validators:
for method_name, method in _get_methods_by_type(protocol, target).items():
err = validator(method)
if err:
errors.append(f"Method {protocol_name}.{method_name} {err}")
return errors

View file

@ -8,7 +8,7 @@ First, create a local Kubernetes cluster via Kind:
kind create cluster --image kindest/node:v1.32.0 --name llama-stack-test
```
Start vLLM server as a Kubernetes Pod and Service:
First, create a Kubernetes PVC and Secret for downloading and storing Hugging Face model:
```bash
cat <<EOF |kubectl apply -f -
@ -31,7 +31,12 @@ metadata:
type: Opaque
data:
token: $(HF_TOKEN)
---
```
Next, start the vLLM server as a Kubernetes Deployment and Service:
```bash
cat <<EOF |kubectl apply -f -
apiVersion: apps/v1
kind: Deployment
metadata:
@ -47,28 +52,23 @@ spec:
app.kubernetes.io/name: vllm
spec:
containers:
- name: llama-stack
image: $(VLLM_IMAGE)
command:
- bash
- -c
- |
MODEL="meta-llama/Llama-3.2-1B-Instruct"
MODEL_PATH=/app/model/$(basename $MODEL)
huggingface-cli login --token $HUGGING_FACE_HUB_TOKEN
huggingface-cli download $MODEL --local-dir $MODEL_PATH --cache-dir $MODEL_PATH
python3 -m vllm.entrypoints.openai.api_server --model $MODEL_PATH --served-model-name $MODEL --port 8000
- name: vllm
image: vllm/vllm-openai:latest
command: ["/bin/sh", "-c"]
args: [
"vllm serve meta-llama/Llama-3.2-1B-Instruct"
]
env:
- name: HUGGING_FACE_HUB_TOKEN
valueFrom:
secretKeyRef:
name: hf-token-secret
key: token
ports:
- containerPort: 8000
volumeMounts:
- name: llama-storage
mountPath: /app/model
env:
- name: HUGGING_FACE_HUB_TOKEN
valueFrom:
secretKeyRef:
name: hf-token-secret
key: token
mountPath: /root/.cache/huggingface
volumes:
- name: llama-storage
persistentVolumeClaim:

View file

@ -8,6 +8,7 @@ The `llamastack/distribution-nvidia` distribution consists of the following prov
| agents | `inline::meta-reference` |
| datasetio | `inline::localfs` |
| inference | `remote::nvidia` |
| post_training | `remote::nvidia` |
| safety | `remote::nvidia` |
| telemetry | `inline::meta-reference` |
| tool_runtime | `inline::rag-runtime` |
@ -19,6 +20,12 @@ The `llamastack/distribution-nvidia` distribution consists of the following prov
The following environment variables can be configured:
- `NVIDIA_API_KEY`: NVIDIA API Key (default: ``)
- `NVIDIA_USER_ID`: NVIDIA User ID (default: `llama-stack-user`)
- `NVIDIA_DATASET_NAMESPACE`: NVIDIA Dataset Namespace (default: `default`)
- `NVIDIA_ACCESS_POLICIES`: NVIDIA Access Policies (default: `{}`)
- `NVIDIA_PROJECT_ID`: NVIDIA Project ID (default: `test-project`)
- `NVIDIA_CUSTOMIZER_URL`: NVIDIA Customizer URL (default: `https://customizer.api.nvidia.com`)
- `NVIDIA_OUTPUT_MODEL_DIR`: NVIDIA Output Model Directory (default: `test-example-model@v1`)
- `GUARDRAILS_SERVICE_URL`: URL for the NeMo Guardrails Service (default: `http://0.0.0.0:7331`)
- `INFERENCE_MODEL`: Inference model (default: `Llama3.1-8B-Instruct`)
- `SAFETY_MODEL`: Name of the model to use for safety (default: `meta/llama-3.1-8b-instruct`)