[BUGFIX] Fix #9199 - incorrect api token parameter in create_pretrained_tokenizer

Fix `Tokenizer.from_pretrained` call and use correct parameter name for api token.
Do not call `Tokenizer.from_pretrained` with api token parameter if it is empty or `None`.
This commit is contained in:
Vladislav Vinogradov 2025-03-31 09:58:33 +03:00 committed by Vladislav Vinogradov
parent 33ead69c0a
commit 4324b0c142
2 changed files with 21 additions and 8 deletions

View file

@ -1705,15 +1705,18 @@ def create_pretrained_tokenizer(
dict: A dictionary with the tokenizer and its type.
"""
try:
tokenizer = Tokenizer.from_pretrained(
identifier, revision=revision, auth_token=auth_token # type: ignore
)
except Exception as e:
verbose_logger.error(
f"Error creating pretrained tokenizer: {e}. Defaulting to version without 'auth_token'."
)
if not auth_token:
tokenizer = Tokenizer.from_pretrained(identifier, revision=revision)
else:
try:
tokenizer = Tokenizer.from_pretrained(
identifier, revision=revision, token=auth_token # type: ignore
)
except Exception as e:
verbose_logger.error(
f"Error creating pretrained tokenizer: {e}. Defaulting to version without 'token'."
)
tokenizer = Tokenizer.from_pretrained(identifier, revision=revision)
return {"type": "huggingface_tokenizer", "tokenizer": tokenizer}

View file

@ -151,6 +151,16 @@ def test_tokenizers():
llama3_tokens_1 == llama3_tokens_2
), "Custom tokenizer is not being used! It has been configured to use the same tokenizer as the built in llama3 tokenizer and the results should be the same."
if hf_api_key := os.getenv("HUGGINGFACE_API_KEY"):
private_tokenizer = create_pretrained_tokenizer(
"meta-llama/Llama-3.1-70B", auth_token=hf_api_key
)
private_tokens = token_counter(
custom_tokenizer=private_tokenizer, text=sample_text
)
print(f"private model tokens: {private_tokens}")
assert private_tokens > 0
print("test tokenizer: It worked!")
except Exception as e:
pytest.fail(f"An exception occured: {e}")