Skip to content

Commit 889c2aa

Browse files
committed
f
1 parent 05c82f4 commit 889c2aa

File tree

2 files changed

+30
-1
lines changed

2 files changed

+30
-1
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,3 +36,4 @@ book/*
3636
hacktricks-preprocessor.log
3737
hacktricks-preprocessor-error.log
3838
searchindex.js
39+
**.pyc

scripts/translator.py

Lines changed: 29 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,16 @@
1818
DISALLOWED_SPECIAL = "<|endoftext|>"
1919
REPLACEMENT_TOKEN = "<END_OF_TEXT>"
2020

21+
TOKENIZER_FALLBACKS = [
22+
("gpt-5", "o200k_base"),
23+
("gpt-4o", "o200k_base"),
24+
("gpt-4.1", "o200k_base"),
25+
("gpt-4", "cl100k_base"),
26+
("gpt-3.5", "cl100k_base"),
27+
]
28+
29+
FINAL_TOKENIZER_FALLBACK = "o200k_base"
30+
2131
def run_git_command_with_retry(cmd, max_retries=1, delay=5, **kwargs):
2232
"""
2333
Run a git command with retry logic.
@@ -59,8 +69,24 @@ def _sanitize(text: str) -> str:
5969
"""
6070
return text.replace(DISALLOWED_SPECIAL, REPLACEMENT_TOKEN)
6171

72+
def _get_encoding_for_model(model: str):
73+
"""
74+
Return a tokenizer for the requested model, with fallbacks for newer
75+
model names that tiktoken may not recognize yet.
76+
"""
77+
try:
78+
return tiktoken.encoding_for_model(model)
79+
except KeyError:
80+
lowered_model = model.lower()
81+
for prefix, encoding_name in TOKENIZER_FALLBACKS:
82+
if lowered_model.startswith(prefix):
83+
print(f"Tokenizer for model {model} not found. Falling back to {encoding_name}.")
84+
return tiktoken.get_encoding(encoding_name)
85+
print(f"Tokenizer for model {model} not found. Falling back to {FINAL_TOKENIZER_FALLBACK}.")
86+
return tiktoken.get_encoding(FINAL_TOKENIZER_FALLBACK)
87+
6288
def reportTokens(prompt, model):
63-
encoding = tiktoken.encoding_for_model(model)
89+
encoding = _get_encoding_for_model(model)
6490
# print number of tokens in light gray, with first 50 characters of prompt in green. if truncated, show that it is truncated
6591
#print("\033[37m" + str(len(encoding.encode(prompt))) + " tokens\033[0m" + " in prompt: " + "\033[92m" + prompt[:50] + "\033[0m" + ("..." if len(prompt) > 50 else ""))
6692
prompt = _sanitize(prompt)
@@ -316,6 +342,7 @@ def copy_dirs(source_path, dest_path, folder_names):
316342
print(f"Error: {source_folder} does not exist.")
317343
else:
318344
# Copy the theme folder
345+
os.makedirs(os.path.dirname(destination_folder.rstrip(os.sep)) or dest_path, exist_ok=True)
319346
shutil.copytree(source_folder, destination_folder)
320347
print(f"Copied {folder_name} folder from {source_folder} to {destination_folder}")
321348

@@ -326,6 +353,7 @@ def move_files_to_push(source_path, dest_path, relative_file_paths):
326353
if not os.path.exists(source_filepath):
327354
print(f"Error: {source_filepath} does not exist.")
328355
else:
356+
os.makedirs(os.path.dirname(dest_filepath), exist_ok=True)
329357
shutil.copy2(source_filepath, dest_filepath)
330358
print(f"[+] Copied {file_path}")
331359

0 commit comments

Comments
 (0)