Update the translation script to use Gemini instead of Google Translate

This code itself was also AI generated.
This commit is contained in:
krzys-h
2025-09-13 19:43:39 +02:00
parent 46e44ada30
commit 0dd8bd72ff

View File

@@ -15,32 +15,37 @@
# Description: # Description:
# #
# Python script to automatically translate the strings.xml file to each supported # Python script to automatically translate the strings.xml file to each supported
# language using Google Translate. # language. Rewritten by krzys_h with the help of AI to use Gemini instead of
# Google Translate for more contextual translations.
# #
# Python installation: # Requirements:
# pip install beautifulsoup4 # pip install google-genai beautifulsoup4 lxml
# pip install deep-translator
# NB. For XML formatting:
# pip install lxml
# #
# References: # Env:
# * https://www.crummy.com/software/BeautifulSoup/bs4/doc/ # export GEMINI_API_KEY="YOUR_API_KEY"
# * https://realpython.com/beautiful-soup-web-scraper-python/ #
# * https://www.crummy.com/software/BeautifulSoup/bs4/doc/#parsing-xml # To get your own API key, go to:
# * https://www.crummy.com/software/BeautifulSoup/bs4/doc/#xml # https://aistudio.google.com/app/apikey
# #
#################################################################################### ####################################################################################
import os import os
import json
from typing import Dict, List, Tuple
from bs4 import BeautifulSoup from google import genai
from bs4 import Comment from bs4 import BeautifulSoup, Comment
from deep_translator import GoogleTranslator
# List of tuples in the form os: # ---------------- Configuration ----------------
# * Garmin IQ language three letter mnemonic,
# * Google Translate language mnemonic, # Gemini model name
# * Language familiar name (mainly for reference) MODEL_NAME = "gemini-2.5-flash"
languages: list[tuple[str, str, str]] = [
# Language definitions:
# * Garmin IQ language three-letter mnemonic (used in resources-XXX folder),
# * Unused Google mnemonic kept for reference,
# * Human-readable language name for prompts
languages: List[Tuple[str, str, str]] = [
("ara", "ar", "Arabic"), ("ara", "ar", "Arabic"),
("bul", "bg", "Bulgarian"), ("bul", "bg", "Bulgarian"),
("zhs", "zh-CN", "Chinese (Simplified)"), ("zhs", "zh-CN", "Chinese (Simplified)"),
@@ -79,75 +84,269 @@ languages: list[tuple[str, str, str]] = [
("vie", "vi", "Vietnamese"), ("vie", "vi", "Vietnamese"),
] ]
langLength = len(languages) exceptionIds: List[str] = ["AppName", "AppVersionTitle"]
exceptionIds: list[str] = ["AppName", "AppVersionTitle"] # ---------------- Helpers ----------------
titleIds: list[str] = []
# def merge(curr: BeautifulSoup, prev: BeautifulSoup) -> BeautifulSoup: def load_xml_as_soup(path: str) -> BeautifulSoup:
# """ if not os.path.exists(path):
# Merge the current strings.xml with the previous one, overwriting return BeautifulSoup("", features="xml")
# the previous strings with the current ones if they exist. with open(path, "r", encoding="utf-8") as f:
# """ return BeautifulSoup(f.read().replace("\r", ""), features="xml")
# out = prev.__copy__()
# for s in curr.find(name="strings").find_all(name="string"):
# s_prev = out.find(name="string", attrs={"id": s["id"]})
# if s_prev:
# s_prev.string = s.string
# else:
# out.find(name="strings").append(s)
# return out
i = 1 def extract_strings(soup: BeautifulSoup) -> Dict[str, str]:
with open("./resources/strings/strings.xml", "r") as f: out = {}
c = f.read().replace("\r", "") strings_node = soup.find(name="strings")
for l in languages: if not strings_node:
os.makedirs(f"./resources-{l[0]}/strings/", exist_ok=True) return out
# Old translations will not be automatically updated/removed, use removeTranslations.py for s in strings_node.find_all(name="string"):
try: sid = s.get("id")
with open(f"./resources-{l[0]}/strings/strings.xml", "r", encoding="utf-8") as r: if not sid:
prev = BeautifulSoup(r.read().replace("\r", ""), features="xml") continue
except FileNotFoundError: value = s.string if s.string is not None else s.get_text()
prev = BeautifulSoup("", features="xml") out[sid] = value if value is not None else ""
try: return out
with open(f"./resources-{l[0]}/strings/corrections.xml", "r", encoding="utf-8") as r:
curr = BeautifulSoup(r.read().replace("\r", ""), features="xml")
except FileNotFoundError:
curr = BeautifulSoup("", features=["xml"])
print(f"{i} of {langLength}: Translating English to {l[2]}")
soup = BeautifulSoup(c, features="xml")
translator = GoogleTranslator(source="en", target=l[1])
soup.find(name="strings").insert_before("\n\n")
soup.find(name="strings").insert_before(
Comment(
f"\n Generated by Google Translate: English to {l[2]}\n " +
translator.translate("Generated by Google Translate from English") + "\n"))
soup.find(name="strings").insert_before("\n\n")
for s in soup.find(name="strings").find_all(name="string"): def extract_comments_in_order(soup: BeautifulSoup) -> List[str]:
comments = []
strings_node = soup.find(name="strings")
if not strings_node:
return comments
for c in strings_node.find_all(string=lambda text: isinstance(text, Comment)):
comments.append(str(c))
return comments
def replace_comments_in_order(soup: BeautifulSoup, translated_comments: List[str]) -> None:
strings_node = soup.find(name="strings")
if not strings_node:
return
idx = 0
for c in strings_node.find_all(string=lambda text: isinstance(text, Comment)):
if idx < len(translated_comments):
c.insert_before(" ")
c.replace_with(Comment(translated_comments[idx]))
idx += 1
def build_translation_prompt(
language_name: str,
english_full: Dict[str, str],
existing_translations: Dict[str, str],
to_translate: Dict[str, str],
english_comments: List[str],
existing_translated_comments: List[str],
generator_comment_en: str,
) -> str:
return f"""
You are a professional localizer for a smartwatch UI. Translate UI strings into {language_name}.
Rules:
- Preserve placeholders EXACTLY and do not translate them:
- printf style: %s, %d, %f, %1$s, %2$d, etc.
- brace placeholders: {{0}}, {{1}}, {{name}}, {{value}}
- dollar placeholders: $1, $2
- Never translate app/product names; keep them unchanged, e.g., "Home Assistant".
- Do not change punctuation, spacing, or add extra punctuation unless natural in the target language.
- Keep any whitespace at the beginning or end of string unchanged.
- Keep meaning accurate and UI-appropriate (short, natural, consistent).
- Use consistent terminology aligned with existing translations for this language.
- Do NOT translate the string IDs themselves.
Comments handling:
- You are given comments from the English XML (in order) and the current translations (same order where available).
- If a given English comment has not changed since the last revision and a current translation exists at the same index, return the existing translation unchanged.
- If you believe an existing translation is already correct for the provided English, keep it unchanged; otherwise provide an improved translation.
- Also translate the generator comment line shown below. We will store both the English and translated lines inside a single XML comment.
Here are the complete English strings for context:
{json.dumps(english_full, ensure_ascii=False, indent=2)}
Here are existing translations for this language (do not modify these; use for terminology/style consistency):
{json.dumps(existing_translations, ensure_ascii=False, indent=2)}
Here are the ONLY strings that need new translations (translate the values):
{json.dumps(to_translate, ensure_ascii=False, indent=2)}
Comments to translate (same order as in the XML):
{json.dumps(english_comments, ensure_ascii=False, indent=2)}
Existing translated comments (same order; may be fewer items):
{json.dumps(existing_translated_comments, ensure_ascii=False, indent=2)}
Generator comment (English; translate this too):
{json.dumps(generator_comment_en, ensure_ascii=False)}
Return only valid JSON with this exact structure and nothing else (no markdown fences, no prose):
{{
"translations": {{ "<STRING_ID>": "<translated string>", ... }},
"translated_comments": ["<translated comment 1>", "<translated comment 2>", ...],
"generator_comment_translated": "<translated generator comment line>"
}}
- "translations" must have exactly the keys provided in "to_translate".
- "translated_comments" must have the same number of items and order as the input comments list.
- For comments that should remain unchanged based on the rules above, return the existing translation verbatim.
""".strip()
# ---------------- Main translation logic ----------------
def translate_language(
client: genai.Client,
lang_tuple: Tuple[str, str, str],
english_soup: BeautifulSoup,
english_strings: Dict[str, str],
) -> None:
garmin_code, _unused, language_name = lang_tuple
# Ensure output directory exists
out_dir = f"./resources-{garmin_code}/strings/"
os.makedirs(out_dir, exist_ok=True)
# Load previous translations and corrections
prev_soup = load_xml_as_soup(os.path.join(out_dir, "strings.xml"))
corrections_soup = load_xml_as_soup(os.path.join(out_dir, "corrections.xml"))
prev_map = extract_strings(prev_soup)
corrections_map = extract_strings(corrections_soup)
# Build a fresh soup for this language from English source
soup = BeautifulSoup(str(english_soup), features="xml")
# Collect comments
english_comments = extract_comments_in_order(english_soup)
existing_translated_comments = extract_comments_in_order(prev_soup)
# Detect any mention of Google Translate anywhere in the previous XML
all_comments_text_prev = [
str(c) for c in prev_soup.find_all(string=lambda t: isinstance(t, Comment))
]
mentions_google_translate = any("google translate" in c.lower() for c in all_comments_text_prev)
# Build generator comment English line (the translated line will be returned by the API)
if mentions_google_translate:
generator_comment_en = f"Generated by Google Translate and {MODEL_NAME} from English to {language_name}"
else:
generator_comment_en = f"Generated by {MODEL_NAME} from English to {language_name}"
# Decide which strings need translation (not in corrections, not in previous)
to_translate_map: Dict[str, str] = {}
final_values: Dict[str, str] = {}
for s in soup.find_all(name="string"):
sid = s.get("id")
if not sid:
continue
if sid in exceptionIds:
# Keep English as-is for exception IDs
final_values[sid] = s.get_text()
continue
if sid in corrections_map and corrections_map[sid] is not None:
final_values[sid] = corrections_map[sid]
elif sid in prev_map and prev_map[sid] is not None:
final_values[sid] = prev_map[sid]
else:
to_translate_map[sid] = s.get_text()
# If there are no new strings to translate, skip this language entirely
if not to_translate_map:
print(f" Skipping {language_name}: no new strings to translate.")
return
# Prepare context (always include full English strings)
english_context = english_strings
existing_translations = {k: v for k, v in prev_map.items()}
if corrections_map:
existing_translations.update(corrections_map)
# Translate all at once; force JSON output but do not enforce a schema
prompt = build_translation_prompt(
language_name=language_name,
english_full=english_context,
existing_translations=existing_translations,
to_translate=to_translate_map,
english_comments=english_comments,
existing_translated_comments=existing_translated_comments,
generator_comment_en=generator_comment_en,
)
config = genai.types.GenerateContentConfig(
temperature=0,
response_mime_type="application/json",
)
resp = client.models.generate_content(
model=MODEL_NAME,
contents=prompt,
config=config,
)
data = getattr(resp, "parsed", None)
if data is None:
txt = getattr(resp, "text", None)
if not txt:
try:
txt = resp.candidates[0].content.parts[0].text
except Exception:
txt = ""
if not txt.strip():
raise RuntimeError("Empty response from model; cannot parse translations.")
data = json.loads(txt)
translations = data.get("translations", {}) or {}
for sid, translated in translations.items():
if sid in to_translate_map:
final_values[sid] = translated
translated_comments_all: List[str] = data.get("translated_comments", []) or []
generator_comment_translated: str = data.get("generator_comment_translated", "") or ""
# Apply final values to the soup
for s in soup.find_all(name="string"):
sid = s.get("id")
if not sid:
continue
if sid in final_values:
val = final_values[sid]
s.insert_before(" ") s.insert_before(" ")
if s["id"] in exceptionIds: s.string = val
continue
s_curr = curr.find(name="string", attrs={"id": s["id"]}) # Replace comments with translated versions (order-preserving)
if s_curr: if translated_comments_all:
s.string = s_curr.string replace_comments_in_order(soup, translated_comments_all)
else:
s_prev = prev.find(name="string", attrs={"id": s["id"]})
if s_prev:
s.string = s_prev.string
else:
a = translator.translate(s.string)
if s["id"] in titleIds:
s.string = a.title()
else:
s.string = a
for s in soup.find(name="strings").find_all(
string=lambda text: isinstance(text, Comment)):
s.insert_before(" ")
s.replace_with(Comment(" " + translator.translate(s) + " "))
# print(str(soup)) # Insert the generator comment (English + translated) before <strings>
with open(f"./resources-{l[0]}/strings/strings.xml", "wb") as w: strings_node = soup.find(name="strings")
w.write(soup.encode("utf-8") + b"\n") if strings_node:
i += 1 strings_node.insert_before("\n\n")
combined = f"\n {generator_comment_en}\n {generator_comment_translated}\n"
strings_node.insert_before(Comment(combined))
strings_node.insert_before("\n\n")
# Write output
out_path = os.path.join(out_dir, "strings.xml")
with open(out_path, "wb") as w:
w.write(soup.encode("utf-8") + b"\n")
def main():
# Init client
client = genai.Client()
# Load English source
src_path = "./resources/strings/strings.xml"
if not os.path.exists(src_path):
raise FileNotFoundError(f"Missing source file: {src_path}")
with open(src_path, "r", encoding="utf-8") as f:
english_xml = f.read().replace("\r", "")
english_soup = BeautifulSoup(english_xml, features="xml")
english_strings = extract_strings(english_soup)
total_langs = len(languages)
for i, lang in enumerate(languages, start=1):
print(f"{i} of {total_langs}: Translating English to {lang[2]}")
try:
translate_language(client, lang, english_soup, english_strings)
except Exception as e:
print(f" Error translating {lang[2]}: {e}")
if __name__ == "__main__":
main()