Update the translation script to use Gemini instead of Google Translate

This code itself was also AI generated.
2025-12-14 18:18:16 +00:00 · 2025-09-13 19:43:39 +02:00
parent 46e44ada30
commit 0dd8bd72ff
1 changed files with 283 additions and 84 deletions
--- a/translate.py
+++ b/translate.py
@@ -15,32 +15,37 @@
 # Description:
 #
 # Python script to automatically translate the strings.xml file to each supported
-# language using Google Translate.
+# language. Rewritten by krzys_h with the help of AI to use Gemini instead of
+# Google Translate for more contextual translations.
 #
-# Python installation:
-#   pip install beautifulsoup4
-#   pip install deep-translator
-# NB. For XML formatting:
-#   pip install lxml
+# Requirements:
+#   pip install google-genai beautifulsoup4 lxml
 #
-# References:
-#  * https://www.crummy.com/software/BeautifulSoup/bs4/doc/
-#  * https://realpython.com/beautiful-soup-web-scraper-python/
-#  * https://www.crummy.com/software/BeautifulSoup/bs4/doc/#parsing-xml
-#  * https://www.crummy.com/software/BeautifulSoup/bs4/doc/#xml
+# Env:
+#   export GEMINI_API_KEY="YOUR_API_KEY"
+#
+# To get your own API key, go to:
+# https://aistudio.google.com/app/apikey
 #
 ####################################################################################
+
 import os
+import json
+from typing import Dict, List, Tuple

-from bs4 import BeautifulSoup
-from bs4 import Comment
-from deep_translator import GoogleTranslator
+from google import genai
+from bs4 import BeautifulSoup, Comment

-# List of tuples in the form os:
-#  * Garmin IQ language three letter mnemonic,
-#  * Google Translate language mnemonic,
-#  * Language familiar name (mainly for reference)
-languages: list[tuple[str, str, str]] = [
+# ---------------- Configuration ----------------
+
+# Gemini model name
+MODEL_NAME = "gemini-2.5-flash"
+
+# Language definitions:
+#  * Garmin IQ language three-letter mnemonic (used in resources-XXX folder),
+#  * Unused Google mnemonic kept for reference,
+#  * Human-readable language name for prompts
+languages: List[Tuple[str, str, str]] = [
    ("ara", "ar", "Arabic"),
    ("bul", "bg", "Bulgarian"),
    ("zhs", "zh-CN", "Chinese (Simplified)"),
@@ -79,75 +84,269 @@ languages: list[tuple[str, str, str]] = [
    ("vie", "vi", "Vietnamese"),
 ]

-langLength = len(languages)
+exceptionIds: List[str] = ["AppName", "AppVersionTitle"]

-exceptionIds: list[str] = ["AppName", "AppVersionTitle"]
-titleIds: list[str] = []
+# ---------------- Helpers ----------------

-# def merge(curr: BeautifulSoup, prev: BeautifulSoup) -> BeautifulSoup:
-#     """
-#     Merge the current strings.xml with the previous one, overwriting
-#     the previous strings with the current ones if they exist.
-#     """
-#     out = prev.__copy__()
-#     for s in curr.find(name="strings").find_all(name="string"):
-#         s_prev = out.find(name="string", attrs={"id": s["id"]})
-#         if s_prev:
-#             s_prev.string = s.string
-#         else:
-#             out.find(name="strings").append(s)
-#     return out
+def load_xml_as_soup(path: str) -> BeautifulSoup:
+    if not os.path.exists(path):
+        return BeautifulSoup("", features="xml")
+    with open(path, "r", encoding="utf-8") as f:
+        return BeautifulSoup(f.read().replace("\r", ""), features="xml")

-i = 1
-with open("./resources/strings/strings.xml", "r") as f:
-    c = f.read().replace("\r", "")
-    for l in languages:
-        os.makedirs(f"./resources-{l[0]}/strings/", exist_ok=True)
-        # Old translations will not be automatically updated/removed, use removeTranslations.py
-        try:
-            with open(f"./resources-{l[0]}/strings/strings.xml", "r", encoding="utf-8") as r:
-                prev = BeautifulSoup(r.read().replace("\r", ""), features="xml")
-        except FileNotFoundError:
-            prev = BeautifulSoup("", features="xml")
-        try:
-            with open(f"./resources-{l[0]}/strings/corrections.xml", "r", encoding="utf-8") as r:
-                curr = BeautifulSoup(r.read().replace("\r", ""), features="xml")
-        except FileNotFoundError:
-            curr = BeautifulSoup("", features=["xml"])
-        print(f"{i} of {langLength}: Translating English to {l[2]}")
-        soup = BeautifulSoup(c, features="xml")
-        translator = GoogleTranslator(source="en", target=l[1])
-        soup.find(name="strings").insert_before("\n\n")
-        soup.find(name="strings").insert_before(
-            Comment(
-                f"\n  Generated by Google Translate: English to {l[2]}\n  " +
-                    translator.translate("Generated by Google Translate from English") + "\n"))
-        soup.find(name="strings").insert_before("\n\n")
+def extract_strings(soup: BeautifulSoup) -> Dict[str, str]:
+    out = {}
+    strings_node = soup.find(name="strings")
+    if not strings_node:
+        return out
+    for s in strings_node.find_all(name="string"):
+        sid = s.get("id")
+        if not sid:
+            continue
+        value = s.string if s.string is not None else s.get_text()
+        out[sid] = value if value is not None else ""
+    return out

-        for s in soup.find(name="strings").find_all(name="string"):
+def extract_comments_in_order(soup: BeautifulSoup) -> List[str]:
+    comments = []
+    strings_node = soup.find(name="strings")
+    if not strings_node:
+        return comments
+    for c in strings_node.find_all(string=lambda text: isinstance(text, Comment)):
+        comments.append(str(c))
+    return comments
+
+def replace_comments_in_order(soup: BeautifulSoup, translated_comments: List[str]) -> None:
+    strings_node = soup.find(name="strings")
+    if not strings_node:
+        return
+    idx = 0
+    for c in strings_node.find_all(string=lambda text: isinstance(text, Comment)):
+        if idx < len(translated_comments):
+            c.insert_before("  ")
+            c.replace_with(Comment(translated_comments[idx]))
+        idx += 1
+
+def build_translation_prompt(
+    language_name: str,
+    english_full: Dict[str, str],
+    existing_translations: Dict[str, str],
+    to_translate: Dict[str, str],
+    english_comments: List[str],
+    existing_translated_comments: List[str],
+    generator_comment_en: str,
+) -> str:
+    return f"""
+You are a professional localizer for a smartwatch UI. Translate UI strings into {language_name}.
+
+Rules:
+- Preserve placeholders EXACTLY and do not translate them:
+  - printf style: %s, %d, %f, %1$s, %2$d, etc.
+  - brace placeholders: {{0}}, {{1}}, {{name}}, {{value}}
+  - dollar placeholders: $1, $2
+- Never translate app/product names; keep them unchanged, e.g., "Home Assistant".
+- Do not change punctuation, spacing, or add extra punctuation unless natural in the target language.
+- Keep any whitespace at the beginning or end of string unchanged.
+- Keep meaning accurate and UI-appropriate (short, natural, consistent).
+- Use consistent terminology aligned with existing translations for this language.
+- Do NOT translate the string IDs themselves.
+
+Comments handling:
+- You are given comments from the English XML (in order) and the current translations (same order where available).
+- If a given English comment has not changed since the last revision and a current translation exists at the same index, return the existing translation unchanged.
+- If you believe an existing translation is already correct for the provided English, keep it unchanged; otherwise provide an improved translation.
+- Also translate the generator comment line shown below. We will store both the English and translated lines inside a single XML comment.
+
+Here are the complete English strings for context:
+{json.dumps(english_full, ensure_ascii=False, indent=2)}
+
+Here are existing translations for this language (do not modify these; use for terminology/style consistency):
+{json.dumps(existing_translations, ensure_ascii=False, indent=2)}
+
+Here are the ONLY strings that need new translations (translate the values):
+{json.dumps(to_translate, ensure_ascii=False, indent=2)}
+
+Comments to translate (same order as in the XML):
+{json.dumps(english_comments, ensure_ascii=False, indent=2)}
+
+Existing translated comments (same order; may be fewer items):
+{json.dumps(existing_translated_comments, ensure_ascii=False, indent=2)}
+
+Generator comment (English; translate this too):
+{json.dumps(generator_comment_en, ensure_ascii=False)}
+
+Return only valid JSON with this exact structure and nothing else (no markdown fences, no prose):
+{{
+  "translations": {{ "<STRING_ID>": "<translated string>", ... }},
+  "translated_comments": ["<translated comment 1>", "<translated comment 2>", ...],
+  "generator_comment_translated": "<translated generator comment line>"
+}}
+- "translations" must have exactly the keys provided in "to_translate".
+- "translated_comments" must have the same number of items and order as the input comments list.
+- For comments that should remain unchanged based on the rules above, return the existing translation verbatim.
+""".strip()
+
+# ---------------- Main translation logic ----------------
+
+def translate_language(
+    client: genai.Client,
+    lang_tuple: Tuple[str, str, str],
+    english_soup: BeautifulSoup,
+    english_strings: Dict[str, str],
+) -> None:
+    garmin_code, _unused, language_name = lang_tuple
+
+    # Ensure output directory exists
+    out_dir = f"./resources-{garmin_code}/strings/"
+    os.makedirs(out_dir, exist_ok=True)
+
+    # Load previous translations and corrections
+    prev_soup = load_xml_as_soup(os.path.join(out_dir, "strings.xml"))
+    corrections_soup = load_xml_as_soup(os.path.join(out_dir, "corrections.xml"))
+
+    prev_map = extract_strings(prev_soup)
+    corrections_map = extract_strings(corrections_soup)
+
+    # Build a fresh soup for this language from English source
+    soup = BeautifulSoup(str(english_soup), features="xml")
+
+    # Collect comments
+    english_comments = extract_comments_in_order(english_soup)
+    existing_translated_comments = extract_comments_in_order(prev_soup)
+
+    # Detect any mention of Google Translate anywhere in the previous XML
+    all_comments_text_prev = [
+        str(c) for c in prev_soup.find_all(string=lambda t: isinstance(t, Comment))
+    ]
+    mentions_google_translate = any("google translate" in c.lower() for c in all_comments_text_prev)
+
+    # Build generator comment English line (the translated line will be returned by the API)
+    if mentions_google_translate:
+        generator_comment_en = f"Generated by Google Translate and {MODEL_NAME} from English to {language_name}"
+    else:
+        generator_comment_en = f"Generated by {MODEL_NAME} from English to {language_name}"
+
+    # Decide which strings need translation (not in corrections, not in previous)
+    to_translate_map: Dict[str, str] = {}
+    final_values: Dict[str, str] = {}
+
+    for s in soup.find_all(name="string"):
+        sid = s.get("id")
+        if not sid:
+            continue
+        if sid in exceptionIds:
+            # Keep English as-is for exception IDs
+            final_values[sid] = s.get_text()
+            continue
+
+        if sid in corrections_map and corrections_map[sid] is not None:
+            final_values[sid] = corrections_map[sid]
+        elif sid in prev_map and prev_map[sid] is not None:
+            final_values[sid] = prev_map[sid]
+        else:
+            to_translate_map[sid] = s.get_text()
+
+    # If there are no new strings to translate, skip this language entirely
+    if not to_translate_map:
+        print(f"  Skipping {language_name}: no new strings to translate.")
+        return
+
+    # Prepare context (always include full English strings)
+    english_context = english_strings
+    existing_translations = {k: v for k, v in prev_map.items()}
+    if corrections_map:
+        existing_translations.update(corrections_map)
+
+    # Translate all at once; force JSON output but do not enforce a schema
+    prompt = build_translation_prompt(
+        language_name=language_name,
+        english_full=english_context,
+        existing_translations=existing_translations,
+        to_translate=to_translate_map,
+        english_comments=english_comments,
+        existing_translated_comments=existing_translated_comments,
+        generator_comment_en=generator_comment_en,
+    )
+
+    config = genai.types.GenerateContentConfig(
+        temperature=0,
+        response_mime_type="application/json",
+    )
+
+    resp = client.models.generate_content(
+        model=MODEL_NAME,
+        contents=prompt,
+        config=config,
+    )
+
+    data = getattr(resp, "parsed", None)
+    if data is None:
+        txt = getattr(resp, "text", None)
+        if not txt:
+            try:
+                txt = resp.candidates[0].content.parts[0].text
+            except Exception:
+                txt = ""
+        if not txt.strip():
+            raise RuntimeError("Empty response from model; cannot parse translations.")
+        data = json.loads(txt)
+
+    translations = data.get("translations", {}) or {}
+    for sid, translated in translations.items():
+        if sid in to_translate_map:
+            final_values[sid] = translated
+
+    translated_comments_all: List[str] = data.get("translated_comments", []) or []
+    generator_comment_translated: str = data.get("generator_comment_translated", "") or ""
+
+    # Apply final values to the soup
+    for s in soup.find_all(name="string"):
+        sid = s.get("id")
+        if not sid:
+            continue
+        if sid in final_values:
+            val = final_values[sid]
            s.insert_before("  ")
-            if s["id"] in exceptionIds:
-                continue
+            s.string = val

-            s_curr = curr.find(name="string", attrs={"id": s["id"]})
-            if s_curr:
-                s.string = s_curr.string
-            else:
-                s_prev = prev.find(name="string", attrs={"id": s["id"]})
-                if s_prev:
-                    s.string = s_prev.string
-                else:
-                    a = translator.translate(s.string)
-                    if s["id"] in titleIds:
-                        s.string = a.title()
-                    else:
-                        s.string = a
-        for s in soup.find(name="strings").find_all(
-                string=lambda text: isinstance(text, Comment)):
-            s.insert_before("  ")
-            s.replace_with(Comment(" " + translator.translate(s) + " "))
+    # Replace comments with translated versions (order-preserving)
+    if translated_comments_all:
+        replace_comments_in_order(soup, translated_comments_all)

-        # print(str(soup))
-        with open(f"./resources-{l[0]}/strings/strings.xml", "wb") as w:
-            w.write(soup.encode("utf-8") + b"\n")
-        i += 1
+    # Insert the generator comment (English + translated) before <strings>
+    strings_node = soup.find(name="strings")
+    if strings_node:
+        strings_node.insert_before("\n\n")
+        combined = f"\n  {generator_comment_en}\n  {generator_comment_translated}\n"
+        strings_node.insert_before(Comment(combined))
+        strings_node.insert_before("\n\n")
+
+    # Write output
+    out_path = os.path.join(out_dir, "strings.xml")
+    with open(out_path, "wb") as w:
+        w.write(soup.encode("utf-8") + b"\n")
+
+def main():
+    # Init client
+    client = genai.Client()
+
+    # Load English source
+    src_path = "./resources/strings/strings.xml"
+    if not os.path.exists(src_path):
+        raise FileNotFoundError(f"Missing source file: {src_path}")
+
+    with open(src_path, "r", encoding="utf-8") as f:
+        english_xml = f.read().replace("\r", "")
+    english_soup = BeautifulSoup(english_xml, features="xml")
+    english_strings = extract_strings(english_soup)
+
+    total_langs = len(languages)
+    for i, lang in enumerate(languages, start=1):
+        print(f"{i} of {total_langs}: Translating English to {lang[2]}")
+        try:
+            translate_language(client, lang, english_soup, english_strings)
+        except Exception as e:
+            print(f"  Error translating {lang[2]}: {e}")
+
+if __name__ == "__main__":
+    main()