From 9c24ce2b08e3190aeede47ba792707b606eb51c4 Mon Sep 17 00:00:00 2001 From: Grace Cai Date: Tue, 2 Sep 2025 17:39:01 +0800 Subject: [PATCH 01/18] Add files via upload --- .../sync-en-cloud-toc-changes-to-zh.py | 616 ++++++++++++++++++ 1 file changed, 616 insertions(+) create mode 100644 .github/workflows/sync-en-cloud-toc-changes-to-zh.py diff --git a/.github/workflows/sync-en-cloud-toc-changes-to-zh.py b/.github/workflows/sync-en-cloud-toc-changes-to-zh.py new file mode 100644 index 0000000000000..da2c16b10da20 --- /dev/null +++ b/.github/workflows/sync-en-cloud-toc-changes-to-zh.py @@ -0,0 +1,616 @@ +# This script is used to sync the changes from the English TOC files to the Chinese TOC files. Detailed steps are as follows: +# 1. The script automatically gets the latest commit of the English TOC file from GitHub and the earlier commit of the English TOC file from the Chinese TOC file in the same repository. +# 2. It compares two English commits and performs the following operations: +# - If the commit numbers are the same, skip the update for that TOC file. +# - If the commit numbers are different, update the Chinese TOC with the following operations: +# a. Updates the Chinese TOC according to the English diff. +# b. Generates bilingual terms based on the old version of the Chinese and English TOC files. +# c. Update the modified English lines in the Chinese TOC with Chinese based on the bilingual terms. +# d. Translate the remaining English in the Chinese TOC using AI. + +import re +import os +import sys +import json +import logging +from urllib.request import urlopen, Request +from urllib.error import URLError, HTTPError +from google import genai + +REPO_OWNER = "qiancai" +REPO_NAME = "docs" +EN_BRANCH = "release-8.5" +ZH_BRANCH = "i18n-zh-release-8.5" +TOC_FILE_NAMES = ["TOC-tidb-cloud-starter.md", "TOC-tidb-cloud-essential.md", "TOC-tidb-cloud.md"] +TOC_HEADER_LINE_COUNT = 3 # The Starting line to create bilingual terms +TEMP_TOC_FILENAME = "en_cloud_toc.md" # The filename of the temporary English TOC content + + +# ========== Logging Configuration ========== +logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') +logger = logging.getLogger(__name__) + +# ========== AI Configuration ========== +MODEL_NAME = "gemini-2.0-flash" +genai_token = os.getenv("GEMINI_API_TOKEN") +if not genai_token: + logger.error("GEMINI_API_TOKEN environment variable must be set") + sys.exit(1) + +client = genai.Client(api_key=genai_token) + +def read_file_from_repo(file_path): + """Read a file from the current repository""" + try: + with open(file_path, 'r', encoding='utf-8') as f: + return f.read() + except IOError as e: + logger.error(f"Error reading file {file_path}: {e}") + return None + +def write_file_to_repo(file_path, content): + """Write content to a file in the current repository""" + try: + with open(file_path, 'w', encoding='utf-8') as f: + f.write(content) + return True + except IOError as e: + logger.error(f"Error writing file {file_path}: {e}") + return False + +def extract_commit_from_target_file(target_file): + """Extract the EN commit SHA from the target TOC file comment""" + try: + content = read_file_from_repo(target_file) + if not content: + return None + + lines = content.split('\n') + for i, line in enumerate(lines): + if i > 10: # Only check first 10 lines + break + + # Look for the pattern: + if "EN commit:" in line: + # Extract commit SHA using regex + match = re.search(r'EN commit:\s*([a-f0-9]{40})', line) + if match: + commit_sha = match.group(1) + logger.info(f"Found earlier EN commit in target file: {commit_sha}") + return commit_sha + + logger.error("No EN commit comment found in target file") + return None + + except Exception as e: + logger.error(f"Error reading target file for commit extraction: {e}") + return None + +def get_latest_commit_sha(repo_owner, repo_name, branch, toc_file_name): + """Get the latest commit SHA for a specific file on GitHub""" + try: + # Use GitHub API to get commits for the specific file + url = f"https://api.github.com/repos/{repo_owner}/{repo_name}/commits" + params = f"?sha={branch}&path={toc_file_name}&per_page=1" + full_url = url + params + headers = { + "User-Agent": "tidb-docs-sync/1.0", + "Accept": "application/vnd.github.v3+json", + } + gh_token = os.getenv("GITHUB_TOKEN") + if gh_token: + headers["Authorization"] = f"Bearer {gh_token}" + req = Request(full_url, headers=headers) + + with urlopen(req) as resp: + data = json.loads(resp.read().decode('utf-8')) + + if data and len(data) > 0: + latest_commit = data[0]['sha'] + logger.info(f"Latest commit: {latest_commit}") + return latest_commit + else: + logger.warning("No commits found for the specified file") + return None + + except (URLError, HTTPError, json.JSONDecodeError) as e: + logger.error(f"Error fetching latest commit: {e}") + return None + +def get_github_compare_diff(base_commit, head_commit): + """Fetch unified diff from GitHub compare endpoint (.diff) for the repo {REPO_OWNER}/{REPO_NAME}""" + try: + url = f"https://github.com/{REPO_OWNER}/{REPO_NAME}/compare/{base_commit}...{head_commit}.diff" + logger.info(f"Fetching compare diff from: {url}") + headers = { + "User-Agent": "tidb-docs-sync/1.0", + "Accept": "application/vnd.github.v3.diff", + } + gh_token = os.getenv("GITHUB_TOKEN") + if gh_token: + headers["Authorization"] = f"Bearer {gh_token}" + req = Request(url, headers=headers) + with urlopen(req, timeout=20) as resp: + content_bytes = resp.read() + # GitHub serves UTF-8 + return content_bytes.decode("utf-8", errors="replace") + except (URLError, HTTPError) as e: + logger.error(f"Error fetching GitHub compare diff: {e}") + return None + +def parse_github_diff_for_file(diff_text, target_rel_path): + """Parse the multi-file unified diff and return hunks for the specified file. + + Returns a list of hunks: {old_start, old_count, new_start, new_count, lines} + where lines are the raw hunk lines starting with ' ', '+', or '-'. + """ + if not diff_text: + return [] + + lines = diff_text.splitlines() + hunks = [] + collecting_for_file = False + current_hunk = None + current_file_path = None + + # Normalize target path to compare by suffix + target_suffix = target_rel_path.strip("/") + + for line in lines: + if line.startswith("diff --git "): + # finalize any open hunk + if current_hunk is not None and collecting_for_file: + hunks.append(current_hunk) + current_hunk = None + collecting_for_file = False + current_file_path = None + continue + + if line.startswith("+++ "): + path = line[4:].strip() + # Expected formats: 'b/path/to/file' or '/dev/null' + if path == "/dev/null": + current_file_path = None + collecting_for_file = False + else: + # strip the leading 'a/' or 'b/' + if path.startswith("a/") or path.startswith("b/"): + path_clean = path[2:] + else: + path_clean = path + current_file_path = path_clean + collecting_for_file = path_clean.endswith(target_suffix) + continue + + if not collecting_for_file: + continue + + # Within the target file section, parse hunks + if line.startswith("@@ "): + # finalize previous hunk + if current_hunk is not None: + hunks.append(current_hunk) + + m = re.match(r"@@ -(\d+),?(\d*) \+(\d+),?(\d*) @@", line) + if not m: + continue + old_start = int(m.group(1)) + old_count = int(m.group(2)) if m.group(2) else 1 + new_start = int(m.group(3)) + new_count = int(m.group(4)) if m.group(4) else 1 + + current_hunk = { + "old_start": old_start, + "old_count": old_count, + "new_start": new_start, + "new_count": new_count, + "lines": [], + } + continue + + # Collect hunk body lines + if current_hunk is not None and (line.startswith(" ") or line.startswith("+") or line.startswith("-")): + current_hunk["lines"].append(line) + + # finalize last hunk if any + if current_hunk is not None and collecting_for_file: + hunks.append(current_hunk) + + return hunks + +def apply_hunks_by_line_numbers(target_file, hunks, earlier_commit, latest_commit): + """Apply unified-diff hunks to target file strictly by old line numbers. + + Only change the lines marked as deletions ('-') and additions ('+'). + Context lines (' ') are used for positioning but are left untouched in the target. + """ + try: + content = read_file_from_repo(target_file) + if not content: + return False, {} + lines = content.splitlines() + + modified = list(lines) + line_offset_delta = 0 + modified_lines = {} + + for hunk_index, hunk in enumerate(hunks): + cursor = hunk["old_start"] - 1 + line_offset_delta + + if cursor < 0: + print(f"Hunk {hunk_index+1}: start cursor {cursor} adjusted to 0") + cursor = 0 + if cursor > len(modified): + print(f"Hunk {hunk_index+1}: start cursor {cursor} beyond EOF {len(modified)}; clamping to EOF") + cursor = len(modified) + + #print(f"Applying hunk {hunk_index+1} at approx line {cursor+1}") + + for raw in hunk["lines"]: + if not raw: + continue + marker = raw[0] + text = raw[1:] + + if marker == ' ': # context: advance cursor, keep original content + cursor += 1 + elif marker == '-': # deletion: remove line at cursor + if cursor < len(modified): + deleted = modified.pop(cursor) + line_offset_delta -= 1 + else: + print(f"Hunk {hunk_index+1}: deletion cursor {cursor} at/after EOF; skipping deletion") + elif marker == '+': # addition: insert line at cursor + modified.insert(cursor, text) + modified_lines[cursor+1] = text + #print(f"Inserted line at line {cursor+1}: {text}") + cursor += 1 + line_offset_delta += 1 + else: + # Unknown marker; ignore + pass + + # replace the earlier commit with the latest commit + for i, line in enumerate(modified): + if "EN commit:" in line and earlier_commit in line: + modified[i] = line.replace(earlier_commit, latest_commit) + break + modified_content = "\n".join(modified) + "\n" + + success = write_file_to_repo(target_file, modified_content) + if not success: + return False, {} + + logger.info(f"Successfully applied {len(hunks)} hunks to {target_file}") + return True, modified_lines + except Exception as e: + logger.error(f"Error applying hunks: {e}") + return False, {} + +def sync_toc_files_using_github_compare(commit1, commit2, source_file, target_file): + """Sync by fetching compare diff from GitHub and applying hunks by line numbers.""" + logger.info(f"Fetching GitHub compare diff between {commit1} and {commit2}...") + diff_text = get_github_compare_diff(commit1, commit2) + if not diff_text: + logger.warning("No diff content retrieved from GitHub") + return False, {} + + logger.info("Parsing diff for target file hunks...") + hunks = parse_github_diff_for_file(diff_text, source_file) + if not hunks: + logger.info(f"No hunks found for file: {source_file}") + return False, {} + + logger.info(f"Found {len(hunks)} hunks for {source_file}. Applying to {target_file} by line numbers...") + sync_status, modified_lines = apply_hunks_by_line_numbers(target_file, hunks, commit1, commit2) + return sync_status, modified_lines + +def create_bilingual_comparison(target_toc_file): + """Create bilingual comparison list from TOC files""" + bilingual_list = [] + + # Read both files + zh_content = read_file_from_repo(target_toc_file) + en_content = read_file_from_repo(TEMP_TOC_FILENAME) + + if not zh_content or not en_content: + return [] + + zh_lines = zh_content.splitlines(True) + en_lines = en_content.splitlines(True) + + # Process from line 4 onwards (index 3) + start_line = TOC_HEADER_LINE_COUNT + + # Ensure both files have the same number of lines + min_lines = min(len(zh_lines), len(en_lines)) + + logger.info(f"Processing {min_lines - start_line} lines starting from line {start_line + 1}") + + for i in range(start_line, min_lines): + zh_line = zh_lines[i].rstrip('\n\r') + en_line = en_lines[i].rstrip('\n\r') + + # Skip empty lines + if not zh_line.strip() and not en_line.strip(): + continue + + # Clean the lines consistently using the same pattern as replace function + zh_toc_pattern = re.match(r'^\s*-\s', zh_line) + en_toc_pattern = re.match(r'^\s*-\s', en_line) + + zh_cleaned = zh_line[zh_toc_pattern.end():].rstrip() if zh_toc_pattern else zh_line.rstrip() + en_cleaned = en_line[en_toc_pattern.end():].rstrip() if en_toc_pattern else en_line.rstrip() + + # Only add non-empty cleaned lines + if zh_cleaned.strip() and en_cleaned.strip(): + bilingual_list.append([zh_cleaned, en_cleaned, i + 1]) + logger.debug(f"Bilingual items: Line {i + 1}: '{en_cleaned}' -> '{zh_cleaned}'") + + logger.info(f"Created bilingual list with {len(bilingual_list)} entries") + return bilingual_list + +def replace_content_with_translation(bilingual_list, modified_lines, target_toc_file): + """Replace English content with existing Chinese translations, return unmatched lines""" + # Read the target file + content = read_file_from_repo(target_toc_file) + if not content: + return modified_lines + target_lines = content.splitlines(True) + + # Optimize lookup by creating a dictionary for O(1) lookups + bilingual_map = {en_text: zh_text for zh_text, en_text, _ in bilingual_list} + + replaced_count = 0 + matched_lines = set() + + logger.info(f"Found {len(modified_lines)} modified lines to process.") + logger.debug(f"Modified lines: {list(modified_lines.keys())}") + + # Process each modified line + for line_number in modified_lines.keys(): + line_index = line_number - 1 # Convert to 0-based + + if 0 <= line_index < len(target_lines): + line_content = target_lines[line_index].rstrip('\n\r') + + # Clean the line content for matching + toc_pattern = re.match(r'^\s*-\s', line_content) + if toc_pattern: + prefix = toc_pattern.group(0) + cleaned_content = line_content[toc_pattern.end():].rstrip() + else: + prefix = '' + cleaned_content = line_content.rstrip() + + # Try to find exact match in bilingual map (O(1) lookup) + if cleaned_content in bilingual_map: + # Found match! Replace with Chinese translation + zh_text = bilingual_map[cleaned_content] + new_line = prefix + zh_text + target_lines[line_index] = new_line + '\n' + replaced_count += 1 + matched_lines.add(line_number) + logger.debug(f"Matched line {line_number}: '{cleaned_content}' -> '{zh_text}'") + + # Write back the updated content + if replaced_count > 0: + updated_content = ''.join(target_lines) + write_file_to_repo(target_toc_file, updated_content) + logger.info(f"Applied {replaced_count} existing translations.") + + # Return unmatched lines for AI translation + unmatched_lines = {k: v for k, v in modified_lines.items() if k not in matched_lines} + logger.info(f"Lines needing AI translation: {len(unmatched_lines)}") + + return unmatched_lines + +def translate_content(modified_lines, target_file): + """Translate English content to Chinese using Gemini API with JSON format""" + if not modified_lines: + logger.info("No content to translate.") + return {} + + logger.info(f"Translating {len(modified_lines)} lines using Gemini API...") + + # Read the target file to get original formatted lines + content = read_file_from_repo(target_file) + if not content: + return {} + target_lines = content.splitlines(True) + + # Create JSON input with original formatted lines + translation_json = {} + for line_num in modified_lines.keys(): + line_index = line_num - 1 + if 0 <= line_index < len(target_lines): + original_line = target_lines[line_index] + translation_json[str(line_num)] = original_line + + if not translation_json: + logger.warning("No valid content to translate after processing.") + return {} + + # Create JSON string for the prompt + json_input = json.dumps(translation_json, ensure_ascii=False, indent=2) + logger.debug(f"Translation JSON input: {json_input}") + + # Create translation prompt + prompt = f"""Please translate the following TOC (Table of Contents) entries from English to Chinese. +These are navigation items for TiDB Cloud documentation with original formatting. + +IMPORTANT: +1. Return the result in the EXACT SAME JSON format with the same keys (line numbers) +2. Keep ALL original formatting: indentation, spaces, dashes, brackets, etc. +3. Only translate the English text content to Chinese, preserve everything else exactly +4. Maintain technical terms appropriately (like "TiDB Cloud", "HTAP", "CLI", etc.) + +Input JSON: +{json_input} + +Return only the JSON with Chinese translations that preserve all original formatting.""" + + try: + logger.info("Sending translation request to Gemini API...") + response = client.models.generate_content( + model=MODEL_NAME, contents=prompt + ) + + if response.text: + # Extract JSON from response + response_text = response.text.strip() + logger.debug(f"Translation JSON response: {response_text}") + + # Try to find and parse JSON from the response + try: + # Use regex to find JSON block more robustly + json_text = response_text + match = re.search(r"```json\s*([\s\S]*?)\s*```", response_text) + if match: + json_text = match.group(1).strip() + elif '```' in response_text: + start = response_text.find('```') + 3 + end = response_text.find('```', start) + json_text = response_text[start:end].strip() + + # Parse the JSON + translated_json = json.loads(json_text) + + # Convert back to integer keys and return + zh_modified_lines = {} + for line_num_str, translated_text in translated_json.items(): + line_num = int(line_num_str) + zh_modified_lines[line_num] = translated_text + original_text = modified_lines.get(line_num, "") + logger.debug(f"Line {line_num}: '{original_text}' -> '{translated_text}'") + + logger.info(f"Translation completed. Processed {len(zh_modified_lines)} lines.") + return zh_modified_lines + + except (json.JSONDecodeError, ValueError) as e: + logger.error(f"Error parsing JSON response: {e}") + logger.error(f"Response was: {response_text}") + # Fallback: return empty dict to prevent writing untranslated content + return {} + else: + logger.error("Empty response from Gemini API") + return {} + + except Exception as e: + logger.error(f"Error during translation: {e}") + # Fallback: return empty dict to prevent writing untranslated content + return {} + +def update_toc_file(zh_modified_lines, target_file): + """Apply translated content to specific lines in the target TOC file""" + if not zh_modified_lines: + logger.info("No translated content to apply.") + return + + logger.info(f"Applying {len(zh_modified_lines)} translated lines to {target_file}...") + + try: + # Read the target file + content = read_file_from_repo(target_file) + if not content: + logger.error(f"Could not read target file {target_file}") + return + target_lines = content.splitlines(True) + + # Apply translations to specific lines + applied_count = 0 + for line_num, translated_content in zh_modified_lines.items(): + # Convert to 0-based index + line_index = line_num - 1 + + if 0 <= line_index < len(target_lines): + # AI has already provided the complete formatted line, use it directly + target_lines[line_index] = translated_content + applied_count += 1 + else: + logger.warning(f"Line number {line_num} is out of range (file has {len(target_lines)} lines)") + + # Write the updated content back to the file + updated_content = ''.join(target_lines) + write_file_to_repo(target_file, updated_content) + + logger.info(f"Successfully applied {applied_count} translations to {target_file}") + + except Exception as e: + logger.error(f"Error updating TOC file: {e}") + raise + +def cleanup_temp_files(): + """Clean up temporary files""" + try: + if os.path.exists(TEMP_TOC_FILENAME): + os.remove(TEMP_TOC_FILENAME) + logger.info(f"Cleaned up temporary file: {TEMP_TOC_FILENAME}") + except Exception as e: + logger.warning(f"Could not clean up temporary files: {e}") + +def process_toc_file(toc_file_name): + """Process a single TOC file for synchronization""" + target_toc_file = toc_file_name + + logger.info("-" * 50) + logger.info(f"Processing {toc_file_name}...") + + logger.info("Extracting EN commit SHA from target file...") + earlier_commit = extract_commit_from_target_file(target_toc_file) + + logger.info("Fetching latest commit SHA for TOC file...") + latest_commit = get_latest_commit_sha(REPO_OWNER, REPO_NAME, EN_BRANCH, toc_file_name) + + # If earlier_commit is different from latest_commit, sync the TOC file. + if earlier_commit and latest_commit and earlier_commit != latest_commit: + # Download the EN TOC content from the earlier commit for comparison + en_toc_path = f"https://raw.githubusercontent.com/{REPO_OWNER}/{REPO_NAME}/{earlier_commit}/{toc_file_name}" + logger.info(f"Downloading EN TOC content from: {en_toc_path}") + en_toc_content = urlopen(en_toc_path).read().decode("utf-8") + + # Write en_toc_content to a file for bilingual comparison + write_file_to_repo(TEMP_TOC_FILENAME, en_toc_content) + + logger.info("Creating bilingual comparison...") + bilingual_list = create_bilingual_comparison(target_toc_file) + + logger.info("Running TOC sync using GitHub compare diff...") + sync_status, modified_lines = sync_toc_files_using_github_compare( + earlier_commit, + latest_commit, + toc_file_name, + target_toc_file, + ) + + if sync_status: + logger.info("TOC file sync completed successfully!") + + # Match with existing bilingual translations + unmatched_lines = replace_content_with_translation(bilingual_list, modified_lines, target_toc_file) + + # Use AI to translate remaining unmatched lines + if unmatched_lines: + logger.info(f"Using AI to translate {len(unmatched_lines)} unmatched lines...") + zh_modified_lines = translate_content(unmatched_lines, target_toc_file) + update_toc_file(zh_modified_lines, target_toc_file) + logger.info("AI translations have been applied successfully!") + else: + logger.info("All lines were matched with existing translations. No AI translation needed.") + else: + logger.error("TOC file sync failed!") + else: + if earlier_commit == latest_commit: + logger.info(f"Earlier commit is the same as latest commit. No sync needed for {toc_file_name}.") + else: + logger.warning(f"Skipping sync for {toc_file_name} due to missing commit information. Check logs for errors.") + +if __name__ == "__main__": + logger.info("Starting TOC synchronization process...") + + for toc_file_name in TOC_FILE_NAMES: + process_toc_file(toc_file_name) + + # Clean up temporary files + cleanup_temp_files() + logger.info("Script execution completed.") From 5b0f5a461d35a2e1fa87db4d774e2f4cd7b80b6e Mon Sep 17 00:00:00 2001 From: Grace Cai Date: Tue, 2 Sep 2025 17:40:04 +0800 Subject: [PATCH 02/18] Add files via upload --- .github/workflows/sync-cloud-zh-toc.yml | 119 ++++++++++++++++++++++++ 1 file changed, 119 insertions(+) create mode 100644 .github/workflows/sync-cloud-zh-toc.yml diff --git a/.github/workflows/sync-cloud-zh-toc.yml b/.github/workflows/sync-cloud-zh-toc.yml new file mode 100644 index 0000000000000..688852d2a4220 --- /dev/null +++ b/.github/workflows/sync-cloud-zh-toc.yml @@ -0,0 +1,119 @@ +name: Sync Cloud ZH TOC Files + +on: + workflow_dispatch: + +permissions: + contents: write + pull-requests: write + +env: + EN_BRANCH: release-8.5 + ZH_BRANCH: i18n-zh-release-8.5 + +jobs: + sync-toc: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + name: Checkout ZH branch + with: + ref: "${{ env.ZH_BRANCH }}" + path: "docs" + token: ${{ github.token }} + + - name: Copy script from main branch (temporary) + run: | + mkdir -p temp_scripts + curl -H "Authorization: Bearer ${{ github.token }}" \ + -H "Accept: application/vnd.github.v3.raw" \ + -L "https://api.github.com/repos/${{ github.repository }}/contents/scripts/sync-en-cloud-toc-changes-to-zh.py" \ + -o temp_scripts/sync-en-cloud-toc-changes-to-zh.py + + - uses: actions/setup-python@v5 + name: Setup Python 3.11 + with: + python-version: '3.11' + + - name: Install Python dependencies + run: | + pip install "google-genai>=0.3,<1" + + - name: Configure Git + run: | + cd docs + git config user.name github-actions + git config user.email github-actions@github.com + + - name: Run TOC sync script + env: + GEMINI_API_TOKEN: ${{ secrets.GEMINI_API_TOKEN }} + GITHUB_TOKEN: ${{ github.token }} + run: | + cp temp_scripts/sync-en-cloud-toc-changes-to-zh.py docs/ + cd docs + python sync-en-cloud-toc-changes-to-zh.py + rm sync-en-cloud-toc-changes-to-zh.py # Remove the script file so it won't be included in PR + + - name: Clean up temporary files + run: | + rm -rf temp_scripts + + - name: Check for changes + id: check_changes + run: | + cd docs + if git diff --quiet; then + echo "changes=false" >> $GITHUB_OUTPUT + echo "No changes detected" + else + echo "changes=true" >> $GITHUB_OUTPUT + echo "Changes detected" + fi + + - name: Set build ID + id: build_id + if: steps.check_changes.outputs.changes == 'true' + run: echo "id=$(date +'%Y%m%d')-$(date +%s)" >> $GITHUB_OUTPUT + + - name: Create PR + if: steps.check_changes.outputs.changes == 'true' + uses: peter-evans/create-pull-request@v7 + with: + path: "docs" + token: ${{ github.token }} + branch: sync-zh-toc-${{ steps.build_id.outputs.id }} + base: ${{ env.ZH_BRANCH }} + title: "i18n-zh-release-8.5: sync ZH TOC changes ${{ steps.build_id.outputs.id }}" + body: | + ### What is changed, added or deleted? (Required) + + Sync Chinese Cloud TOC files based on English TOC updates from ${{ env.EN_BRANCH }} branch. + + ### Which TiDB version(s) do your changes apply to? (Required) + + + + **Tips for choosing the affected version(s):** + + By default, **CHOOSE MASTER ONLY** so your changes will be applied to the next TiDB major or minor releases. If your PR involves a product feature behavior change or a compatibility change, **CHOOSE THE AFFECTED RELEASE BRANCH(ES) AND MASTER**. + + For details, see [tips for choosing the affected versions](https://github.com/pingcap/docs/blob/master/CONTRIBUTING.md#guideline-for-choosing-the-affected-versions). + + - [x] ${{ env.ZH_BRANCH }} + + ### What is the related PR or file link(s)? + + + + - This PR syncs TOC changes from: ${{ env.EN_BRANCH }} + - Other reference link(s): + + ### Do your changes match any of the following descriptions? + + - [ ] Delete files + - [ ] Change aliases + - [ ] Need modification after applied to another branch + - [ ] Might cause conflicts after applied to another branch + delete-branch: true \ No newline at end of file From fb28ac3306d38f20ebf17552feb6dcedf1bae75f Mon Sep 17 00:00:00 2001 From: Grace Cai Date: Tue, 2 Sep 2025 17:40:13 +0800 Subject: [PATCH 03/18] Add files via upload From a38b9ec7a592b2fa954f50d538a84645c0a41775 Mon Sep 17 00:00:00 2001 From: Grace Cai Date: Tue, 2 Sep 2025 17:51:20 +0800 Subject: [PATCH 04/18] Update sync-cloud-zh-toc.yml --- .github/workflows/sync-cloud-zh-toc.yml | 35 ++++++++++++++++++++++--- 1 file changed, 32 insertions(+), 3 deletions(-) diff --git a/.github/workflows/sync-cloud-zh-toc.yml b/.github/workflows/sync-cloud-zh-toc.yml index 688852d2a4220..e9772a769b2c2 100644 --- a/.github/workflows/sync-cloud-zh-toc.yml +++ b/.github/workflows/sync-cloud-zh-toc.yml @@ -38,7 +38,11 @@ jobs: - name: Install Python dependencies run: | + echo "Installing Python dependencies..." pip install "google-genai>=0.3,<1" + echo "Verifying installed packages..." + pip list | grep google + echo "Dependencies installed successfully" - name: Configure Git run: | @@ -51,10 +55,35 @@ jobs: GEMINI_API_TOKEN: ${{ secrets.GEMINI_API_TOKEN }} GITHUB_TOKEN: ${{ github.token }} run: | + echo "Copying script to docs directory..." cp temp_scripts/sync-en-cloud-toc-changes-to-zh.py docs/ + echo "Script copied successfully" + + echo "Entering docs directory..." cd docs - python sync-en-cloud-toc-changes-to-zh.py - rm sync-en-cloud-toc-changes-to-zh.py # Remove the script file so it won't be included in PR + + echo "Checking Python version..." + python --version + python3 --version + + echo "Checking script permissions and existence..." + ls -la sync-en-cloud-toc-changes-to-zh.py + + echo "Checking environment variables..." + echo "GEMINI_API_TOKEN is set: $([ -n "$GEMINI_API_TOKEN" ] && echo 'Yes' || echo 'No')" + echo "GITHUB_TOKEN is set: $([ -n "$GITHUB_TOKEN" ] && echo 'Yes' || echo 'No')" + + echo "Starting TOC sync script execution..." + python3 sync-en-cloud-toc-changes-to-zh.py || { + echo "Script execution failed with exit code $?" + echo "Checking for any error logs..." + ls -la + exit 1 + } + + echo "Script execution completed, cleaning up..." + rm sync-en-cloud-toc-changes-to-zh.py + echo "Cleanup completed" - name: Clean up temporary files run: | @@ -116,4 +145,4 @@ jobs: - [ ] Change aliases - [ ] Need modification after applied to another branch - [ ] Might cause conflicts after applied to another branch - delete-branch: true \ No newline at end of file + delete-branch: true From 07ca99b675e140cdc17dcdd52426c82bc983ae3b Mon Sep 17 00:00:00 2001 From: Grace Cai Date: Tue, 2 Sep 2025 17:55:56 +0800 Subject: [PATCH 05/18] Add files via upload --- scripts/sync-en-cloud-toc-changes-to-zh.py | 616 +++++++++++++++++++++ 1 file changed, 616 insertions(+) create mode 100644 scripts/sync-en-cloud-toc-changes-to-zh.py diff --git a/scripts/sync-en-cloud-toc-changes-to-zh.py b/scripts/sync-en-cloud-toc-changes-to-zh.py new file mode 100644 index 0000000000000..da2c16b10da20 --- /dev/null +++ b/scripts/sync-en-cloud-toc-changes-to-zh.py @@ -0,0 +1,616 @@ +# This script is used to sync the changes from the English TOC files to the Chinese TOC files. Detailed steps are as follows: +# 1. The script automatically gets the latest commit of the English TOC file from GitHub and the earlier commit of the English TOC file from the Chinese TOC file in the same repository. +# 2. It compares two English commits and performs the following operations: +# - If the commit numbers are the same, skip the update for that TOC file. +# - If the commit numbers are different, update the Chinese TOC with the following operations: +# a. Updates the Chinese TOC according to the English diff. +# b. Generates bilingual terms based on the old version of the Chinese and English TOC files. +# c. Update the modified English lines in the Chinese TOC with Chinese based on the bilingual terms. +# d. Translate the remaining English in the Chinese TOC using AI. + +import re +import os +import sys +import json +import logging +from urllib.request import urlopen, Request +from urllib.error import URLError, HTTPError +from google import genai + +REPO_OWNER = "qiancai" +REPO_NAME = "docs" +EN_BRANCH = "release-8.5" +ZH_BRANCH = "i18n-zh-release-8.5" +TOC_FILE_NAMES = ["TOC-tidb-cloud-starter.md", "TOC-tidb-cloud-essential.md", "TOC-tidb-cloud.md"] +TOC_HEADER_LINE_COUNT = 3 # The Starting line to create bilingual terms +TEMP_TOC_FILENAME = "en_cloud_toc.md" # The filename of the temporary English TOC content + + +# ========== Logging Configuration ========== +logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') +logger = logging.getLogger(__name__) + +# ========== AI Configuration ========== +MODEL_NAME = "gemini-2.0-flash" +genai_token = os.getenv("GEMINI_API_TOKEN") +if not genai_token: + logger.error("GEMINI_API_TOKEN environment variable must be set") + sys.exit(1) + +client = genai.Client(api_key=genai_token) + +def read_file_from_repo(file_path): + """Read a file from the current repository""" + try: + with open(file_path, 'r', encoding='utf-8') as f: + return f.read() + except IOError as e: + logger.error(f"Error reading file {file_path}: {e}") + return None + +def write_file_to_repo(file_path, content): + """Write content to a file in the current repository""" + try: + with open(file_path, 'w', encoding='utf-8') as f: + f.write(content) + return True + except IOError as e: + logger.error(f"Error writing file {file_path}: {e}") + return False + +def extract_commit_from_target_file(target_file): + """Extract the EN commit SHA from the target TOC file comment""" + try: + content = read_file_from_repo(target_file) + if not content: + return None + + lines = content.split('\n') + for i, line in enumerate(lines): + if i > 10: # Only check first 10 lines + break + + # Look for the pattern: + if "EN commit:" in line: + # Extract commit SHA using regex + match = re.search(r'EN commit:\s*([a-f0-9]{40})', line) + if match: + commit_sha = match.group(1) + logger.info(f"Found earlier EN commit in target file: {commit_sha}") + return commit_sha + + logger.error("No EN commit comment found in target file") + return None + + except Exception as e: + logger.error(f"Error reading target file for commit extraction: {e}") + return None + +def get_latest_commit_sha(repo_owner, repo_name, branch, toc_file_name): + """Get the latest commit SHA for a specific file on GitHub""" + try: + # Use GitHub API to get commits for the specific file + url = f"https://api.github.com/repos/{repo_owner}/{repo_name}/commits" + params = f"?sha={branch}&path={toc_file_name}&per_page=1" + full_url = url + params + headers = { + "User-Agent": "tidb-docs-sync/1.0", + "Accept": "application/vnd.github.v3+json", + } + gh_token = os.getenv("GITHUB_TOKEN") + if gh_token: + headers["Authorization"] = f"Bearer {gh_token}" + req = Request(full_url, headers=headers) + + with urlopen(req) as resp: + data = json.loads(resp.read().decode('utf-8')) + + if data and len(data) > 0: + latest_commit = data[0]['sha'] + logger.info(f"Latest commit: {latest_commit}") + return latest_commit + else: + logger.warning("No commits found for the specified file") + return None + + except (URLError, HTTPError, json.JSONDecodeError) as e: + logger.error(f"Error fetching latest commit: {e}") + return None + +def get_github_compare_diff(base_commit, head_commit): + """Fetch unified diff from GitHub compare endpoint (.diff) for the repo {REPO_OWNER}/{REPO_NAME}""" + try: + url = f"https://github.com/{REPO_OWNER}/{REPO_NAME}/compare/{base_commit}...{head_commit}.diff" + logger.info(f"Fetching compare diff from: {url}") + headers = { + "User-Agent": "tidb-docs-sync/1.0", + "Accept": "application/vnd.github.v3.diff", + } + gh_token = os.getenv("GITHUB_TOKEN") + if gh_token: + headers["Authorization"] = f"Bearer {gh_token}" + req = Request(url, headers=headers) + with urlopen(req, timeout=20) as resp: + content_bytes = resp.read() + # GitHub serves UTF-8 + return content_bytes.decode("utf-8", errors="replace") + except (URLError, HTTPError) as e: + logger.error(f"Error fetching GitHub compare diff: {e}") + return None + +def parse_github_diff_for_file(diff_text, target_rel_path): + """Parse the multi-file unified diff and return hunks for the specified file. + + Returns a list of hunks: {old_start, old_count, new_start, new_count, lines} + where lines are the raw hunk lines starting with ' ', '+', or '-'. + """ + if not diff_text: + return [] + + lines = diff_text.splitlines() + hunks = [] + collecting_for_file = False + current_hunk = None + current_file_path = None + + # Normalize target path to compare by suffix + target_suffix = target_rel_path.strip("/") + + for line in lines: + if line.startswith("diff --git "): + # finalize any open hunk + if current_hunk is not None and collecting_for_file: + hunks.append(current_hunk) + current_hunk = None + collecting_for_file = False + current_file_path = None + continue + + if line.startswith("+++ "): + path = line[4:].strip() + # Expected formats: 'b/path/to/file' or '/dev/null' + if path == "/dev/null": + current_file_path = None + collecting_for_file = False + else: + # strip the leading 'a/' or 'b/' + if path.startswith("a/") or path.startswith("b/"): + path_clean = path[2:] + else: + path_clean = path + current_file_path = path_clean + collecting_for_file = path_clean.endswith(target_suffix) + continue + + if not collecting_for_file: + continue + + # Within the target file section, parse hunks + if line.startswith("@@ "): + # finalize previous hunk + if current_hunk is not None: + hunks.append(current_hunk) + + m = re.match(r"@@ -(\d+),?(\d*) \+(\d+),?(\d*) @@", line) + if not m: + continue + old_start = int(m.group(1)) + old_count = int(m.group(2)) if m.group(2) else 1 + new_start = int(m.group(3)) + new_count = int(m.group(4)) if m.group(4) else 1 + + current_hunk = { + "old_start": old_start, + "old_count": old_count, + "new_start": new_start, + "new_count": new_count, + "lines": [], + } + continue + + # Collect hunk body lines + if current_hunk is not None and (line.startswith(" ") or line.startswith("+") or line.startswith("-")): + current_hunk["lines"].append(line) + + # finalize last hunk if any + if current_hunk is not None and collecting_for_file: + hunks.append(current_hunk) + + return hunks + +def apply_hunks_by_line_numbers(target_file, hunks, earlier_commit, latest_commit): + """Apply unified-diff hunks to target file strictly by old line numbers. + + Only change the lines marked as deletions ('-') and additions ('+'). + Context lines (' ') are used for positioning but are left untouched in the target. + """ + try: + content = read_file_from_repo(target_file) + if not content: + return False, {} + lines = content.splitlines() + + modified = list(lines) + line_offset_delta = 0 + modified_lines = {} + + for hunk_index, hunk in enumerate(hunks): + cursor = hunk["old_start"] - 1 + line_offset_delta + + if cursor < 0: + print(f"Hunk {hunk_index+1}: start cursor {cursor} adjusted to 0") + cursor = 0 + if cursor > len(modified): + print(f"Hunk {hunk_index+1}: start cursor {cursor} beyond EOF {len(modified)}; clamping to EOF") + cursor = len(modified) + + #print(f"Applying hunk {hunk_index+1} at approx line {cursor+1}") + + for raw in hunk["lines"]: + if not raw: + continue + marker = raw[0] + text = raw[1:] + + if marker == ' ': # context: advance cursor, keep original content + cursor += 1 + elif marker == '-': # deletion: remove line at cursor + if cursor < len(modified): + deleted = modified.pop(cursor) + line_offset_delta -= 1 + else: + print(f"Hunk {hunk_index+1}: deletion cursor {cursor} at/after EOF; skipping deletion") + elif marker == '+': # addition: insert line at cursor + modified.insert(cursor, text) + modified_lines[cursor+1] = text + #print(f"Inserted line at line {cursor+1}: {text}") + cursor += 1 + line_offset_delta += 1 + else: + # Unknown marker; ignore + pass + + # replace the earlier commit with the latest commit + for i, line in enumerate(modified): + if "EN commit:" in line and earlier_commit in line: + modified[i] = line.replace(earlier_commit, latest_commit) + break + modified_content = "\n".join(modified) + "\n" + + success = write_file_to_repo(target_file, modified_content) + if not success: + return False, {} + + logger.info(f"Successfully applied {len(hunks)} hunks to {target_file}") + return True, modified_lines + except Exception as e: + logger.error(f"Error applying hunks: {e}") + return False, {} + +def sync_toc_files_using_github_compare(commit1, commit2, source_file, target_file): + """Sync by fetching compare diff from GitHub and applying hunks by line numbers.""" + logger.info(f"Fetching GitHub compare diff between {commit1} and {commit2}...") + diff_text = get_github_compare_diff(commit1, commit2) + if not diff_text: + logger.warning("No diff content retrieved from GitHub") + return False, {} + + logger.info("Parsing diff for target file hunks...") + hunks = parse_github_diff_for_file(diff_text, source_file) + if not hunks: + logger.info(f"No hunks found for file: {source_file}") + return False, {} + + logger.info(f"Found {len(hunks)} hunks for {source_file}. Applying to {target_file} by line numbers...") + sync_status, modified_lines = apply_hunks_by_line_numbers(target_file, hunks, commit1, commit2) + return sync_status, modified_lines + +def create_bilingual_comparison(target_toc_file): + """Create bilingual comparison list from TOC files""" + bilingual_list = [] + + # Read both files + zh_content = read_file_from_repo(target_toc_file) + en_content = read_file_from_repo(TEMP_TOC_FILENAME) + + if not zh_content or not en_content: + return [] + + zh_lines = zh_content.splitlines(True) + en_lines = en_content.splitlines(True) + + # Process from line 4 onwards (index 3) + start_line = TOC_HEADER_LINE_COUNT + + # Ensure both files have the same number of lines + min_lines = min(len(zh_lines), len(en_lines)) + + logger.info(f"Processing {min_lines - start_line} lines starting from line {start_line + 1}") + + for i in range(start_line, min_lines): + zh_line = zh_lines[i].rstrip('\n\r') + en_line = en_lines[i].rstrip('\n\r') + + # Skip empty lines + if not zh_line.strip() and not en_line.strip(): + continue + + # Clean the lines consistently using the same pattern as replace function + zh_toc_pattern = re.match(r'^\s*-\s', zh_line) + en_toc_pattern = re.match(r'^\s*-\s', en_line) + + zh_cleaned = zh_line[zh_toc_pattern.end():].rstrip() if zh_toc_pattern else zh_line.rstrip() + en_cleaned = en_line[en_toc_pattern.end():].rstrip() if en_toc_pattern else en_line.rstrip() + + # Only add non-empty cleaned lines + if zh_cleaned.strip() and en_cleaned.strip(): + bilingual_list.append([zh_cleaned, en_cleaned, i + 1]) + logger.debug(f"Bilingual items: Line {i + 1}: '{en_cleaned}' -> '{zh_cleaned}'") + + logger.info(f"Created bilingual list with {len(bilingual_list)} entries") + return bilingual_list + +def replace_content_with_translation(bilingual_list, modified_lines, target_toc_file): + """Replace English content with existing Chinese translations, return unmatched lines""" + # Read the target file + content = read_file_from_repo(target_toc_file) + if not content: + return modified_lines + target_lines = content.splitlines(True) + + # Optimize lookup by creating a dictionary for O(1) lookups + bilingual_map = {en_text: zh_text for zh_text, en_text, _ in bilingual_list} + + replaced_count = 0 + matched_lines = set() + + logger.info(f"Found {len(modified_lines)} modified lines to process.") + logger.debug(f"Modified lines: {list(modified_lines.keys())}") + + # Process each modified line + for line_number in modified_lines.keys(): + line_index = line_number - 1 # Convert to 0-based + + if 0 <= line_index < len(target_lines): + line_content = target_lines[line_index].rstrip('\n\r') + + # Clean the line content for matching + toc_pattern = re.match(r'^\s*-\s', line_content) + if toc_pattern: + prefix = toc_pattern.group(0) + cleaned_content = line_content[toc_pattern.end():].rstrip() + else: + prefix = '' + cleaned_content = line_content.rstrip() + + # Try to find exact match in bilingual map (O(1) lookup) + if cleaned_content in bilingual_map: + # Found match! Replace with Chinese translation + zh_text = bilingual_map[cleaned_content] + new_line = prefix + zh_text + target_lines[line_index] = new_line + '\n' + replaced_count += 1 + matched_lines.add(line_number) + logger.debug(f"Matched line {line_number}: '{cleaned_content}' -> '{zh_text}'") + + # Write back the updated content + if replaced_count > 0: + updated_content = ''.join(target_lines) + write_file_to_repo(target_toc_file, updated_content) + logger.info(f"Applied {replaced_count} existing translations.") + + # Return unmatched lines for AI translation + unmatched_lines = {k: v for k, v in modified_lines.items() if k not in matched_lines} + logger.info(f"Lines needing AI translation: {len(unmatched_lines)}") + + return unmatched_lines + +def translate_content(modified_lines, target_file): + """Translate English content to Chinese using Gemini API with JSON format""" + if not modified_lines: + logger.info("No content to translate.") + return {} + + logger.info(f"Translating {len(modified_lines)} lines using Gemini API...") + + # Read the target file to get original formatted lines + content = read_file_from_repo(target_file) + if not content: + return {} + target_lines = content.splitlines(True) + + # Create JSON input with original formatted lines + translation_json = {} + for line_num in modified_lines.keys(): + line_index = line_num - 1 + if 0 <= line_index < len(target_lines): + original_line = target_lines[line_index] + translation_json[str(line_num)] = original_line + + if not translation_json: + logger.warning("No valid content to translate after processing.") + return {} + + # Create JSON string for the prompt + json_input = json.dumps(translation_json, ensure_ascii=False, indent=2) + logger.debug(f"Translation JSON input: {json_input}") + + # Create translation prompt + prompt = f"""Please translate the following TOC (Table of Contents) entries from English to Chinese. +These are navigation items for TiDB Cloud documentation with original formatting. + +IMPORTANT: +1. Return the result in the EXACT SAME JSON format with the same keys (line numbers) +2. Keep ALL original formatting: indentation, spaces, dashes, brackets, etc. +3. Only translate the English text content to Chinese, preserve everything else exactly +4. Maintain technical terms appropriately (like "TiDB Cloud", "HTAP", "CLI", etc.) + +Input JSON: +{json_input} + +Return only the JSON with Chinese translations that preserve all original formatting.""" + + try: + logger.info("Sending translation request to Gemini API...") + response = client.models.generate_content( + model=MODEL_NAME, contents=prompt + ) + + if response.text: + # Extract JSON from response + response_text = response.text.strip() + logger.debug(f"Translation JSON response: {response_text}") + + # Try to find and parse JSON from the response + try: + # Use regex to find JSON block more robustly + json_text = response_text + match = re.search(r"```json\s*([\s\S]*?)\s*```", response_text) + if match: + json_text = match.group(1).strip() + elif '```' in response_text: + start = response_text.find('```') + 3 + end = response_text.find('```', start) + json_text = response_text[start:end].strip() + + # Parse the JSON + translated_json = json.loads(json_text) + + # Convert back to integer keys and return + zh_modified_lines = {} + for line_num_str, translated_text in translated_json.items(): + line_num = int(line_num_str) + zh_modified_lines[line_num] = translated_text + original_text = modified_lines.get(line_num, "") + logger.debug(f"Line {line_num}: '{original_text}' -> '{translated_text}'") + + logger.info(f"Translation completed. Processed {len(zh_modified_lines)} lines.") + return zh_modified_lines + + except (json.JSONDecodeError, ValueError) as e: + logger.error(f"Error parsing JSON response: {e}") + logger.error(f"Response was: {response_text}") + # Fallback: return empty dict to prevent writing untranslated content + return {} + else: + logger.error("Empty response from Gemini API") + return {} + + except Exception as e: + logger.error(f"Error during translation: {e}") + # Fallback: return empty dict to prevent writing untranslated content + return {} + +def update_toc_file(zh_modified_lines, target_file): + """Apply translated content to specific lines in the target TOC file""" + if not zh_modified_lines: + logger.info("No translated content to apply.") + return + + logger.info(f"Applying {len(zh_modified_lines)} translated lines to {target_file}...") + + try: + # Read the target file + content = read_file_from_repo(target_file) + if not content: + logger.error(f"Could not read target file {target_file}") + return + target_lines = content.splitlines(True) + + # Apply translations to specific lines + applied_count = 0 + for line_num, translated_content in zh_modified_lines.items(): + # Convert to 0-based index + line_index = line_num - 1 + + if 0 <= line_index < len(target_lines): + # AI has already provided the complete formatted line, use it directly + target_lines[line_index] = translated_content + applied_count += 1 + else: + logger.warning(f"Line number {line_num} is out of range (file has {len(target_lines)} lines)") + + # Write the updated content back to the file + updated_content = ''.join(target_lines) + write_file_to_repo(target_file, updated_content) + + logger.info(f"Successfully applied {applied_count} translations to {target_file}") + + except Exception as e: + logger.error(f"Error updating TOC file: {e}") + raise + +def cleanup_temp_files(): + """Clean up temporary files""" + try: + if os.path.exists(TEMP_TOC_FILENAME): + os.remove(TEMP_TOC_FILENAME) + logger.info(f"Cleaned up temporary file: {TEMP_TOC_FILENAME}") + except Exception as e: + logger.warning(f"Could not clean up temporary files: {e}") + +def process_toc_file(toc_file_name): + """Process a single TOC file for synchronization""" + target_toc_file = toc_file_name + + logger.info("-" * 50) + logger.info(f"Processing {toc_file_name}...") + + logger.info("Extracting EN commit SHA from target file...") + earlier_commit = extract_commit_from_target_file(target_toc_file) + + logger.info("Fetching latest commit SHA for TOC file...") + latest_commit = get_latest_commit_sha(REPO_OWNER, REPO_NAME, EN_BRANCH, toc_file_name) + + # If earlier_commit is different from latest_commit, sync the TOC file. + if earlier_commit and latest_commit and earlier_commit != latest_commit: + # Download the EN TOC content from the earlier commit for comparison + en_toc_path = f"https://raw.githubusercontent.com/{REPO_OWNER}/{REPO_NAME}/{earlier_commit}/{toc_file_name}" + logger.info(f"Downloading EN TOC content from: {en_toc_path}") + en_toc_content = urlopen(en_toc_path).read().decode("utf-8") + + # Write en_toc_content to a file for bilingual comparison + write_file_to_repo(TEMP_TOC_FILENAME, en_toc_content) + + logger.info("Creating bilingual comparison...") + bilingual_list = create_bilingual_comparison(target_toc_file) + + logger.info("Running TOC sync using GitHub compare diff...") + sync_status, modified_lines = sync_toc_files_using_github_compare( + earlier_commit, + latest_commit, + toc_file_name, + target_toc_file, + ) + + if sync_status: + logger.info("TOC file sync completed successfully!") + + # Match with existing bilingual translations + unmatched_lines = replace_content_with_translation(bilingual_list, modified_lines, target_toc_file) + + # Use AI to translate remaining unmatched lines + if unmatched_lines: + logger.info(f"Using AI to translate {len(unmatched_lines)} unmatched lines...") + zh_modified_lines = translate_content(unmatched_lines, target_toc_file) + update_toc_file(zh_modified_lines, target_toc_file) + logger.info("AI translations have been applied successfully!") + else: + logger.info("All lines were matched with existing translations. No AI translation needed.") + else: + logger.error("TOC file sync failed!") + else: + if earlier_commit == latest_commit: + logger.info(f"Earlier commit is the same as latest commit. No sync needed for {toc_file_name}.") + else: + logger.warning(f"Skipping sync for {toc_file_name} due to missing commit information. Check logs for errors.") + +if __name__ == "__main__": + logger.info("Starting TOC synchronization process...") + + for toc_file_name in TOC_FILE_NAMES: + process_toc_file(toc_file_name) + + # Clean up temporary files + cleanup_temp_files() + logger.info("Script execution completed.") From 37213930983841385a26ac4c6983458d96172848 Mon Sep 17 00:00:00 2001 From: Grace Cai Date: Tue, 2 Sep 2025 17:58:26 +0800 Subject: [PATCH 06/18] Update sync-cloud-zh-toc.yml --- .github/workflows/sync-cloud-zh-toc.yml | 33 ++----------------------- 1 file changed, 2 insertions(+), 31 deletions(-) diff --git a/.github/workflows/sync-cloud-zh-toc.yml b/.github/workflows/sync-cloud-zh-toc.yml index e9772a769b2c2..b6aca47db3573 100644 --- a/.github/workflows/sync-cloud-zh-toc.yml +++ b/.github/workflows/sync-cloud-zh-toc.yml @@ -38,11 +38,7 @@ jobs: - name: Install Python dependencies run: | - echo "Installing Python dependencies..." pip install "google-genai>=0.3,<1" - echo "Verifying installed packages..." - pip list | grep google - echo "Dependencies installed successfully" - name: Configure Git run: | @@ -55,35 +51,10 @@ jobs: GEMINI_API_TOKEN: ${{ secrets.GEMINI_API_TOKEN }} GITHUB_TOKEN: ${{ github.token }} run: | - echo "Copying script to docs directory..." cp temp_scripts/sync-en-cloud-toc-changes-to-zh.py docs/ - echo "Script copied successfully" - - echo "Entering docs directory..." cd docs - - echo "Checking Python version..." - python --version - python3 --version - - echo "Checking script permissions and existence..." - ls -la sync-en-cloud-toc-changes-to-zh.py - - echo "Checking environment variables..." - echo "GEMINI_API_TOKEN is set: $([ -n "$GEMINI_API_TOKEN" ] && echo 'Yes' || echo 'No')" - echo "GITHUB_TOKEN is set: $([ -n "$GITHUB_TOKEN" ] && echo 'Yes' || echo 'No')" - - echo "Starting TOC sync script execution..." - python3 sync-en-cloud-toc-changes-to-zh.py || { - echo "Script execution failed with exit code $?" - echo "Checking for any error logs..." - ls -la - exit 1 - } - - echo "Script execution completed, cleaning up..." - rm sync-en-cloud-toc-changes-to-zh.py - echo "Cleanup completed" + python sync-en-cloud-toc-changes-to-zh.py + rm sync-en-cloud-toc-changes-to-zh.py # Remove the script file so it won't be included in PR - name: Clean up temporary files run: | From 2aadfeb568d98ede9b8fce2add646bb32d7b7039 Mon Sep 17 00:00:00 2001 From: qiancai Date: Thu, 25 Sep 2025 17:56:02 +0800 Subject: [PATCH 07/18] add files --- .github/workflows/sync-docs-cn-to-en.yml | 134 ++ scripts/translate_doc_pr/__init__.py | 22 + scripts/translate_doc_pr/file_adder.py | 193 +++ scripts/translate_doc_pr/file_deleter.py | 45 + scripts/translate_doc_pr/file_updater.py | 1692 +++++++++++++++++++ scripts/translate_doc_pr/main_workflow.py | 691 ++++++++ scripts/translate_doc_pr/pr_analyzer.py | 1447 ++++++++++++++++ scripts/translate_doc_pr/requirements.txt | 4 + scripts/translate_doc_pr/section_matcher.py | 973 +++++++++++ scripts/translate_doc_pr/toc_processor.py | 434 +++++ 10 files changed, 5635 insertions(+) create mode 100644 .github/workflows/sync-docs-cn-to-en.yml create mode 100644 scripts/translate_doc_pr/__init__.py create mode 100644 scripts/translate_doc_pr/file_adder.py create mode 100644 scripts/translate_doc_pr/file_deleter.py create mode 100644 scripts/translate_doc_pr/file_updater.py create mode 100644 scripts/translate_doc_pr/main_workflow.py create mode 100644 scripts/translate_doc_pr/pr_analyzer.py create mode 100644 scripts/translate_doc_pr/requirements.txt create mode 100644 scripts/translate_doc_pr/section_matcher.py create mode 100644 scripts/translate_doc_pr/toc_processor.py diff --git a/.github/workflows/sync-docs-cn-to-en.yml b/.github/workflows/sync-docs-cn-to-en.yml new file mode 100644 index 0000000000000..5fe0aa9e3913b --- /dev/null +++ b/.github/workflows/sync-docs-cn-to-en.yml @@ -0,0 +1,134 @@ +name: Sync Docs Changes from ZH PR to EN PR + +on: + workflow_dispatch: + inputs: + source_pr_url: + description: 'Source PR URL (Chinese docs repository)' + required: true + type: string + default: '' + target_pr_url: + description: 'Target PR URL (English docs repository)' + required: true + type: string + default: '' + ai_provider: + description: 'AI Provider to use for translation' + required: false + type: choice + options: + - deepseek + - gemini + default: 'gemini' + +jobs: + sync-docs: + runs-on: ubuntu-latest + + steps: + - name: Checkout current repository + uses: actions/checkout@v4 + with: + token: ${{ secrets.GITHUB_TOKEN }} + fetch-depth: 0 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: '3.9' + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r scripts/translate_doc_pr/requirements.txt + + - name: Extract PR information + id: extract_info + run: | + # Extract source repo info + SOURCE_URL="${{ github.event.inputs.source_pr_url }}" + SOURCE_OWNER=$(echo $SOURCE_URL | cut -d'/' -f4) + SOURCE_REPO=$(echo $SOURCE_URL | cut -d'/' -f5) + SOURCE_PR=$(echo $SOURCE_URL | cut -d'/' -f7) + + # Extract target repo info + TARGET_URL="${{ github.event.inputs.target_pr_url }}" + TARGET_OWNER=$(echo $TARGET_URL | cut -d'/' -f4) + TARGET_REPO=$(echo $TARGET_URL | cut -d'/' -f5) + TARGET_PR=$(echo $TARGET_URL | cut -d'/' -f7) + + echo "source_owner=${SOURCE_OWNER}" >> $GITHUB_OUTPUT + echo "source_repo=${SOURCE_REPO}" >> $GITHUB_OUTPUT + echo "source_pr=${SOURCE_PR}" >> $GITHUB_OUTPUT + echo "target_owner=${TARGET_OWNER}" >> $GITHUB_OUTPUT + echo "target_repo=${TARGET_REPO}" >> $GITHUB_OUTPUT + echo "target_pr=${TARGET_PR}" >> $GITHUB_OUTPUT + + echo "Source: ${SOURCE_OWNER}/${SOURCE_REPO}#${SOURCE_PR}" + echo "Target: ${TARGET_OWNER}/${TARGET_REPO}#${TARGET_PR}" + + - name: Get target PR branch info + id: target_branch + run: | + # Get target PR branch name + TARGET_BRANCH=$(curl -s \ + -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" \ + -H "Accept: application/vnd.github.v3+json" \ + "https://api.github.com/repos/${{ steps.extract_info.outputs.target_owner }}/${{ steps.extract_info.outputs.target_repo }}/pulls/${{ steps.extract_info.outputs.target_pr }}" \ + | jq -r '.head.ref') + + echo "target_branch=${TARGET_BRANCH}" >> $GITHUB_OUTPUT + echo "Target branch: ${TARGET_BRANCH}" + + - name: Clone target repository + run: | + # Clone target repository with the PR branch + git clone https://x-access-token:${{ secrets.GITHUB_TOKEN }}@github.com/${{ steps.extract_info.outputs.target_owner }}/${{ steps.extract_info.outputs.target_repo }}.git target_repo + cd target_repo + git checkout ${{ steps.target_branch.outputs.target_branch }} + git config user.name "github-actions[bot]" + git config user.email "github-actions[bot]@users.noreply.github.com" + + - name: Run sync script + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + DEEPSEEK_API_TOKEN: ${{ secrets.DEEPSEEK_API_TOKEN }} + GEMINI_API_TOKEN: ${{ secrets.GEMINI_API_TOKEN }} + SOURCE_PR_URL: ${{ github.event.inputs.source_pr_url }} + TARGET_PR_URL: ${{ github.event.inputs.target_pr_url }} + AI_PROVIDER: ${{ github.event.inputs.ai_provider }} + TARGET_REPO_PATH: ${{ github.workspace }}/target_repo + run: | + cd scripts/translate_doc_pr + python main_workflow.py + + - name: Commit and push changes + run: | + cd target_repo + git add . + if git diff --staged --quiet; then + echo "No changes to commit" + else + git commit -m "Auto-sync: Update English docs from Chinese PR ${{ github.event.inputs.source_pr_url }} + + Synced from: ${{ github.event.inputs.source_pr_url }} + Target PR: ${{ github.event.inputs.target_pr_url }} + AI Provider: ${{ github.event.inputs.ai_provider }} + + Co-authored-by: github-actions[bot] " + + git push origin ${{ steps.target_branch.outputs.target_branch }} + echo "Changes pushed to target PR branch: ${{ steps.target_branch.outputs.target_branch }}" + fi + + - name: Add comment to target PR + run: | + # Add a comment to the target PR about the sync + curl -X POST \ + -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" \ + -H "Accept: application/vnd.github.v3+json" \ + "https://api.github.com/repos/${{ steps.extract_info.outputs.target_owner }}/${{ steps.extract_info.outputs.target_repo }}/issues/${{ steps.extract_info.outputs.target_pr }}/comments" \ + -d "{ + \"body\": \"šŸ¤– **Auto-sync completed**\\n\\nšŸ“„ **Source PR**: ${{ github.event.inputs.source_pr_url }}\\nšŸŽÆ **Target PR**: ${{ github.event.inputs.target_pr_url }}\\nāœ… English documentation has been updated based on Chinese documentation changes.\\n\\n_This comment was generated automatically by the sync workflow._\" + }" diff --git a/scripts/translate_doc_pr/__init__.py b/scripts/translate_doc_pr/__init__.py new file mode 100644 index 0000000000000..b272696e2e394 --- /dev/null +++ b/scripts/translate_doc_pr/__init__.py @@ -0,0 +1,22 @@ +#!/usr/bin/env python3 +""" +Auto-Sync PR Changes - Refactored Modular Version + +This package contains the refactored version of the auto-sync-pr-changes script, +split into logical modules for better maintainability and testing. + +Modules: +- pr_analyzer: PR analysis, diff parsing, content getting, hierarchy building +- section_matcher: Section matching (direct matching + AI matching) +- file_adder: New file processing and translation +- file_deleter: Deleted file processing +- file_updater: Updated file processing and translation +- toc_processor: TOC file special processing +- main: Main orchestration function +""" + +# Import main functionality for easy access +from main import main + +# Make main function available at package level +__all__ = ["main"] diff --git a/scripts/translate_doc_pr/file_adder.py b/scripts/translate_doc_pr/file_adder.py new file mode 100644 index 0000000000000..57e93b2fb1c63 --- /dev/null +++ b/scripts/translate_doc_pr/file_adder.py @@ -0,0 +1,193 @@ +""" +File Adder Module +Handles processing and translation of newly added files +""" + +import os +import re +import json +import threading +from github import Github +from openai import OpenAI + +# Thread-safe printing +print_lock = threading.Lock() + +def thread_safe_print(*args, **kwargs): + with print_lock: + print(*args, **kwargs) + +def create_section_batches(file_content, max_lines_per_batch=200): + """Create batches of file content for translation, respecting section boundaries""" + lines = file_content.split('\n') + + # Find all section headers + section_starts = [] + for i, line in enumerate(lines): + line = line.strip() + if line.startswith('#'): + match = re.match(r'^(#{1,10})\s+(.+)', line) + if match: + section_starts.append(i + 1) # 1-based line numbers + + # If no sections found, just batch by line count + if not section_starts: + batches = [] + for i in range(0, len(lines), max_lines_per_batch): + batch_lines = lines[i:i + max_lines_per_batch] + batches.append('\n'.join(batch_lines)) + return batches + + # Create batches respecting section boundaries + batches = [] + current_batch_start = 0 + + for i, section_start in enumerate(section_starts): + section_start_idx = section_start - 1 # Convert to 0-based + + # Check if adding this section would exceed the line limit + if (section_start_idx - current_batch_start) > max_lines_per_batch: + # Close current batch at the previous section boundary + if current_batch_start < section_start_idx: + batch_lines = lines[current_batch_start:section_start_idx] + batches.append('\n'.join(batch_lines)) + current_batch_start = section_start_idx + + # If this is the last section, or the next section would create a batch too large + if i == len(section_starts) - 1: + # Add remaining content as final batch + batch_lines = lines[current_batch_start:] + batches.append('\n'.join(batch_lines)) + else: + next_section_start = section_starts[i + 1] - 1 # 0-based + if (next_section_start - current_batch_start) > max_lines_per_batch: + # Close current batch at current section boundary + batch_lines = lines[current_batch_start:section_start_idx] + if batch_lines: # Only add non-empty batches + batches.append('\n'.join(batch_lines)) + current_batch_start = section_start_idx + + # Clean up any empty batches + batches = [batch for batch in batches if batch.strip()] + + return batches + +def translate_file_batch(batch_content, ai_client, source_language="English", target_language="Chinese"): + """Translate a single batch of file content using AI""" + if not batch_content.strip(): + return batch_content + + thread_safe_print(f" šŸ¤– Translating batch ({len(batch_content.split())} words)...") + + prompt = f"""You are a professional technical writer. Please translate the following {source_language} content to {target_language}. + +IMPORTANT INSTRUCTIONS: +1. Preserve ALL Markdown formatting (headers, links, code blocks, tables, etc.) +2. Do NOT translate: + - Code examples, SQL queries, configuration values + - Technical terms like "TiDB", "TiKV", "PD", API names, etc. + - File paths, URLs, and command line examples + - Variable names and system configuration parameters +3. Translate only the descriptive text and explanations +4. Maintain the exact structure and indentation +5. Keep all special characters and formatting intact + +Content to translate: +{batch_content} + +Please provide the translated content maintaining all formatting and structure.""" + + # Add token estimation + try: + from main import print_token_estimation + print_token_estimation(prompt, "File addition translation") + except ImportError: + # Fallback if import fails - use tiktoken + try: + import tiktoken + enc = tiktoken.get_encoding("cl100k_base") + tokens = enc.encode(prompt) + actual_tokens = len(tokens) + char_count = len(prompt) + print(f" šŸ’° File addition translation") + print(f" šŸ“ Input: {char_count:,} characters") + print(f" šŸ”¢ Actual tokens: {actual_tokens:,} (using tiktoken cl100k_base)") + except Exception: + # Final fallback to character approximation + estimated_tokens = len(prompt) // 4 + char_count = len(prompt) + print(f" šŸ’° File addition translation") + print(f" šŸ“ Input: {char_count:,} characters") + print(f" šŸ”¢ Estimated tokens: ~{estimated_tokens:,} (fallback: 4 chars/token approximation)") + + try: + translated_content = ai_client.chat_completion( + messages=[{"role": "user", "content": prompt}], + temperature=0.1 + ) + thread_safe_print(f" āœ… Batch translation completed") + return translated_content + + except Exception as e: + thread_safe_print(f" āŒ Batch translation failed: {e}") + return batch_content # Return original content if translation fails + +def process_added_files(added_files, pr_url, github_client, ai_client, repo_config): + """Process newly added files by translating and creating them in target repository""" + if not added_files: + thread_safe_print("\nšŸ“„ No new files to process") + return + + thread_safe_print(f"\nšŸ“„ Processing {len(added_files)} newly added files...") + + target_local_path = repo_config['target_local_path'] + source_language = repo_config['source_language'] + target_language = repo_config['target_language'] + + for file_path, file_content in added_files.items(): + thread_safe_print(f"\nšŸ“ Processing new file: {file_path}") + + # Create target file path + target_file_path = os.path.join(target_local_path, file_path) + target_dir = os.path.dirname(target_file_path) + + # Create directory if it doesn't exist + if not os.path.exists(target_dir): + os.makedirs(target_dir, exist_ok=True) + thread_safe_print(f" šŸ“ Created directory: {target_dir}") + + # Check if file already exists + if os.path.exists(target_file_path): + thread_safe_print(f" āš ļø Target file already exists: {target_file_path}") + continue + + # Create section batches for translation + batches = create_section_batches(file_content, max_lines_per_batch=200) + thread_safe_print(f" šŸ“¦ Created {len(batches)} batches for translation") + + # Translate each batch + translated_batches = [] + for i, batch in enumerate(batches): + thread_safe_print(f" šŸ”„ Processing batch {i+1}/{len(batches)}") + translated_batch = translate_file_batch( + batch, + ai_client, + source_language, + target_language + ) + translated_batches.append(translated_batch) + + # Combine translated batches + translated_content = '\n'.join(translated_batches) + + # Write translated content to target file + try: + with open(target_file_path, 'w', encoding='utf-8') as f: + f.write(translated_content) + + thread_safe_print(f" āœ… Created translated file: {target_file_path}") + + except Exception as e: + thread_safe_print(f" āŒ Error creating file {target_file_path}: {e}") + + thread_safe_print(f"\nāœ… Completed processing all new files") diff --git a/scripts/translate_doc_pr/file_deleter.py b/scripts/translate_doc_pr/file_deleter.py new file mode 100644 index 0000000000000..c2064fe568cf3 --- /dev/null +++ b/scripts/translate_doc_pr/file_deleter.py @@ -0,0 +1,45 @@ +""" +File Deleter Module +Handles processing of deleted files and deleted sections +""" + +import os +import threading +from github import Github + +# Thread-safe printing +print_lock = threading.Lock() + +def thread_safe_print(*args, **kwargs): + with print_lock: + print(*args, **kwargs) + +def process_deleted_files(deleted_files, github_client, repo_config): + """Process deleted files by removing them from target repository""" + if not deleted_files: + thread_safe_print("\nšŸ—‘ļø No files to delete") + return + + thread_safe_print(f"\nšŸ—‘ļø Processing {len(deleted_files)} deleted files...") + + target_local_path = repo_config['target_local_path'] + + for file_path in deleted_files: + thread_safe_print(f"\nšŸ—‘ļø Processing deleted file: {file_path}") + + # Create target file path + target_file_path = os.path.join(target_local_path, file_path) + + # Check if file exists in target + if os.path.exists(target_file_path): + try: + os.remove(target_file_path) + thread_safe_print(f" āœ… Deleted file: {target_file_path}") + except Exception as e: + thread_safe_print(f" āŒ Error deleting file {target_file_path}: {e}") + else: + thread_safe_print(f" āš ļø Target file not found: {target_file_path}") + + thread_safe_print(f"\nāœ… Completed processing deleted files") + +# Section deletion logic moved to file_updater.py diff --git a/scripts/translate_doc_pr/file_updater.py b/scripts/translate_doc_pr/file_updater.py new file mode 100644 index 0000000000000..82addd7cc6881 --- /dev/null +++ b/scripts/translate_doc_pr/file_updater.py @@ -0,0 +1,1692 @@ +""" +File Updater Module +Handles processing and translation of updated files and sections +""" + +import os +import re +import json +import threading +from concurrent.futures import ThreadPoolExecutor +from github import Github +from openai import OpenAI + +# Thread-safe printing +print_lock = threading.Lock() + +def thread_safe_print(*args, **kwargs): + with print_lock: + print(*args, **kwargs) + +def get_updated_sections_from_ai(pr_diff, target_sections, source_old_content_dict, ai_client, source_language, target_language, target_file_name=None): + """Use AI to update target sections based on source old content, PR diff, and target sections""" + if not source_old_content_dict or not target_sections: + return {} + + # Filter out deleted sections and prepare source sections from old content + source_sections = {} + for key, old_content in source_old_content_dict.items(): + # Skip deleted sections + if 'deleted' in key: + continue + + # Handle null values by using empty string + content = old_content if old_content is not None else "" + source_sections[key] = content + + # Keep the original order from match_source_diff_to_target.json (no sorting needed) + formatted_source_sections = json.dumps(source_sections, ensure_ascii=False, indent=2) + formatted_target_sections = json.dumps(target_sections, ensure_ascii=False, indent=2) + + thread_safe_print(f" šŸ“Š Source sections: {len(source_sections)} sections") + thread_safe_print(f" šŸ“Š Target sections: {len(target_sections)} sections") + + # Calculate total content size + total_source_chars = sum(len(str(content)) for content in source_sections.values()) + total_target_chars = sum(len(str(content)) for content in target_sections.values()) + thread_safe_print(f" šŸ“ Content size: Source={total_source_chars:,} chars, Target={total_target_chars:,} chars") + + thread_safe_print(f" šŸ¤– Getting AI translation for {len(source_sections)} sections...") + + diff_content = source_sections + + prompt = f"""You are a professional technical writer in the Database domain. I will provide you with: + +1. Source sections in {source_language}: +{formatted_source_sections} + +2. GitHub PR changes (Diff): +{pr_diff} + +3. Current target sections in {target_language}: +{formatted_target_sections} + +Task: Update the target sections in {target_language} according to the diff in {source_language}. + +Instructions: +1. Carefully analyze the PR diff to understand what changes were made (additions, deletions, modifications) +2. Find the corresponding positions in the {target_language} sections and make the same changes. Do not change any content that is not modified in the diff, especially the format. +3. Keep the JSON structure unchanged, only modify the section content +4. Ensure the updated {target_language} content is logically consistent with the {source_language} changes +5. Maintain proper technical writing style and terminology in {target_language}. If a sentence in the diff is unchanged in content but only reordered in {source_language}, reuse its existing translation in {target_language}. + +Please return the complete updated JSON in the same format as target sections, without any additional explanatory text.""" + + # Save prompt to file for reference with target file prefix + target_file_prefix = "unknown" + if target_file_name: + # Use provided target file name + target_file_prefix = target_file_name.replace('/', '_').replace('.md', '') + elif target_sections: + # Try to extract filename from the first section key or content + first_key = next(iter(target_sections.keys()), "") + if "_" in first_key: + # If key contains underscore, it might have target file info + parts = first_key.split("_") + if len(parts) > 1: + target_file_prefix = parts[0] + + # Ensure temp_output directory exists + script_dir = os.path.dirname(os.path.abspath(__file__)) + temp_dir = os.path.join(script_dir, "temp_output") + os.makedirs(temp_dir, exist_ok=True) + + prompt_file = os.path.join(temp_dir, f"{target_file_prefix}_prompt-for-ai-translation.txt") + with open(prompt_file, 'w', encoding='utf-8') as f: + f.write(prompt) + + thread_safe_print(f"\nšŸ’¾ Prompt saved to {prompt_file}") + thread_safe_print(f"šŸ“ Prompt length: {len(prompt)} characters") + thread_safe_print(f"šŸ“Š Source sections: {len(source_sections)}") + thread_safe_print(f"šŸ“Š Target sections: {len(target_sections)}") + thread_safe_print(f"šŸ¤– Sending prompt to AI...") + + thread_safe_print(f"\n šŸ“¤ AI Update Prompt ({source_language} → {target_language}):") + thread_safe_print(f" " + "="*80) + thread_safe_print(f" Source Sections: {formatted_source_sections[:500]}...") + thread_safe_print(f" PR Diff (first 500 chars): {pr_diff[:500]}...") + thread_safe_print(f" Target Sections: {formatted_target_sections[:500]}...") + thread_safe_print(f" " + "="*80) + + try: + from main import print_token_estimation + print_token_estimation(prompt, f"Document translation ({source_language} → {target_language})") + except ImportError: + # Fallback if import fails - use tiktoken + try: + import tiktoken + enc = tiktoken.get_encoding("cl100k_base") + tokens = enc.encode(prompt) + actual_tokens = len(tokens) + char_count = len(prompt) + thread_safe_print(f" šŸ’° Document translation ({source_language} → {target_language})") + thread_safe_print(f" šŸ“ Input: {char_count:,} characters") + thread_safe_print(f" šŸ”¢ Actual tokens: {actual_tokens:,} (using tiktoken cl100k_base)") + except Exception: + # Final fallback to character approximation + estimated_tokens = len(prompt) // 4 + char_count = len(prompt) + thread_safe_print(f" šŸ’° Document translation ({source_language} → {target_language})") + thread_safe_print(f" šŸ“ Input: {char_count:,} characters") + thread_safe_print(f" šŸ”¢ Estimated tokens: ~{estimated_tokens:,} (fallback: 4 chars/token approximation)") + + try: + ai_response = ai_client.chat_completion( + messages=[{"role": "user", "content": prompt}], + temperature=0.1 + ) + thread_safe_print(f" šŸ“ AI translation response received") + thread_safe_print(f" šŸ“‹ AI response (first 500 chars): {ai_response[:500]}...") + + result = parse_updated_sections(ai_response) + thread_safe_print(f" šŸ“Š Parsed {len(result)} sections from AI response") + + # Save AI results to file with target file prefix + ai_results_file = os.path.join(temp_dir, f"{target_file_prefix}_updated_sections_from_ai.json") + with open(ai_results_file, 'w', encoding='utf-8') as f: + json.dump(result, f, ensure_ascii=False, indent=2) + + thread_safe_print(f" šŸ’¾ AI results saved to {ai_results_file}") + return result + + except Exception as e: + thread_safe_print(f" āŒ AI translation failed: {e}") + return {} + +def parse_updated_sections(ai_response): + """Parse AI response and extract JSON (from get-updated-target-sections.py)""" + # Ensure temp_output directory exists for debug files + script_dir = os.path.dirname(os.path.abspath(__file__)) + temp_dir = os.path.join(script_dir, "temp_output") + os.makedirs(temp_dir, exist_ok=True) + + try: + print(f"\n šŸ”§ Parsing AI response...") + print(f" Raw response length: {len(ai_response)} characters") + + # Try to extract JSON from AI response + cleaned_response = ai_response.strip() + + # Remove markdown code blocks if present + if cleaned_response.startswith('```json'): + cleaned_response = cleaned_response[7:] + print(f" šŸ“ Removed '```json' prefix") + elif cleaned_response.startswith('```'): + cleaned_response = cleaned_response[3:] + print(f" šŸ“ Removed '```' prefix") + + if cleaned_response.endswith('```'): + cleaned_response = cleaned_response[:-3] + print(f" šŸ“ Removed '```' suffix") + + cleaned_response = cleaned_response.strip() + + print(f" šŸ“ Cleaned response length: {len(cleaned_response)} characters") + print(f" šŸ“ First 200 chars: {cleaned_response[:200]}...") + print(f" šŸ“ Last 200 chars: ...{cleaned_response[-200:]}") + + # Try to find JSON content between curly braces + start_idx = cleaned_response.find('{') + end_idx = cleaned_response.rfind('}') + + if start_idx != -1 and end_idx != -1 and end_idx > start_idx: + json_content = cleaned_response[start_idx:end_idx+1] + print(f" šŸ“ Extracted JSON content length: {len(json_content)} characters") + + try: + # Parse JSON + updated_sections = json.loads(json_content) + print(f" āœ… Successfully parsed JSON with {len(updated_sections)} sections") + return updated_sections + except json.JSONDecodeError as e: + print(f" āš ļø JSON seems incomplete, trying to fix...") + + # Try to fix incomplete JSON by finding the last complete entry + lines = json_content.split('\n') + fixed_lines = [] + in_value = False + quote_count = 0 + + for line in lines: + if '"' in line: + quote_count += line.count('"') + + fixed_lines.append(line) + + # If we have an even number of quotes, we might have a complete entry + if quote_count % 2 == 0 and (line.strip().endswith(',') or line.strip().endswith('"')): + # Try to parse up to this point + potential_json = '\n'.join(fixed_lines) + if not potential_json.rstrip().endswith('}'): + # Remove trailing comma and add closing brace + if potential_json.rstrip().endswith(','): + potential_json = potential_json.rstrip()[:-1] + '\n}' + else: + potential_json += '\n}' + + try: + partial_sections = json.loads(potential_json) + print(f" šŸ”§ Fixed JSON with {len(partial_sections)} sections") + return partial_sections + except: + continue + + # If all else fails, return the original error + raise e + else: + print(f" āŒ Could not find valid JSON structure in response") + return None + + except json.JSONDecodeError as e: + print(f" āŒ Error parsing AI response as JSON: {e}") + print(f" šŸ“ Error at position: {e.pos if hasattr(e, 'pos') else 'unknown'}") + + # Save debug info + debug_file = os.path.join(temp_dir, f"ai_response_debug_{os.getpid()}.txt") + with open(debug_file, 'w', encoding='utf-8') as f: + f.write("Original AI Response:\n") + f.write("="*80 + "\n") + f.write(ai_response) + f.write("\n" + "="*80 + "\n") + f.write("Cleaned Response:\n") + f.write("-"*80 + "\n") + f.write(cleaned_response if 'cleaned_response' in locals() else "Not available") + + print(f" šŸ“ Debug info saved to: {debug_file}") + return None + except Exception as e: + print(f" āŒ Unexpected error parsing AI response: {e}") + return None + + +def replace_frontmatter_content(lines, new_content): + """Replace content from beginning of file to first top-level header""" + # Find the first top-level header + first_header_idx = None + for i, line in enumerate(lines): + if line.strip().startswith('# '): + first_header_idx = i + break + + if first_header_idx is None: + # No top-level header found, replace entire content + return new_content.split('\n') + + # Replace content from start to before first header + new_lines = new_content.split('\n') + return new_lines + lines[first_header_idx:] + + +def replace_toplevel_section_content(lines, target_line_num, new_content): + """Replace content from top-level header to first next-level header""" + start_idx = target_line_num - 1 # Convert to 0-based index + + # Find the end of top-level section (before first ## header) + end_idx = len(lines) + for i in range(start_idx + 1, len(lines)): + line = lines[i].strip() + if line.startswith('##'): # Found first next-level header + end_idx = i + break + + # Replace the top-level section content (from start_idx to end_idx) + new_lines = new_content.split('\n') + return lines[:start_idx] + new_lines + lines[end_idx:] + + +def update_local_document(file_path, updated_sections, hierarchy_dict, target_local_path): + """Update local document using hierarchy-based section identification (from update-target-doc-v2.py)""" + local_path = os.path.join(target_local_path, file_path) + + if not os.path.exists(local_path): + print(f" āŒ Local file not found: {local_path}") + return False + + try: + # Read document content + with open(local_path, 'r', encoding='utf-8') as f: + document_content = f.read() + + lines = document_content.split('\n') + + replacements_made = [] + + # Use a unified approach: build a complete replacement plan first, then execute it + # This avoids line number shifts during the replacement process + + # Find section boundaries for ALL sections + section_boundaries = find_section_boundaries(lines, hierarchy_dict) + + # Create a comprehensive replacement plan + replacement_plan = [] + + for line_num, new_content in updated_sections.items(): + if line_num == "0": + # Special handling for frontmatter + first_header_idx = None + for i, line in enumerate(lines): + if line.strip().startswith('# '): + first_header_idx = i + break + + replacement_plan.append({ + 'type': 'frontmatter', + 'start': 0, + 'end': first_header_idx if first_header_idx else len(lines), + 'new_content': new_content, + 'line_num': line_num + }) + + elif line_num in hierarchy_dict: + hierarchy = hierarchy_dict[line_num] + if ' > ' not in hierarchy: # Top-level section + # Special handling for top-level sections + start_idx = int(line_num) - 1 + end_idx = len(lines) + for i in range(start_idx + 1, len(lines)): + line = lines[i].strip() + if line.startswith('##'): + end_idx = i + break + + replacement_plan.append({ + 'type': 'toplevel', + 'start': start_idx, + 'end': end_idx, + 'new_content': new_content, + 'line_num': line_num + }) + else: + # Regular section + if line_num in section_boundaries: + boundary = section_boundaries[line_num] + replacement_plan.append({ + 'type': 'regular', + 'start': boundary['start'], + 'end': boundary['end'], + 'new_content': new_content, + 'line_num': line_num, + 'hierarchy': boundary['hierarchy'] + }) + else: + print(f" āš ļø Section at line {line_num} not found in hierarchy") + + # Sort replacement plan: process from bottom to top of the document to avoid line shifts + # Sort by start line in reverse order (highest line number first) + replacement_plan.sort(key=lambda x: -x['start']) + + # Execute replacements in the planned order (from bottom to top) + print(f" šŸ“‹ Executing {len(replacement_plan)} replacements from bottom to top:") + for i, replacement in enumerate(replacement_plan): + print(f" {i+1}. {replacement['type']} (line {replacement.get('line_num', '0')}, start: {replacement['start']})") + + for replacement in replacement_plan: + start = replacement['start'] + end = replacement['end'] + new_content = replacement['new_content'] + new_lines = new_content.split('\n') + + # Replace the content + lines = lines[:start] + new_lines + lines[end:] + + # Record the replacement + original_line_count = end - start + line_diff = len(new_lines) - original_line_count + + replacements_made.append({ + 'type': replacement['type'], + 'line_num': replacement.get('line_num', 'N/A'), + 'hierarchy': replacement.get('hierarchy', 'N/A'), + 'start': start, + 'end': end, + 'original_lines': original_line_count, + 'new_lines': len(new_lines), + 'line_diff': line_diff + }) + + print(f" āœ… Updated {replacement['type']} section: {replacement.get('line_num', 'frontmatter')}") + + # Save updated document + with open(local_path, 'w', encoding='utf-8') as f: + f.write('\n'.join(lines)) + + print(f" āœ… Updated {len(replacements_made)} sections") + for replacement in replacements_made: + print(f" šŸ“ Line {replacement['line_num']}: {replacement['hierarchy']}") + + return True + + except Exception as e: + thread_safe_print(f" āŒ Error updating file: {e}") + return False + +def find_section_boundaries(lines, hierarchy_dict): + """Find the start and end line for each section based on hierarchy (from update-target-doc-v2.py)""" + section_boundaries = {} + + # Sort sections by line number + sorted_sections = sorted(hierarchy_dict.items(), key=lambda x: int(x[0])) + + for i, (line_num, hierarchy) in enumerate(sorted_sections): + start_line = int(line_num) - 1 # Convert to 0-based index + + # Find end line (start of next section at same or higher level) + end_line = len(lines) # Default to end of document + + if start_line >= len(lines): + continue + + # Get current section level + current_line = lines[start_line].strip() + if not current_line.startswith('#'): + continue + + current_level = len(current_line.split()[0]) # Count # characters + + # Look for next section at same or higher level + for j in range(start_line + 1, len(lines)): + line = lines[j].strip() + if line.startswith('#'): + line_level = len(line.split()[0]) if line.split() else 0 + if line_level <= current_level: + end_line = j + break + + section_boundaries[line_num] = { + 'start': start_line, + 'end': end_line, + 'hierarchy': hierarchy, + 'level': current_level + } + + return section_boundaries + +def insert_sections_into_document(file_path, translated_sections, target_insertion_points, target_local_path): + """Insert translated sections into the target document at specified points""" + + if not translated_sections or not target_insertion_points: + thread_safe_print(f" āš ļø No sections or insertion points provided") + return False + + local_path = os.path.join(target_local_path, file_path) + + if not os.path.exists(local_path): + thread_safe_print(f" āŒ Local file not found: {local_path}") + return False + + try: + # Read document content + with open(local_path, 'r', encoding='utf-8') as f: + document_content = f.read() + + lines = document_content.split('\n') + thread_safe_print(f" šŸ“„ Document has {len(lines)} lines") + + # Sort insertion points by line number in descending order to avoid position shifts + sorted_insertions = sorted( + target_insertion_points.items(), + key=lambda x: x[1]['insertion_after_line'], + reverse=True + ) + + insertions_made = [] + + for group_id, point_data in sorted_insertions: + insertion_after_line = point_data['insertion_after_line'] + new_sections = point_data['new_sections'] + insertion_type = point_data['insertion_type'] + + thread_safe_print(f" šŸ“Œ Inserting {len(new_sections)} sections after line {insertion_after_line}") + + # Convert 1-based line number to 0-based index for insertion point + # insertion_after_line is 1-based, so insertion_index should be insertion_after_line - 1 + insertion_index = insertion_after_line - 1 + + # Prepare new content to insert + new_content_lines = [] + + # Add an empty line before the new sections if not already present + if insertion_index < len(lines) and lines[insertion_index].strip(): + new_content_lines.append("") + + # Add each translated section + for section_line_num in new_sections: + # Find the corresponding translated content + section_hierarchy = None + section_content = None + + # Search for the section in translated_sections by line number or hierarchy + for hierarchy, content in translated_sections.items(): + # Try to match by hierarchy or find the content + if str(section_line_num) in hierarchy or content: # This is a simplified matching + section_hierarchy = hierarchy + section_content = content + break + + if section_content: + # Split content into lines and add to insertion + content_lines = section_content.split('\n') + new_content_lines.extend(content_lines) + + # Add spacing between sections + if section_line_num != new_sections[-1]: # Not the last section + new_content_lines.append("") + + thread_safe_print(f" āœ… Added section: {section_hierarchy}") + else: + thread_safe_print(f" āš ļø Could not find translated content for section at line {section_line_num}") + + # Add an empty line after the new sections if not already present + # Check if the new content already ends with an empty line + if new_content_lines and not new_content_lines[-1].strip(): + # Content already ends with empty line, don't add another + pass + elif insertion_index + 1 < len(lines) and lines[insertion_index + 1].strip(): + # Next line has content and our content doesn't end with empty line, add one + new_content_lines.append("") + + # Insert the new content (insert after insertion_index line, before the next line) + # If insertion_after_line is 251, we want to insert at position 252 (0-based index 251) + lines = lines[:insertion_index + 1] + new_content_lines + lines[insertion_index + 1:] + + insertions_made.append({ + 'group_id': group_id, + 'insertion_after_line': insertion_after_line, + 'sections_count': len(new_sections), + 'lines_added': len(new_content_lines), + 'insertion_type': insertion_type + }) + + # Save updated document + with open(local_path, 'w', encoding='utf-8') as f: + f.write('\n'.join(lines)) + + thread_safe_print(f" āœ… Successfully inserted {len(insertions_made)} section groups") + for insertion in insertions_made: + thread_safe_print(f" šŸ“ {insertion['group_id']}: {insertion['sections_count']} sections, {insertion['lines_added']} lines after line {insertion['insertion_after_line']}") + + return True + + except Exception as e: + thread_safe_print(f" āŒ Error inserting sections: {e}") + return False + +def process_modified_sections(modified_sections, pr_diff, pr_url, github_client, ai_client, repo_config, max_non_system_sections=120): + """Process modified sections with full data structure support""" + results = [] + + for file_path, file_data in modified_sections.items(): + thread_safe_print(f"\nšŸ“„ Processing {file_path}") + + try: + # Call process_single_file with the complete data structure + success, message = process_single_file( + file_path, + file_data, # Pass the complete data structure (includes 'sections', 'original_hierarchy', etc.) + pr_diff, + pr_url, + github_client, + ai_client, + repo_config, + max_non_system_sections + ) + + if success: + thread_safe_print(f" āœ… Successfully processed {file_path}") + results.append((file_path, True, message)) + else: + thread_safe_print(f" āŒ Failed to process {file_path}: {message}") + results.append((file_path, False, message)) + + except Exception as e: + thread_safe_print(f" āŒ Error processing {file_path}: {e}") + results.append((file_path, False, f"Error processing {file_path}: {e}")) + + return results + +def process_deleted_sections(deleted_sections, pr_url, github_client, ai_client, repo_config, max_non_system_sections=120): + """Process deleted sections with full data structure support""" + results = [] + + for file_path, source_sections in deleted_sections.items(): + thread_safe_print(f"\nšŸ—‘ļø Processing deleted sections in {file_path}") + + try: + # Call process_single_file_deletion with the complete data structure + success, message = process_single_file_deletion( + file_path, + source_sections, + pr_url, + github_client, + ai_client, + repo_config, + max_non_system_sections + ) + + if success: + thread_safe_print(f" āœ… Successfully processed deletions in {file_path}") + results.append((file_path, True, message)) + else: + thread_safe_print(f" āŒ Failed to process deletions in {file_path}: {message}") + results.append((file_path, False, message)) + + except Exception as e: + thread_safe_print(f" āŒ Error processing deletions in {file_path}: {e}") + results.append((file_path, False, f"Error processing deletions in {file_path}: {e}")) + + return results + +def process_single_file_deletion(file_path, source_sections, pr_url, github_client, ai_client, repo_config, max_non_system_sections=120): + """Process deletion of sections in a single file""" + + # Import needed functions + from pr_analyzer import get_target_hierarchy_and_content + from section_matcher import ( + find_direct_matches_for_special_files, + filter_non_system_sections, + get_corresponding_sections, + is_system_variable_or_config, + clean_title_for_matching, + parse_ai_response, + find_matching_line_numbers + ) + + # Get target file hierarchy and content + target_hierarchy, target_lines = get_target_hierarchy_and_content( + file_path, github_client, repo_config['target_repo'] + ) + + if not target_hierarchy: + return False, f"Could not get target hierarchy for {file_path}" + + # Separate system variables from regular sections for hybrid mapping + system_sections = {} + regular_sections = {} + + for line_num, hierarchy in source_sections.items(): + # Extract title for checking + if ' > ' in hierarchy: + title = hierarchy.split(' > ')[-1] + else: + title = hierarchy + + cleaned_title = clean_title_for_matching(title) + if is_system_variable_or_config(cleaned_title): + system_sections[line_num] = hierarchy + else: + regular_sections[line_num] = hierarchy + + sections_to_delete = [] + + # Process system variables with direct matching + if system_sections: + thread_safe_print(f" šŸŽÆ Direct matching for {len(system_sections)} system sections...") + matched_dict, failed_matches, skipped_sections = find_direct_matches_for_special_files( + system_sections, target_hierarchy, target_lines + ) + + for target_line_num, hierarchy_string in matched_dict.items(): + sections_to_delete.append(int(target_line_num)) + thread_safe_print(f" āœ… Marked system section for deletion: line {target_line_num}") + + if failed_matches: + thread_safe_print(f" āŒ Failed to match {len(failed_matches)} system sections") + for failed_line in failed_matches: + thread_safe_print(f" - Line {failed_line}: {system_sections[failed_line]}") + + # Process regular sections with AI matching + if regular_sections: + thread_safe_print(f" šŸ¤– AI matching for {len(regular_sections)} regular sections...") + + # Filter target hierarchy for AI + filtered_target_hierarchy = filter_non_system_sections(target_hierarchy) + + # Check if filtered hierarchy is reasonable for AI + if len(filtered_target_hierarchy) > max_non_system_sections: + thread_safe_print(f" āŒ Target hierarchy too large for AI: {len(filtered_target_hierarchy)} > {max_non_system_sections}") + else: + # Get AI mapping (convert dict values to lists as expected by the function) + source_list = list(regular_sections.values()) + target_list = list(filtered_target_hierarchy.values()) + + ai_mapping = get_corresponding_sections( + source_list, + target_list, + ai_client, + repo_config['source_language'], + repo_config['target_language'], + max_tokens=20000 # Use default value for now, can be made configurable later + ) + + if ai_mapping: + # Parse AI response and find matching line numbers + ai_sections = parse_ai_response(ai_mapping) + ai_matched = find_matching_line_numbers(ai_sections, target_hierarchy) + + for source_line, target_line in ai_matched.items(): + try: + sections_to_delete.append(int(target_line)) + thread_safe_print(f" āœ… Marked regular section for deletion: line {target_line}") + except ValueError as e: + thread_safe_print(f" āŒ Error converting target_line to int: {target_line}, error: {e}") + # If target_line is not a number, try to find it in target_hierarchy + for line_num, hierarchy in target_hierarchy.items(): + if target_line in hierarchy or hierarchy in target_line: + sections_to_delete.append(int(line_num)) + thread_safe_print(f" āœ… Found matching section at line {line_num}: {hierarchy}") + break + + # Delete the sections from local document + if sections_to_delete: + success = delete_sections_from_document(file_path, sections_to_delete, repo_config['target_local_path']) + if success: + return True, f"Successfully deleted {len(sections_to_delete)} sections from {file_path}" + else: + return False, f"Failed to delete sections from {file_path}" + else: + return False, f"No sections to delete in {file_path}" + +def delete_sections_from_document(file_path, sections_to_delete, target_local_path): + """Delete specified sections from the local document""" + target_file_path = os.path.join(target_local_path, file_path) + + if not os.path.exists(target_file_path): + thread_safe_print(f" āŒ Target file not found: {target_file_path}") + return False + + try: + # Read current file content + with open(target_file_path, 'r', encoding='utf-8') as f: + content = f.read() + + lines = content.split('\n') + + # Import needed function + from pr_analyzer import build_hierarchy_dict + + # Build hierarchy to understand section boundaries + target_hierarchy = build_hierarchy_dict(content) + + # Sort sections to delete in reverse order to maintain line numbers + sections_to_delete.sort(reverse=True) + + thread_safe_print(f" šŸ—‘ļø Deleting {len(sections_to_delete)} sections from {file_path}") + + for section_line in sections_to_delete: + section_start = section_line - 1 # Convert to 0-based index + + if section_start < 0 or section_start >= len(lines): + thread_safe_print(f" āŒ Invalid section line: {section_line}") + continue + + # Find section end + section_end = len(lines) - 1 # Default to end of file + + # Look for next header at same or higher level + current_line = lines[section_start].strip() + if current_line.startswith('#'): + current_level = len(current_line.split('#')[1:]) # Count # characters + + for i in range(section_start + 1, len(lines)): + line = lines[i].strip() + if line.startswith('#'): + line_level = len(line.split('#')[1:]) + if line_level <= current_level: + section_end = i - 1 + break + + # Delete section (from section_start to section_end inclusive) + thread_safe_print(f" šŸ—‘ļø Deleting lines {section_start + 1} to {section_end + 1}") + del lines[section_start:section_end + 1] + + # Write updated content back to file + updated_content = '\n'.join(lines) + with open(target_file_path, 'w', encoding='utf-8') as f: + f.write(updated_content) + + thread_safe_print(f" āœ… Updated file: {target_file_path}") + return True + + except Exception as e: + thread_safe_print(f" āŒ Error deleting sections from {target_file_path}: {e}") + return False + +def process_single_file(file_path, source_sections, pr_diff, pr_url, github_client, ai_client, repo_config, max_non_system_sections=120): + """Process a single file - thread-safe function for parallel processing""" + thread_id = threading.current_thread().name + thread_safe_print(f"\nšŸ“„ [{thread_id}] Processing {file_path}") + + try: + # Check if this is a TOC file with special operations + if isinstance(source_sections, dict) and 'type' in source_sections and source_sections['type'] == 'toc': + from toc_processor import process_toc_file + return process_toc_file(file_path, source_sections, pr_url, github_client, ai_client, repo_config) + + # Check if this is enhanced sections + if isinstance(source_sections, dict) and 'sections' in source_sections: + if source_sections.get('type') == 'enhanced_sections': + # Skip all the matching logic and directly extract data + thread_safe_print(f" [{thread_id}] šŸš€ Using enhanced sections data, skipping matching logic") + enhanced_sections = source_sections['sections'] + + # Extract target sections and source old content from enhanced sections + # Maintain the exact order from match_source_diff_to_target.json + from collections import OrderedDict + target_sections = OrderedDict() + source_old_content_dict = OrderedDict() + + # Process in the exact order they appear in enhanced_sections (which comes from match_source_diff_to_target.json) + for key, section_info in enhanced_sections.items(): + if isinstance(section_info, dict): + operation = section_info.get('source_operation', '') + + # Skip deleted sections - they shouldn't be in the enhanced_sections anyway + if operation == 'deleted': + continue + + # For source sections: use old_content for modified, new_content for added + if operation == 'added': + source_content = section_info.get('source_new_content', '') + else: # modified + source_content = section_info.get('source_old_content', '') + + # For target sections: use target_content for modified, empty string for added + if operation == 'added': + target_content = "" # Added sections have no existing target content + else: # modified + target_content = section_info.get('target_content', '') + + # Add to both dictionaries using the same key from match_source_diff_to_target.json + if source_content is not None: + source_old_content_dict[key] = source_content + target_sections[key] = target_content + + thread_safe_print(f" [{thread_id}] šŸ“Š Extracted: {len(target_sections)} target sections, {len(source_old_content_dict)} source old content entries") + + # Update sections with AI (get-updated-target-sections.py logic) + thread_safe_print(f" [{thread_id}] šŸ¤– Getting updated sections from AI...") + updated_sections = get_updated_sections_from_ai(pr_diff, target_sections, source_old_content_dict, ai_client, repo_config['source_language'], repo_config['target_language'], file_path) + if not updated_sections: + thread_safe_print(f" [{thread_id}] āš ļø Could not get AI update") + return False, f"Could not get AI update for {file_path}" + + # Return the AI results for further processing + thread_safe_print(f" [{thread_id}] āœ… Successfully got AI translation results for {file_path}") + return True, updated_sections # Return the actual AI results + + else: + # New format: complete data structure + actual_sections = source_sections['sections'] + + # Regular file processing continues here for old format + # Get target hierarchy and content (get-target-affected-hierarchy.py logic) + from pr_analyzer import get_target_hierarchy_and_content + target_hierarchy, target_lines = get_target_hierarchy_and_content(file_path, github_client, repo_config['target_repo']) + if not target_hierarchy: + thread_safe_print(f" [{thread_id}] āš ļø Could not get target content") + return False, f"Could not get target content for {file_path}" + else: + # Old format: direct dict + actual_sections = source_sections + + # Only do mapping if we don't have enhanced sections + if 'enhanced_sections' not in locals() or not enhanced_sections: + # Separate different types of sections + from section_matcher import is_system_variable_or_config + system_var_sections = {} + toplevel_sections = {} + frontmatter_sections = {} + regular_sections = {} + + for line_num, hierarchy in actual_sections.items(): + if line_num == "0" and hierarchy == "frontmatter": + # Special handling for frontmatter + frontmatter_sections[line_num] = hierarchy + else: + # Extract the leaf title from hierarchy + leaf_title = hierarchy.split(' > ')[-1] if ' > ' in hierarchy else hierarchy + + if is_system_variable_or_config(leaf_title): + system_var_sections[line_num] = hierarchy + elif leaf_title.startswith('# '): + # Top-level titles need special handling + toplevel_sections[line_num] = hierarchy + else: + regular_sections[line_num] = hierarchy + + thread_safe_print(f" [{thread_id}] šŸ“Š Found {len(system_var_sections)} system variable/config, {len(toplevel_sections)} top-level, {len(frontmatter_sections)} frontmatter, and {len(regular_sections)} regular sections") + + target_affected = {} + + # Process frontmatter sections with special handling + if frontmatter_sections: + thread_safe_print(f" [{thread_id}] šŸ“„ Processing frontmatter section...") + # For frontmatter, we simply map it to line 0 in target + for line_num, hierarchy in frontmatter_sections.items(): + target_affected[line_num] = hierarchy + thread_safe_print(f" [{thread_id}] āœ… Mapped {len(frontmatter_sections)} frontmatter section") + + # Process top-level titles with special matching + if toplevel_sections: + thread_safe_print(f" [{thread_id}] šŸ” Top-level title matching for {len(toplevel_sections)} sections...") + from section_matcher import find_toplevel_title_matches + toplevel_matched, toplevel_failed, toplevel_skipped = find_toplevel_title_matches(toplevel_sections, target_lines) + + if toplevel_matched: + target_affected.update(toplevel_matched) + thread_safe_print(f" [{thread_id}] āœ… Top-level matched {len(toplevel_matched)} sections") + + if toplevel_failed: + thread_safe_print(f" [{thread_id}] āš ļø {len(toplevel_failed)} top-level sections failed matching") + for failed in toplevel_failed: + thread_safe_print(f" āŒ {failed['hierarchy']}: {failed['reason']}") + + # Process system variables/config sections with direct matching + if system_var_sections: + thread_safe_print(f" [{thread_id}] šŸŽÆ Direct matching {len(system_var_sections)} system variable/config sections...") + from section_matcher import find_direct_matches_for_special_files + direct_matched, failed_matches, skipped_sections = find_direct_matches_for_special_files(system_var_sections, target_hierarchy, target_lines) + + if direct_matched: + target_affected.update(direct_matched) + thread_safe_print(f" [{thread_id}] āœ… Direct matched {len(direct_matched)} system variable/config sections") + + if failed_matches: + thread_safe_print(f" [{thread_id}] āš ļø {len(failed_matches)} system variable/config sections failed direct matching") + for failed in failed_matches: + thread_safe_print(f" āŒ {failed['hierarchy']}: {failed['reason']}") + + # Process regular sections with AI mapping using filtered target hierarchy + if regular_sections: + thread_safe_print(f" [{thread_id}] šŸ¤– AI mapping {len(regular_sections)} regular sections...") + + # Filter target hierarchy to only include non-system sections for AI mapping + from section_matcher import filter_non_system_sections + filtered_target_hierarchy = filter_non_system_sections(target_hierarchy) + + # Check if filtered target hierarchy exceeds the maximum allowed for AI mapping + MAX_NON_SYSTEM_SECTIONS_FOR_AI = 120 + if len(filtered_target_hierarchy) > MAX_NON_SYSTEM_SECTIONS_FOR_AI: + thread_safe_print(f" [{thread_id}] āŒ Too many non-system sections ({len(filtered_target_hierarchy)} > {MAX_NON_SYSTEM_SECTIONS_FOR_AI})") + thread_safe_print(f" [{thread_id}] āš ļø Skipping AI mapping for regular sections to avoid complexity") + + # If no system sections were matched either, return error + if not target_affected: + error_message = f"File {file_path} has too many non-system sections ({len(filtered_target_hierarchy)} > {MAX_NON_SYSTEM_SECTIONS_FOR_AI}) and no system variable sections were matched" + return False, error_message + + # Continue with only system variable matches if available + thread_safe_print(f" [{thread_id}] āœ… Proceeding with {len(target_affected)} system variable/config sections only") + else: + # Proceed with AI mapping using filtered hierarchy + source_list = list(regular_sections.values()) + target_list = list(filtered_target_hierarchy.values()) + + from section_matcher import get_corresponding_sections + ai_response = get_corresponding_sections(source_list, target_list, ai_client, repo_config['source_language'], repo_config['target_language'], max_tokens=20000) + if ai_response: + # Parse AI response and find matching line numbers in the original (unfiltered) hierarchy + from section_matcher import parse_ai_response, find_matching_line_numbers + ai_sections = parse_ai_response(ai_response) + ai_matched = find_matching_line_numbers(ai_sections, target_hierarchy) # Use original hierarchy for line number lookup + + if ai_matched: + target_affected.update(ai_matched) + thread_safe_print(f" [{thread_id}] āœ… AI mapped {len(ai_matched)} regular sections") + else: + thread_safe_print(f" [{thread_id}] āš ļø AI mapping failed for regular sections") + else: + thread_safe_print(f" [{thread_id}] āš ļø Could not get AI response for regular sections") + + # Summary of mapping results + thread_safe_print(f" [{thread_id}] šŸ“Š Total mapped: {len(target_affected)} out of {len(actual_sections)} sections") + + if not target_affected: + thread_safe_print(f" [{thread_id}] āš ļø Could not map sections") + return False, f"Could not map sections for {file_path}" + + thread_safe_print(f" [{thread_id}] āœ… Mapped {len(target_affected)} sections") + + # Extract target sections (get-target-affected-sections.py logic) + thread_safe_print(f" [{thread_id}] šŸ“ Extracting target sections...") + from pr_analyzer import extract_affected_sections + target_sections = extract_affected_sections(target_affected, target_lines) + + # Extract source old content from the enhanced data structure + thread_safe_print(f" [{thread_id}] šŸ“– Extracting source old content...") + source_old_content_dict = {} + + # Handle different data structures for source_sections + if isinstance(source_sections, dict) and 'sections' in source_sections: + # New format: complete data structure with enhanced matching info + for key, section_info in source_sections.items(): + if isinstance(section_info, dict) and 'source_old_content' in section_info: + source_old_content_dict[key] = section_info['source_old_content'] + else: + # Fallback: if we don't have the enhanced structure, we need to get it differently + thread_safe_print(f" [{thread_id}] āš ļø Source sections missing enhanced structure, using fallback") + # For now, create empty dict to avoid errors - this should be addressed in the calling code + source_old_content_dict = {} + + # Update sections with AI (get-updated-target-sections.py logic) + thread_safe_print(f" [{thread_id}] šŸ¤– Getting updated sections from AI...") + updated_sections = get_updated_sections_from_ai(pr_diff, target_sections, source_old_content_dict, ai_client, repo_config['source_language'], repo_config['target_language'], file_path) + if not updated_sections: + thread_safe_print(f" [{thread_id}] āš ļø Could not get AI update") + return False, f"Could not get AI update for {file_path}" + + # Update local document (update-target-doc-v2.py logic) + thread_safe_print(f" [{thread_id}] šŸ’¾ Updating local document...") + success = update_local_document(file_path, updated_sections, target_affected, repo_config['target_local_path']) + + if success: + thread_safe_print(f" [{thread_id}] šŸŽ‰ Successfully updated {file_path}") + return True, f"Successfully updated {file_path}" + else: + thread_safe_print(f" [{thread_id}] āŒ Failed to update {file_path}") + return False, f"Failed to update {file_path}" + + except Exception as e: + thread_safe_print(f" [{thread_id}] āŒ Error processing {file_path}: {e}") + return False, f"Error processing {file_path}: {e}" + +def process_added_sections(added_sections, pr_diff, pr_url, github_client, ai_client, repo_config, max_non_system_sections=120): + """Process added sections by translating and inserting them""" + if not added_sections: + thread_safe_print("\nāž• No added sections to process") + return + + thread_safe_print(f"\nāž• Processing added sections from {len(added_sections)} files...") + + # Import needed functions + from section_matcher import map_insertion_points_to_target + from pr_analyzer import get_target_hierarchy_and_content + + for file_path, section_data in added_sections.items(): + thread_safe_print(f"\nāž• Processing added sections in {file_path}") + + source_sections = section_data['sections'] + insertion_points = section_data['insertion_points'] + + # Get target file hierarchy and content + target_hierarchy, target_lines = get_target_hierarchy_and_content( + file_path, github_client, repo_config['target_repo'] + ) + + if not target_hierarchy: + thread_safe_print(f" āŒ Could not get target hierarchy for {file_path}") + continue + + # Map insertion points to target language + target_insertion_points = map_insertion_points_to_target( + insertion_points, target_hierarchy, target_lines, file_path, pr_url, github_client, ai_client, repo_config, max_non_system_sections + ) + + if not target_insertion_points: + thread_safe_print(f" āŒ No insertion points mapped for {file_path}") + continue + + # Use AI to translate/update new sections (similar to modified sections) + # Since we're now using source_old_content, we need to extract it from the added sections + source_old_content_dict = {} + for key, content in source_sections.items(): + # For added sections, source_old_content is typically None or empty + # We use the new content (from the source file) as the content to translate + source_old_content_dict[key] = content if content is not None else "" + + # Get target sections (empty for new sections, but we need the structure) + target_sections = {} # New sections don't have existing target content + + # Use the same AI function to translate the new sections + translated_sections = get_updated_sections_from_ai( + pr_diff, + target_sections, + source_old_content_dict, + ai_client, + repo_config['source_language'], + repo_config['target_language'], + file_path + ) + + if translated_sections: + # Insert translated sections into document + insert_sections_into_document(file_path, translated_sections, target_insertion_points, repo_config['target_local_path']) + thread_safe_print(f" āœ… Successfully inserted {len(translated_sections)} sections in {file_path}") + else: + thread_safe_print(f" āš ļø No sections were translated for {file_path}") + +def process_files_in_batches(source_changes, pr_diff, pr_url, github_client, ai_client, repo_config, operation_type="modified", batch_size=5, max_non_system_sections=120): + """Process files in parallel batches""" + # Handle different data formats + if isinstance(source_changes, dict): + files = [] + for path, data in source_changes.items(): + if isinstance(data, dict): + if 'type' in data and data['type'] == 'toc': + # TOC file with special operations + files.append((path, data)) + elif 'sections' in data: + # New format: extract sections for processing + files.append((path, data['sections'])) + else: + # Old format: direct dict + files.append((path, data)) + else: + # Old format: direct dict + files.append((path, data)) + else: + files = list(source_changes.items()) + + total_files = len(files) + + if total_files == 0: + return [] + + thread_safe_print(f"\nšŸ”„ Processing {total_files} files in batches of {batch_size}") + + results = [] + + # Process files in batches + for i in range(0, total_files, batch_size): + batch = files[i:i + batch_size] + batch_num = (i // batch_size) + 1 + total_batches = (total_files + batch_size - 1) // batch_size + + thread_safe_print(f"\nšŸ“¦ Batch {batch_num}/{total_batches}: Processing {len(batch)} files") + + # Process current batch in parallel + with ThreadPoolExecutor(max_workers=len(batch), thread_name_prefix=f"Batch{batch_num}") as executor: + # Submit all files in current batch + future_to_file = {} + for file_path, source_sections in batch: + future = executor.submit( + process_single_file, + file_path, + source_sections, + pr_diff, + pr_url, + github_client, + ai_client, + repo_config, + max_non_system_sections + ) + future_to_file[future] = file_path + + # Collect results as they complete + from concurrent.futures import as_completed + batch_results = [] + for future in as_completed(future_to_file): + file_path = future_to_file[future] + try: + success, message = future.result() + batch_results.append((file_path, success, message)) + except Exception as e: + batch_results.append((file_path, False, f"Exception in thread: {e}")) + + results.extend(batch_results) + + # Brief pause between batches to avoid overwhelming the APIs + if i + batch_size < total_files: + thread_safe_print(f" āøļø Waiting 2 seconds before next batch...") + import time + time.sleep(2) + + return results + +def update_target_document_from_match_data(match_file_path, target_local_path, target_file_name=None): + """ + Update target document using data from match_source_diff_to_target.json + This integrates the logic from test_target_update.py + + Args: + match_file_path: Path to the match_source_diff_to_target.json file + target_local_path: Local path to the target repository + target_file_name: Optional target file name (if not provided, will be extracted from match_file_path) + """ + import json + import os + from pathlib import Path + + # Load match data + if not os.path.exists(match_file_path): + thread_safe_print(f"āŒ {match_file_path} file does not exist") + return False + + with open(match_file_path, 'r', encoding='utf-8') as f: + match_data = json.load(f) + + thread_safe_print(f"āœ… Loaded {len(match_data)} section matching data from {match_file_path}") + thread_safe_print(f" Reading translation results directly from target_new_content field") + + if not match_data: + thread_safe_print("āŒ No matching data found") + return False + + # Sort sections by target_line from large to small (modify from back to front) + sections_with_line = [] + + for key, section_data in match_data.items(): + operation = section_data.get('source_operation', '') + target_new_content = section_data.get('target_new_content') + + # For deleted sections, target_new_content should be null + if operation == 'deleted': + if target_new_content is not None: + thread_safe_print(f" āš ļø Deleted section {key} has non-null target_new_content, should be fixed") + thread_safe_print(f" šŸ—‘ļø Including deleted section: {key}") + elif not target_new_content: + thread_safe_print(f" āš ļø Skipping section without target_new_content: {key}") + continue + + target_line = section_data.get('target_line') + if target_line and target_line != 'unknown': + try: + # Handle special case for bottom sections + if target_line == "-1": + line_num = -1 # Special marker for bottom sections + else: + line_num = int(target_line) + sections_with_line.append((key, section_data, line_num)) + except ValueError: + thread_safe_print(f"āš ļø Skipping invalid target_line: {target_line} for {key}") + + # Separate sections into different processing groups + bottom_modified_sections = [] # Process first: modify existing content at document end + regular_sections = [] # Process second: normal operations from back to front + bottom_added_sections = [] # Process last: append new content to document end + + for key, section_data, line_num in sections_with_line: + target_hierarchy = section_data.get('target_hierarchy', '') + + if target_hierarchy.startswith('bottom-modified-'): + bottom_modified_sections.append((key, section_data, line_num)) + elif target_hierarchy.startswith('bottom-added-'): + bottom_added_sections.append((key, section_data, line_num)) + else: + regular_sections.append((key, section_data, line_num)) + + # Sort each group appropriately + def get_source_line_num(item): + key, section_data, line_num = item + if '_' in key and key.split('_')[1].isdigit(): + return int(key.split('_')[1]) + return 0 + + # Bottom modified: sort by source line number (large to small) + bottom_modified_sections.sort(key=lambda x: -get_source_line_num(x)) + + # Regular sections: sort by target_line (large to small), then by source line number + regular_sections.sort(key=lambda x: (-x[2], -get_source_line_num(x))) + + # Bottom added: sort by source line number (small to large) for proper document order + bottom_added_sections.sort(key=lambda x: get_source_line_num(x)) + + # Combine all sections in processing order + all_sections = bottom_modified_sections + regular_sections + bottom_added_sections + + thread_safe_print(f"\nšŸ“Š Processing order: bottom-modified -> regular -> bottom-added") + thread_safe_print(f" šŸ“‹ Bottom modified sections: {len(bottom_modified_sections)}") + thread_safe_print(f" šŸ“‹ Regular sections: {len(regular_sections)}") + thread_safe_print(f" šŸ“‹ Bottom added sections: {len(bottom_added_sections)}") + + if not all_sections: + thread_safe_print("āŒ No valid sections found for update") + return False + + thread_safe_print(f"\nšŸ“Š Detailed processing order:") + for i, (key, section_data, line_num) in enumerate(all_sections, 1): + operation = section_data.get('source_operation', '') + hierarchy = section_data.get('target_hierarchy', '') + insertion_type = section_data.get('insertion_type', '') + + # Extract source line number for display + source_line_num = int(key.split('_')[1]) if '_' in key and key.split('_')[1].isdigit() else 'N/A' + + # Display target_line with special handling for bottom sections + target_display = "END" if line_num == -1 else str(line_num) + + # Determine section group + if hierarchy.startswith('bottom-modified-'): + group = "BotMod" + elif hierarchy.startswith('bottom-added-'): + group = "BotAdd" + else: + group = "Regular" + + if operation == 'deleted': + action = "delete" + elif insertion_type == "before_reference": + action = "insert" + elif line_num == -1: + action = "append" + else: + action = "replace" + + thread_safe_print(f" {i:2}. [{group:7}] Target:{target_display:>3} Src:{source_line_num:3} | {key:15} ({operation:8}) | {action:7} | {hierarchy}") + + # Determine target file name + if target_file_name is None: + # Extract target file name from match file path + # e.g., "tikv-configuration-file-match_source_diff_to_target.json" -> "tikv-configuration-file.md" + match_filename = os.path.basename(match_file_path) + if match_filename.endswith('-match_source_diff_to_target.json'): + extracted_name = match_filename[:-len('-match_source_diff_to_target.json')] + '.md' + target_file_name = extracted_name + thread_safe_print(f" šŸ“‚ Extracted target file name from match file: {target_file_name}") + else: + # Fallback: try to determine from source hierarchy + first_entry = next(iter(match_data.values())) + source_hierarchy = first_entry.get('source_original_hierarchy', '') + + if 'TiFlash' in source_hierarchy or 'tiflash' in source_hierarchy.lower(): + target_file_name = "tiflash/tiflash-configuration.md" + else: + # Default to command-line flags for other cases + target_file_name = "command-line-flags-for-tidb-configuration.md" + thread_safe_print(f" šŸ“‚ Determined target file name from hierarchy: {target_file_name}") + else: + thread_safe_print(f" šŸ“‚ Using provided target file name: {target_file_name}") + + target_file_path = os.path.join(target_local_path, target_file_name) + thread_safe_print(f"\nšŸ“„ Target file path: {target_file_path}") + + # Update target document + thread_safe_print(f"\nšŸš€ Starting target document update, will modify {len(all_sections)} sections...") + success = update_target_document_sections(all_sections, target_file_path) + + return success + +def update_target_document_sections(all_sections, target_file_path): + """ + Update target document sections - integrated from test_target_update.py + """ + thread_safe_print(f"\nšŸš€ Starting target document update: {target_file_path}") + + # Read target document + if not os.path.exists(target_file_path): + thread_safe_print(f"āŒ Target file does not exist: {target_file_path}") + return False + + with open(target_file_path, 'r', encoding='utf-8') as f: + target_lines = f.readlines() + + thread_safe_print(f"šŸ“„ Target document total lines: {len(target_lines)}") + + # Process modifications in order (bottom-modified -> regular -> bottom-added) + for i, (key, section_data, target_line_num) in enumerate(all_sections, 1): + operation = section_data.get('source_operation', '') + insertion_type = section_data.get('insertion_type', '') + target_hierarchy = section_data.get('target_hierarchy', '') + target_new_content = section_data.get('target_new_content') + + thread_safe_print(f"\nšŸ“ {i}/{len(all_sections)} Processing {key} (Line {target_line_num})") + thread_safe_print(f" Operation type: {operation}") + thread_safe_print(f" Target section: {target_hierarchy}") + + if operation == 'deleted': + # Delete logic: remove the specified section + if target_line_num == -1: + thread_safe_print(f" āŒ Invalid delete operation for bottom section") + continue + + thread_safe_print(f" šŸ—‘ļø Delete mode: removing section starting at line {target_line_num}") + + # Find section end position + start_line = target_line_num - 1 # Convert to 0-based index + + if start_line >= len(target_lines): + thread_safe_print(f" āŒ Line number out of range: {target_line_num} > {len(target_lines)}") + continue + + # Find section end position + end_line = find_section_end_for_update(target_lines, start_line, target_hierarchy) + + thread_safe_print(f" šŸ“ Delete range: line {start_line + 1} to {end_line}") + thread_safe_print(f" šŸ“„ Delete content: {target_lines[start_line].strip()[:50]}...") + + # Delete content + deleted_lines = target_lines[start_line:end_line] + target_lines[start_line:end_line] = [] + + thread_safe_print(f" āœ… Deleted {len(deleted_lines)} lines of content") + + elif target_new_content is None: + thread_safe_print(f" āš ļø Skipping: target_new_content is null") + continue + + elif not target_new_content: + thread_safe_print(f" āš ļø Skipping: target_new_content is empty") + continue + + else: + # Handle content format + thread_safe_print(f" šŸ“„ Content preview: {repr(target_new_content[:80])}...") + + if target_hierarchy.startswith('bottom-'): + # Bottom section special handling + if target_hierarchy.startswith('bottom-modified-'): + # Bottom modified: find and replace existing content at document end + thread_safe_print(f" šŸ”„ Bottom modified section: replacing existing content at document end") + + # Get the old content to search for + source_operation_data = section_data.get('source_operation_data', {}) + old_content = source_operation_data.get('old_content', '').strip() + + if old_content: + # Search backwards from end to find the matching section + found_line = None + for idx in range(len(target_lines) - 1, -1, -1): + line_content = target_lines[idx].strip() + if line_content == old_content: + found_line = idx + thread_safe_print(f" šŸ“ Found target section at line {found_line + 1}: {line_content[:50]}...") + break + + if found_line is not None: + # Find section end + end_line = find_section_end_for_update(target_lines, found_line, target_hierarchy) + + # Ensure content format is correct + if not target_new_content.endswith('\n'): + target_new_content += '\n' + + # Split content by lines + new_lines = target_new_content.splitlines(keepends=True) + + # Replace content + target_lines[found_line:end_line] = new_lines + + thread_safe_print(f" āœ… Replaced {end_line - found_line} lines with {len(new_lines)} lines") + else: + thread_safe_print(f" āš ļø Could not find target section, appending to end instead") + # Fallback: append to end + if not target_new_content.endswith('\n'): + target_new_content += '\n' + if target_lines and target_lines[-1].strip(): + target_new_content = '\n' + target_new_content + new_lines = target_new_content.splitlines(keepends=True) + target_lines.extend(new_lines) + thread_safe_print(f" āœ… Appended {len(new_lines)} lines to end of document") + else: + thread_safe_print(f" āš ļø No old_content found, appending to end instead") + # Fallback: append to end + if not target_new_content.endswith('\n'): + target_new_content += '\n' + if target_lines and target_lines[-1].strip(): + target_new_content = '\n' + target_new_content + new_lines = target_new_content.splitlines(keepends=True) + target_lines.extend(new_lines) + thread_safe_print(f" āœ… Appended {len(new_lines)} lines to end of document") + + elif target_hierarchy.startswith('bottom-added-'): + # Bottom added: append new content to end of document + thread_safe_print(f" šŸ”š Bottom added section: appending new content to end") + + # Ensure content format is correct + if not target_new_content.endswith('\n'): + target_new_content += '\n' + + # Add spacing before new section if needed + if target_lines and target_lines[-1].strip(): + target_new_content = '\n' + target_new_content + + # Split content by lines + new_lines = target_new_content.splitlines(keepends=True) + + # Append to end of document + target_lines.extend(new_lines) + + thread_safe_print(f" āœ… Appended {len(new_lines)} lines to end of document") + else: + # Other bottom sections: append to end + thread_safe_print(f" šŸ”š Other bottom section: appending to end of document") + + # Ensure content format is correct + if not target_new_content.endswith('\n'): + target_new_content += '\n' + + # Add spacing before new section if needed + if target_lines and target_lines[-1].strip(): + target_new_content = '\n' + target_new_content + + # Split content by lines + new_lines = target_new_content.splitlines(keepends=True) + + # Append to end of document + target_lines.extend(new_lines) + + thread_safe_print(f" āœ… Appended {len(new_lines)} lines to end of document") + + elif target_hierarchy == "frontmatter": + # Frontmatter special handling: directly replace front lines + thread_safe_print(f" šŸ“„ Frontmatter mode: directly replacing document beginning") + + # Find the first top-level heading position + first_header_line = 0 + for i, line in enumerate(target_lines): + if line.strip().startswith('# '): + first_header_line = i + break + + thread_safe_print(f" šŸ“ Frontmatter range: line 1 to {first_header_line}") + + # Split new content by lines, preserving original structure including trailing empty lines + new_lines = target_new_content.splitlines(keepends=True) + + # If the original content ends with \n, it means there should be an empty line after the last content line + # splitlines() doesn't create this empty line, so we need to add it manually + if target_new_content.endswith('\n'): + new_lines.append('\n') + elif target_new_content: + # If content doesn't end with newline, ensure the last line has one + if not new_lines[-1].endswith('\n'): + new_lines[-1] += '\n' + + # Replace frontmatter + target_lines[0:first_header_line] = new_lines + + thread_safe_print(f" āœ… Replaced {first_header_line} lines of frontmatter with {len(new_lines)} lines") + + elif insertion_type == "before_reference": + # Insert logic: insert before specified line + if target_line_num == -1: + thread_safe_print(f" āŒ Invalid insert operation for bottom section") + continue + + thread_safe_print(f" šŸ“ Insert mode: inserting before line {target_line_num}") + + # Ensure content format is correct + if not target_new_content.endswith('\n'): + target_new_content += '\n' + + # Ensure spacing between sections + if not target_new_content.endswith('\n\n'): + target_new_content += '\n' + + # Split content by lines + new_lines = target_new_content.splitlines(keepends=True) + + # Insert at specified position + insert_position = target_line_num - 1 # Convert to 0-based index + if insert_position < 0: + insert_position = 0 + elif insert_position > len(target_lines): + insert_position = len(target_lines) + + # Execute insertion + for j, line in enumerate(new_lines): + target_lines.insert(insert_position + j, line) + + thread_safe_print(f" āœ… Inserted {len(new_lines)} lines of content") + + else: + # Replace logic: find target section and replace + if target_line_num == -1: + thread_safe_print(f" āŒ Invalid replace operation for bottom section") + continue + + thread_safe_print(f" šŸ”„ Replace mode: replacing section starting at line {target_line_num}") + + # Ensure content format is correct + if not target_new_content.endswith('\n'): + target_new_content += '\n' + + # Ensure spacing between sections + if not target_new_content.endswith('\n\n'): + target_new_content += '\n' + + # Find section end position + start_line = target_line_num - 1 # Convert to 0-based index + + if start_line >= len(target_lines): + thread_safe_print(f" āŒ Line number out of range: {target_line_num} > {len(target_lines)}") + continue + + # Find section end position + end_line = find_section_end_for_update(target_lines, start_line, target_hierarchy) + + thread_safe_print(f" šŸ“ Replace range: line {start_line + 1} to {end_line}") + + # Split new content by lines + new_lines = target_new_content.splitlines(keepends=True) + + # Replace content + target_lines[start_line:end_line] = new_lines + + thread_safe_print(f" āœ… Replaced {end_line - start_line} lines with {len(new_lines)} lines") + + + with open(target_file_path, 'w', encoding='utf-8') as f: + f.writelines(target_lines) + + thread_safe_print(f"\nāœ… Target document update completed!") + thread_safe_print(f"šŸ“„ Updated file: {target_file_path}") + + return True + +def find_section_end_for_update(lines, start_line, target_hierarchy): + """Find section end position - based on test_target_update.py logic""" + current_line = lines[start_line].strip() + + if target_hierarchy == "frontmatter": + # Frontmatter special handling: from --- to second ---, then to first top-level heading + if start_line == 0 and current_line.startswith('---'): + # Find second --- + for i in range(start_line + 1, len(lines)): + if lines[i].strip() == '---': + # Found frontmatter end, but need to include up to next content start + # Look for first non-empty line or first heading + for j in range(i + 1, len(lines)): + line = lines[j].strip() + if line and line.startswith('# '): + thread_safe_print(f" šŸ“ Frontmatter ends at line {j} (before first top-level heading)") + return j + elif line and not line.startswith('#'): + # If there's other content, end there + thread_safe_print(f" šŸ“ Frontmatter ends at line {j} (before other content)") + return j + # If no other content found, end after second --- + thread_safe_print(f" šŸ“ Frontmatter ends at line {i+1} (after second ---)") + return i + 1 + # If not standard frontmatter format, find first top-level heading + for i in range(start_line + 1, len(lines)): + if lines[i].strip().startswith('# '): + thread_safe_print(f" šŸ“ Frontmatter ends at line {i} (before first top-level heading)") + return i + # If no top-level heading found, process entire file + return len(lines) + + if current_line.startswith('#'): + # Use file_updater.py method to calculate heading level + current_level = len(current_line.split()[0]) if current_line.split() else 0 + thread_safe_print(f" šŸ” Current heading level: {current_level} (heading: {current_line[:50]}...)") + + # Special handling for top-level headings: only process until first second-level heading + if current_level == 1: + for i in range(start_line + 1, len(lines)): + line = lines[i].strip() + if line.startswith('##'): # Find first second-level heading + thread_safe_print(f" šŸ“ Top-level heading ends at line {i} (before first second-level heading)") + return i + # If no second-level heading found, look for next top-level heading + for i in range(start_line + 1, len(lines)): + line = lines[i].strip() + if line.startswith('#') and not line.startswith('##'): + thread_safe_print(f" šŸ“ Top-level heading ends at line {i} (before next top-level heading)") + return i + else: + # For other level headings, stop at ANY header to get only direct content + # This prevents including sub-sections in the update range + for i in range(start_line + 1, len(lines)): + line = lines[i].strip() + if line.startswith('#'): + # Stop at ANY header to get only direct content + thread_safe_print(f" šŸ“ Found header at line {i}: {line[:30]}... (stopping for direct content only)") + return i + + # If not found, return file end + thread_safe_print(f" šŸ“ No end position found, using file end") + return len(lines) + + # Non-heading line, only replace current line + return start_line + 1 diff --git a/scripts/translate_doc_pr/main_workflow.py b/scripts/translate_doc_pr/main_workflow.py new file mode 100644 index 0000000000000..12260334ec206 --- /dev/null +++ b/scripts/translate_doc_pr/main_workflow.py @@ -0,0 +1,691 @@ +""" +Main Entry Point for GitHub Workflow +Orchestrates the entire auto-sync workflow in GitHub Actions environment +""" + +import sys +import os +import json +import threading +import tiktoken +from github import Github, Auth + +# Conditional import for Gemini +try: + from google import genai + GEMINI_AVAILABLE = True +except ImportError: + GEMINI_AVAILABLE = False + +# Import all modules +from pr_analyzer import analyze_source_changes, get_repo_config, get_target_hierarchy_and_content, parse_pr_url +from file_adder import process_added_files +from file_deleter import process_deleted_files +from file_updater import process_files_in_batches, process_added_sections, process_modified_sections, process_deleted_sections +from toc_processor import process_toc_files +from section_matcher import match_source_diff_to_target + +# Configuration from environment variables +SOURCE_PR_URL = os.getenv("SOURCE_PR_URL") +TARGET_PR_URL = os.getenv("TARGET_PR_URL") +GITHUB_TOKEN = os.getenv("GITHUB_TOKEN") +AI_PROVIDER = os.getenv("AI_PROVIDER", "deepseek") +TARGET_REPO_PATH = os.getenv("TARGET_REPO_PATH") + +# AI configuration +DEEPSEEK_API_KEY = os.getenv("DEEPSEEK_API_TOKEN") +DEEPSEEK_BASE_URL = "https://api.deepseek.com" +GEMINI_API_KEY = os.getenv("GEMINI_API_TOKEN") +GEMINI_MODEL_NAME = "gemini-2.0-flash" + +# Processing limit configuration +MAX_NON_SYSTEM_SECTIONS_FOR_AI = 120 +SOURCE_TOKEN_LIMIT = 5000 # Maximum tokens for source new_content before skipping file processing + +# AI configuration +AI_MAX_TOKENS = 20000 # Maximum tokens for AI translation requests + +# Special file configuration +SPECIAL_FILES = ["TOC.md"] +IGNORE_FILES = ["faq/ddl-faq.md","command-line-flags-for-tidb-configuration.md","pd-configuration-file.md"] + +# Repository configuration for workflow +def get_workflow_repo_configs(): + """Get repository configuration based on environment variables""" + if not SOURCE_PR_URL or not TARGET_PR_URL: + raise ValueError("SOURCE_PR_URL and TARGET_PR_URL must be set") + + # Parse source and target repo info + source_parts = SOURCE_PR_URL.split('/') + target_parts = TARGET_PR_URL.split('/') + + source_owner, source_repo = source_parts[-4], source_parts[-3] + target_owner, target_repo = target_parts[-4], target_parts[-3] + + source_repo_key = f"{source_owner}/{source_repo}" + target_repo_key = f"{target_owner}/{target_repo}" + + # Determine language direction based on repo names + if source_repo.endswith('-cn') and not target_repo.endswith('-cn'): + # Chinese to English + source_language = "Chinese" + target_language = "English" + elif not source_repo.endswith('-cn') and target_repo.endswith('-cn'): + # English to Chinese + source_language = "English" + target_language = "Chinese" + else: + # Default fallback + source_language = "English" + target_language = "Chinese" + + return { + source_repo_key: { + "target_repo": target_repo_key, + "target_local_path": TARGET_REPO_PATH, + "source_language": source_language, + "target_language": target_language + } + } + +# Thread-safe printing function +print_lock = threading.Lock() + +def thread_safe_print(*args, **kwargs): + with print_lock: + print(*args, **kwargs) + +def ensure_temp_output_dir(): + """Ensure the temp_output directory exists""" + # Get the directory of the current script + script_dir = os.path.dirname(os.path.abspath(__file__)) + temp_dir = os.path.join(script_dir, "temp_output") + os.makedirs(temp_dir, exist_ok=True) + return temp_dir + +def clean_temp_output_dir(): + """Clean the temp_output directory at the start of execution""" + import shutil + # Get the directory of the current script + script_dir = os.path.dirname(os.path.abspath(__file__)) + temp_dir = os.path.join(script_dir, "temp_output") + if os.path.exists(temp_dir): + if os.path.isdir(temp_dir): + shutil.rmtree(temp_dir) + print(f"🧹 Cleaned existing temp_output directory") + else: + # Remove file if it exists + os.remove(temp_dir) + print(f"🧹 Removed existing temp_output file") + os.makedirs(temp_dir, exist_ok=True) + print(f"šŸ“ Created temp_output directory: {temp_dir}") + return temp_dir + +def estimate_tokens(text): + """Calculate accurate token count using tiktoken (GPT-4/3.5 encoding)""" + if not text: + return 0 + try: + enc = tiktoken.get_encoding("cl100k_base") # GPT-4/3.5 encoding + tokens = enc.encode(text) + return len(tokens) + except Exception as e: + # Fallback to character approximation if tiktoken fails + thread_safe_print(f" āš ļø Tiktoken encoding failed: {e}, using character approximation") + return len(text) // 4 + +def print_token_estimation(prompt_text, context="AI translation"): + """Print accurate token consumption for a request""" + actual_tokens = estimate_tokens(prompt_text) + char_count = len(prompt_text) + thread_safe_print(f" šŸ’° {context}") + thread_safe_print(f" šŸ“ Input: {char_count:,} characters") + thread_safe_print(f" šŸ”¢ Actual tokens: {actual_tokens:,} (using tiktoken cl100k_base)") + return actual_tokens + +class UnifiedAIClient: + """Unified interface for different AI providers""" + + def __init__(self, provider="deepseek"): + self.provider = provider + if provider == "deepseek": + from openai import OpenAI + self.client = OpenAI(api_key=DEEPSEEK_API_KEY, base_url=DEEPSEEK_BASE_URL) + self.model = "deepseek-chat" + elif provider == "gemini": + if not GEMINI_AVAILABLE: + raise ImportError("google.generativeai package not installed. Run: pip install google-generativeai") + if not GEMINI_API_KEY: + raise ValueError("GEMINI_API_TOKEN environment variable must be set") + self.client = genai.Client(api_key=GEMINI_API_KEY) + self.model = GEMINI_MODEL_NAME + else: + raise ValueError(f"Unsupported AI provider: {provider}") + + def chat_completion(self, messages, temperature=0.1, max_tokens=20000): + """Unified chat completion interface""" + if self.provider == "deepseek": + response = self.client.chat.completions.create( + model=self.model, + messages=messages, + temperature=temperature, + max_tokens=max_tokens + ) + return response.choices[0].message.content.strip() + elif self.provider == "gemini": + try: + # Convert OpenAI-style messages to Gemini format + prompt = self._convert_messages_to_prompt(messages) + thread_safe_print(f" šŸ”„ Calling Gemini API...") + + # Use the correct Gemini API call format (based on your reference file) + response = self.client.models.generate_content( + model=self.model, + contents=prompt + ) + + if response and response.text: + thread_safe_print(f" āœ… Gemini response received") + return response.text.strip() + else: + thread_safe_print(f" āš ļø Gemini response was empty or blocked") + return "No response from Gemini" + + except Exception as e: + thread_safe_print(f" āŒ Gemini API error: {str(e)}") + # Fallback: suggest switching to DeepSeek + thread_safe_print(f" šŸ’” Consider switching to DeepSeek in main.py: AI_PROVIDER = 'deepseek'") + raise e + + def _convert_messages_to_prompt(self, messages): + """Convert OpenAI-style messages to a single prompt for Gemini""" + prompt_parts = [] + for message in messages: + role = message.get("role", "user") + content = message.get("content", "") + if role == "user": + prompt_parts.append(content) + elif role == "system": + prompt_parts.append(f"System: {content}") + return "\n\n".join(prompt_parts) + +def check_source_token_limit(source_diff_dict_file, token_limit=SOURCE_TOKEN_LIMIT): + """Check if the total tokens of all new_content in source-diff-dict exceeds the limit""" + try: + with open(source_diff_dict_file, 'r', encoding='utf-8') as f: + source_diff_dict = json.load(f) + + total_new_content = "" + section_count = 0 + + for key, section_data in source_diff_dict.items(): + if isinstance(section_data, dict): + new_content = section_data.get('new_content', '') + if new_content: + total_new_content += new_content + "\n" + section_count += 1 + + if not total_new_content.strip(): + thread_safe_print(f" āš ļø No new_content found in {source_diff_dict_file}") + return True, 0, 0 # Allow processing if no content to check + + total_tokens = estimate_tokens(total_new_content) + char_count = len(total_new_content) + + thread_safe_print(f" šŸ“Š Source token limit check:") + thread_safe_print(f" šŸ“ Total new_content: {char_count:,} characters from {section_count} sections") + thread_safe_print(f" šŸ”¢ Total tokens: {total_tokens:,}") + thread_safe_print(f" 🚧 Token limit: {token_limit:,}") + + if total_tokens > token_limit: + thread_safe_print(f" āŒ Token limit exceeded! ({total_tokens:,} > {token_limit:,})") + return False, total_tokens, token_limit + else: + thread_safe_print(f" āœ… Within token limit ({total_tokens:,} ≤ {token_limit:,})") + return True, total_tokens, token_limit + + except Exception as e: + thread_safe_print(f" āŒ Error checking token limit for {source_diff_dict_file}: {e}") + return True, 0, 0 # Allow processing on error to avoid blocking + +def get_pr_diff(pr_url, github_client): + """Get the diff content from a GitHub PR (from auto-sync-pr-changes.py)""" + try: + from pr_analyzer import parse_pr_url + owner, repo, pr_number = parse_pr_url(pr_url) + repository = github_client.get_repo(f"{owner}/{repo}") + pr = repository.get_pull(pr_number) + + # Get files and their patches + files = pr.get_files() + diff_content = [] + + for file in files: + if file.filename.endswith('.md') and file.patch: + diff_content.append(f"File: {file.filename}") + diff_content.append(file.patch) + diff_content.append("-" * 80) + + return "\n".join(diff_content) + + except Exception as e: + thread_safe_print(f" āŒ Error getting PR diff: {e}") + return None + +def filter_diff_by_operation_type(pr_diff, operation_type, target_sections=None): + """Filter PR diff to only include changes relevant to specific operation type""" + + if not pr_diff: + return "" + + if operation_type == "modified": + # For modified sections, we want the full diff but focus on changed content + return pr_diff + elif operation_type == "added": + # For added sections, we want to show what was added + filtered_lines = [] + for line in pr_diff.split('\n'): + if line.startswith('+') and not line.startswith('+++'): + filtered_lines.append(line) + elif line.startswith('@@') or line.startswith('File:'): + filtered_lines.append(line) + return '\n'.join(filtered_lines) + elif operation_type == "deleted": + # For deleted sections, we want to show what was removed + filtered_lines = [] + for line in pr_diff.split('\n'): + if line.startswith('-') and not line.startswith('---'): + filtered_lines.append(line) + elif line.startswith('@@') or line.startswith('File:'): + filtered_lines.append(line) + return '\n'.join(filtered_lines) + + return pr_diff + +def filter_diff_for_target_file(pr_diff, target_file, source_diff_dict): + """Extract file-specific diff from the complete PR diff based on source files that map to the target file""" + if not pr_diff or not source_diff_dict: + return pr_diff + + # Extract source files that contribute to this target file + source_files = set() + for key, section_data in source_diff_dict.items(): + if isinstance(section_data, dict): + source_file = section_data.get('source_file', '') + if source_file: + source_files.add(source_file) + + if not source_files: + print(f" āš ļø No source files found in source_diff_dict, using complete PR diff") + return pr_diff + + print(f" šŸ“„ Source files contributing to {target_file}: {list(source_files)}") + + # Filter PR diff to only include changes from these source files + filtered_lines = [] + current_file = None + include_section = False + + for line in pr_diff.split('\n'): + if line.startswith('File: '): + current_file = line.replace('File: ', '').strip() + include_section = current_file in source_files + if include_section: + filtered_lines.append(line) + elif line.startswith('-' * 80): + if include_section: + filtered_lines.append(line) + elif include_section: + filtered_lines.append(line) + + file_specific_diff = '\n'.join(filtered_lines) + print(f" šŸ“Š Filtered diff: {len(file_specific_diff)} chars (from {len(pr_diff)} chars)") + + return file_specific_diff if file_specific_diff.strip() else pr_diff + +def extract_file_diff_from_pr(pr_diff, source_file_path): + """Extract diff content for a specific source file from the complete PR diff""" + if not pr_diff: + return "" + + filtered_lines = [] + current_file = None + include_section = False + + for line in pr_diff.split('\n'): + if line.startswith('File: '): + current_file = line.replace('File: ', '').strip() + include_section = (current_file == source_file_path) + if include_section: + filtered_lines.append(line) + elif line.startswith('-' * 80): + if include_section: + filtered_lines.append(line) + include_section = False # End of this file's section + elif include_section: + filtered_lines.append(line) + + return '\n'.join(filtered_lines) + +def determine_file_processing_type(source_file_path, file_sections, special_files=None): + """Determine how to process a file based on operation type and file characteristics""" + + # Check if this is a special file (like TOC.md) + if special_files and os.path.basename(source_file_path) in special_files: + return "special_file_toc" + + # For all other modified files, use regular processing + return "regular_modified" + +def process_regular_modified_file(source_file_path, file_sections, file_diff, pr_url, github_client, ai_client, repo_config, max_sections): + """Process a regular markdown file that has been modified""" + try: + print(f" šŸ“ Processing as regular modified file: {source_file_path}") + + # Extract the actual sections from the file_sections structure + # file_sections contains: {'sections': {...}, 'original_hierarchy': {...}, 'current_hierarchy': {...}} + if isinstance(file_sections, dict) and 'sections' in file_sections: + actual_sections = file_sections['sections'] + else: + # Fallback: assume file_sections is already the sections dict + actual_sections = file_sections + + print(f" šŸ“Š Extracted sections: {len(actual_sections)} sections") + + # CRITICAL: Load the source-diff-dict.json and perform matching + import json + import os + from section_matcher import match_source_diff_to_target + from pr_analyzer import get_target_hierarchy_and_content + + # Load source-diff-dict.json with file prefix + temp_dir = ensure_temp_output_dir() + file_prefix = source_file_path.replace('/', '-').replace('.md', '') + source_diff_dict_file = os.path.join(temp_dir, f"{file_prefix}-source-diff-dict.json") + if os.path.exists(source_diff_dict_file): + with open(source_diff_dict_file, 'r', encoding='utf-8') as f: + source_diff_dict = json.load(f) + print(f" šŸ“‚ Loaded source diff dict with {len(source_diff_dict)} sections from {source_diff_dict_file}") + + # Check source token limit before proceeding with processing + print(f" šŸ” Checking source token limit...") + within_limit, total_tokens, token_limit = check_source_token_limit(source_diff_dict_file) + if not within_limit: + print(f" 🚫 Skipping file processing: source content exceeds token limit") + print(f" šŸ“Š Total tokens: {total_tokens:,} > Limit: {token_limit:,}") + print(f" ā­ļø File {source_file_path} will not be processed") + return False + + else: + print(f" āŒ {source_diff_dict_file} not found") + return False + + # Get target file hierarchy and content + target_repo = repo_config['target_repo'] + target_hierarchy, target_lines = get_target_hierarchy_and_content(source_file_path, github_client, target_repo) + + if not target_hierarchy or not target_lines: + print(f" āŒ Could not get target file content for {source_file_path}") + return False + + print(f" šŸ“– Target file: {len(target_hierarchy)} sections, {len(target_lines)} lines") + + # Perform source diff to target matching + print(f" šŸ”— Matching source diff to target...") + enhanced_sections = match_source_diff_to_target( + source_diff_dict, + target_hierarchy, + target_lines, + ai_client, + repo_config, + max_sections, + AI_MAX_TOKENS + ) + + if not enhanced_sections: + print(f" āŒ No sections matched") + return False + + print(f" āœ… Matched {len(enhanced_sections)} sections") + + # Save the match result for reference + match_file = os.path.join(temp_dir, f"{source_file_path.replace('/', '-').replace('.md', '')}-match_source_diff_to_target.json") + with open(match_file, 'w', encoding='utf-8') as f: + json.dump(enhanced_sections, f, ensure_ascii=False, indent=2) + print(f" šŸ’¾ Saved match result to: {match_file}") + + # Step 2: Get AI translation for the matched sections + print(f" šŸ¤– Getting AI translation for matched sections...") + + # Create file data structure with enhanced matching info + # Wrap enhanced_sections in the expected format for process_single_file + file_data = { + source_file_path: { + 'type': 'enhanced_sections', + 'sections': enhanced_sections + } + } + + # Call the existing process_modified_sections function to get AI translation + results = process_modified_sections(file_data, file_diff, pr_url, github_client, ai_client, repo_config, max_sections) + + # Step 3: Update match_source_diff_to_target.json with AI results + if results and len(results) > 0: + file_path, success, ai_updated_sections = results[0] # Get first result + if success and isinstance(ai_updated_sections, dict): + print(f" šŸ“ Step 3: Updating {match_file} with AI results...") + + # Load current match_source_diff_to_target.json + with open(match_file, 'r', encoding='utf-8') as f: + match_data = json.load(f) + + # Add target_new_content field to each section based on AI results + updated_count = 0 + for key, section_data in match_data.items(): + operation = section_data.get('source_operation', '') + + if operation == 'deleted': + # For deleted sections, set target_new_content to null + section_data['target_new_content'] = None + elif key in ai_updated_sections: + # For modified/added sections with AI translation + section_data['target_new_content'] = ai_updated_sections[key] + updated_count += 1 + else: + # For sections not translated, keep original content + section_data['target_new_content'] = section_data.get('target_content', '') + + # Save updated match_source_diff_to_target.json + with open(match_file, 'w', encoding='utf-8') as f: + json.dump(match_data, f, ensure_ascii=False, indent=2) + + print(f" āœ… Updated {updated_count} sections with AI translations in {match_file}") + + # Step 4: Apply updates to target document using update_target_document_from_match_data + print(f" šŸ“ Step 4: Applying updates to target document...") + from file_updater import update_target_document_from_match_data + + success = update_target_document_from_match_data(match_file, repo_config['target_local_path'], source_file_path) + if success: + print(f" šŸŽ‰ Target document successfully updated!") + return True + else: + print(f" āŒ Failed to update target document") + return False + + else: + print(f" āš ļø AI translation failed or returned invalid results") + return False + else: + print(f" āš ļø No results from process_modified_sections") + return False + + except Exception as e: + print(f" āŒ Error processing regular modified file {source_file_path}: {e}") + return False + + +def get_workflow_repo_config(pr_url, repo_configs): + """Get repository configuration for workflow environment""" + from pr_analyzer import parse_pr_url + + owner, repo, pr_number = parse_pr_url(pr_url) + source_repo = f"{owner}/{repo}" + + if source_repo not in repo_configs: + raise ValueError(f"Unsupported source repository: {source_repo}. Supported: {list(repo_configs.keys())}") + + config = repo_configs[source_repo].copy() + config['source_repo'] = source_repo + config['pr_number'] = pr_number + + return config + +def main(): + """Main function - orchestrates the entire workflow for GitHub Actions""" + + # Validate environment variables + if not all([SOURCE_PR_URL, TARGET_PR_URL, GITHUB_TOKEN, TARGET_REPO_PATH]): + print("āŒ Missing required environment variables:") + print(f" SOURCE_PR_URL: {SOURCE_PR_URL}") + print(f" TARGET_PR_URL: {TARGET_PR_URL}") + print(f" GITHUB_TOKEN: {'Set' if GITHUB_TOKEN else 'Not set'}") + print(f" TARGET_REPO_PATH: {TARGET_REPO_PATH}") + return + + print(f"šŸ”§ Auto PR Sync Tool (GitHub Workflow Version)") + print(f"šŸ“ Source PR URL: {SOURCE_PR_URL}") + print(f"šŸ“ Target PR URL: {TARGET_PR_URL}") + print(f"šŸ¤– AI Provider: {AI_PROVIDER}") + print(f"šŸ“ Target Repo Path: {TARGET_REPO_PATH}") + + # Clean and prepare temp_output directory + clean_temp_output_dir() + + # Get repository configuration using workflow config + try: + repo_configs = get_workflow_repo_configs() + repo_config = get_workflow_repo_config(SOURCE_PR_URL, repo_configs) + print(f"šŸ“ Source Repo: {repo_config['source_repo']} ({repo_config['source_language']})") + print(f"šŸ“ Target Repo: {repo_config['target_repo']} ({repo_config['target_language']})") + print(f"šŸ“ Target Path: {repo_config['target_local_path']}") + except ValueError as e: + print(f"āŒ {e}") + return + + # Initialize clients + auth = Auth.Token(GITHUB_TOKEN) + github_client = Github(auth=auth) + + # Initialize unified AI client + try: + ai_client = UnifiedAIClient(provider=AI_PROVIDER) + thread_safe_print(f"šŸ¤– AI Provider: {AI_PROVIDER.upper()} ({ai_client.model})") + except Exception as e: + thread_safe_print(f"āŒ Failed to initialize AI client: {e}") + return + + print(f"\nšŸš€ Starting auto-sync for PR: {SOURCE_PR_URL}") + + # Step 1: Get PR diff + print(f"\nšŸ“‹ Step 1: Getting PR diff...") + pr_diff = get_pr_diff(SOURCE_PR_URL, github_client) + if not pr_diff: + print("āŒ Could not get PR diff") + return + print(f"āœ… Got PR diff: {len(pr_diff)} characters") + + # Step 2: Analyze source changes with operation categorization + print(f"\nšŸ“Š Step 2: Analyzing source changes...") + added_sections, modified_sections, deleted_sections, added_files, deleted_files, toc_files = analyze_source_changes( + SOURCE_PR_URL, github_client, + special_files=SPECIAL_FILES, + ignore_files=IGNORE_FILES, + repo_configs=repo_configs, + max_non_system_sections=MAX_NON_SYSTEM_SECTIONS_FOR_AI, + pr_diff=pr_diff # Pass the PR diff to avoid re-fetching + ) + + # Step 3: Process different types of files based on operation type + print(f"\nšŸ“‹ Step 3: Processing files based on operation type...") + + # Import necessary functions + from file_updater import process_modified_sections, update_target_document_from_match_data + from toc_processor import process_toc_files + + # Step 3.1: Process deleted files (file-level deletions) + if deleted_files: + print(f"\nšŸ—‘ļø Step 3.1: Processing {len(deleted_files)} deleted files...") + process_deleted_files(deleted_files, github_client, repo_config) + print(f" āœ… Deleted files processed") + + # Step 3.2: Process added files (file-level additions) + if added_files: + print(f"\nšŸ“„ Step 3.2: Processing {len(added_files)} added files...") + process_added_files(added_files, SOURCE_PR_URL, github_client, ai_client, repo_config) + print(f" āœ… Added files processed") + + # Step 3.3: Process special files (TOC.md and similar) + if toc_files: + print(f"\nšŸ“‹ Step 3.3: Processing {len(toc_files)} special files (TOC)...") + process_toc_files(toc_files, SOURCE_PR_URL, github_client, ai_client, repo_config) + print(f" āœ… Special files processed") + + # Step 3.4: Process modified files (section-level modifications) + if modified_sections: + print(f"\nšŸ“ Step 3.4: Processing {len(modified_sections)} modified files...") + + # Process each modified file separately + for source_file_path, file_sections in modified_sections.items(): + print(f"\nšŸ“„ Processing modified file: {source_file_path}") + + # Extract file-specific diff from the complete PR diff + print(f" šŸ” Extracting file-specific diff for: {source_file_path}") + file_specific_diff = extract_file_diff_from_pr(pr_diff, source_file_path) + + if not file_specific_diff: + print(f" āš ļø No diff found for {source_file_path}, skipping...") + continue + + print(f" šŸ“Š File-specific diff: {len(file_specific_diff)} chars") + + # Determine file processing approach for modified files + file_type = determine_file_processing_type(source_file_path, file_sections, SPECIAL_FILES) + print(f" šŸ” File processing type: {file_type}") + + if file_type == "special_file_toc": + # Special files should have been processed in Step 3.3, skip here + print(f" ā­ļø Special file already processed in Step 3.3, skipping...") + continue + + elif file_type == "regular_modified": + # Regular markdown files with modifications + success = process_regular_modified_file( + source_file_path, + file_sections, + file_specific_diff, + SOURCE_PR_URL, + github_client, + ai_client, + repo_config, + MAX_NON_SYSTEM_SECTIONS_FOR_AI + ) + + if success: + print(f" āœ… Successfully processed {source_file_path}") + else: + print(f" āŒ Failed to process {source_file_path}") + + else: + print(f" āš ļø Unknown file processing type: {file_type} for {source_file_path}, skipping...") + + # Final summary + print(f"šŸ“Š Summary:") + print(f" šŸ“„ Added files: {len(added_files)} processed") + print(f" šŸ—‘ļø Deleted files: {len(deleted_files)} processed") + print(f" šŸ“‹ TOC files: {len(toc_files)} processed") + print(f" šŸ“ Modified files: {len(modified_sections)} processed") + print(f"šŸŽ‰ Workflow completed successfully!") + +if __name__ == "__main__": + main() diff --git a/scripts/translate_doc_pr/pr_analyzer.py b/scripts/translate_doc_pr/pr_analyzer.py new file mode 100644 index 0000000000000..c164da1520163 --- /dev/null +++ b/scripts/translate_doc_pr/pr_analyzer.py @@ -0,0 +1,1447 @@ +#!/usr/bin/env python3 +""" +PR Analyzer Module +Handles PR analysis, diff parsing, content getting, hierarchy building, and section getting +""" + +import json +import os +import re +import threading +from github import Github + +# Thread-safe printing +print_lock = threading.Lock() + +def thread_safe_print(*args, **kwargs): + """Thread-safe print function""" + with print_lock: + print(*args, **kwargs) + + +def parse_pr_url(pr_url): + """Parse PR URL to get repo info""" + parts = pr_url.split('/') + return parts[-4], parts[-3], int(parts[-1]) # owner, repo, pr_number + +def get_repo_config(pr_url, repo_configs): + """Get repository configuration based on source repo""" + owner, repo, pr_number = parse_pr_url(pr_url) + source_repo = f"{owner}/{repo}" + + if source_repo not in repo_configs: + raise ValueError(f"Unsupported source repository: {source_repo}. Supported: {list(repo_configs.keys())}") + + config = repo_configs[source_repo].copy() + config['source_repo'] = source_repo + config['pr_number'] = pr_number + + return config + +def get_pr_diff(pr_url, github_client): + """Get the diff content from a GitHub PR""" + try: + owner, repo, pr_number = parse_pr_url(pr_url) + repository = github_client.get_repo(f"{owner}/{repo}") + pr = repository.get_pull(pr_number) + + # Get files and their patches + files = pr.get_files() + diff_content = [] + + for file in files: + if file.filename.endswith('.md') and file.patch: + diff_content.append(f"File: {file.filename}") + diff_content.append(file.patch) + diff_content.append("-" * 80) + + return "\n".join(diff_content) + + except Exception as e: + print(f" āŒ Error getting PR diff: {e}") + return None + +def get_changed_line_ranges(file): + """Get the ranges of lines that were changed in the PR""" + changed_ranges = [] + patch = file.patch + if not patch: + return changed_ranges + + lines = patch.split('\n') + current_line = 0 + + for line in lines: + if line.startswith('@@'): + # Parse the hunk header to get line numbers + match = re.search(r'\+(\d+),?(\d+)?', line) + if match: + current_line = int(match.group(1)) + elif line.startswith('+') and not line.startswith('+++'): + # This is an added line + changed_ranges.append(current_line) + current_line += 1 + elif line.startswith('-') and not line.startswith('---'): + # This is a deleted line, also consider as changed + changed_ranges.append(current_line) + # Don't increment current_line for deleted lines + continue + elif line.startswith(' '): + # Context line + current_line += 1 + + return changed_ranges + +def analyze_diff_operations(file): + """Analyze diff to categorize operations as added, modified, or deleted (improved GitHub-like approach)""" + operations = { + 'added_lines': [], # Lines that were added + 'deleted_lines': [], # Lines that were deleted + 'modified_lines': [] # Lines that were modified (both added and deleted content) + } + + patch = file.patch + if not patch: + return operations + + lines = patch.split('\n') + current_line = 0 + deleted_line = 0 + + # Parse diff and keep track of sequence order for better modification detection + diff_sequence = [] # Track the order of operations in diff + + for i, line in enumerate(lines): + if line.startswith('@@'): + # Parse the hunk header to get line numbers + # Format: @@ -old_start,old_count +new_start,new_count @@ + match = re.search(r'-(\d+),?(\d+)?\s+\+(\d+),?(\d+)?', line) + if match: + deleted_line = int(match.group(1)) + current_line = int(match.group(3)) + elif line.startswith('+') and not line.startswith('+++'): + # This is an added line + added_entry = { + 'line_number': current_line, + 'content': line[1:], # Remove the '+' prefix + 'is_header': line[1:].strip().startswith('#'), + 'diff_index': i # Track position in diff + } + operations['added_lines'].append(added_entry) + diff_sequence.append(('added', added_entry)) + current_line += 1 + elif line.startswith('-') and not line.startswith('---'): + # This is a deleted line + deleted_entry = { + 'line_number': deleted_line, + 'content': line[1:], # Remove the '-' prefix + 'is_header': line[1:].strip().startswith('#'), + 'diff_index': i # Track position in diff + } + operations['deleted_lines'].append(deleted_entry) + diff_sequence.append(('deleted', deleted_entry)) + deleted_line += 1 + elif line.startswith(' '): + # Context line (unchanged) + current_line += 1 + deleted_line += 1 + + # GitHub-like modification detection: based on diff sequence proximity + modified_pairs = [] + deleted_headers = [d for d in operations['deleted_lines'] if d['is_header']] + added_headers = [a for a in operations['added_lines'] if a['is_header']] + + used_added_indices = set() + used_deleted_indices = set() + + # Helper function for semantic similarity + def are_headers_similar(old, new): + # Remove markdown markers + old_clean = old.replace('#', '').replace('`', '').strip() + new_clean = new.replace('#', '').replace('`', '').strip() + + # Check if one is a substring/extension of the other + if old_clean in new_clean or new_clean in old_clean: + return True + + # Check for similar patterns (like appending -pu, -new, etc.) + old_base = old_clean.split('-')[0] + new_base = new_clean.split('-')[0] + if old_base and new_base and old_base == new_base: + return True + + return False + + # GitHub-like approach: Look for adjacent or close operations in diff sequence + for i, deleted_header in enumerate(deleted_headers): + if i in used_deleted_indices: + continue + + for j, added_header in enumerate(added_headers): + if j in used_added_indices: + continue + + deleted_content = deleted_header['content'].strip() + added_content = added_header['content'].strip() + + # Check if they are close in the diff sequence (GitHub's approach) + diff_distance = abs(added_header['diff_index'] - deleted_header['diff_index']) + is_close_in_diff = diff_distance <= 5 # Allow small gap for context lines + + # Check semantic similarity + is_similar = are_headers_similar(deleted_content, added_content) + + # GitHub-like logic: prioritize diff proximity + semantic similarity + if is_close_in_diff and is_similar: + modified_pairs.append({ + 'deleted': deleted_header, + 'added': added_header, + 'original_content': deleted_header['content'] + }) + used_added_indices.add(j) + used_deleted_indices.add(i) + break + # Fallback: strong semantic similarity even if not adjacent + elif is_similar and abs(added_header['line_number'] - deleted_header['line_number']) <= 20: + modified_pairs.append({ + 'deleted': deleted_header, + 'added': added_header, + 'original_content': deleted_header['content'] + }) + used_added_indices.add(j) + used_deleted_indices.add(i) + break + + # Remove identified modifications from pure additions/deletions + for pair in modified_pairs: + if pair['deleted'] in operations['deleted_lines']: + operations['deleted_lines'].remove(pair['deleted']) + if pair['added'] in operations['added_lines']: + operations['added_lines'].remove(pair['added']) + # Store both new and original content for modified headers + modified_entry = pair['added'].copy() + modified_entry['original_content'] = pair['original_content'] + operations['modified_lines'].append(modified_entry) + + return operations + +def build_hierarchy_dict(file_content): + """Build hierarchy dictionary from file content, excluding content inside code blocks""" + lines = file_content.split('\n') + level_stack = [] + all_hierarchy_dict = {} + + # Track code block state + in_code_block = False + code_block_delimiter = None # Track the type of code block (``` or ```) + + # Build complete hierarchy for all headers + for line_num, line in enumerate(lines, 1): + original_line = line + line = line.strip() + + # Check for code block delimiters + if line.startswith('```') or line.startswith('~~~'): + if not in_code_block: + # Entering a code block + in_code_block = True + code_block_delimiter = line[:3] # Store the delimiter type + continue + elif line.startswith(code_block_delimiter): + # Exiting a code block + in_code_block = False + code_block_delimiter = None + continue + + # Skip processing if we're inside a code block + if in_code_block: + continue + + # Process headers only if not in code block + if line.startswith('#'): + match = re.match(r'^(#{1,10})\s+(.+)', line) + if match: + level = len(match.group(1)) + title = match.group(2).strip() + + # Remove items from stack that are at same or deeper level + while level_stack and level_stack[-1][0] >= level: + level_stack.pop() + + # Build hierarchy with special handling for top-level titles + if level == 1: + # Top-level titles are included directly without hierarchy path + hierarchy_line = line + elif level_stack: + # For other levels, build path but skip the top-level title (level 1) + path_parts = [item[1] for item in level_stack if item[0] > 1] # Skip level 1 items + path_parts.append(line) + hierarchy_line = " > ".join(path_parts) + else: + # Fallback for other cases + hierarchy_line = line + + if hierarchy_line: # Only add non-empty hierarchies + all_hierarchy_dict[line_num] = hierarchy_line + + level_stack.append((level, line)) + + return all_hierarchy_dict + +def build_hierarchy_path(lines, line_num, all_headers): + """Build the full hierarchy path for a header at given line""" + if line_num not in all_headers: + return [] + + current_header = all_headers[line_num] + current_level = current_header['level'] + hierarchy_path = [] + + # Find all parent headers + for check_line in sorted(all_headers.keys()): + if check_line >= line_num: + break + + header = all_headers[check_line] + if header['level'] < current_level: + # This is a potential parent + # Remove any headers at same or deeper level + while hierarchy_path and hierarchy_path[-1]['level'] >= header['level']: + hierarchy_path.pop() + hierarchy_path.append(header) + + # Add current header + hierarchy_path.append(current_header) + + return hierarchy_path + +def build_hierarchy_for_modified_section(file_content, target_line_num, original_line, base_hierarchy_dict): + """Build hierarchy path for a modified section using original content""" + lines = file_content.split('\n') + + # Get the level of the original header + original_match = re.match(r'^(#{1,10})\s+(.+)', original_line) + if not original_match: + return None + + original_level = len(original_match.group(1)) + original_title = original_match.group(2).strip() + + # Find parent sections by looking backwards from target line + level_stack = [] + + for line_num in range(1, target_line_num): + if line_num in base_hierarchy_dict: + # This is a header line + line_content = lines[line_num - 1].strip() + if line_content.startswith('#'): + match = re.match(r'^(#{1,10})\s+(.+)', line_content) + if match: + level = len(match.group(1)) + title = match.group(2).strip() + + # Remove items from stack that are at same or deeper level + while level_stack and level_stack[-1][0] >= level: + level_stack.pop() + + # Add this header to stack if it's a potential parent + if level < original_level: + level_stack.append((level, line_content)) + + # Build hierarchy path using original content + if level_stack: + path_parts = [item[1] for item in level_stack[1:]] # Skip first level + path_parts.append(original_line) + hierarchy_line = " > ".join(path_parts) + else: + hierarchy_line = original_line if original_level > 1 else "" + + return hierarchy_line if hierarchy_line else None + +def find_section_boundaries(lines, hierarchy_dict): + """Find the start and end line for each section based on hierarchy""" + section_boundaries = {} + + # Sort sections by line number + sorted_sections = sorted(hierarchy_dict.items(), key=lambda x: int(x[0])) + + for i, (line_num, hierarchy) in enumerate(sorted_sections): + start_line = int(line_num) - 1 # Convert to 0-based index + + # Find end line (start of next section at same or higher level) + end_line = len(lines) # Default to end of document + + if start_line >= len(lines): + continue + + # Get current section level + current_line = lines[start_line].strip() + if not current_line.startswith('#'): + continue + + current_level = len(current_line.split()[0]) # Count # characters + + # Look for next section at same or higher level + for j in range(start_line + 1, len(lines)): + line = lines[j].strip() + if line.startswith('#'): + line_level = len(line.split()[0]) if line.split() else 0 + if line_level <= current_level: + end_line = j + break + + section_boundaries[line_num] = { + 'start': start_line, + 'end': end_line, + 'hierarchy': hierarchy, + 'level': current_level + } + + return section_boundaries + +def extract_section_content(lines, start_line, hierarchy_dict): + """Extract the content of a section starting from start_line (includes sub-sections)""" + if not lines or start_line < 1 or start_line > len(lines): + return "" + + start_index = start_line - 1 # Convert to 0-based index + section_content = [] + + # Find the header at start_line + current_line = lines[start_index].strip() + if not current_line.startswith('#'): + return "" + + # Get the level of current header + current_level = len(current_line.split()[0]) # Count # characters + section_content.append(current_line) + + # Special handling for top-level titles (level 1) + if current_level == 1: + # For top-level titles, only extract content until the first next-level header (##) + for i in range(start_index + 1, len(lines)): + line = lines[i].strip() + + if line.startswith('#'): + # Check if this is a header of next level (##, ###, etc.) + line_level = len(line.split()[0]) if line.split() else 0 + if line_level > current_level: + # Found first subsection, stop here for top-level titles + break + elif line_level <= current_level: + # Found same or higher level header, also stop + break + + section_content.append(lines[i].rstrip()) # Keep original line without trailing whitespace + else: + # For non-top-level titles, use the original logic + # Extract content until we hit the next header of same or higher level + for i in range(start_index + 1, len(lines)): + line = lines[i].strip() + + if line.startswith('#'): + # Check if this is a header of same or higher level + line_level = len(line.split()[0]) if line.split() else 0 + if line_level <= current_level: + # Found a header of same or higher level, stop here regardless + # Each section should be extracted individually + break + + section_content.append(lines[i].rstrip()) # Keep original line without trailing whitespace + + return '\n'.join(section_content) + +def extract_section_direct_content(lines, start_line): + """Extract ONLY the direct content of a section (excluding sub-sections) - for source diff dict""" + if not lines or start_line < 1 or start_line > len(lines): + return "" + + start_index = start_line - 1 # Convert to 0-based index + section_content = [] + + # Find the header at start_line + current_line = lines[start_index].strip() + if not current_line.startswith('#'): + return "" + + # Add the header line + section_content.append(current_line) + + # Only extract until the first header (any level) + # This means we stop at ANY header - whether it's a sub-section OR same/higher level + for i in range(start_index + 1, len(lines)): + line = lines[i].strip() + if line.startswith('#'): + # Stop at ANY header to get only direct content + break + section_content.append(lines[i].rstrip()) + + return '\n'.join(section_content) + +def extract_frontmatter_content(file_lines): + """Extract content from the beginning of file to the first top-level header""" + if not file_lines: + return "" + + frontmatter_lines = [] + for i, line in enumerate(file_lines): + line_stripped = line.strip() + # Stop when we hit the first top-level header + if line_stripped.startswith('# '): + break + frontmatter_lines.append(line.rstrip()) + + return '\n'.join(frontmatter_lines) + + +def extract_affected_sections(hierarchy_dict, file_lines): + """Extract all affected sections based on hierarchy dict""" + affected_sections = {} + + for line_num, hierarchy in hierarchy_dict.items(): + if line_num == "0" and hierarchy == "frontmatter": + # Special handling for frontmatter + frontmatter_content = extract_frontmatter_content(file_lines) + if frontmatter_content: + affected_sections[line_num] = frontmatter_content + else: + line_number = int(line_num) + section_content = extract_section_content(file_lines, line_number, hierarchy_dict) + + if section_content: + affected_sections[line_num] = section_content + + return affected_sections + +def find_containing_section(line_num, all_headers): + """Find which section a line belongs to""" + current_section = None + for header_line_num in sorted(all_headers.keys()): + if header_line_num <= line_num: + current_section = header_line_num + else: + break + return current_section + +def find_affected_sections(lines, changed_lines, all_headers): + """Find which sections are affected by the changes""" + affected_sections = set() + + for changed_line in changed_lines: + # Find the section this changed line belongs to + current_section = None + + # Find the most recent header before or at the changed line + for line_num in sorted(all_headers.keys()): + if line_num <= changed_line: + current_section = line_num + else: + break + + if current_section: + # Only add the directly affected section (the one that directly contains the change) + affected_sections.add(current_section) + + return affected_sections + +def find_sections_by_operation_type(lines, operations, all_headers, base_hierarchy_dict=None): + """Find sections affected by different types of operations""" + sections = { + 'added': set(), + 'modified': set(), + 'deleted': set() + } + + # Process added lines + for added_line in operations['added_lines']: + line_num = added_line['line_number'] + if added_line['is_header']: + # This is a new header - only mark the section as added if the header itself is new + sections['added'].add(line_num) + # Note: We don't mark sections as "added" just because they contain new non-header content + # That would be a "modified" section, not an "added" section + + # Process modified lines + for modified_line in operations['modified_lines']: + line_num = modified_line['line_number'] + if modified_line['is_header']: + sections['modified'].add(line_num) + else: + section = find_containing_section(line_num, all_headers) + if section: + sections['modified'].add(section) + + # Process deleted lines - use base hierarchy to find deleted sections + for deleted_line in operations['deleted_lines']: + if deleted_line['is_header']: + # Find this header in the base file hierarchy (before deletion) + deleted_title = clean_title_for_matching(deleted_line['content']) + # Use base hierarchy if available, otherwise fall back to current headers + search_hierarchy = base_hierarchy_dict if base_hierarchy_dict else all_headers + + found_deleted = False + for line_num, hierarchy_line in search_hierarchy.items(): + # Extract title from hierarchy line + if ' > ' in hierarchy_line: + original_title = clean_title_for_matching(hierarchy_line.split(' > ')[-1]) + else: + original_title = clean_title_for_matching(hierarchy_line) + + if deleted_title == original_title: + sections['deleted'].add(line_num) + print(f" šŸ—‘ļø Detected deleted section: {deleted_line['content']} (line {line_num})") + found_deleted = True + break + + if not found_deleted: + # If not found by exact match, try partial matching for renamed sections + print(f" āš ļø Could not find deleted section: {deleted_line['content']}") + + return sections + + +def get_target_hierarchy_and_content(file_path, github_client, target_repo): + """Get target hierarchy and content""" + try: + repository = github_client.get_repo(target_repo) + file_content = repository.get_contents(file_path, ref="master").decoded_content.decode('utf-8') + lines = file_content.split('\n') + + # Build hierarchy using same method + hierarchy = build_hierarchy_dict(file_content) + + return hierarchy, lines + except Exception as e: + print(f" āŒ Error getting target file: {e}") + return {}, [] + +def get_source_sections_content(pr_url, file_path, source_affected, github_client): + """Get the content of source sections for better context""" + try: + owner, repo, pr_number = parse_pr_url(pr_url) + repository = github_client.get_repo(f"{owner}/{repo}") + pr = repository.get_pull(pr_number) + + # Get the source file content + file_content = repository.get_contents(file_path, ref=pr.head.sha).decoded_content.decode('utf-8') + lines = file_content.split('\n') + + # Extract source sections + source_sections = {} + + for line_num, hierarchy in source_affected.items(): + if line_num == "0" and hierarchy == "frontmatter": + # Special handling for frontmatter + frontmatter_content = extract_frontmatter_content(lines) + if frontmatter_content: + source_sections[line_num] = frontmatter_content + else: + line_number = int(line_num) + section_content = extract_section_content(lines, line_number, source_affected) + if section_content: + source_sections[line_num] = section_content + + return source_sections + except Exception as e: + thread_safe_print(f" āš ļø Could not get source sections: {e}") + return {} + +def get_source_file_hierarchy(file_path, pr_url, github_client, get_base_version=False): + """Get source file hierarchy from PR head or base""" + try: + owner, repo, pr_number = parse_pr_url(pr_url) + repository = github_client.get_repo(f"{owner}/{repo}") + pr = repository.get_pull(pr_number) + + if get_base_version: + # Get the source file content before PR changes (base version) + source_file_content = repository.get_contents(file_path, ref=pr.base.sha).decoded_content.decode('utf-8') + else: + # Get the source file content after PR changes (head version) + source_file_content = repository.get_contents(file_path, ref=pr.head.sha).decoded_content.decode('utf-8') + + source_hierarchy = build_hierarchy_dict(source_file_content) + + return source_hierarchy + + except Exception as e: + thread_safe_print(f" āŒ Error getting source file hierarchy: {e}") + return {} + +# Helper function needed for find_sections_by_operation_type +def clean_title_for_matching(title): + """Clean title for matching by removing markdown formatting and span elements""" + if not title: + return "" + + # Remove span elements like New in v5.0 + title = re.sub(r']*>.*?', '', title) + + # Remove markdown header prefix (# ## ### etc.) + title = re.sub(r'^#{1,6}\s*', '', title.strip()) + + # Remove backticks + title = title.replace('`', '') + + # Strip whitespace + title = title.strip() + + return title + +def find_previous_section_for_added(added_sections, hierarchy_dict): + """Find the previous section hierarchy for each added section group""" + insertion_points = {} + + if not added_sections: + return insertion_points + + # Group consecutive added sections + added_list = sorted(list(added_sections)) + groups = [] + current_group = [added_list[0]] + + for i in range(1, len(added_list)): + if added_list[i] - added_list[i-1] <= 10: # Consider sections within 10 lines as consecutive + current_group.append(added_list[i]) + else: + groups.append(current_group) + current_group = [added_list[i]] + groups.append(current_group) + + # For each group, find the previous section hierarchy + for group in groups: + first_new_section = min(group) + + # Find the section that comes before this group + previous_section_line = None + previous_section_hierarchy = None + + for line_num_str in sorted(hierarchy_dict.keys(), key=int): + line_num = int(line_num_str) + if line_num < first_new_section: + previous_section_line = line_num + previous_section_hierarchy = hierarchy_dict[line_num_str] + else: + break + + if previous_section_hierarchy: + insertion_points[f"group_{groups.index(group)}"] = { + 'previous_section_hierarchy': previous_section_hierarchy, + 'previous_section_line': previous_section_line, + 'new_sections': group, + 'insertion_type': 'multiple' if len(group) > 1 else 'single' + } + print(f" šŸ“ Added section group: {len(group)} sections after '{previous_section_hierarchy}'") + else: + print(f" āš ļø Could not find previous section for added sections starting at line {first_new_section}") + + return insertion_points + +def build_source_diff_dict(modified_sections, added_sections, deleted_sections, all_hierarchy_dict, base_hierarchy_dict, operations, file_content, base_file_content): + """Build source diff dictionary with correct structure for matching""" + from section_matcher import clean_title_for_matching + source_diff_dict = {} + + # Helper function to extract section content (only direct content, no sub-sections) + def extract_section_content_for_diff(line_num, hierarchy_dict): + if str(line_num) == "0": + # Handle frontmatter + return extract_frontmatter_content(file_content.split('\n')) + else: + return extract_section_direct_content(file_content.split('\n'), line_num) + + # Helper function to extract old content from base file (only direct content, no sub-sections) + def extract_old_content_for_diff(line_num, base_hierarchy_dict, base_file_content): + if str(line_num) == "0": + # Handle frontmatter from base file + return extract_frontmatter_content(base_file_content.split('\n')) + else: + return extract_section_direct_content(base_file_content.split('\n'), line_num) + + # Helper function to extract old content by hierarchy (for modified sections that may have moved) + def extract_old_content_by_hierarchy(original_hierarchy, base_hierarchy_dict, base_file_content): + """Extract old content by finding the section with matching hierarchy in base file (only direct content)""" + if original_hierarchy == "frontmatter": + return extract_frontmatter_content(base_file_content.split('\n')) + + # Find the line number in base file that matches the original hierarchy + for base_line_num_str, base_hierarchy in base_hierarchy_dict.items(): + if base_hierarchy == original_hierarchy: + base_line_num = int(base_line_num_str) if base_line_num_str != "0" else 0 + if base_line_num == 0: + return extract_frontmatter_content(base_file_content.split('\n')) + else: + return extract_section_direct_content(base_file_content.split('\n'), base_line_num) + + # If exact match not found, return empty string + print(f" āš ļø Could not find matching hierarchy in base file: {original_hierarchy}") + return "" + + # Helper function to build complete hierarchy for a section using base file info + def build_complete_original_hierarchy(line_num, current_hierarchy, base_hierarchy_dict, operations): + """Build complete hierarchy path for original section""" + line_num_str = str(line_num) + + # Special cases: frontmatter and top-level titles + if line_num_str == "0": + return "frontmatter" + + # Check if this line was modified and has original content + for modified_line in operations.get('modified_lines', []): + if (modified_line.get('is_header') and + modified_line.get('line_number') == line_num and + 'original_content' in modified_line): + original_line = modified_line['original_content'].strip() + + # For top-level titles, return the original content directly + if ' > ' not in current_hierarchy: + return original_line + + # For nested sections, build the complete hierarchy using original content + # Find the hierarchy path using base hierarchy dict and replace the leaf with original + if line_num_str in base_hierarchy_dict: + base_hierarchy = base_hierarchy_dict[line_num_str] + if ' > ' in base_hierarchy: + # Replace the leaf (last part) with original content + hierarchy_parts = base_hierarchy.split(' > ') + hierarchy_parts[-1] = original_line + return ' > '.join(hierarchy_parts) + else: + # Single level, return original content + return original_line + + # Fallback: return original content + return original_line + + # If not modified, use base hierarchy if available + if line_num_str in base_hierarchy_dict: + return base_hierarchy_dict[line_num_str] + + # If not found in base (new section), use current hierarchy + return current_hierarchy + + # Process modified sections + for line_num_str, hierarchy in modified_sections.items(): + line_num = int(line_num_str) if line_num_str != "0" else 0 + + # Build complete original hierarchy + original_hierarchy = build_complete_original_hierarchy(line_num, hierarchy, base_hierarchy_dict, operations) + + # Extract both old and new content + new_content = extract_section_content_for_diff(line_num, all_hierarchy_dict) + # Use hierarchy-based lookup for old content instead of line number + old_content = extract_old_content_by_hierarchy(original_hierarchy, base_hierarchy_dict, base_file_content) + + # Only include if content actually changed + if new_content != old_content: + # Check if this is a bottom modified section (no next section in base file) + is_bottom_modified = False + if line_num_str in base_hierarchy_dict: + # Get all sections in base file sorted by line number + base_sections = sorted([(int(ln), hier) for ln, hier in base_hierarchy_dict.items() if ln != "0"]) + + # Check if there's any section after this line in base file + has_next_section = any(base_line > line_num for base_line, _ in base_sections) + + if not has_next_section: + is_bottom_modified = True + print(f" āœ… Bottom modified section detected at line {line_num_str}: no next section in base file") + + # Use special marker for bottom modified sections + if is_bottom_modified: + final_original_hierarchy = f"bottom-modified-{line_num}" + else: + final_original_hierarchy = original_hierarchy + + source_diff_dict[f"modified_{line_num_str}"] = { + "new_line_number": line_num, + "original_hierarchy": final_original_hierarchy, + "operation": "modified", + "new_content": new_content, + "old_content": old_content + } + print(f" āœ… Real modification detected at line {line_num_str}: content changed") + else: + print(f" 🚫 Filtered out false positive at line {line_num_str}: content unchanged (likely line shift artifact)") + + # Process added sections - find next section from current document hierarchy + for line_num_str, hierarchy in added_sections.items(): + line_num = int(line_num_str) + + print(f" šŸ” Finding next section for added section at line {line_num}: {hierarchy}") + + # Strategy: Find the next section directly from the current document (post-PR) + # Get all current sections sorted by line number + current_sections = sorted([(int(ln), curr_hierarchy) for ln, curr_hierarchy in all_hierarchy_dict.items()]) + print(f" šŸ“‹ Current sections around line {line_num}: {[(ln, h.split(' > ')[-1] if ' > ' in h else h) for ln, h in current_sections if abs(ln - line_num) <= 15]}") + + next_section_original_hierarchy = None + + # Find the next section that comes after the added section in the current document + for curr_line_num, curr_hierarchy in current_sections: + if curr_line_num > line_num: + # Found the next section in current document + # Now find its original hierarchy in base document + curr_line_str = str(curr_line_num) + + # Get the original hierarchy for this next section + # Use the same logic as build_complete_original_hierarchy to get original content + if curr_line_str in base_hierarchy_dict: + # Check if this section was modified + was_modified = False + for modified_line in operations.get('modified_lines', []): + if (modified_line.get('is_header') and + modified_line.get('line_number') == curr_line_num and + 'original_content' in modified_line): + # This section was modified, use original content + original_line = modified_line['original_content'].strip() + base_hierarchy = base_hierarchy_dict[curr_line_str] + + if ' > ' in base_hierarchy: + # Replace the leaf with original content + hierarchy_parts = base_hierarchy.split(' > ') + hierarchy_parts[-1] = original_line + next_section_original_hierarchy = ' > '.join(hierarchy_parts) + else: + next_section_original_hierarchy = original_line + + print(f" āœ… Found next section (modified): line {curr_line_num} -> {next_section_original_hierarchy.split(' > ')[-1] if ' > ' in next_section_original_hierarchy else next_section_original_hierarchy}") + was_modified = True + break + + if not was_modified: + # Section was not modified, use base hierarchy directly + next_section_original_hierarchy = base_hierarchy_dict[curr_line_str] + print(f" āœ… Found next section (unchanged): line {curr_line_num} -> {next_section_original_hierarchy.split(' > ')[-1] if ' > ' in next_section_original_hierarchy else next_section_original_hierarchy}") + + break + else: + # This next section might also be new or modified + # Try to find it by content matching in base hierarchy + found_match = False + for base_line_str, base_hierarchy in base_hierarchy_dict.items(): + # Compare the leaf titles (last part of hierarchy) + curr_leaf = curr_hierarchy.split(' > ')[-1] if ' > ' in curr_hierarchy else curr_hierarchy + base_leaf = base_hierarchy.split(' > ')[-1] if ' > ' in base_hierarchy else base_hierarchy + + # Clean titles for comparison + curr_clean = clean_title_for_matching(curr_leaf) + base_clean = clean_title_for_matching(base_leaf) + + if curr_clean == base_clean: + next_section_original_hierarchy = base_hierarchy + print(f" āœ… Found next section (by content): {base_hierarchy.split(' > ')[-1] if ' > ' in base_hierarchy else base_hierarchy}") + found_match = True + break + + if found_match: + break + else: + print(f" āš ļø Next section at line {curr_line_num} not found in base, continuing search...") + + # If no next section found, this is being added at the end + if not next_section_original_hierarchy: + print(f" āœ… Bottom section detected: this section is added at the end of document") + # Use special marker for bottom added sections - no matching needed + next_section_original_hierarchy = f"bottom-added-{line_num}" + + source_diff_dict[f"added_{line_num_str}"] = { + "new_line_number": line_num, + "original_hierarchy": next_section_original_hierarchy, + "operation": "added", + "new_content": extract_section_content_for_diff(line_num, all_hierarchy_dict), + "old_content": None # Added sections have no old content + } + + # Process deleted sections - use original hierarchy from base file + for line_num_str, hierarchy in deleted_sections.items(): + line_num = int(line_num_str) + # Use complete hierarchy from base file + original_hierarchy = base_hierarchy_dict.get(line_num_str, hierarchy) + + # Extract old content for deleted sections + old_content = extract_old_content_for_diff(line_num, base_hierarchy_dict, base_file_content) + + source_diff_dict[f"deleted_{line_num_str}"] = { + "new_line_number": line_num, + "original_hierarchy": original_hierarchy, + "operation": "deleted", + "new_content": None, # No new content for deleted sections + "old_content": old_content # Show what was deleted + } + + # Sort the dictionary by new_line_number for better readability + sorted_items = sorted(source_diff_dict.items(), key=lambda x: x[1]['new_line_number']) + source_diff_dict = dict(sorted_items) + + return source_diff_dict + +def analyze_source_changes(pr_url, github_client, special_files=None, ignore_files=None, repo_configs=None, max_non_system_sections=120, pr_diff=None): + """Analyze source language changes and categorize them as added, modified, or deleted""" + # Import modules needed in this function + import os + import json + from toc_processor import process_toc_operations + + owner, repo, pr_number = parse_pr_url(pr_url) + repository = github_client.get_repo(f"{owner}/{repo}") + pr = repository.get_pull(pr_number) + + # Get repository configuration for target repo info + repo_config = get_repo_config(pr_url, repo_configs) + + print(f"šŸ“‹ Processing PR #{pr_number}: {pr.title}") + + # Get markdown files + files = pr.get_files() + markdown_files = [f for f in files if f.filename.endswith('.md')] + + print(f"šŸ“„ Found {len(markdown_files)} markdown files") + + # Return dictionaries for different operation types + added_sections = {} # New sections that were added + modified_sections = {} # Existing sections that were modified + deleted_sections = {} # Sections that were deleted + added_files = {} # Completely new files that were added + deleted_files = [] # Completely deleted files + ignored_files = [] # Files that were ignored + toc_files = {} # Special TOC files requiring special processing + + for file in markdown_files: + print(f"\nšŸ” Analyzing {file.filename}") + + # Check if this file should be ignored + if file.filename in ignore_files: + print(f" ā­ļø Skipping ignored file: {file.filename}") + ignored_files.append(file.filename) + continue + + # Check if this is a completely new file or deleted file + if file.status == 'added': + print(f" āž• Detected new file: {file.filename}") + try: + file_content = repository.get_contents(file.filename, ref=pr.head.sha).decoded_content.decode('utf-8') + added_files[file.filename] = file_content + print(f" āœ… Added complete file for translation") + continue + except Exception as e: + print(f" āŒ Error getting new file content: {e}") + continue + + elif file.status == 'removed': + print(f" šŸ—‘ļø Detected deleted file: {file.filename}") + deleted_files.append(file.filename) + print(f" āœ… Marked file for deletion") + continue + + # For modified files, check if it's a special file like TOC.md + try: + file_content = repository.get_contents(file.filename, ref=pr.head.sha).decoded_content.decode('utf-8') + except Exception as e: + print(f" āŒ Error getting content: {e}") + continue + + # Check if this is a TOC.md file requiring special processing + if os.path.basename(file.filename) in special_files: + print(f" šŸ“‹ Detected special file: {file.filename}") + + # Get target file content for comparison + try: + target_repository = github_client.get_repo(repo_config['target_repo']) + target_file_content = target_repository.get_contents(file.filename, ref="master").decoded_content.decode('utf-8') + target_lines = target_file_content.split('\n') + except Exception as e: + print(f" āš ļø Could not get target file content: {e}") + continue + + # Analyze diff operations for TOC.md + operations = analyze_diff_operations(file) + source_lines = file_content.split('\n') + + # Process with special TOC logic + toc_results = process_toc_operations(file.filename, operations, source_lines, target_lines, "") # Local path will be determined later + + # Store TOC operations for later processing + if any([toc_results['added'], toc_results['modified'], toc_results['deleted']]): + # Combine all operations for processing + all_toc_operations = [] + all_toc_operations.extend(toc_results['added']) + all_toc_operations.extend(toc_results['modified']) + all_toc_operations.extend(toc_results['deleted']) + + # Add to special TOC processing queue (separate from regular sections) + toc_files[file.filename] = { + 'type': 'toc', + 'operations': all_toc_operations + } + + print(f" šŸ“‹ TOC operations queued for processing:") + if toc_results['added']: + print(f" āž• Added: {len(toc_results['added'])} entries") + if toc_results['modified']: + print(f" āœļø Modified: {len(toc_results['modified'])} entries") + if toc_results['deleted']: + print(f" āŒ Deleted: {len(toc_results['deleted'])} entries") + else: + print(f" ā„¹ļø No TOC operations found") + + continue # Skip regular processing for TOC files + + # Analyze diff operations + operations = analyze_diff_operations(file) + print(f" šŸ“ Diff analysis: {len(operations['added_lines'])} added, {len(operations['modified_lines'])} modified, {len(operations['deleted_lines'])} deleted lines") + + lines = file_content.split('\n') + all_headers = {} + + # Track code block state + in_code_block = False + code_block_delimiter = None + + # First pass: collect all headers (excluding those in code blocks) + for line_num, line in enumerate(lines, 1): + original_line = line + line = line.strip() + + # Check for code block delimiters + if line.startswith('```') or line.startswith('~~~'): + if not in_code_block: + # Entering a code block + in_code_block = True + code_block_delimiter = line[:3] + continue + elif line.startswith(code_block_delimiter): + # Exiting a code block + in_code_block = False + code_block_delimiter = None + continue + + # Skip processing if we're inside a code block + if in_code_block: + continue + + # Process headers only if not in code block + if line.startswith('#'): + match = re.match(r'^(#{1,10})\s+(.+)', line) + if match: + level = len(match.group(1)) + title = match.group(2).strip() + all_headers[line_num] = { + 'level': level, + 'title': title, + 'line': line + } + + # Build complete hierarchy from HEAD (after changes) + all_hierarchy_dict = build_hierarchy_dict(file_content) + + # For deletion detection, we also need the base file hierarchy + try: + base_file_content = repository.get_contents(file.filename, ref=f"{repository.default_branch}").decoded_content.decode('utf-8') + base_hierarchy_dict = build_hierarchy_dict(base_file_content) + except Exception as e: + print(f" āš ļø Could not get base file content: {e}") + base_hierarchy_dict = all_hierarchy_dict + base_file_content = file_content # Fallback to current content + + # Find sections by operation type with corrected logic + sections_by_type = find_sections_by_operation_type(lines, operations, all_headers, base_hierarchy_dict) + + # Prioritize modified headers over added ones (fix for header changes like --host -> --hosts) + modified_header_lines = set() + for modified_line in operations['modified_lines']: + if modified_line['is_header']: + modified_header_lines.add(modified_line['line_number']) + + # Remove modified header lines from added set + sections_by_type['added'] = sections_by_type['added'] - modified_header_lines + + # Enhanced logic: check for actual content changes within sections + # This helps detect changes in section content (not just headers) + print(f" šŸ” Enhanced detection: checking for actual section content changes...") + + # Get only lines that have actual content changes (exclude headers) + real_content_changes = set() + + # Added lines (new content, excluding headers) + for added_line in operations['added_lines']: + if not added_line['is_header']: + real_content_changes.add(added_line['line_number']) + + # Deleted lines (removed content, excluding headers) + for deleted_line in operations['deleted_lines']: + if not deleted_line['is_header']: + real_content_changes.add(deleted_line['line_number']) + + # Modified lines (changed content, excluding headers) + for modified_line in operations['modified_lines']: + if not modified_line['is_header']: + real_content_changes.add(modified_line['line_number']) + + print(f" šŸ“ Real content changes (non-header): {sorted(real_content_changes)}") + + # Find sections that contain actual content changes + content_affected_sections = set() + for changed_line in real_content_changes: + # Find which section this changed line belongs to + containing_section = None + for line_num in sorted(all_headers.keys()): + if line_num <= changed_line: + containing_section = line_num + else: + break + + if containing_section and containing_section not in sections_by_type['added']: + # Additional check: make sure this is not just a line number shift + # Only add if the change is within reasonable distance from the section header + # AND if the changed line is not part of a completely deleted section header + is_deleted_header = False + for deleted_line in operations['deleted_lines']: + if (deleted_line['is_header'] and + abs(changed_line - deleted_line['line_number']) <= 2): + is_deleted_header = True + print(f" āš ļø Skipping change at line {changed_line} (deleted header near line {deleted_line['line_number']})") + break + + # More precise filtering: check if this change is actually meaningful + # Skip changes that are part of deleted content or line shifts due to deletions + should_include = True + + # Skip exact deleted headers + for deleted_line in operations['deleted_lines']: + if (deleted_line['is_header'] and + changed_line == deleted_line['line_number']): + should_include = False + print(f" āš ļø Skipping change at line {changed_line} (exact deleted header)") + break + + # Skip changes that are very close to deleted content AND far from their containing section + # This helps filter out line shift artifacts while keeping real content changes + if should_include: + for deleted_line in operations['deleted_lines']: + # Only skip if both conditions are met: + # 1. Very close to deleted content (within 5 lines) + # 2. The change is far from its containing section (likely a shift artifact) + distance_to_deletion = abs(changed_line - deleted_line['line_number']) + distance_to_section = changed_line - containing_section + + if (distance_to_deletion <= 5 and distance_to_section > 100): + should_include = False + print(f" āš ļø Skipping change at line {changed_line} (likely line shift: {distance_to_deletion} lines from deletion, {distance_to_section} from section)") + break + + if should_include and changed_line - containing_section <= 30: + content_affected_sections.add(containing_section) + print(f" šŸ“ Content change at line {changed_line} affects section at line {containing_section}") + elif should_include: + print(f" āš ļø Skipping distant change at line {changed_line} from section {containing_section}") + + # Add content-modified sections to the modified set, but exclude sections that are already marked as added or deleted + for line_num in content_affected_sections: + if (line_num not in sections_by_type['modified'] and + line_num not in sections_by_type['added'] and + line_num not in sections_by_type['deleted']): # āœ… Critical fix: exclude deleted sections + sections_by_type['modified'].add(line_num) + print(f" šŸ“ Added content-modified section at line {line_num}") + elif line_num in sections_by_type['deleted']: + print(f" 🚫 Skipping content-modified section at line {line_num}: already marked as deleted") + + # Prepare sections data for source_diff_dict + file_modified = {} + file_added = {} + file_deleted = {} + + # Build modified sections + for line_num in sections_by_type['modified']: + if line_num in all_hierarchy_dict: + file_modified[str(line_num)] = all_hierarchy_dict[line_num] + + # Build added sections + for line_num in sections_by_type['added']: + if line_num in all_hierarchy_dict: + file_added[str(line_num)] = all_hierarchy_dict[line_num] + + # Build deleted sections + for line_num in sections_by_type['deleted']: + if line_num in base_hierarchy_dict: + file_deleted[str(line_num)] = base_hierarchy_dict[line_num] + + # Check for frontmatter changes (content before first top-level header) + print(f" šŸ” Checking for frontmatter changes...") + frontmatter_changed = False + + # Check if any changes occur before the first top-level header + first_header_line = None + for line_num in sorted(all_headers.keys()): + header_info = all_headers[line_num] + if header_info['level'] == 1: # First top-level header + first_header_line = line_num + break + + print(f" šŸ“Š First header line: {first_header_line}") + print(f" šŸ“Š Real content changes: {sorted(real_content_changes)}") + + if first_header_line: + # Check if any real content changes are before the first header + for line_num in real_content_changes: + #print(f" šŸ” Checking line {line_num} vs first header {first_header_line}") + if line_num < first_header_line: + frontmatter_changed = True + print(f" šŸ“„ Frontmatter change detected: line {line_num} < {first_header_line}") + break + + print(f" šŸ“Š Frontmatter changed: {frontmatter_changed}") + + if frontmatter_changed: + print(f" šŸ“„ Frontmatter changes detected (before line {first_header_line})") + # Add frontmatter as a special section with line number 0 + file_modified["0"] = "frontmatter" + print(f" āœ… Added frontmatter section to modified sections") + + # Build source diff dictionary + source_diff_dict = build_source_diff_dict( + file_modified, file_added, file_deleted, + all_hierarchy_dict, base_hierarchy_dict, + operations, file_content, base_file_content + ) + + # Breakpoint: Output source_diff_dict to file for review with file prefix + + # Ensure temp_output directory exists + script_dir = os.path.dirname(os.path.abspath(__file__)) + temp_dir = os.path.join(script_dir, "temp_output") + os.makedirs(temp_dir, exist_ok=True) + + file_prefix = file.filename.replace('/', '-').replace('.md', '') + output_file = os.path.join(temp_dir, f"{file_prefix}-source-diff-dict.json") + with open(output_file, 'w', encoding='utf-8') as f: + json.dump(source_diff_dict, f, ensure_ascii=False, indent=2) + + print(f" šŸ’¾ Saved source diff dictionary to: {output_file}") + print(f" šŸ“Š Source diff dictionary contains {len(source_diff_dict)} sections:") + for key, diff_info in source_diff_dict.items(): + print(f" {diff_info['operation']}: {key} -> original_hierarchy: {diff_info['original_hierarchy']}") + + # source-diff-dict.json generation is complete, continue to next step in main.py + + # For modified headers, we need to build a mapping using original titles for matching + original_hierarchy_dict = all_hierarchy_dict.copy() + + # Update hierarchy dict to use original content for modified headers when needed for matching + for line_num in sections_by_type['modified']: + if line_num in all_headers: + header_info = all_headers[line_num] + # Check if this header was modified and has original content + for op in operations['modified_lines']: + if (op['is_header'] and + op['line_number'] == line_num and + 'original_content' in op): + # Create hierarchy path using original content for matching + original_line = op['original_content'].strip() + if original_line.startswith('#'): + # Build original hierarchy for matching + original_hierarchy = build_hierarchy_for_modified_section( + file_content, line_num, original_line, all_hierarchy_dict) + if original_hierarchy: + original_hierarchy_dict[line_num] = original_hierarchy + break + + # Process added sections + if sections_by_type['added']: + file_added = {} + # Find insertion points using the simplified logic: + # Record the previous section hierarchy for each added section + insertion_points = find_previous_section_for_added(sections_by_type['added'], all_hierarchy_dict) + + # Get actual content for added sections + for line_num in sections_by_type['added']: + if line_num in all_hierarchy_dict: + file_added[str(line_num)] = all_hierarchy_dict[line_num] + + # Get source sections content (actual content, not just hierarchy) + if file_added: + source_sections_content = get_source_sections_content(pr_url, file.filename, file_added, github_client) + file_added = source_sections_content # Replace hierarchy with actual content + + if file_added: + added_sections[file.filename] = { + 'sections': file_added, + 'insertion_points': insertion_points + } + print(f" āž• Found {len(file_added)} added sections with {len(insertion_points)} insertion points") + + # Process modified sections + if sections_by_type['modified']: + file_modified = {} + for line_num in sections_by_type['modified']: + if line_num in original_hierarchy_dict: + file_modified[str(line_num)] = original_hierarchy_dict[line_num] + + if file_modified: + modified_sections[file.filename] = { + 'sections': file_modified, + 'original_hierarchy': original_hierarchy_dict, + 'current_hierarchy': all_hierarchy_dict + } + print(f" āœļø Found {len(file_modified)} modified sections") + + # Process deleted sections + if sections_by_type['deleted']: + file_deleted = {} + for line_num in sections_by_type['deleted']: + # Use base hierarchy to get the deleted section info + if line_num in base_hierarchy_dict: + file_deleted[str(line_num)] = base_hierarchy_dict[line_num] + + if file_deleted: + deleted_sections[file.filename] = file_deleted + print(f" āŒ Found {len(file_deleted)} deleted sections") + + # Enhanced logic: also check content-level changes using legacy detection + # This helps detect changes in section content (not just headers) + print(f" šŸ” Enhanced detection: checking content-level changes...") + changed_lines = get_changed_line_ranges(file) + affected_sections = find_affected_sections(lines, changed_lines, all_headers) + + legacy_modified = {} + for line_num in affected_sections: + if line_num in all_hierarchy_dict: + section_hierarchy = all_hierarchy_dict[line_num] + # Only add if not already detected by operation-type analysis + already_detected = False + if file.filename in modified_sections: + for existing_line, existing_hierarchy in modified_sections[file.filename].get('sections', {}).items(): + if existing_hierarchy == section_hierarchy: + already_detected = True + break + + if not already_detected: + legacy_modified[str(line_num)] = section_hierarchy + + if legacy_modified: + print(f" āœ… Enhanced detection found {len(legacy_modified)} additional content-modified sections") + # Merge with existing modified sections + if file.filename in modified_sections: + # Merge the sections + existing_sections = modified_sections[file.filename].get('sections', {}) + existing_sections.update(legacy_modified) + modified_sections[file.filename]['sections'] = existing_sections + else: + # Create new entry + modified_sections[file.filename] = { + 'sections': legacy_modified, + 'original_hierarchy': all_hierarchy_dict, + 'current_hierarchy': all_hierarchy_dict + } + + print(f"\nšŸ“Š Summary:") + #print(f" āœļø Modified files: {} files") + print(f" šŸ“„ Added files: {len(added_files)} files") + print(f" šŸ—‘ļø Deleted files: {len(deleted_files)} files") + print(f" šŸ“‹ TOC files: {len(toc_files)} files") + if ignored_files: + print(f" ā­ļø Ignored files: {len(ignored_files)} files") + for ignored_file in ignored_files: + print(f" - {ignored_file}") + + return added_sections, modified_sections, deleted_sections, added_files, deleted_files, toc_files diff --git a/scripts/translate_doc_pr/requirements.txt b/scripts/translate_doc_pr/requirements.txt new file mode 100644 index 0000000000000..d8336cf8cebe7 --- /dev/null +++ b/scripts/translate_doc_pr/requirements.txt @@ -0,0 +1,4 @@ +PyGithub>=1.55.0 +openai>=1.0.0 +tiktoken>=0.4.0 +google-generativeai>=0.3.0 diff --git a/scripts/translate_doc_pr/section_matcher.py b/scripts/translate_doc_pr/section_matcher.py new file mode 100644 index 0000000000000..ce4ef61116c89 --- /dev/null +++ b/scripts/translate_doc_pr/section_matcher.py @@ -0,0 +1,973 @@ +""" +Section Matcher Module +Handles section hierarchy matching including direct matching and AI matching +""" + +import os +import re +import json +import threading +from github import Github +from openai import OpenAI + +# Thread-safe printing +print_lock = threading.Lock() + +def thread_safe_print(*args, **kwargs): + with print_lock: + print(*args, **kwargs) + +def clean_title_for_matching(title): + """Clean title for matching by removing markdown formatting and span elements""" + if not title: + return "" + + # Remove span elements like New in v5.0 + title = re.sub(r']*>.*?', '', title) + + # Remove markdown header prefix (# ## ### etc.) + title = re.sub(r'^#{1,6}\s*', '', title.strip()) + + # Remove backticks + title = title.replace('`', '') + + # Strip whitespace + title = title.strip() + + return title + +def is_system_variable_or_config(title): + """Check if a title represents a system variable or configuration item""" + cleaned_title = clean_title_for_matching(title) + + if not cleaned_title: + return False + + # Check if original title had backticks (indicating code/config item) + original_has_backticks = '`' in title + + # System variables and config items are typically: + # 1. Alphanumeric characters with underscores, hyphens, dots, or percent signs + # 2. No spaces in the middle + # 3. Often contain underscores, hyphens, dots, or percent signs + # 4. May contain uppercase letters (like alert rule names) + # 5. Single words wrapped in backticks (like `capacity`, `engine`) + + # Check if it contains only allowed characters (including % for metrics/alerts) + allowed_chars = re.match(r'^[a-zA-Z0-9_\-\.%]+$', cleaned_title) + + # Check if it contains at least one separator (common in system vars/config/alerts) + has_separator = ('_' in cleaned_title or '-' in cleaned_title or + '.' in cleaned_title or '%' in cleaned_title) + + # Check if it doesn't contain spaces (spaces would indicate it's likely a regular title) + no_spaces = ' ' not in cleaned_title + + # Additional patterns for alert rules and metrics + is_alert_rule = (cleaned_title.startswith('PD_') or + cleaned_title.startswith('TiDB_') or + cleaned_title.startswith('TiKV_') or + cleaned_title.endswith('_alert') or + '%' in cleaned_title) + + # NEW: Check if it's a single word in backticks (config/variable name) + # Examples: `capacity`, `engine`, `enable`, `dirname` etc. + is_single_backticked_word = (original_has_backticks and + allowed_chars and + no_spaces and + len(cleaned_title.split()) == 1) + + return bool(allowed_chars and (has_separator or is_alert_rule or is_single_backticked_word) and no_spaces) + +def find_toplevel_title_matches(source_sections, target_lines): + """Find matches for top-level titles (# Level) by direct pattern matching""" + matched_dict = {} + failed_matches = [] + skipped_sections = [] + + thread_safe_print(f"šŸ” Searching for top-level title matches") + + for source_line_num, source_hierarchy in source_sections.items(): + # Extract the leaf title from hierarchy + source_leaf_title = source_hierarchy.split(' > ')[-1] if ' > ' in source_hierarchy else source_hierarchy + + # Only process top-level titles + if not source_leaf_title.startswith('# '): + skipped_sections.append({ + 'line_num': source_line_num, + 'hierarchy': source_hierarchy, + 'reason': 'Not a top-level title' + }) + continue + + thread_safe_print(f" šŸ“ Looking for top-level match: {source_leaf_title}") + + # Find the first top-level title in target document + target_match = None + for line_num, line in enumerate(target_lines, 1): + line = line.strip() + if line.startswith('# '): + target_match = { + 'line_num': line_num, + 'title': line, + 'hierarchy_string': line[2:].strip() # Remove '# ' prefix for hierarchy + } + thread_safe_print(f" āœ“ Found target top-level at line {line_num}: {line}") + break + + if target_match: + matched_dict[str(target_match['line_num'])] = target_match['hierarchy_string'] + thread_safe_print(f" āœ… Top-level match: line {target_match['line_num']}") + else: + thread_safe_print(f" āŒ No top-level title found in target") + failed_matches.append({ + 'line_num': source_line_num, + 'hierarchy': source_hierarchy, + 'reason': 'No top-level title found in target' + }) + + thread_safe_print(f"šŸ“Š Top-level matching result: {len(matched_dict)} matches found") + if failed_matches: + thread_safe_print(f"āš ļø {len(failed_matches)} top-level sections failed to match:") + for failed in failed_matches: + thread_safe_print(f" āŒ Line {failed['line_num']}: {failed['hierarchy']} - {failed['reason']}") + + return matched_dict, failed_matches, skipped_sections + + +def find_direct_matches_for_special_files(source_sections, target_hierarchy, target_lines): + """Find direct matches for system variables/config items without using AI""" + matched_dict = {} + failed_matches = [] + skipped_sections = [] + + # Build target headers with hierarchy paths + target_headers = {} + for line_num, line in enumerate(target_lines, 1): + line = line.strip() + if line.startswith('#'): + match = re.match(r'^(#{1,10})\s+(.+)', line) + if match: + level = len(match.group(1)) + title = match.group(2).strip() + target_headers[line_num] = { + 'level': level, + 'title': title, + 'line': line + } + + thread_safe_print(f" šŸ” Searching for direct matches among {len(target_headers)} target headers") + + for source_line_num, source_hierarchy in source_sections.items(): + # Extract the leaf title from hierarchy + source_leaf_title = source_hierarchy.split(' > ')[-1] if ' > ' in source_hierarchy else source_hierarchy + source_clean_title = clean_title_for_matching(source_leaf_title) + + thread_safe_print(f" šŸ“ Looking for match: {source_clean_title}") + + if not is_system_variable_or_config(source_leaf_title): + thread_safe_print(f" āš ļø Not a system variable/config, skipping direct match") + skipped_sections.append({ + 'line_num': source_line_num, + 'hierarchy': source_hierarchy, + 'reason': 'Not a system variable or config item' + }) + continue + + # Find potential matches in target + potential_matches = [] + for target_line_num, target_header in target_headers.items(): + target_clean_title = clean_title_for_matching(target_header['title']) + + if source_clean_title == target_clean_title: + # Build hierarchy path for this target header + hierarchy_path = build_hierarchy_path(target_lines, target_line_num, target_headers) + potential_matches.append({ + 'line_num': target_line_num, + 'header': target_header, + 'hierarchy_path': hierarchy_path, + 'hierarchy_string': ' > '.join([f"{'#' * h['level']} {h['title']}" for h in hierarchy_path if h['level'] > 1 or len(hierarchy_path) == 1]) + }) + thread_safe_print(f" āœ“ Found potential match at line {target_line_num}: {target_header['title']}") + + if len(potential_matches) == 1: + # Single match found + match = potential_matches[0] + matched_dict[str(match['line_num'])] = match['hierarchy_string'] + thread_safe_print(f" āœ… Direct match: line {match['line_num']}") + elif len(potential_matches) > 1: + # Multiple matches, need to use parent hierarchy to disambiguate + thread_safe_print(f" šŸ”€ Multiple matches found ({len(potential_matches)}), using parent hierarchy") + + # Extract parent hierarchy from source + source_parts = source_hierarchy.split(' > ') + if len(source_parts) > 1: + source_parent_titles = [clean_title_for_matching(part) for part in source_parts[:-1]] + + best_match = None + best_score = -1 + + for match in potential_matches: + # Compare parent hierarchy + target_parent_titles = [clean_title_for_matching(h['title']) for h in match['hierarchy_path'][:-1]] + + # Calculate similarity score + score = 0 + min_len = min(len(source_parent_titles), len(target_parent_titles)) + + for i in range(min_len): + if i < len(source_parent_titles) and i < len(target_parent_titles): + if source_parent_titles[-(i+1)] == target_parent_titles[-(i+1)]: # Compare from end + score += 1 + else: + break + + thread_safe_print(f" šŸ“Š Match at line {match['line_num']} score: {score}") + + if score > best_score: + best_score = score + best_match = match + + if best_match and best_score > 0: + matched_dict[str(best_match['line_num'])] = best_match['hierarchy_string'] + thread_safe_print(f" āœ… Best match: line {best_match['line_num']} (score: {best_score})") + else: + thread_safe_print(f" āŒ No good parent hierarchy match found") + failed_matches.append({ + 'line_num': source_line_num, + 'hierarchy': source_hierarchy, + 'reason': 'Multiple matches found but no good parent hierarchy match' + }) + else: + thread_safe_print(f" āš ļø No parent hierarchy in source, cannot disambiguate") + failed_matches.append({ + 'line_num': source_line_num, + 'hierarchy': source_hierarchy, + 'reason': 'Multiple matches found but no parent hierarchy to disambiguate' + }) + else: + thread_safe_print(f" āŒ No matches found for: {source_clean_title}") + # Try fuzzy matching for similar titles (e.g., --host vs --hosts) + fuzzy_matched = False + source_clean_lower = source_clean_title.lower() + for target_header in target_headers: + # Handle both dict and tuple formats + if isinstance(target_header, dict): + target_clean = clean_title_for_matching(target_header['title']) + elif isinstance(target_header, (list, tuple)) and len(target_header) >= 2: + target_clean = clean_title_for_matching(target_header[1]) # title is at index 1 + else: + continue # Skip invalid entries + target_clean_lower = target_clean.lower() + # Check for similar titles (handle plural/singular and minor differences) + # Case 1: One is substring of another (e.g., --host vs --hosts) + # Case 2: Small character difference (1-2 characters) + len_diff = abs(len(source_clean_lower) - len(target_clean_lower)) + if (len_diff <= 2 and + (source_clean_lower in target_clean_lower or + target_clean_lower in source_clean_lower)): + thread_safe_print(f" ā‰ˆ Fuzzy match found: {source_clean_title} ā‰ˆ {target_clean}") + if isinstance(target_header, dict): + matched_dict[str(target_header['line_num'])] = target_header['hierarchy_string'] + thread_safe_print(f" āœ… Fuzzy match: line {target_header['line_num']}") + elif isinstance(target_header, (list, tuple)) and len(target_header) >= 3: + matched_dict[str(target_header[0])] = target_header[2] # line_num at index 0, hierarchy at index 2 + thread_safe_print(f" āœ… Fuzzy match: line {target_header[0]}") + fuzzy_matched = True + break + + if not fuzzy_matched: + failed_matches.append({ + 'line_num': source_line_num, + 'hierarchy': source_hierarchy, + 'reason': 'No matching section found in target' + }) + + thread_safe_print(f" šŸ“Š Direct matching result: {len(matched_dict)} matches found") + + if failed_matches: + thread_safe_print(f" āš ļø {len(failed_matches)} sections failed to match:") + for failed in failed_matches: + thread_safe_print(f" āŒ Line {failed['line_num']}: {failed['hierarchy']} - {failed['reason']}") + + if skipped_sections: + thread_safe_print(f" ā„¹ļø {len(skipped_sections)} sections skipped (not system variables/config):") + for skipped in skipped_sections: + thread_safe_print(f" ā­ļø Line {skipped['line_num']}: {skipped['hierarchy']} - {skipped['reason']}") + + return matched_dict, failed_matches, skipped_sections + +def filter_non_system_sections(target_hierarchy): + """Filter out system variable/config sections from target hierarchy for AI mapping""" + filtered_hierarchy = {} + system_sections_count = 0 + + for line_num, hierarchy in target_hierarchy.items(): + # Extract the leaf title from hierarchy + leaf_title = hierarchy.split(' > ')[-1] if ' > ' in hierarchy else hierarchy + + if is_system_variable_or_config(leaf_title): + system_sections_count += 1 + else: + filtered_hierarchy[line_num] = hierarchy + + thread_safe_print(f" šŸ”§ Filtered target hierarchy: {len(filtered_hierarchy)} non-system sections (removed {system_sections_count} system sections)") + + return filtered_hierarchy + +def get_corresponding_sections(source_sections, target_sections, ai_client, source_language, target_language, max_tokens=20000): + """Use AI to find corresponding sections between different languages""" + + # Format source sections + source_text = "\n".join(source_sections) + target_text = "\n".join(target_sections) + number_of_sections = len(source_sections) + + prompt = f"""I am aligning the {source_language} and {target_language} documentation for TiDB. I have modified the following {number_of_sections} sections in the {source_language} file: + +{source_text} + +Here is the section structure of the corresponding {target_language} file. Please select the corresponding {number_of_sections} sections in {target_language} from the following list that I should modify. Do not output any other text, return the Markdown code block enclosed in three backticks. + +{target_text}""" + + thread_safe_print(f"\n šŸ“¤ AI Mapping Prompt ({source_language} → {target_language}):") + thread_safe_print(f" " + "="*80) + thread_safe_print(f" {prompt}") + thread_safe_print(f" " + "="*80) + + # Import token estimation function from main + try: + from main import print_token_estimation + print_token_estimation(prompt, f"Section mapping ({source_language} → {target_language})") + except ImportError: + # Fallback if import fails - use tiktoken + try: + import tiktoken + enc = tiktoken.get_encoding("cl100k_base") + tokens = enc.encode(prompt) + actual_tokens = len(tokens) + char_count = len(prompt) + thread_safe_print(f" šŸ’° Section mapping ({source_language} → {target_language})") + thread_safe_print(f" šŸ“ Input: {char_count:,} characters") + thread_safe_print(f" šŸ”¢ Actual tokens: {actual_tokens:,} (using tiktoken cl100k_base)") + except Exception: + # Final fallback to character approximation + estimated_tokens = len(prompt) // 4 + char_count = len(prompt) + thread_safe_print(f" šŸ’° Section mapping ({source_language} → {target_language})") + thread_safe_print(f" šŸ“ Input: {char_count:,} characters") + thread_safe_print(f" šŸ”¢ Estimated tokens: ~{estimated_tokens:,} (fallback: 4 chars/token approximation)") + + try: + ai_response = ai_client.chat_completion( + messages=[ + {"role": "user", "content": prompt} + ], + temperature=0.1, + max_tokens=max_tokens + ) + + thread_safe_print(f"\n šŸ“„ AI Mapping Response:") + thread_safe_print(f" " + "-"*80) + thread_safe_print(f" {ai_response}") + thread_safe_print(f" " + "-"*80) + + return ai_response + except Exception as e: + print(f" āŒ AI mapping error: {e}") + return None + +def parse_ai_response(ai_response): + """Parse AI response to extract section names""" + sections = [] + lines = ai_response.split('\n') + + for line in lines: + line = line.strip() + # Skip markdown code block markers and empty lines + if line and not line.startswith('```'): + # Remove leading "## " if present and clean up + if line.startswith('## '): + sections.append(line) + elif line.startswith('- '): + # Handle cases where AI returns a list + sections.append(line[2:].strip()) + + return sections + +def find_matching_line_numbers(ai_sections, target_hierarchy_dict): + """Find line numbers in target hierarchy dict that match AI sections""" + matched_dict = {} + + for ai_section in ai_sections: + # Look for exact matches first + found = False + for line_num, hierarchy in target_hierarchy_dict.items(): + if hierarchy == ai_section: + matched_dict[str(line_num)] = hierarchy + found = True + break + + if not found: + # Look for partial matches (in case of slight differences) + for line_num, hierarchy in target_hierarchy_dict.items(): + # Remove common variations and compare + ai_clean = ai_section.replace('### ', '').replace('## ', '').strip() + hierarchy_clean = hierarchy.replace('### ', '').replace('## ', '').strip() + + if ai_clean in hierarchy_clean or hierarchy_clean in ai_clean: + matched_dict[str(line_num)] = hierarchy + thread_safe_print(f" ā‰ˆ Partial match found at line {line_num}: {hierarchy}") + found = True + break + + if not found: + thread_safe_print(f" āœ— No match found for: {ai_section}") + + return matched_dict + +def build_hierarchy_path(lines, line_num, all_headers): + """Build the full hierarchy path for a header at given line (from auto-sync-pr-changes.py)""" + if line_num not in all_headers: + return [] + + current_header = all_headers[line_num] + current_level = current_header['level'] + hierarchy_path = [] + + # Find all parent headers + for check_line in sorted(all_headers.keys()): + if check_line >= line_num: + break + + header = all_headers[check_line] + if header['level'] < current_level: + # This is a potential parent + # Remove any headers at same or deeper level + while hierarchy_path and hierarchy_path[-1]['level'] >= header['level']: + hierarchy_path.pop() + hierarchy_path.append(header) + + # Add current header + hierarchy_path.append(current_header) + + return hierarchy_path + +def map_insertion_points_to_target(insertion_points, target_hierarchy, target_lines, file_path, pr_url, github_client, ai_client, repo_config, max_non_system_sections=120): + """Map source insertion points to target language locations""" + target_insertion_points = {} + + thread_safe_print(f" šŸ“ Mapping {len(insertion_points)} insertion points to target...") + + for group_key, point_info in insertion_points.items(): + previous_section_hierarchy = point_info['previous_section_hierarchy'] + thread_safe_print(f" šŸ” Finding target location for: {previous_section_hierarchy}") + + # Extract title for system variable checking + if ' > ' in previous_section_hierarchy: + title = previous_section_hierarchy.split(' > ')[-1] + else: + title = previous_section_hierarchy + + # Check if this is a system variable/config that can be directly matched + cleaned_title = clean_title_for_matching(title) + if is_system_variable_or_config(cleaned_title): + thread_safe_print(f" šŸŽÆ Direct matching for system var/config: {cleaned_title}") + + # Direct matching for system variables + temp_source = {point_info['previous_section_line']: previous_section_hierarchy} + matched_dict, failed_matches, skipped_sections = find_direct_matches_for_special_files( + temp_source, target_hierarchy, target_lines + ) + + if matched_dict: + # Get the first (and should be only) matched target line + target_line = list(matched_dict.keys())[0] + + # Find the end of this section + target_line_num = int(target_line) + insertion_after_line = find_section_end_line(target_line_num, target_hierarchy, target_lines) + + target_insertion_points[group_key] = { + 'insertion_after_line': insertion_after_line, + 'target_hierarchy': target_hierarchy.get(str(target_line_num), ''), + 'insertion_type': point_info['insertion_type'], + 'new_sections': point_info['new_sections'] + } + thread_safe_print(f" āœ… Direct match found, insertion after line {insertion_after_line}") + continue + + # If not a system variable or direct matching failed, use AI + thread_safe_print(f" šŸ¤– Using AI mapping for: {cleaned_title}") + + # Filter target hierarchy for AI (remove system sections) + filtered_target_hierarchy = filter_non_system_sections(target_hierarchy) + + # Check if filtered hierarchy is too large for AI + # Use provided max_non_system_sections parameter + if len(filtered_target_hierarchy) > max_non_system_sections: + thread_safe_print(f" āŒ Target hierarchy too large for AI: {len(filtered_target_hierarchy)} > {max_non_system_sections}") + continue + + # Prepare source for AI mapping + temp_source = {str(point_info['previous_section_line']): previous_section_hierarchy} + + # Get AI mapping + ai_response = get_corresponding_sections( + list(temp_source.values()), + list(filtered_target_hierarchy.values()), + ai_client, + repo_config['source_language'], + repo_config['target_language'], + max_tokens=20000 # Use default value since this function doesn't accept max_tokens yet + ) + + if ai_response: + # Parse AI response and find matching line numbers + ai_sections = parse_ai_response(ai_response) + ai_matched = find_matching_line_numbers(ai_sections, target_hierarchy) + + if ai_matched and len(ai_matched) > 0: + # Get the first match (we only have one source section) + target_line = list(ai_matched.keys())[0] + target_line_num = int(target_line) + + # Find the end of this section + insertion_after_line = find_section_end_line(target_line_num, target_hierarchy, target_lines) + + target_insertion_points[group_key] = { + 'insertion_after_line': insertion_after_line, + 'target_hierarchy': target_hierarchy.get(target_line, ''), + 'insertion_type': point_info['insertion_type'], + 'new_sections': point_info['new_sections'] + } + thread_safe_print(f" āœ… AI match found, insertion after line {insertion_after_line}") + else: + thread_safe_print(f" āŒ No AI matching sections found for: {previous_section_hierarchy}") + else: + thread_safe_print(f" āŒ No AI response received for: {previous_section_hierarchy}") + + return target_insertion_points + +def extract_hierarchies_from_diff_dict(source_diff_dict): + """Extract original_hierarchy from source_diff_dict for section matching""" + extracted_hierarchies = {} + + for key, diff_info in source_diff_dict.items(): + operation = diff_info.get('operation', '') + original_hierarchy = diff_info.get('original_hierarchy', '') + + # Process all sections: modified, deleted, and added + if operation in ['modified', 'deleted', 'added'] and original_hierarchy: + # Use the key as the identifier for the hierarchy + extracted_hierarchies[key] = original_hierarchy + + thread_safe_print(f"šŸ“„ Extracted {len(extracted_hierarchies)} hierarchies from source diff dict:") + for key, hierarchy in extracted_hierarchies.items(): + thread_safe_print(f" {key}: {hierarchy}") + + return extracted_hierarchies + +def match_source_diff_to_target(source_diff_dict, target_hierarchy, target_lines, ai_client, repo_config, max_non_system_sections=120, max_tokens=20000): + """ + Match source_diff_dict original_hierarchy to target file sections + Uses direct matching for system variables/config and AI matching for others + + Returns: + dict: Matched sections with enhanced information including: + - target_line: Line number in target file + - target_hierarchy: Target section hierarchy + - insertion_type: For added sections only + - source_original_hierarchy: Original hierarchy from source + - source_operation: Operation type (modified/added/deleted) + - source_old_content: Old content from source diff + - source_new_content: New content from source diff + """ + thread_safe_print(f"šŸ”— Starting source diff to target matching...") + + # Extract hierarchies from source diff dict + source_hierarchies = extract_hierarchies_from_diff_dict(source_diff_dict) + + if not source_hierarchies: + thread_safe_print(f"āš ļø No hierarchies to match") + return {} + + # Process sections in original order to maintain consistency + # Initialize final matching results with ordered dict to preserve order + from collections import OrderedDict + all_matched_sections = OrderedDict() + + # Categorize sections for processing strategy but maintain order + direct_match_sections = OrderedDict() + ai_match_sections = OrderedDict() + added_sections = OrderedDict() + bottom_sections = OrderedDict() # New category for bottom sections + + for key, hierarchy in source_hierarchies.items(): + # Check if this is a bottom section (no matching needed) + if hierarchy.startswith('bottom-'): + bottom_sections[key] = hierarchy + # Check if this is an added section + elif key.startswith('added_'): + added_sections[key] = hierarchy + else: + # Extract the leaf title from hierarchy for checking + leaf_title = hierarchy.split(' > ')[-1] if ' > ' in hierarchy else hierarchy + + # Check if this is suitable for direct matching + if (hierarchy == "frontmatter" or + leaf_title.startswith('# ') or # Top-level titles + is_system_variable_or_config(leaf_title)): # System variables/config + direct_match_sections[key] = hierarchy + else: + ai_match_sections[key] = hierarchy + + thread_safe_print(f"šŸ“Š Section categorization:") + thread_safe_print(f" šŸŽÆ Direct matching: {len(direct_match_sections)} sections") + thread_safe_print(f" šŸ¤– AI matching: {len(ai_match_sections)} sections") + thread_safe_print(f" āž• Added sections: {len(added_sections)} sections") + thread_safe_print(f" šŸ”š Bottom sections: {len(bottom_sections)} sections (no matching needed)") + + # Process each section in original order + thread_safe_print(f"\nšŸ”„ Processing sections in original order...") + + for key, hierarchy in source_hierarchies.items(): + thread_safe_print(f" šŸ” Processing {key}: {hierarchy}") + + # Determine processing strategy based on section type and content + if hierarchy.startswith('bottom-'): + # Bottom section - no matching needed, append to end + thread_safe_print(f" šŸ”š Bottom section - append to end of document") + result = { + "target_line": "-1", # Special marker for bottom sections + "target_hierarchy": hierarchy # Keep the bottom-xxx marker + } + elif key.startswith('added_'): + # Added section - find insertion point + thread_safe_print(f" āž• Added section - finding insertion point") + result = process_added_section(key, hierarchy, target_hierarchy, target_lines, ai_client, repo_config, max_non_system_sections, max_tokens) + else: + # Modified or deleted section - find matching section + operation = source_diff_dict[key].get('operation', 'unknown') + thread_safe_print(f" {operation.capitalize()} section - finding target match") + result = process_modified_or_deleted_section(key, hierarchy, target_hierarchy, target_lines, ai_client, repo_config, max_non_system_sections, max_tokens) + + if result: + # Add source language information from source_diff_dict + source_info = source_diff_dict.get(key, {}) + + # Extract target content from target_lines + target_line = result.get('target_line', 'unknown') + target_content = "" + if target_line != 'unknown' and target_line != '0': + try: + target_line_num = int(target_line) + # For ALL operations, only extract direct content (no sub-sections) + # This avoids duplication when both parent and child sections have operations + target_content = extract_section_direct_content(target_line_num, target_lines) + except (ValueError, IndexError): + target_content = "" + elif target_line == '0': + # For frontmatter, extract content from beginning to first header + target_content = extract_frontmatter_content(target_lines) + + enhanced_result = { + **result, # Include existing target matching info + 'target_content': target_content, # Add target section content + 'source_original_hierarchy': source_info.get('original_hierarchy', ''), + 'source_operation': source_info.get('operation', ''), + 'source_old_content': source_info.get('old_content', ''), + 'source_new_content': source_info.get('new_content', '') + } + all_matched_sections[key] = enhanced_result + thread_safe_print(f" āœ… {key}: -> line {target_line}") + else: + thread_safe_print(f" āŒ {key}: matching failed") + + thread_safe_print(f"\nšŸ“Š Final matching results: {len(all_matched_sections)} total matches") + return all_matched_sections + +def process_modified_or_deleted_section(key, hierarchy, target_hierarchy, target_lines, ai_client, repo_config, max_non_system_sections, max_tokens=20000): + """Process modified or deleted sections to find target matches""" + # Extract the leaf title from hierarchy for checking + leaf_title = hierarchy.split(' > ')[-1] if ' > ' in hierarchy else hierarchy + + # Check if this is suitable for direct matching + if (hierarchy == "frontmatter" or + leaf_title.startswith('# ') or # Top-level titles + is_system_variable_or_config(leaf_title)): # System variables/config + + if hierarchy == "frontmatter": + return {"target_line": "0", "target_hierarchy": "frontmatter"} + + elif leaf_title.startswith('# '): + # Top-level title matching + temp_sections = {key: hierarchy} + matched_dict, failed_matches, skipped_sections = find_toplevel_title_matches( + temp_sections, target_lines + ) + if matched_dict: + target_line = list(matched_dict.keys())[0] + # For top-level titles, add # prefix to the hierarchy + return { + "target_line": target_line, + "target_hierarchy": f"# {matched_dict[target_line]}" + } + + else: + # System variable/config matching + temp_sections = {key: hierarchy} + matched_dict, failed_matches, skipped_sections = find_direct_matches_for_special_files( + temp_sections, target_hierarchy, target_lines + ) + if matched_dict: + target_line = list(matched_dict.keys())[0] + target_hierarchy_str = list(matched_dict.values())[0] + + # Extract the leaf title and add # prefix, remove top-level title from hierarchy + if ' > ' in target_hierarchy_str: + # Remove top-level title and keep only the leaf with ## prefix + leaf_title = target_hierarchy_str.split(' > ')[-1] + formatted_hierarchy = f"## {leaf_title}" + else: + # Single level, add ## prefix + formatted_hierarchy = f"## {target_hierarchy_str}" + + return { + "target_line": target_line, + "target_hierarchy": formatted_hierarchy + } + else: + # AI matching for non-system sections + filtered_target_hierarchy = filter_non_system_sections(target_hierarchy) + + if len(filtered_target_hierarchy) <= max_non_system_sections: + temp_sections = {key: hierarchy} + + ai_response = get_corresponding_sections( + list(temp_sections.values()), + list(filtered_target_hierarchy.values()), + ai_client, + repo_config['source_language'], + repo_config['target_language'], + max_tokens + ) + + if ai_response: + ai_sections = parse_ai_response(ai_response) + ai_matched = find_matching_line_numbers(ai_sections, target_hierarchy) + + if ai_matched: + target_line = list(ai_matched.keys())[0] + target_hierarchy_str = list(ai_matched.values())[0] + + # Format AI matched hierarchy with # prefix and remove top-level title + formatted_hierarchy = format_target_hierarchy(target_hierarchy_str) + + return { + "target_line": target_line, + "target_hierarchy": formatted_hierarchy + } + + return None + +def format_target_hierarchy(target_hierarchy_str): + """Format target hierarchy to preserve complete hierarchy structure""" + if target_hierarchy_str.startswith('##') or target_hierarchy_str.startswith('#'): + # Already formatted, return as is + return target_hierarchy_str + elif ' > ' in target_hierarchy_str: + # Keep complete hierarchy structure, just ensure proper formatting + return target_hierarchy_str + else: + # Single level, add ## prefix for compatibility + return f"## {target_hierarchy_str}" + +def process_added_section(key, reference_hierarchy, target_hierarchy, target_lines, ai_client, repo_config, max_non_system_sections, max_tokens=20000): + """Process added sections to find insertion points""" + # For added sections, hierarchy points to the next section (where to insert before) + reference_leaf = reference_hierarchy.split(' > ')[-1] if ' > ' in reference_hierarchy else reference_hierarchy + + if (reference_hierarchy == "frontmatter" or + reference_leaf.startswith('# ') or + is_system_variable_or_config(reference_leaf)): + + # Use direct matching for the reference section + temp_reference = {f"ref_{key}": reference_hierarchy} + + if reference_hierarchy == "frontmatter": + return { + "target_line": "0", + "target_hierarchy": "frontmatter", + "insertion_type": "before_reference" + } + + elif reference_leaf.startswith('# '): + matched_dict, failed_matches, skipped_sections = find_toplevel_title_matches( + temp_reference, target_lines + ) + if matched_dict: + target_line = list(matched_dict.keys())[0] + formatted_hierarchy = f"# {matched_dict[target_line]}" + return { + "target_line": target_line, + "target_hierarchy": formatted_hierarchy, + "insertion_type": "before_reference" + } + + else: + # System variable/config + matched_dict, failed_matches, skipped_sections = find_direct_matches_for_special_files( + temp_reference, target_hierarchy, target_lines + ) + if matched_dict: + target_line = list(matched_dict.keys())[0] + target_hierarchy_str = list(matched_dict.values())[0] + formatted_hierarchy = format_target_hierarchy(target_hierarchy_str) + return { + "target_line": target_line, + "target_hierarchy": formatted_hierarchy, + "insertion_type": "before_reference" + } + else: + # Use AI matching for the reference section + filtered_target_hierarchy = filter_non_system_sections(target_hierarchy) + + if len(filtered_target_hierarchy) <= max_non_system_sections: + temp_reference = {f"ref_{key}": reference_hierarchy} + + ai_response = get_corresponding_sections( + list(temp_reference.values()), + list(filtered_target_hierarchy.values()), + ai_client, + repo_config['source_language'], + repo_config['target_language'], + max_tokens + ) + + if ai_response: + ai_sections = parse_ai_response(ai_response) + ai_matched = find_matching_line_numbers(ai_sections, target_hierarchy) + + if ai_matched: + target_line = list(ai_matched.keys())[0] + target_hierarchy_str = list(ai_matched.values())[0] + formatted_hierarchy = format_target_hierarchy(target_hierarchy_str) + return { + "target_line": target_line, + "target_hierarchy": formatted_hierarchy, + "insertion_type": "before_reference" + } + + return None + +def extract_target_section_content(target_line_num, target_lines): + """Extract target section content from target_lines (includes sub-sections)""" + if target_line_num >= len(target_lines): + return "" + + start_line = target_line_num - 1 # Convert to 0-based index + + # Find the end of the section by looking for the next header + current_line = target_lines[start_line].strip() + if not current_line.startswith('#'): + return current_line + + current_level = len(current_line.split()[0]) # Count # characters + end_line = len(target_lines) # Default to end of file + + # For top-level headers (# level 1), stop at first sublevel (## level 2) + # For other headers, stop at same or higher level + if current_level == 1: + # Top-level header: stop at first ## (level 2) or higher + for i in range(start_line + 1, len(target_lines)): + line = target_lines[i].strip() + if line.startswith('#'): + line_level = len(line.split()[0]) + if line_level >= 2: # Stop at ## or higher level + end_line = i + break + else: + # Sub-level header: stop at same or higher level (traditional behavior) + for i in range(start_line + 1, len(target_lines)): + line = target_lines[i].strip() + if line.startswith('#'): + line_level = len(line.split()[0]) + if line_level <= current_level: + end_line = i + break + + # Extract content from start_line to end_line + section_content = '\n'.join(target_lines[start_line:end_line]) + return section_content.strip() + +def extract_section_direct_content(target_line_num, target_lines): + """Extract ONLY the direct content of a section (excluding sub-sections)""" + if target_line_num >= len(target_lines): + return "" + + start_line = target_line_num - 1 # Convert to 0-based index + + # Find the end of the section by looking for the next header + current_line = target_lines[start_line].strip() + if not current_line.startswith('#'): + return current_line + + current_level = len(current_line.split()[0]) # Count # characters + end_line = len(target_lines) # Default to end of file + + # Only extract until the first header (any level) + # This means we stop at ANY header - whether it's a sub-section OR same/higher level + for i in range(start_line + 1, len(target_lines)): + line = target_lines[i].strip() + if line.startswith('#'): + # Stop at ANY header to get only direct content + end_line = i + break + + # Extract content from start_line to end_line + section_content = '\n'.join(target_lines[start_line:end_line]) + return section_content.strip() + +def extract_frontmatter_content(target_lines): + """Extract frontmatter content from beginning to first header""" + if not target_lines: + return "" + + frontmatter_lines = [] + for i, line in enumerate(target_lines): + line_stripped = line.strip() + # Stop when we hit the first top-level header + if line_stripped.startswith('# '): + break + frontmatter_lines.append(line.rstrip()) + + return '\n'.join(frontmatter_lines) + +def find_section_end_line(section_start_line, target_hierarchy, target_lines): + """Find the end line of a section to determine insertion point (from auto-sync-pr-changes.py)""" + + # Get the current section's level + current_section_line = target_lines[section_start_line - 1].strip() + current_level = len(current_section_line.split()[0]) if current_section_line.startswith('#') else 5 + + # Find the next section at the same level or higher (lower number) + next_section_line = None + for line_num_str in sorted(target_hierarchy.keys(), key=int): + line_num = int(line_num_str) + if line_num > section_start_line: + # Check the level of this section + section_line = target_lines[line_num - 1].strip() + if section_line.startswith('#'): + section_level = len(section_line.split()[0]) + if section_level <= current_level: + next_section_line = line_num + break + + if next_section_line: + # Insert before the next same-level or higher-level section + return next_section_line - 1 + else: + # This is the last section at this level, insert at the end of the file + return len(target_lines) diff --git a/scripts/translate_doc_pr/toc_processor.py b/scripts/translate_doc_pr/toc_processor.py new file mode 100644 index 0000000000000..71cce4a17f8bb --- /dev/null +++ b/scripts/translate_doc_pr/toc_processor.py @@ -0,0 +1,434 @@ +""" +TOC Processor Module +Handles special processing logic for TOC.md files +""" + +import os +import re +import json +import threading +from github import Github +from openai import OpenAI + +# Thread-safe printing +print_lock = threading.Lock() + +def thread_safe_print(*args, **kwargs): + with print_lock: + print(*args, **kwargs) + +def extract_toc_link_from_line(line): + """Extract the link part (including parentheses) from a TOC line""" + # Pattern to match [text](link) format + pattern = r'\[([^\]]+)\]\(([^)]+)\)' + match = re.search(pattern, line) + if match: + return f"({match.group(2)})" # Return (link) including parentheses + return None + +def is_toc_translation_needed(line): + """Check if a TOC line needs translation based on content in square brackets""" + # Extract content within square brackets [content] + pattern = r'\[([^\]]+)\]' + match = re.search(pattern, line) + if match: + content = match.group(1) + # Skip translation if content has no Chinese and no spaces + has_chinese = bool(re.search(r'[\u4e00-\u9fff]', content)) + has_spaces = ' ' in content + + # Need translation if has Chinese OR has spaces + # Skip translation only if it's alphanumeric/technical term without spaces + return has_chinese or has_spaces + return True # Default to translate if can't parse + +def find_best_toc_match(target_link, target_lines, source_line_num): + """Find the best matching line in target TOC based on link content and line proximity""" + matches = [] + + for i, line in enumerate(target_lines): + line_link = extract_toc_link_from_line(line.strip()) + if line_link and line_link == target_link: + matches.append({ + 'line_num': i + 1, # Convert to 1-based + 'line': line.strip(), + 'distance': abs((i + 1) - source_line_num) + }) + + if not matches: + return None + + # Sort by distance to source line number, choose the closest one + matches.sort(key=lambda x: x['distance']) + return matches[0] + +def group_consecutive_lines(lines): + """Group consecutive lines together""" + if not lines: + return [] + + # Sort lines by line number + sorted_lines = sorted(lines, key=lambda x: x['line_number']) + + groups = [] + current_group = [sorted_lines[0]] + + for i in range(1, len(sorted_lines)): + current_line = sorted_lines[i] + prev_line = sorted_lines[i-1] + + # Consider lines consecutive if they are within 2 lines of each other + if current_line['line_number'] - prev_line['line_number'] <= 2: + current_group.append(current_line) + else: + groups.append(current_group) + current_group = [current_line] + + groups.append(current_group) + return groups + +def process_toc_operations(file_path, operations, source_lines, target_lines, target_local_path): + """Process TOC.md file operations with special logic""" + thread_safe_print(f"\nšŸ“‹ Processing TOC.md with special logic...") + + results = { + 'added': [], + 'modified': [], + 'deleted': [] + } + + # Process deleted lines first + for deleted_line in operations['deleted_lines']: + if not deleted_line['is_header']: # TOC lines are not headers + deleted_content = deleted_line['content'] + deleted_link = extract_toc_link_from_line(deleted_content) + + if deleted_link: + thread_safe_print(f" šŸ—‘ļø Processing deleted TOC line with link: {deleted_link}") + + # Find matching line in target + match = find_best_toc_match(deleted_link, target_lines, deleted_line['line_number']) + if match: + thread_safe_print(f" āœ… Found target line {match['line_num']}: {match['line']}") + results['deleted'].append({ + 'source_line': deleted_line['line_number'], + 'target_line': match['line_num'], + 'content': deleted_content + }) + else: + thread_safe_print(f" āŒ No matching line found for {deleted_link}") + + # Process added lines + added_groups = group_consecutive_lines(operations['added_lines']) + for group in added_groups: + if group: # Skip empty groups + first_added_line = group[0] + thread_safe_print(f" āž• Processing added TOC group starting at line {first_added_line['line_number']}") + + # Find the previous line in source to determine insertion point + previous_line_num = first_added_line['line_number'] - 1 + if previous_line_num > 0 and previous_line_num <= len(source_lines): + previous_line_content = source_lines[previous_line_num - 1] + previous_link = extract_toc_link_from_line(previous_line_content) + + if previous_link: + thread_safe_print(f" šŸ“ Previous line link: {previous_link}") + + # Find matching previous line in target + match = find_best_toc_match(previous_link, target_lines, previous_line_num) + if match: + thread_safe_print(f" āœ… Found target insertion point after line {match['line_num']}") + + # Process each line in the group + for added_line in group: + added_content = added_line['content'] + if is_toc_translation_needed(added_content): + results['added'].append({ + 'source_line': added_line['line_number'], + 'target_insertion_after': match['line_num'], + 'content': added_content, + 'needs_translation': True + }) + thread_safe_print(f" šŸ“ Added for translation: {added_content.strip()}") + else: + results['added'].append({ + 'source_line': added_line['line_number'], + 'target_insertion_after': match['line_num'], + 'content': added_content, + 'needs_translation': False + }) + thread_safe_print(f" ā­ļø Added without translation: {added_content.strip()}") + else: + thread_safe_print(f" āŒ No target insertion point found for {previous_link}") + else: + thread_safe_print(f" āŒ No link found in previous line: {previous_line_content.strip()}") + + # Process modified lines + modified_groups = group_consecutive_lines(operations['modified_lines']) + for group in modified_groups: + if group: # Skip empty groups + first_modified_line = group[0] + thread_safe_print(f" āœļø Processing modified TOC group starting at line {first_modified_line['line_number']}") + + # Find the previous line in source to determine target location + previous_line_num = first_modified_line['line_number'] - 1 + if previous_line_num > 0 and previous_line_num <= len(source_lines): + previous_line_content = source_lines[previous_line_num - 1] + previous_link = extract_toc_link_from_line(previous_line_content) + + if previous_link: + thread_safe_print(f" šŸ“ Previous line link: {previous_link}") + + # Find matching previous line in target + match = find_best_toc_match(previous_link, target_lines, previous_line_num) + if match: + # Process each line in the group + for modified_line in group: + modified_content = modified_line['content'] + if is_toc_translation_needed(modified_content): + results['modified'].append({ + 'source_line': modified_line['line_number'], + 'target_line_context': match['line_num'], + 'content': modified_content, + 'needs_translation': True + }) + thread_safe_print(f" šŸ“ Modified for translation: {modified_content.strip()}") + else: + results['modified'].append({ + 'source_line': modified_line['line_number'], + 'target_line_context': match['line_num'], + 'content': modified_content, + 'needs_translation': False + }) + thread_safe_print(f" ā­ļø Modified without translation: {modified_content.strip()}") + else: + thread_safe_print(f" āŒ No target context found for {previous_link}") + else: + thread_safe_print(f" āŒ No link found in previous line: {previous_line_content.strip()}") + + return results + +def find_toc_modification_line(mod_op, target_lines): + """Find the actual line number to modify in target TOC based on context""" + # This function helps find the exact line to modify in target TOC + # based on the modification operation context + + target_line_context = mod_op.get('target_line_context', 0) + + # Look for the line after the context line that should be modified + # This is a simplified approach - in practice, you might need more sophisticated logic + + if target_line_context > 0 and target_line_context < len(target_lines): + # Check if the next line is the one to modify + return target_line_context + 1 + + return target_line_context + +def translate_toc_lines(toc_operations, ai_client, repo_config): + """Translate multiple TOC lines at once""" + lines_to_translate = [] + + # Collect all lines that need translation + for op in toc_operations: + if op.get('needs_translation', False): + lines_to_translate.append({ + 'operation_type': 'added' if 'target_insertion_after' in op else 'modified', + 'content': op['content'], + 'source_line': op['source_line'] + }) + + if not lines_to_translate: + thread_safe_print(f" ā­ļø No TOC lines need translation") + return {} + + thread_safe_print(f" šŸ¤– Translating {len(lines_to_translate)} TOC lines...") + + # Prepare content for AI translation + content_dict = {} + for i, line_info in enumerate(lines_to_translate): + content_dict[f"line_{i}"] = line_info['content'] + + source_lang = repo_config['source_language'] + target_lang = repo_config['target_language'] + + prompt = f"""You are a professional translator. Please translate the following TOC (Table of Contents) lines from {source_lang} to {target_lang}. + +IMPORTANT INSTRUCTIONS: +1. Preserve ALL formatting, indentation, spaces, and dashes exactly as they appear +2. Only translate the text content within square brackets [text] +3. Keep all markdown links, parentheses, and special characters unchanged +4. Maintain the exact same indentation and spacing structure + +Input lines to translate: +{json.dumps(content_dict, indent=2, ensure_ascii=False)} + +Please return the translated lines in the same JSON format, preserving all formatting and only translating the text within square brackets. + +Return format: +{{ + "line_0": "translated line with preserved formatting", + "line_1": "translated line with preserved formatting" +}}""" + + #print(prompt) #DEBUG + # Add token estimation + try: + from main import print_token_estimation + print_token_estimation(prompt, "TOC translation") + except ImportError: + # Fallback if import fails - use tiktoken + try: + import tiktoken + enc = tiktoken.get_encoding("cl100k_base") + tokens = enc.encode(prompt) + actual_tokens = len(tokens) + char_count = len(prompt) + print(f" šŸ’° TOC translation") + print(f" šŸ“ Input: {char_count:,} characters") + print(f" šŸ”¢ Actual tokens: {actual_tokens:,} (using tiktoken cl100k_base)") + except Exception: + # Final fallback to character approximation + estimated_tokens = len(prompt) // 4 + char_count = len(prompt) + print(f" šŸ’° TOC translation") + print(f" šŸ“ Input: {char_count:,} characters") + print(f" šŸ”¢ Estimated tokens: ~{estimated_tokens:,} (fallback: 4 chars/token approximation)") + + try: + ai_response = ai_client.chat_completion( + messages=[{"role": "user", "content": prompt}], + temperature=0.1 + ) + #print(ai_response) #DEBUG + thread_safe_print(f" šŸ“ AI translation response received") + + # Parse AI response + try: + json_start = ai_response.find('{') + json_end = ai_response.rfind('}') + 1 + + if json_start != -1 and json_end > json_start: + json_str = ai_response[json_start:json_end] + translated_lines = json.loads(json_str) + + # Map back to original operations + translation_mapping = {} + for i, line_info in enumerate(lines_to_translate): + key = f"line_{i}" + if key in translated_lines: + translation_mapping[line_info['source_line']] = translated_lines[key] + + thread_safe_print(f" āœ… Successfully translated {len(translation_mapping)} TOC lines") + return translation_mapping + + except json.JSONDecodeError as e: + thread_safe_print(f" āŒ Failed to parse AI translation response: {e}") + return {} + + except Exception as e: + thread_safe_print(f" āŒ AI translation failed: {e}") + return {} + +def process_toc_file(file_path, toc_data, pr_url, github_client, ai_client, repo_config): + """Process a single TOC.md file with special logic""" + thread_safe_print(f"\nšŸ“‹ Processing TOC file: {file_path}") + + try: + target_local_path = repo_config['target_local_path'] + target_file_path = os.path.join(target_local_path, file_path) + + # Read current target file + with open(target_file_path, 'r', encoding='utf-8') as f: + target_content = f.read() + + target_lines = target_content.split('\n') + operations = toc_data['operations'] + + # Separate operations by type + deleted_ops = [op for op in operations if 'target_line' in op] + added_ops = [op for op in operations if 'target_insertion_after' in op] + modified_ops = [op for op in operations if 'target_line_context' in op] + + thread_safe_print(f" šŸ“Š TOC operations: {len(deleted_ops)} deleted, {len(added_ops)} added, {len(modified_ops)} modified") + + # Process deletions first (work backwards to maintain line numbers) + if deleted_ops: + thread_safe_print(f" šŸ—‘ļø Processing {len(deleted_ops)} deletions...") + deleted_ops.sort(key=lambda x: x['target_line'], reverse=True) + + for del_op in deleted_ops: + target_line_num = del_op['target_line'] - 1 # Convert to 0-based + if 0 <= target_line_num < len(target_lines): + thread_safe_print(f" āŒ Deleting line {del_op['target_line']}: {target_lines[target_line_num].strip()}") + del target_lines[target_line_num] + + # Process modifications + if modified_ops: + thread_safe_print(f" āœļø Processing {len(modified_ops)} modifications...") + + # Get translations for operations that need them + translations = translate_toc_lines(modified_ops, ai_client, repo_config) + + for mod_op in modified_ops: + target_line_num = find_toc_modification_line(mod_op, target_lines) - 1 # Convert to 0-based + + if 0 <= target_line_num < len(target_lines): + if mod_op.get('needs_translation', False) and mod_op['source_line'] in translations: + new_content = translations[mod_op['source_line']] + thread_safe_print(f" āœļø Modifying line {target_line_num + 1} with translation") + else: + new_content = mod_op['content'] + thread_safe_print(f" āœļø Modifying line {target_line_num + 1} without translation") + + target_lines[target_line_num] = new_content + + # Process additions last + if added_ops: + thread_safe_print(f" āž• Processing {len(added_ops)} additions...") + + # Get translations for operations that need them + translations = translate_toc_lines(added_ops, ai_client, repo_config) + + # Group additions by insertion point and process in reverse order + added_ops.sort(key=lambda x: x['target_insertion_after'], reverse=True) + + for add_op in added_ops: + insertion_after = add_op['target_insertion_after'] + + if add_op.get('needs_translation', False) and add_op['source_line'] in translations: + new_content = translations[add_op['source_line']] + thread_safe_print(f" āž• Inserting after line {insertion_after} with translation") + else: + new_content = add_op['content'] + thread_safe_print(f" āž• Inserting after line {insertion_after} without translation") + + # Insert the new line + if insertion_after < len(target_lines): + target_lines.insert(insertion_after, new_content) + else: + target_lines.append(new_content) + + # Write updated content back to file + updated_content = '\n'.join(target_lines) + with open(target_file_path, 'w', encoding='utf-8') as f: + f.write(updated_content) + + thread_safe_print(f" āœ… TOC file updated: {file_path}") + + except Exception as e: + thread_safe_print(f" āŒ Error processing TOC file {file_path}: {e}") + +def process_toc_files(toc_files, pr_url, github_client, ai_client, repo_config): + """Process all TOC files""" + if not toc_files: + return + + thread_safe_print(f"\nšŸ“‹ Processing {len(toc_files)} TOC files...") + + for file_path, toc_data in toc_files.items(): + if toc_data['type'] == 'toc': + process_toc_file(file_path, toc_data, pr_url, github_client, ai_client, repo_config) + else: + thread_safe_print(f" āš ļø Unknown TOC data type: {toc_data['type']} for {file_path}") + + thread_safe_print(f" āœ… All TOC files processed") From 4fce98fafd49946a331efc8de10321416debf0aa Mon Sep 17 00:00:00 2001 From: Grace Cai Date: Thu, 25 Sep 2025 17:58:58 +0800 Subject: [PATCH 08/18] Discard changes to system-variables.md --- system-variables.md | 2 -- 1 file changed, 2 deletions(-) diff --git a/system-variables.md b/system-variables.md index f430c634edda5..d6bd2b52f1e0a 100644 --- a/system-variables.md +++ b/system-variables.md @@ -1727,7 +1727,6 @@ mysql> SELECT job_info FROM mysql.analyze_jobs ORDER BY end_time DESC LIMIT 1; - If `tidb_ddl_enable_fast_reorg` is set to `OFF`, `ADD INDEX` is executed as a transaction. If there are many update operations such as `UPDATE` and `REPLACE` in the target columns during the `ADD INDEX` execution, a larger batch size indicates a larger probability of transaction conflicts. In this case, it is recommended that you set the batch size to a smaller value. The minimum value is 32. - If the transaction conflict does not exist, or if `tidb_ddl_enable_fast_reorg` is set to `ON`, you can set the batch size to a large value. This makes data backfilling faster but also increases the write pressure on TiKV. For a proper batch size, you also need to refer to the value of `tidb_ddl_reorg_worker_cnt`. See [Interaction Test on Online Workloads and `ADD INDEX` Operations](https://docs.pingcap.com/tidb/dev/online-workloads-and-add-index-operations) for reference. - Starting from v8.3.0, this parameter is supported at the SESSION level. Modifying the parameter at the GLOBAL level will not impact currently running DDL statements. It will only apply to DDLs submitted in new sessions. - - Starting from v8.5.0, you can modify this parameter for a running DDL job by executing `ADMIN ALTER DDL JOBS BATCH_SIZE = ;`. Note that this operation is not supported for `ADD INDEX` DDL when [`tidb_enable_dist_task`](/system-variables.md#tidb_enable_dist_task-new-in-v710) is enabled. For details, see [`ADMIN ALTER DDL JOBS`](/sql-statements/sql-statement-admin-alter-ddl.md). ### tidb_ddl_reorg_priority @@ -1779,7 +1778,6 @@ Assume that you have a cluster with 4 TiDB nodes and multiple TiKV nodes. In thi - Unit: Threads - This variable is used to set the concurrency of the DDL operation in the `re-organize` phase. - Starting from v8.3.0, this parameter is supported at the SESSION level. Modifying the parameter at the GLOBAL level will not impact currently running DDL statements. It will only apply to DDLs submitted in new sessions. -- Starting from v8.5.0, you can modify this parameter for a running DDL job by executing `ADMIN ALTER DDL JOBS THREAD = ;`. Note that this operation is not supported for `ADD INDEX` DDL when [`tidb_enable_dist_task`](/system-variables.md#tidb_enable_dist_task-new-in-v710) is enabled. For details, see [`ADMIN ALTER DDL JOBS`](/sql-statements/sql-statement-admin-alter-ddl.md). ### `tidb_enable_fast_create_table` New in v8.0.0 From e68d1c2301c91c0481c18f76e212b9b4bb341cc4 Mon Sep 17 00:00:00 2001 From: Grace Cai Date: Thu, 25 Sep 2025 17:59:07 +0800 Subject: [PATCH 09/18] Discard changes to ticdc/ticdc-faq.md --- ticdc/ticdc-faq.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ticdc/ticdc-faq.md b/ticdc/ticdc-faq.md index df072430c0941..389ce8f58a4bd 100644 --- a/ticdc/ticdc-faq.md +++ b/ticdc/ticdc-faq.md @@ -407,7 +407,7 @@ If the downstream is a TiDB cluster or MySQL instance, it is recommended that yo ## Replication of a single table can only be run on a single TiCDC node. Will it be possible to use multiple TiCDC nodes to replicate data of multiple tables? -Starting from v7.1.0, TiCDC supports the MQ sink to replicate data change logs at the granularity of TiKV Regions, which achieves scalable processing capability and allows TiCDC to replicate a single table with a large number of Regions. To enable this feature, you can configure the following parameter in the [TiCDC changefeed configuration file](/ticdc/ticdc-changefeed-config.md): +Starting from v7.1.0, TiCDC supports the MQ sink to replicate data change logs at the granularity of TiKV Regions, which achieves scalable processing capability and allows TiCDC to replicate a single table with a large number of Regions. To enable this feature, you can configure the following parameter in the [TiCDC configuration file](/ticdc/ticdc-changefeed-config.md): ```toml [scheduler] From 88d98644e93e386d9a1f14abf9be1981f1e67482 Mon Sep 17 00:00:00 2001 From: Grace Cai Date: Thu, 25 Sep 2025 17:59:16 +0800 Subject: [PATCH 10/18] Discard changes to tikv-configuration-file.md --- tikv-configuration-file.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tikv-configuration-file.md b/tikv-configuration-file.md index 53c0422d29c17..fafbcee45657d 100644 --- a/tikv-configuration-file.md +++ b/tikv-configuration-file.md @@ -2709,8 +2709,8 @@ TiKV MVCC in-memory engine (IME) configuration items related to the storage laye > + After the in-memory engine is enabled, `block-cache.capacity` automatically decreases by 10%. > + If you manually configure `capacity`, `block-cache.capacity` does not automatically decrease. In this case, you need to manually adjust its value to avoid OOM. -+ Controls the maximum memory size that the [TiKV MVCC in-memory engine](/tikv-in-memory-engine.md) can use. The memory capacity determines the number of Regions that can be cached. When the capacity is full, the in-memory engine loads new Regions and evicts cached Regions based on the redundancy of Region MVCC. -+ Default value: `min(10% of the total system memory, 5 GiB)` ++ Controls the maximum memory size that the in-memory engine can use. The maximum value is 5 GiB. You can manually configure it to use more memory. ++ Default value: 10% of the system memory. ### `gc-run-interval` New in v8.5.0 From e7ed7d1c42e0b477b15fe6192aadce825fe4651f Mon Sep 17 00:00:00 2001 From: Grace Cai Date: Fri, 26 Sep 2025 09:57:31 +0800 Subject: [PATCH 11/18] Update main_workflow.py --- scripts/translate_doc_pr/main_workflow.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/scripts/translate_doc_pr/main_workflow.py b/scripts/translate_doc_pr/main_workflow.py index 12260334ec206..c76952ad01994 100644 --- a/scripts/translate_doc_pr/main_workflow.py +++ b/scripts/translate_doc_pr/main_workflow.py @@ -12,7 +12,7 @@ # Conditional import for Gemini try: - from google import genai + import google.generativeai as genai GEMINI_AVAILABLE = True except ImportError: GEMINI_AVAILABLE = False @@ -154,10 +154,10 @@ def __init__(self, provider="deepseek"): self.model = "deepseek-chat" elif provider == "gemini": if not GEMINI_AVAILABLE: - raise ImportError("google.generativeai package not installed. Run: pip install google-generativeai") + raise ImportError("google-generativeai package not installed. Run: pip install google-generativeai") if not GEMINI_API_KEY: raise ValueError("GEMINI_API_TOKEN environment variable must be set") - self.client = genai.Client(api_key=GEMINI_API_KEY) + genai.configure(api_key=GEMINI_API_KEY) self.model = GEMINI_MODEL_NAME else: raise ValueError(f"Unsupported AI provider: {provider}") @@ -178,11 +178,9 @@ def chat_completion(self, messages, temperature=0.1, max_tokens=20000): prompt = self._convert_messages_to_prompt(messages) thread_safe_print(f" šŸ”„ Calling Gemini API...") - # Use the correct Gemini API call format (based on your reference file) - response = self.client.models.generate_content( - model=self.model, - contents=prompt - ) + # Use the correct Gemini API call format + model = genai.GenerativeModel(self.model) + response = model.generate_content(prompt) if response and response.text: thread_safe_print(f" āœ… Gemini response received") From 11efdefa1a352d8e27d8efa9d6c0acf70313cd4c Mon Sep 17 00:00:00 2001 From: Grace Cai Date: Fri, 26 Sep 2025 09:57:51 +0800 Subject: [PATCH 12/18] Update sync-docs-cn-to-en.yml --- .github/workflows/sync-docs-cn-to-en.yml | 30 ++++++++++++++++++++---- 1 file changed, 26 insertions(+), 4 deletions(-) diff --git a/.github/workflows/sync-docs-cn-to-en.yml b/.github/workflows/sync-docs-cn-to-en.yml index 5fe0aa9e3913b..9a6865106d047 100644 --- a/.github/workflows/sync-docs-cn-to-en.yml +++ b/.github/workflows/sync-docs-cn-to-en.yml @@ -91,6 +91,7 @@ jobs: git config user.email "github-actions[bot]@users.noreply.github.com" - name: Run sync script + id: sync_script env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} DEEPSEEK_API_TOKEN: ${{ secrets.DEEPSEEK_API_TOKEN }} @@ -101,9 +102,17 @@ jobs: TARGET_REPO_PATH: ${{ github.workspace }}/target_repo run: | cd scripts/translate_doc_pr - python main_workflow.py + if python main_workflow.py; then + echo "sync_success=true" >> $GITHUB_OUTPUT + echo "āœ… Sync script completed successfully" + else + echo "sync_success=false" >> $GITHUB_OUTPUT + echo "āŒ Sync script failed" + exit 1 + fi - name: Commit and push changes + if: steps.sync_script.outputs.sync_success == 'true' run: | cd target_repo git add . @@ -122,13 +131,26 @@ jobs: echo "Changes pushed to target PR branch: ${{ steps.target_branch.outputs.target_branch }}" fi - - name: Add comment to target PR + - name: Add success comment to target PR + if: steps.sync_script.outputs.sync_success == 'true' + run: | + # Add a comment to the target PR about the sync success + curl -X POST \ + -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" \ + -H "Accept: application/vnd.github.v3+json" \ + "https://api.github.com/repos/${{ steps.extract_info.outputs.target_owner }}/${{ steps.extract_info.outputs.target_repo }}/issues/${{ steps.extract_info.outputs.target_pr }}/comments" \ + -d "{ + \"body\": \"šŸ¤– **Auto-sync completed successfully**\\n\\nšŸ“„ **Source PR**: ${{ github.event.inputs.source_pr_url }}\\nšŸŽÆ **Target PR**: ${{ github.event.inputs.target_pr_url }}\\nāœ… English documentation has been updated based on Chinese documentation changes.\\n\\n_This comment was generated automatically by the sync workflow._\" + }" + + - name: Add failure comment to target PR + if: steps.sync_script.outputs.sync_success == 'false' run: | - # Add a comment to the target PR about the sync + # Add a comment to the target PR about the sync failure curl -X POST \ -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" \ -H "Accept: application/vnd.github.v3+json" \ "https://api.github.com/repos/${{ steps.extract_info.outputs.target_owner }}/${{ steps.extract_info.outputs.target_repo }}/issues/${{ steps.extract_info.outputs.target_pr }}/comments" \ -d "{ - \"body\": \"šŸ¤– **Auto-sync completed**\\n\\nšŸ“„ **Source PR**: ${{ github.event.inputs.source_pr_url }}\\nšŸŽÆ **Target PR**: ${{ github.event.inputs.target_pr_url }}\\nāœ… English documentation has been updated based on Chinese documentation changes.\\n\\n_This comment was generated automatically by the sync workflow._\" + \"body\": \"šŸ¤– **Auto-sync failed**\\n\\nšŸ“„ **Source PR**: ${{ github.event.inputs.source_pr_url }}\\nšŸŽÆ **Target PR**: ${{ github.event.inputs.target_pr_url }}\\nāŒ The sync process encountered an error. Please check the workflow logs for details.\\n\\n_This comment was generated automatically by the sync workflow._\" }" From ebe65dda85502fba4a4cd9a84a3e62c2d0e9a297 Mon Sep 17 00:00:00 2001 From: Grace Cai Date: Tue, 21 Oct 2025 18:44:38 +0800 Subject: [PATCH 13/18] Delete scripts/translate_doc_pr directory --- scripts/translate_doc_pr/__init__.py | 22 - scripts/translate_doc_pr/file_adder.py | 193 --- scripts/translate_doc_pr/file_deleter.py | 45 - scripts/translate_doc_pr/file_updater.py | 1692 ------------------- scripts/translate_doc_pr/main_workflow.py | 689 -------- scripts/translate_doc_pr/pr_analyzer.py | 1447 ---------------- scripts/translate_doc_pr/requirements.txt | 4 - scripts/translate_doc_pr/section_matcher.py | 973 ----------- scripts/translate_doc_pr/toc_processor.py | 434 ----- 9 files changed, 5499 deletions(-) delete mode 100644 scripts/translate_doc_pr/__init__.py delete mode 100644 scripts/translate_doc_pr/file_adder.py delete mode 100644 scripts/translate_doc_pr/file_deleter.py delete mode 100644 scripts/translate_doc_pr/file_updater.py delete mode 100644 scripts/translate_doc_pr/main_workflow.py delete mode 100644 scripts/translate_doc_pr/pr_analyzer.py delete mode 100644 scripts/translate_doc_pr/requirements.txt delete mode 100644 scripts/translate_doc_pr/section_matcher.py delete mode 100644 scripts/translate_doc_pr/toc_processor.py diff --git a/scripts/translate_doc_pr/__init__.py b/scripts/translate_doc_pr/__init__.py deleted file mode 100644 index b272696e2e394..0000000000000 --- a/scripts/translate_doc_pr/__init__.py +++ /dev/null @@ -1,22 +0,0 @@ -#!/usr/bin/env python3 -""" -Auto-Sync PR Changes - Refactored Modular Version - -This package contains the refactored version of the auto-sync-pr-changes script, -split into logical modules for better maintainability and testing. - -Modules: -- pr_analyzer: PR analysis, diff parsing, content getting, hierarchy building -- section_matcher: Section matching (direct matching + AI matching) -- file_adder: New file processing and translation -- file_deleter: Deleted file processing -- file_updater: Updated file processing and translation -- toc_processor: TOC file special processing -- main: Main orchestration function -""" - -# Import main functionality for easy access -from main import main - -# Make main function available at package level -__all__ = ["main"] diff --git a/scripts/translate_doc_pr/file_adder.py b/scripts/translate_doc_pr/file_adder.py deleted file mode 100644 index 57e93b2fb1c63..0000000000000 --- a/scripts/translate_doc_pr/file_adder.py +++ /dev/null @@ -1,193 +0,0 @@ -""" -File Adder Module -Handles processing and translation of newly added files -""" - -import os -import re -import json -import threading -from github import Github -from openai import OpenAI - -# Thread-safe printing -print_lock = threading.Lock() - -def thread_safe_print(*args, **kwargs): - with print_lock: - print(*args, **kwargs) - -def create_section_batches(file_content, max_lines_per_batch=200): - """Create batches of file content for translation, respecting section boundaries""" - lines = file_content.split('\n') - - # Find all section headers - section_starts = [] - for i, line in enumerate(lines): - line = line.strip() - if line.startswith('#'): - match = re.match(r'^(#{1,10})\s+(.+)', line) - if match: - section_starts.append(i + 1) # 1-based line numbers - - # If no sections found, just batch by line count - if not section_starts: - batches = [] - for i in range(0, len(lines), max_lines_per_batch): - batch_lines = lines[i:i + max_lines_per_batch] - batches.append('\n'.join(batch_lines)) - return batches - - # Create batches respecting section boundaries - batches = [] - current_batch_start = 0 - - for i, section_start in enumerate(section_starts): - section_start_idx = section_start - 1 # Convert to 0-based - - # Check if adding this section would exceed the line limit - if (section_start_idx - current_batch_start) > max_lines_per_batch: - # Close current batch at the previous section boundary - if current_batch_start < section_start_idx: - batch_lines = lines[current_batch_start:section_start_idx] - batches.append('\n'.join(batch_lines)) - current_batch_start = section_start_idx - - # If this is the last section, or the next section would create a batch too large - if i == len(section_starts) - 1: - # Add remaining content as final batch - batch_lines = lines[current_batch_start:] - batches.append('\n'.join(batch_lines)) - else: - next_section_start = section_starts[i + 1] - 1 # 0-based - if (next_section_start - current_batch_start) > max_lines_per_batch: - # Close current batch at current section boundary - batch_lines = lines[current_batch_start:section_start_idx] - if batch_lines: # Only add non-empty batches - batches.append('\n'.join(batch_lines)) - current_batch_start = section_start_idx - - # Clean up any empty batches - batches = [batch for batch in batches if batch.strip()] - - return batches - -def translate_file_batch(batch_content, ai_client, source_language="English", target_language="Chinese"): - """Translate a single batch of file content using AI""" - if not batch_content.strip(): - return batch_content - - thread_safe_print(f" šŸ¤– Translating batch ({len(batch_content.split())} words)...") - - prompt = f"""You are a professional technical writer. Please translate the following {source_language} content to {target_language}. - -IMPORTANT INSTRUCTIONS: -1. Preserve ALL Markdown formatting (headers, links, code blocks, tables, etc.) -2. Do NOT translate: - - Code examples, SQL queries, configuration values - - Technical terms like "TiDB", "TiKV", "PD", API names, etc. - - File paths, URLs, and command line examples - - Variable names and system configuration parameters -3. Translate only the descriptive text and explanations -4. Maintain the exact structure and indentation -5. Keep all special characters and formatting intact - -Content to translate: -{batch_content} - -Please provide the translated content maintaining all formatting and structure.""" - - # Add token estimation - try: - from main import print_token_estimation - print_token_estimation(prompt, "File addition translation") - except ImportError: - # Fallback if import fails - use tiktoken - try: - import tiktoken - enc = tiktoken.get_encoding("cl100k_base") - tokens = enc.encode(prompt) - actual_tokens = len(tokens) - char_count = len(prompt) - print(f" šŸ’° File addition translation") - print(f" šŸ“ Input: {char_count:,} characters") - print(f" šŸ”¢ Actual tokens: {actual_tokens:,} (using tiktoken cl100k_base)") - except Exception: - # Final fallback to character approximation - estimated_tokens = len(prompt) // 4 - char_count = len(prompt) - print(f" šŸ’° File addition translation") - print(f" šŸ“ Input: {char_count:,} characters") - print(f" šŸ”¢ Estimated tokens: ~{estimated_tokens:,} (fallback: 4 chars/token approximation)") - - try: - translated_content = ai_client.chat_completion( - messages=[{"role": "user", "content": prompt}], - temperature=0.1 - ) - thread_safe_print(f" āœ… Batch translation completed") - return translated_content - - except Exception as e: - thread_safe_print(f" āŒ Batch translation failed: {e}") - return batch_content # Return original content if translation fails - -def process_added_files(added_files, pr_url, github_client, ai_client, repo_config): - """Process newly added files by translating and creating them in target repository""" - if not added_files: - thread_safe_print("\nšŸ“„ No new files to process") - return - - thread_safe_print(f"\nšŸ“„ Processing {len(added_files)} newly added files...") - - target_local_path = repo_config['target_local_path'] - source_language = repo_config['source_language'] - target_language = repo_config['target_language'] - - for file_path, file_content in added_files.items(): - thread_safe_print(f"\nšŸ“ Processing new file: {file_path}") - - # Create target file path - target_file_path = os.path.join(target_local_path, file_path) - target_dir = os.path.dirname(target_file_path) - - # Create directory if it doesn't exist - if not os.path.exists(target_dir): - os.makedirs(target_dir, exist_ok=True) - thread_safe_print(f" šŸ“ Created directory: {target_dir}") - - # Check if file already exists - if os.path.exists(target_file_path): - thread_safe_print(f" āš ļø Target file already exists: {target_file_path}") - continue - - # Create section batches for translation - batches = create_section_batches(file_content, max_lines_per_batch=200) - thread_safe_print(f" šŸ“¦ Created {len(batches)} batches for translation") - - # Translate each batch - translated_batches = [] - for i, batch in enumerate(batches): - thread_safe_print(f" šŸ”„ Processing batch {i+1}/{len(batches)}") - translated_batch = translate_file_batch( - batch, - ai_client, - source_language, - target_language - ) - translated_batches.append(translated_batch) - - # Combine translated batches - translated_content = '\n'.join(translated_batches) - - # Write translated content to target file - try: - with open(target_file_path, 'w', encoding='utf-8') as f: - f.write(translated_content) - - thread_safe_print(f" āœ… Created translated file: {target_file_path}") - - except Exception as e: - thread_safe_print(f" āŒ Error creating file {target_file_path}: {e}") - - thread_safe_print(f"\nāœ… Completed processing all new files") diff --git a/scripts/translate_doc_pr/file_deleter.py b/scripts/translate_doc_pr/file_deleter.py deleted file mode 100644 index c2064fe568cf3..0000000000000 --- a/scripts/translate_doc_pr/file_deleter.py +++ /dev/null @@ -1,45 +0,0 @@ -""" -File Deleter Module -Handles processing of deleted files and deleted sections -""" - -import os -import threading -from github import Github - -# Thread-safe printing -print_lock = threading.Lock() - -def thread_safe_print(*args, **kwargs): - with print_lock: - print(*args, **kwargs) - -def process_deleted_files(deleted_files, github_client, repo_config): - """Process deleted files by removing them from target repository""" - if not deleted_files: - thread_safe_print("\nšŸ—‘ļø No files to delete") - return - - thread_safe_print(f"\nšŸ—‘ļø Processing {len(deleted_files)} deleted files...") - - target_local_path = repo_config['target_local_path'] - - for file_path in deleted_files: - thread_safe_print(f"\nšŸ—‘ļø Processing deleted file: {file_path}") - - # Create target file path - target_file_path = os.path.join(target_local_path, file_path) - - # Check if file exists in target - if os.path.exists(target_file_path): - try: - os.remove(target_file_path) - thread_safe_print(f" āœ… Deleted file: {target_file_path}") - except Exception as e: - thread_safe_print(f" āŒ Error deleting file {target_file_path}: {e}") - else: - thread_safe_print(f" āš ļø Target file not found: {target_file_path}") - - thread_safe_print(f"\nāœ… Completed processing deleted files") - -# Section deletion logic moved to file_updater.py diff --git a/scripts/translate_doc_pr/file_updater.py b/scripts/translate_doc_pr/file_updater.py deleted file mode 100644 index 82addd7cc6881..0000000000000 --- a/scripts/translate_doc_pr/file_updater.py +++ /dev/null @@ -1,1692 +0,0 @@ -""" -File Updater Module -Handles processing and translation of updated files and sections -""" - -import os -import re -import json -import threading -from concurrent.futures import ThreadPoolExecutor -from github import Github -from openai import OpenAI - -# Thread-safe printing -print_lock = threading.Lock() - -def thread_safe_print(*args, **kwargs): - with print_lock: - print(*args, **kwargs) - -def get_updated_sections_from_ai(pr_diff, target_sections, source_old_content_dict, ai_client, source_language, target_language, target_file_name=None): - """Use AI to update target sections based on source old content, PR diff, and target sections""" - if not source_old_content_dict or not target_sections: - return {} - - # Filter out deleted sections and prepare source sections from old content - source_sections = {} - for key, old_content in source_old_content_dict.items(): - # Skip deleted sections - if 'deleted' in key: - continue - - # Handle null values by using empty string - content = old_content if old_content is not None else "" - source_sections[key] = content - - # Keep the original order from match_source_diff_to_target.json (no sorting needed) - formatted_source_sections = json.dumps(source_sections, ensure_ascii=False, indent=2) - formatted_target_sections = json.dumps(target_sections, ensure_ascii=False, indent=2) - - thread_safe_print(f" šŸ“Š Source sections: {len(source_sections)} sections") - thread_safe_print(f" šŸ“Š Target sections: {len(target_sections)} sections") - - # Calculate total content size - total_source_chars = sum(len(str(content)) for content in source_sections.values()) - total_target_chars = sum(len(str(content)) for content in target_sections.values()) - thread_safe_print(f" šŸ“ Content size: Source={total_source_chars:,} chars, Target={total_target_chars:,} chars") - - thread_safe_print(f" šŸ¤– Getting AI translation for {len(source_sections)} sections...") - - diff_content = source_sections - - prompt = f"""You are a professional technical writer in the Database domain. I will provide you with: - -1. Source sections in {source_language}: -{formatted_source_sections} - -2. GitHub PR changes (Diff): -{pr_diff} - -3. Current target sections in {target_language}: -{formatted_target_sections} - -Task: Update the target sections in {target_language} according to the diff in {source_language}. - -Instructions: -1. Carefully analyze the PR diff to understand what changes were made (additions, deletions, modifications) -2. Find the corresponding positions in the {target_language} sections and make the same changes. Do not change any content that is not modified in the diff, especially the format. -3. Keep the JSON structure unchanged, only modify the section content -4. Ensure the updated {target_language} content is logically consistent with the {source_language} changes -5. Maintain proper technical writing style and terminology in {target_language}. If a sentence in the diff is unchanged in content but only reordered in {source_language}, reuse its existing translation in {target_language}. - -Please return the complete updated JSON in the same format as target sections, without any additional explanatory text.""" - - # Save prompt to file for reference with target file prefix - target_file_prefix = "unknown" - if target_file_name: - # Use provided target file name - target_file_prefix = target_file_name.replace('/', '_').replace('.md', '') - elif target_sections: - # Try to extract filename from the first section key or content - first_key = next(iter(target_sections.keys()), "") - if "_" in first_key: - # If key contains underscore, it might have target file info - parts = first_key.split("_") - if len(parts) > 1: - target_file_prefix = parts[0] - - # Ensure temp_output directory exists - script_dir = os.path.dirname(os.path.abspath(__file__)) - temp_dir = os.path.join(script_dir, "temp_output") - os.makedirs(temp_dir, exist_ok=True) - - prompt_file = os.path.join(temp_dir, f"{target_file_prefix}_prompt-for-ai-translation.txt") - with open(prompt_file, 'w', encoding='utf-8') as f: - f.write(prompt) - - thread_safe_print(f"\nšŸ’¾ Prompt saved to {prompt_file}") - thread_safe_print(f"šŸ“ Prompt length: {len(prompt)} characters") - thread_safe_print(f"šŸ“Š Source sections: {len(source_sections)}") - thread_safe_print(f"šŸ“Š Target sections: {len(target_sections)}") - thread_safe_print(f"šŸ¤– Sending prompt to AI...") - - thread_safe_print(f"\n šŸ“¤ AI Update Prompt ({source_language} → {target_language}):") - thread_safe_print(f" " + "="*80) - thread_safe_print(f" Source Sections: {formatted_source_sections[:500]}...") - thread_safe_print(f" PR Diff (first 500 chars): {pr_diff[:500]}...") - thread_safe_print(f" Target Sections: {formatted_target_sections[:500]}...") - thread_safe_print(f" " + "="*80) - - try: - from main import print_token_estimation - print_token_estimation(prompt, f"Document translation ({source_language} → {target_language})") - except ImportError: - # Fallback if import fails - use tiktoken - try: - import tiktoken - enc = tiktoken.get_encoding("cl100k_base") - tokens = enc.encode(prompt) - actual_tokens = len(tokens) - char_count = len(prompt) - thread_safe_print(f" šŸ’° Document translation ({source_language} → {target_language})") - thread_safe_print(f" šŸ“ Input: {char_count:,} characters") - thread_safe_print(f" šŸ”¢ Actual tokens: {actual_tokens:,} (using tiktoken cl100k_base)") - except Exception: - # Final fallback to character approximation - estimated_tokens = len(prompt) // 4 - char_count = len(prompt) - thread_safe_print(f" šŸ’° Document translation ({source_language} → {target_language})") - thread_safe_print(f" šŸ“ Input: {char_count:,} characters") - thread_safe_print(f" šŸ”¢ Estimated tokens: ~{estimated_tokens:,} (fallback: 4 chars/token approximation)") - - try: - ai_response = ai_client.chat_completion( - messages=[{"role": "user", "content": prompt}], - temperature=0.1 - ) - thread_safe_print(f" šŸ“ AI translation response received") - thread_safe_print(f" šŸ“‹ AI response (first 500 chars): {ai_response[:500]}...") - - result = parse_updated_sections(ai_response) - thread_safe_print(f" šŸ“Š Parsed {len(result)} sections from AI response") - - # Save AI results to file with target file prefix - ai_results_file = os.path.join(temp_dir, f"{target_file_prefix}_updated_sections_from_ai.json") - with open(ai_results_file, 'w', encoding='utf-8') as f: - json.dump(result, f, ensure_ascii=False, indent=2) - - thread_safe_print(f" šŸ’¾ AI results saved to {ai_results_file}") - return result - - except Exception as e: - thread_safe_print(f" āŒ AI translation failed: {e}") - return {} - -def parse_updated_sections(ai_response): - """Parse AI response and extract JSON (from get-updated-target-sections.py)""" - # Ensure temp_output directory exists for debug files - script_dir = os.path.dirname(os.path.abspath(__file__)) - temp_dir = os.path.join(script_dir, "temp_output") - os.makedirs(temp_dir, exist_ok=True) - - try: - print(f"\n šŸ”§ Parsing AI response...") - print(f" Raw response length: {len(ai_response)} characters") - - # Try to extract JSON from AI response - cleaned_response = ai_response.strip() - - # Remove markdown code blocks if present - if cleaned_response.startswith('```json'): - cleaned_response = cleaned_response[7:] - print(f" šŸ“ Removed '```json' prefix") - elif cleaned_response.startswith('```'): - cleaned_response = cleaned_response[3:] - print(f" šŸ“ Removed '```' prefix") - - if cleaned_response.endswith('```'): - cleaned_response = cleaned_response[:-3] - print(f" šŸ“ Removed '```' suffix") - - cleaned_response = cleaned_response.strip() - - print(f" šŸ“ Cleaned response length: {len(cleaned_response)} characters") - print(f" šŸ“ First 200 chars: {cleaned_response[:200]}...") - print(f" šŸ“ Last 200 chars: ...{cleaned_response[-200:]}") - - # Try to find JSON content between curly braces - start_idx = cleaned_response.find('{') - end_idx = cleaned_response.rfind('}') - - if start_idx != -1 and end_idx != -1 and end_idx > start_idx: - json_content = cleaned_response[start_idx:end_idx+1] - print(f" šŸ“ Extracted JSON content length: {len(json_content)} characters") - - try: - # Parse JSON - updated_sections = json.loads(json_content) - print(f" āœ… Successfully parsed JSON with {len(updated_sections)} sections") - return updated_sections - except json.JSONDecodeError as e: - print(f" āš ļø JSON seems incomplete, trying to fix...") - - # Try to fix incomplete JSON by finding the last complete entry - lines = json_content.split('\n') - fixed_lines = [] - in_value = False - quote_count = 0 - - for line in lines: - if '"' in line: - quote_count += line.count('"') - - fixed_lines.append(line) - - # If we have an even number of quotes, we might have a complete entry - if quote_count % 2 == 0 and (line.strip().endswith(',') or line.strip().endswith('"')): - # Try to parse up to this point - potential_json = '\n'.join(fixed_lines) - if not potential_json.rstrip().endswith('}'): - # Remove trailing comma and add closing brace - if potential_json.rstrip().endswith(','): - potential_json = potential_json.rstrip()[:-1] + '\n}' - else: - potential_json += '\n}' - - try: - partial_sections = json.loads(potential_json) - print(f" šŸ”§ Fixed JSON with {len(partial_sections)} sections") - return partial_sections - except: - continue - - # If all else fails, return the original error - raise e - else: - print(f" āŒ Could not find valid JSON structure in response") - return None - - except json.JSONDecodeError as e: - print(f" āŒ Error parsing AI response as JSON: {e}") - print(f" šŸ“ Error at position: {e.pos if hasattr(e, 'pos') else 'unknown'}") - - # Save debug info - debug_file = os.path.join(temp_dir, f"ai_response_debug_{os.getpid()}.txt") - with open(debug_file, 'w', encoding='utf-8') as f: - f.write("Original AI Response:\n") - f.write("="*80 + "\n") - f.write(ai_response) - f.write("\n" + "="*80 + "\n") - f.write("Cleaned Response:\n") - f.write("-"*80 + "\n") - f.write(cleaned_response if 'cleaned_response' in locals() else "Not available") - - print(f" šŸ“ Debug info saved to: {debug_file}") - return None - except Exception as e: - print(f" āŒ Unexpected error parsing AI response: {e}") - return None - - -def replace_frontmatter_content(lines, new_content): - """Replace content from beginning of file to first top-level header""" - # Find the first top-level header - first_header_idx = None - for i, line in enumerate(lines): - if line.strip().startswith('# '): - first_header_idx = i - break - - if first_header_idx is None: - # No top-level header found, replace entire content - return new_content.split('\n') - - # Replace content from start to before first header - new_lines = new_content.split('\n') - return new_lines + lines[first_header_idx:] - - -def replace_toplevel_section_content(lines, target_line_num, new_content): - """Replace content from top-level header to first next-level header""" - start_idx = target_line_num - 1 # Convert to 0-based index - - # Find the end of top-level section (before first ## header) - end_idx = len(lines) - for i in range(start_idx + 1, len(lines)): - line = lines[i].strip() - if line.startswith('##'): # Found first next-level header - end_idx = i - break - - # Replace the top-level section content (from start_idx to end_idx) - new_lines = new_content.split('\n') - return lines[:start_idx] + new_lines + lines[end_idx:] - - -def update_local_document(file_path, updated_sections, hierarchy_dict, target_local_path): - """Update local document using hierarchy-based section identification (from update-target-doc-v2.py)""" - local_path = os.path.join(target_local_path, file_path) - - if not os.path.exists(local_path): - print(f" āŒ Local file not found: {local_path}") - return False - - try: - # Read document content - with open(local_path, 'r', encoding='utf-8') as f: - document_content = f.read() - - lines = document_content.split('\n') - - replacements_made = [] - - # Use a unified approach: build a complete replacement plan first, then execute it - # This avoids line number shifts during the replacement process - - # Find section boundaries for ALL sections - section_boundaries = find_section_boundaries(lines, hierarchy_dict) - - # Create a comprehensive replacement plan - replacement_plan = [] - - for line_num, new_content in updated_sections.items(): - if line_num == "0": - # Special handling for frontmatter - first_header_idx = None - for i, line in enumerate(lines): - if line.strip().startswith('# '): - first_header_idx = i - break - - replacement_plan.append({ - 'type': 'frontmatter', - 'start': 0, - 'end': first_header_idx if first_header_idx else len(lines), - 'new_content': new_content, - 'line_num': line_num - }) - - elif line_num in hierarchy_dict: - hierarchy = hierarchy_dict[line_num] - if ' > ' not in hierarchy: # Top-level section - # Special handling for top-level sections - start_idx = int(line_num) - 1 - end_idx = len(lines) - for i in range(start_idx + 1, len(lines)): - line = lines[i].strip() - if line.startswith('##'): - end_idx = i - break - - replacement_plan.append({ - 'type': 'toplevel', - 'start': start_idx, - 'end': end_idx, - 'new_content': new_content, - 'line_num': line_num - }) - else: - # Regular section - if line_num in section_boundaries: - boundary = section_boundaries[line_num] - replacement_plan.append({ - 'type': 'regular', - 'start': boundary['start'], - 'end': boundary['end'], - 'new_content': new_content, - 'line_num': line_num, - 'hierarchy': boundary['hierarchy'] - }) - else: - print(f" āš ļø Section at line {line_num} not found in hierarchy") - - # Sort replacement plan: process from bottom to top of the document to avoid line shifts - # Sort by start line in reverse order (highest line number first) - replacement_plan.sort(key=lambda x: -x['start']) - - # Execute replacements in the planned order (from bottom to top) - print(f" šŸ“‹ Executing {len(replacement_plan)} replacements from bottom to top:") - for i, replacement in enumerate(replacement_plan): - print(f" {i+1}. {replacement['type']} (line {replacement.get('line_num', '0')}, start: {replacement['start']})") - - for replacement in replacement_plan: - start = replacement['start'] - end = replacement['end'] - new_content = replacement['new_content'] - new_lines = new_content.split('\n') - - # Replace the content - lines = lines[:start] + new_lines + lines[end:] - - # Record the replacement - original_line_count = end - start - line_diff = len(new_lines) - original_line_count - - replacements_made.append({ - 'type': replacement['type'], - 'line_num': replacement.get('line_num', 'N/A'), - 'hierarchy': replacement.get('hierarchy', 'N/A'), - 'start': start, - 'end': end, - 'original_lines': original_line_count, - 'new_lines': len(new_lines), - 'line_diff': line_diff - }) - - print(f" āœ… Updated {replacement['type']} section: {replacement.get('line_num', 'frontmatter')}") - - # Save updated document - with open(local_path, 'w', encoding='utf-8') as f: - f.write('\n'.join(lines)) - - print(f" āœ… Updated {len(replacements_made)} sections") - for replacement in replacements_made: - print(f" šŸ“ Line {replacement['line_num']}: {replacement['hierarchy']}") - - return True - - except Exception as e: - thread_safe_print(f" āŒ Error updating file: {e}") - return False - -def find_section_boundaries(lines, hierarchy_dict): - """Find the start and end line for each section based on hierarchy (from update-target-doc-v2.py)""" - section_boundaries = {} - - # Sort sections by line number - sorted_sections = sorted(hierarchy_dict.items(), key=lambda x: int(x[0])) - - for i, (line_num, hierarchy) in enumerate(sorted_sections): - start_line = int(line_num) - 1 # Convert to 0-based index - - # Find end line (start of next section at same or higher level) - end_line = len(lines) # Default to end of document - - if start_line >= len(lines): - continue - - # Get current section level - current_line = lines[start_line].strip() - if not current_line.startswith('#'): - continue - - current_level = len(current_line.split()[0]) # Count # characters - - # Look for next section at same or higher level - for j in range(start_line + 1, len(lines)): - line = lines[j].strip() - if line.startswith('#'): - line_level = len(line.split()[0]) if line.split() else 0 - if line_level <= current_level: - end_line = j - break - - section_boundaries[line_num] = { - 'start': start_line, - 'end': end_line, - 'hierarchy': hierarchy, - 'level': current_level - } - - return section_boundaries - -def insert_sections_into_document(file_path, translated_sections, target_insertion_points, target_local_path): - """Insert translated sections into the target document at specified points""" - - if not translated_sections or not target_insertion_points: - thread_safe_print(f" āš ļø No sections or insertion points provided") - return False - - local_path = os.path.join(target_local_path, file_path) - - if not os.path.exists(local_path): - thread_safe_print(f" āŒ Local file not found: {local_path}") - return False - - try: - # Read document content - with open(local_path, 'r', encoding='utf-8') as f: - document_content = f.read() - - lines = document_content.split('\n') - thread_safe_print(f" šŸ“„ Document has {len(lines)} lines") - - # Sort insertion points by line number in descending order to avoid position shifts - sorted_insertions = sorted( - target_insertion_points.items(), - key=lambda x: x[1]['insertion_after_line'], - reverse=True - ) - - insertions_made = [] - - for group_id, point_data in sorted_insertions: - insertion_after_line = point_data['insertion_after_line'] - new_sections = point_data['new_sections'] - insertion_type = point_data['insertion_type'] - - thread_safe_print(f" šŸ“Œ Inserting {len(new_sections)} sections after line {insertion_after_line}") - - # Convert 1-based line number to 0-based index for insertion point - # insertion_after_line is 1-based, so insertion_index should be insertion_after_line - 1 - insertion_index = insertion_after_line - 1 - - # Prepare new content to insert - new_content_lines = [] - - # Add an empty line before the new sections if not already present - if insertion_index < len(lines) and lines[insertion_index].strip(): - new_content_lines.append("") - - # Add each translated section - for section_line_num in new_sections: - # Find the corresponding translated content - section_hierarchy = None - section_content = None - - # Search for the section in translated_sections by line number or hierarchy - for hierarchy, content in translated_sections.items(): - # Try to match by hierarchy or find the content - if str(section_line_num) in hierarchy or content: # This is a simplified matching - section_hierarchy = hierarchy - section_content = content - break - - if section_content: - # Split content into lines and add to insertion - content_lines = section_content.split('\n') - new_content_lines.extend(content_lines) - - # Add spacing between sections - if section_line_num != new_sections[-1]: # Not the last section - new_content_lines.append("") - - thread_safe_print(f" āœ… Added section: {section_hierarchy}") - else: - thread_safe_print(f" āš ļø Could not find translated content for section at line {section_line_num}") - - # Add an empty line after the new sections if not already present - # Check if the new content already ends with an empty line - if new_content_lines and not new_content_lines[-1].strip(): - # Content already ends with empty line, don't add another - pass - elif insertion_index + 1 < len(lines) and lines[insertion_index + 1].strip(): - # Next line has content and our content doesn't end with empty line, add one - new_content_lines.append("") - - # Insert the new content (insert after insertion_index line, before the next line) - # If insertion_after_line is 251, we want to insert at position 252 (0-based index 251) - lines = lines[:insertion_index + 1] + new_content_lines + lines[insertion_index + 1:] - - insertions_made.append({ - 'group_id': group_id, - 'insertion_after_line': insertion_after_line, - 'sections_count': len(new_sections), - 'lines_added': len(new_content_lines), - 'insertion_type': insertion_type - }) - - # Save updated document - with open(local_path, 'w', encoding='utf-8') as f: - f.write('\n'.join(lines)) - - thread_safe_print(f" āœ… Successfully inserted {len(insertions_made)} section groups") - for insertion in insertions_made: - thread_safe_print(f" šŸ“ {insertion['group_id']}: {insertion['sections_count']} sections, {insertion['lines_added']} lines after line {insertion['insertion_after_line']}") - - return True - - except Exception as e: - thread_safe_print(f" āŒ Error inserting sections: {e}") - return False - -def process_modified_sections(modified_sections, pr_diff, pr_url, github_client, ai_client, repo_config, max_non_system_sections=120): - """Process modified sections with full data structure support""" - results = [] - - for file_path, file_data in modified_sections.items(): - thread_safe_print(f"\nšŸ“„ Processing {file_path}") - - try: - # Call process_single_file with the complete data structure - success, message = process_single_file( - file_path, - file_data, # Pass the complete data structure (includes 'sections', 'original_hierarchy', etc.) - pr_diff, - pr_url, - github_client, - ai_client, - repo_config, - max_non_system_sections - ) - - if success: - thread_safe_print(f" āœ… Successfully processed {file_path}") - results.append((file_path, True, message)) - else: - thread_safe_print(f" āŒ Failed to process {file_path}: {message}") - results.append((file_path, False, message)) - - except Exception as e: - thread_safe_print(f" āŒ Error processing {file_path}: {e}") - results.append((file_path, False, f"Error processing {file_path}: {e}")) - - return results - -def process_deleted_sections(deleted_sections, pr_url, github_client, ai_client, repo_config, max_non_system_sections=120): - """Process deleted sections with full data structure support""" - results = [] - - for file_path, source_sections in deleted_sections.items(): - thread_safe_print(f"\nšŸ—‘ļø Processing deleted sections in {file_path}") - - try: - # Call process_single_file_deletion with the complete data structure - success, message = process_single_file_deletion( - file_path, - source_sections, - pr_url, - github_client, - ai_client, - repo_config, - max_non_system_sections - ) - - if success: - thread_safe_print(f" āœ… Successfully processed deletions in {file_path}") - results.append((file_path, True, message)) - else: - thread_safe_print(f" āŒ Failed to process deletions in {file_path}: {message}") - results.append((file_path, False, message)) - - except Exception as e: - thread_safe_print(f" āŒ Error processing deletions in {file_path}: {e}") - results.append((file_path, False, f"Error processing deletions in {file_path}: {e}")) - - return results - -def process_single_file_deletion(file_path, source_sections, pr_url, github_client, ai_client, repo_config, max_non_system_sections=120): - """Process deletion of sections in a single file""" - - # Import needed functions - from pr_analyzer import get_target_hierarchy_and_content - from section_matcher import ( - find_direct_matches_for_special_files, - filter_non_system_sections, - get_corresponding_sections, - is_system_variable_or_config, - clean_title_for_matching, - parse_ai_response, - find_matching_line_numbers - ) - - # Get target file hierarchy and content - target_hierarchy, target_lines = get_target_hierarchy_and_content( - file_path, github_client, repo_config['target_repo'] - ) - - if not target_hierarchy: - return False, f"Could not get target hierarchy for {file_path}" - - # Separate system variables from regular sections for hybrid mapping - system_sections = {} - regular_sections = {} - - for line_num, hierarchy in source_sections.items(): - # Extract title for checking - if ' > ' in hierarchy: - title = hierarchy.split(' > ')[-1] - else: - title = hierarchy - - cleaned_title = clean_title_for_matching(title) - if is_system_variable_or_config(cleaned_title): - system_sections[line_num] = hierarchy - else: - regular_sections[line_num] = hierarchy - - sections_to_delete = [] - - # Process system variables with direct matching - if system_sections: - thread_safe_print(f" šŸŽÆ Direct matching for {len(system_sections)} system sections...") - matched_dict, failed_matches, skipped_sections = find_direct_matches_for_special_files( - system_sections, target_hierarchy, target_lines - ) - - for target_line_num, hierarchy_string in matched_dict.items(): - sections_to_delete.append(int(target_line_num)) - thread_safe_print(f" āœ… Marked system section for deletion: line {target_line_num}") - - if failed_matches: - thread_safe_print(f" āŒ Failed to match {len(failed_matches)} system sections") - for failed_line in failed_matches: - thread_safe_print(f" - Line {failed_line}: {system_sections[failed_line]}") - - # Process regular sections with AI matching - if regular_sections: - thread_safe_print(f" šŸ¤– AI matching for {len(regular_sections)} regular sections...") - - # Filter target hierarchy for AI - filtered_target_hierarchy = filter_non_system_sections(target_hierarchy) - - # Check if filtered hierarchy is reasonable for AI - if len(filtered_target_hierarchy) > max_non_system_sections: - thread_safe_print(f" āŒ Target hierarchy too large for AI: {len(filtered_target_hierarchy)} > {max_non_system_sections}") - else: - # Get AI mapping (convert dict values to lists as expected by the function) - source_list = list(regular_sections.values()) - target_list = list(filtered_target_hierarchy.values()) - - ai_mapping = get_corresponding_sections( - source_list, - target_list, - ai_client, - repo_config['source_language'], - repo_config['target_language'], - max_tokens=20000 # Use default value for now, can be made configurable later - ) - - if ai_mapping: - # Parse AI response and find matching line numbers - ai_sections = parse_ai_response(ai_mapping) - ai_matched = find_matching_line_numbers(ai_sections, target_hierarchy) - - for source_line, target_line in ai_matched.items(): - try: - sections_to_delete.append(int(target_line)) - thread_safe_print(f" āœ… Marked regular section for deletion: line {target_line}") - except ValueError as e: - thread_safe_print(f" āŒ Error converting target_line to int: {target_line}, error: {e}") - # If target_line is not a number, try to find it in target_hierarchy - for line_num, hierarchy in target_hierarchy.items(): - if target_line in hierarchy or hierarchy in target_line: - sections_to_delete.append(int(line_num)) - thread_safe_print(f" āœ… Found matching section at line {line_num}: {hierarchy}") - break - - # Delete the sections from local document - if sections_to_delete: - success = delete_sections_from_document(file_path, sections_to_delete, repo_config['target_local_path']) - if success: - return True, f"Successfully deleted {len(sections_to_delete)} sections from {file_path}" - else: - return False, f"Failed to delete sections from {file_path}" - else: - return False, f"No sections to delete in {file_path}" - -def delete_sections_from_document(file_path, sections_to_delete, target_local_path): - """Delete specified sections from the local document""" - target_file_path = os.path.join(target_local_path, file_path) - - if not os.path.exists(target_file_path): - thread_safe_print(f" āŒ Target file not found: {target_file_path}") - return False - - try: - # Read current file content - with open(target_file_path, 'r', encoding='utf-8') as f: - content = f.read() - - lines = content.split('\n') - - # Import needed function - from pr_analyzer import build_hierarchy_dict - - # Build hierarchy to understand section boundaries - target_hierarchy = build_hierarchy_dict(content) - - # Sort sections to delete in reverse order to maintain line numbers - sections_to_delete.sort(reverse=True) - - thread_safe_print(f" šŸ—‘ļø Deleting {len(sections_to_delete)} sections from {file_path}") - - for section_line in sections_to_delete: - section_start = section_line - 1 # Convert to 0-based index - - if section_start < 0 or section_start >= len(lines): - thread_safe_print(f" āŒ Invalid section line: {section_line}") - continue - - # Find section end - section_end = len(lines) - 1 # Default to end of file - - # Look for next header at same or higher level - current_line = lines[section_start].strip() - if current_line.startswith('#'): - current_level = len(current_line.split('#')[1:]) # Count # characters - - for i in range(section_start + 1, len(lines)): - line = lines[i].strip() - if line.startswith('#'): - line_level = len(line.split('#')[1:]) - if line_level <= current_level: - section_end = i - 1 - break - - # Delete section (from section_start to section_end inclusive) - thread_safe_print(f" šŸ—‘ļø Deleting lines {section_start + 1} to {section_end + 1}") - del lines[section_start:section_end + 1] - - # Write updated content back to file - updated_content = '\n'.join(lines) - with open(target_file_path, 'w', encoding='utf-8') as f: - f.write(updated_content) - - thread_safe_print(f" āœ… Updated file: {target_file_path}") - return True - - except Exception as e: - thread_safe_print(f" āŒ Error deleting sections from {target_file_path}: {e}") - return False - -def process_single_file(file_path, source_sections, pr_diff, pr_url, github_client, ai_client, repo_config, max_non_system_sections=120): - """Process a single file - thread-safe function for parallel processing""" - thread_id = threading.current_thread().name - thread_safe_print(f"\nšŸ“„ [{thread_id}] Processing {file_path}") - - try: - # Check if this is a TOC file with special operations - if isinstance(source_sections, dict) and 'type' in source_sections and source_sections['type'] == 'toc': - from toc_processor import process_toc_file - return process_toc_file(file_path, source_sections, pr_url, github_client, ai_client, repo_config) - - # Check if this is enhanced sections - if isinstance(source_sections, dict) and 'sections' in source_sections: - if source_sections.get('type') == 'enhanced_sections': - # Skip all the matching logic and directly extract data - thread_safe_print(f" [{thread_id}] šŸš€ Using enhanced sections data, skipping matching logic") - enhanced_sections = source_sections['sections'] - - # Extract target sections and source old content from enhanced sections - # Maintain the exact order from match_source_diff_to_target.json - from collections import OrderedDict - target_sections = OrderedDict() - source_old_content_dict = OrderedDict() - - # Process in the exact order they appear in enhanced_sections (which comes from match_source_diff_to_target.json) - for key, section_info in enhanced_sections.items(): - if isinstance(section_info, dict): - operation = section_info.get('source_operation', '') - - # Skip deleted sections - they shouldn't be in the enhanced_sections anyway - if operation == 'deleted': - continue - - # For source sections: use old_content for modified, new_content for added - if operation == 'added': - source_content = section_info.get('source_new_content', '') - else: # modified - source_content = section_info.get('source_old_content', '') - - # For target sections: use target_content for modified, empty string for added - if operation == 'added': - target_content = "" # Added sections have no existing target content - else: # modified - target_content = section_info.get('target_content', '') - - # Add to both dictionaries using the same key from match_source_diff_to_target.json - if source_content is not None: - source_old_content_dict[key] = source_content - target_sections[key] = target_content - - thread_safe_print(f" [{thread_id}] šŸ“Š Extracted: {len(target_sections)} target sections, {len(source_old_content_dict)} source old content entries") - - # Update sections with AI (get-updated-target-sections.py logic) - thread_safe_print(f" [{thread_id}] šŸ¤– Getting updated sections from AI...") - updated_sections = get_updated_sections_from_ai(pr_diff, target_sections, source_old_content_dict, ai_client, repo_config['source_language'], repo_config['target_language'], file_path) - if not updated_sections: - thread_safe_print(f" [{thread_id}] āš ļø Could not get AI update") - return False, f"Could not get AI update for {file_path}" - - # Return the AI results for further processing - thread_safe_print(f" [{thread_id}] āœ… Successfully got AI translation results for {file_path}") - return True, updated_sections # Return the actual AI results - - else: - # New format: complete data structure - actual_sections = source_sections['sections'] - - # Regular file processing continues here for old format - # Get target hierarchy and content (get-target-affected-hierarchy.py logic) - from pr_analyzer import get_target_hierarchy_and_content - target_hierarchy, target_lines = get_target_hierarchy_and_content(file_path, github_client, repo_config['target_repo']) - if not target_hierarchy: - thread_safe_print(f" [{thread_id}] āš ļø Could not get target content") - return False, f"Could not get target content for {file_path}" - else: - # Old format: direct dict - actual_sections = source_sections - - # Only do mapping if we don't have enhanced sections - if 'enhanced_sections' not in locals() or not enhanced_sections: - # Separate different types of sections - from section_matcher import is_system_variable_or_config - system_var_sections = {} - toplevel_sections = {} - frontmatter_sections = {} - regular_sections = {} - - for line_num, hierarchy in actual_sections.items(): - if line_num == "0" and hierarchy == "frontmatter": - # Special handling for frontmatter - frontmatter_sections[line_num] = hierarchy - else: - # Extract the leaf title from hierarchy - leaf_title = hierarchy.split(' > ')[-1] if ' > ' in hierarchy else hierarchy - - if is_system_variable_or_config(leaf_title): - system_var_sections[line_num] = hierarchy - elif leaf_title.startswith('# '): - # Top-level titles need special handling - toplevel_sections[line_num] = hierarchy - else: - regular_sections[line_num] = hierarchy - - thread_safe_print(f" [{thread_id}] šŸ“Š Found {len(system_var_sections)} system variable/config, {len(toplevel_sections)} top-level, {len(frontmatter_sections)} frontmatter, and {len(regular_sections)} regular sections") - - target_affected = {} - - # Process frontmatter sections with special handling - if frontmatter_sections: - thread_safe_print(f" [{thread_id}] šŸ“„ Processing frontmatter section...") - # For frontmatter, we simply map it to line 0 in target - for line_num, hierarchy in frontmatter_sections.items(): - target_affected[line_num] = hierarchy - thread_safe_print(f" [{thread_id}] āœ… Mapped {len(frontmatter_sections)} frontmatter section") - - # Process top-level titles with special matching - if toplevel_sections: - thread_safe_print(f" [{thread_id}] šŸ” Top-level title matching for {len(toplevel_sections)} sections...") - from section_matcher import find_toplevel_title_matches - toplevel_matched, toplevel_failed, toplevel_skipped = find_toplevel_title_matches(toplevel_sections, target_lines) - - if toplevel_matched: - target_affected.update(toplevel_matched) - thread_safe_print(f" [{thread_id}] āœ… Top-level matched {len(toplevel_matched)} sections") - - if toplevel_failed: - thread_safe_print(f" [{thread_id}] āš ļø {len(toplevel_failed)} top-level sections failed matching") - for failed in toplevel_failed: - thread_safe_print(f" āŒ {failed['hierarchy']}: {failed['reason']}") - - # Process system variables/config sections with direct matching - if system_var_sections: - thread_safe_print(f" [{thread_id}] šŸŽÆ Direct matching {len(system_var_sections)} system variable/config sections...") - from section_matcher import find_direct_matches_for_special_files - direct_matched, failed_matches, skipped_sections = find_direct_matches_for_special_files(system_var_sections, target_hierarchy, target_lines) - - if direct_matched: - target_affected.update(direct_matched) - thread_safe_print(f" [{thread_id}] āœ… Direct matched {len(direct_matched)} system variable/config sections") - - if failed_matches: - thread_safe_print(f" [{thread_id}] āš ļø {len(failed_matches)} system variable/config sections failed direct matching") - for failed in failed_matches: - thread_safe_print(f" āŒ {failed['hierarchy']}: {failed['reason']}") - - # Process regular sections with AI mapping using filtered target hierarchy - if regular_sections: - thread_safe_print(f" [{thread_id}] šŸ¤– AI mapping {len(regular_sections)} regular sections...") - - # Filter target hierarchy to only include non-system sections for AI mapping - from section_matcher import filter_non_system_sections - filtered_target_hierarchy = filter_non_system_sections(target_hierarchy) - - # Check if filtered target hierarchy exceeds the maximum allowed for AI mapping - MAX_NON_SYSTEM_SECTIONS_FOR_AI = 120 - if len(filtered_target_hierarchy) > MAX_NON_SYSTEM_SECTIONS_FOR_AI: - thread_safe_print(f" [{thread_id}] āŒ Too many non-system sections ({len(filtered_target_hierarchy)} > {MAX_NON_SYSTEM_SECTIONS_FOR_AI})") - thread_safe_print(f" [{thread_id}] āš ļø Skipping AI mapping for regular sections to avoid complexity") - - # If no system sections were matched either, return error - if not target_affected: - error_message = f"File {file_path} has too many non-system sections ({len(filtered_target_hierarchy)} > {MAX_NON_SYSTEM_SECTIONS_FOR_AI}) and no system variable sections were matched" - return False, error_message - - # Continue with only system variable matches if available - thread_safe_print(f" [{thread_id}] āœ… Proceeding with {len(target_affected)} system variable/config sections only") - else: - # Proceed with AI mapping using filtered hierarchy - source_list = list(regular_sections.values()) - target_list = list(filtered_target_hierarchy.values()) - - from section_matcher import get_corresponding_sections - ai_response = get_corresponding_sections(source_list, target_list, ai_client, repo_config['source_language'], repo_config['target_language'], max_tokens=20000) - if ai_response: - # Parse AI response and find matching line numbers in the original (unfiltered) hierarchy - from section_matcher import parse_ai_response, find_matching_line_numbers - ai_sections = parse_ai_response(ai_response) - ai_matched = find_matching_line_numbers(ai_sections, target_hierarchy) # Use original hierarchy for line number lookup - - if ai_matched: - target_affected.update(ai_matched) - thread_safe_print(f" [{thread_id}] āœ… AI mapped {len(ai_matched)} regular sections") - else: - thread_safe_print(f" [{thread_id}] āš ļø AI mapping failed for regular sections") - else: - thread_safe_print(f" [{thread_id}] āš ļø Could not get AI response for regular sections") - - # Summary of mapping results - thread_safe_print(f" [{thread_id}] šŸ“Š Total mapped: {len(target_affected)} out of {len(actual_sections)} sections") - - if not target_affected: - thread_safe_print(f" [{thread_id}] āš ļø Could not map sections") - return False, f"Could not map sections for {file_path}" - - thread_safe_print(f" [{thread_id}] āœ… Mapped {len(target_affected)} sections") - - # Extract target sections (get-target-affected-sections.py logic) - thread_safe_print(f" [{thread_id}] šŸ“ Extracting target sections...") - from pr_analyzer import extract_affected_sections - target_sections = extract_affected_sections(target_affected, target_lines) - - # Extract source old content from the enhanced data structure - thread_safe_print(f" [{thread_id}] šŸ“– Extracting source old content...") - source_old_content_dict = {} - - # Handle different data structures for source_sections - if isinstance(source_sections, dict) and 'sections' in source_sections: - # New format: complete data structure with enhanced matching info - for key, section_info in source_sections.items(): - if isinstance(section_info, dict) and 'source_old_content' in section_info: - source_old_content_dict[key] = section_info['source_old_content'] - else: - # Fallback: if we don't have the enhanced structure, we need to get it differently - thread_safe_print(f" [{thread_id}] āš ļø Source sections missing enhanced structure, using fallback") - # For now, create empty dict to avoid errors - this should be addressed in the calling code - source_old_content_dict = {} - - # Update sections with AI (get-updated-target-sections.py logic) - thread_safe_print(f" [{thread_id}] šŸ¤– Getting updated sections from AI...") - updated_sections = get_updated_sections_from_ai(pr_diff, target_sections, source_old_content_dict, ai_client, repo_config['source_language'], repo_config['target_language'], file_path) - if not updated_sections: - thread_safe_print(f" [{thread_id}] āš ļø Could not get AI update") - return False, f"Could not get AI update for {file_path}" - - # Update local document (update-target-doc-v2.py logic) - thread_safe_print(f" [{thread_id}] šŸ’¾ Updating local document...") - success = update_local_document(file_path, updated_sections, target_affected, repo_config['target_local_path']) - - if success: - thread_safe_print(f" [{thread_id}] šŸŽ‰ Successfully updated {file_path}") - return True, f"Successfully updated {file_path}" - else: - thread_safe_print(f" [{thread_id}] āŒ Failed to update {file_path}") - return False, f"Failed to update {file_path}" - - except Exception as e: - thread_safe_print(f" [{thread_id}] āŒ Error processing {file_path}: {e}") - return False, f"Error processing {file_path}: {e}" - -def process_added_sections(added_sections, pr_diff, pr_url, github_client, ai_client, repo_config, max_non_system_sections=120): - """Process added sections by translating and inserting them""" - if not added_sections: - thread_safe_print("\nāž• No added sections to process") - return - - thread_safe_print(f"\nāž• Processing added sections from {len(added_sections)} files...") - - # Import needed functions - from section_matcher import map_insertion_points_to_target - from pr_analyzer import get_target_hierarchy_and_content - - for file_path, section_data in added_sections.items(): - thread_safe_print(f"\nāž• Processing added sections in {file_path}") - - source_sections = section_data['sections'] - insertion_points = section_data['insertion_points'] - - # Get target file hierarchy and content - target_hierarchy, target_lines = get_target_hierarchy_and_content( - file_path, github_client, repo_config['target_repo'] - ) - - if not target_hierarchy: - thread_safe_print(f" āŒ Could not get target hierarchy for {file_path}") - continue - - # Map insertion points to target language - target_insertion_points = map_insertion_points_to_target( - insertion_points, target_hierarchy, target_lines, file_path, pr_url, github_client, ai_client, repo_config, max_non_system_sections - ) - - if not target_insertion_points: - thread_safe_print(f" āŒ No insertion points mapped for {file_path}") - continue - - # Use AI to translate/update new sections (similar to modified sections) - # Since we're now using source_old_content, we need to extract it from the added sections - source_old_content_dict = {} - for key, content in source_sections.items(): - # For added sections, source_old_content is typically None or empty - # We use the new content (from the source file) as the content to translate - source_old_content_dict[key] = content if content is not None else "" - - # Get target sections (empty for new sections, but we need the structure) - target_sections = {} # New sections don't have existing target content - - # Use the same AI function to translate the new sections - translated_sections = get_updated_sections_from_ai( - pr_diff, - target_sections, - source_old_content_dict, - ai_client, - repo_config['source_language'], - repo_config['target_language'], - file_path - ) - - if translated_sections: - # Insert translated sections into document - insert_sections_into_document(file_path, translated_sections, target_insertion_points, repo_config['target_local_path']) - thread_safe_print(f" āœ… Successfully inserted {len(translated_sections)} sections in {file_path}") - else: - thread_safe_print(f" āš ļø No sections were translated for {file_path}") - -def process_files_in_batches(source_changes, pr_diff, pr_url, github_client, ai_client, repo_config, operation_type="modified", batch_size=5, max_non_system_sections=120): - """Process files in parallel batches""" - # Handle different data formats - if isinstance(source_changes, dict): - files = [] - for path, data in source_changes.items(): - if isinstance(data, dict): - if 'type' in data and data['type'] == 'toc': - # TOC file with special operations - files.append((path, data)) - elif 'sections' in data: - # New format: extract sections for processing - files.append((path, data['sections'])) - else: - # Old format: direct dict - files.append((path, data)) - else: - # Old format: direct dict - files.append((path, data)) - else: - files = list(source_changes.items()) - - total_files = len(files) - - if total_files == 0: - return [] - - thread_safe_print(f"\nšŸ”„ Processing {total_files} files in batches of {batch_size}") - - results = [] - - # Process files in batches - for i in range(0, total_files, batch_size): - batch = files[i:i + batch_size] - batch_num = (i // batch_size) + 1 - total_batches = (total_files + batch_size - 1) // batch_size - - thread_safe_print(f"\nšŸ“¦ Batch {batch_num}/{total_batches}: Processing {len(batch)} files") - - # Process current batch in parallel - with ThreadPoolExecutor(max_workers=len(batch), thread_name_prefix=f"Batch{batch_num}") as executor: - # Submit all files in current batch - future_to_file = {} - for file_path, source_sections in batch: - future = executor.submit( - process_single_file, - file_path, - source_sections, - pr_diff, - pr_url, - github_client, - ai_client, - repo_config, - max_non_system_sections - ) - future_to_file[future] = file_path - - # Collect results as they complete - from concurrent.futures import as_completed - batch_results = [] - for future in as_completed(future_to_file): - file_path = future_to_file[future] - try: - success, message = future.result() - batch_results.append((file_path, success, message)) - except Exception as e: - batch_results.append((file_path, False, f"Exception in thread: {e}")) - - results.extend(batch_results) - - # Brief pause between batches to avoid overwhelming the APIs - if i + batch_size < total_files: - thread_safe_print(f" āøļø Waiting 2 seconds before next batch...") - import time - time.sleep(2) - - return results - -def update_target_document_from_match_data(match_file_path, target_local_path, target_file_name=None): - """ - Update target document using data from match_source_diff_to_target.json - This integrates the logic from test_target_update.py - - Args: - match_file_path: Path to the match_source_diff_to_target.json file - target_local_path: Local path to the target repository - target_file_name: Optional target file name (if not provided, will be extracted from match_file_path) - """ - import json - import os - from pathlib import Path - - # Load match data - if not os.path.exists(match_file_path): - thread_safe_print(f"āŒ {match_file_path} file does not exist") - return False - - with open(match_file_path, 'r', encoding='utf-8') as f: - match_data = json.load(f) - - thread_safe_print(f"āœ… Loaded {len(match_data)} section matching data from {match_file_path}") - thread_safe_print(f" Reading translation results directly from target_new_content field") - - if not match_data: - thread_safe_print("āŒ No matching data found") - return False - - # Sort sections by target_line from large to small (modify from back to front) - sections_with_line = [] - - for key, section_data in match_data.items(): - operation = section_data.get('source_operation', '') - target_new_content = section_data.get('target_new_content') - - # For deleted sections, target_new_content should be null - if operation == 'deleted': - if target_new_content is not None: - thread_safe_print(f" āš ļø Deleted section {key} has non-null target_new_content, should be fixed") - thread_safe_print(f" šŸ—‘ļø Including deleted section: {key}") - elif not target_new_content: - thread_safe_print(f" āš ļø Skipping section without target_new_content: {key}") - continue - - target_line = section_data.get('target_line') - if target_line and target_line != 'unknown': - try: - # Handle special case for bottom sections - if target_line == "-1": - line_num = -1 # Special marker for bottom sections - else: - line_num = int(target_line) - sections_with_line.append((key, section_data, line_num)) - except ValueError: - thread_safe_print(f"āš ļø Skipping invalid target_line: {target_line} for {key}") - - # Separate sections into different processing groups - bottom_modified_sections = [] # Process first: modify existing content at document end - regular_sections = [] # Process second: normal operations from back to front - bottom_added_sections = [] # Process last: append new content to document end - - for key, section_data, line_num in sections_with_line: - target_hierarchy = section_data.get('target_hierarchy', '') - - if target_hierarchy.startswith('bottom-modified-'): - bottom_modified_sections.append((key, section_data, line_num)) - elif target_hierarchy.startswith('bottom-added-'): - bottom_added_sections.append((key, section_data, line_num)) - else: - regular_sections.append((key, section_data, line_num)) - - # Sort each group appropriately - def get_source_line_num(item): - key, section_data, line_num = item - if '_' in key and key.split('_')[1].isdigit(): - return int(key.split('_')[1]) - return 0 - - # Bottom modified: sort by source line number (large to small) - bottom_modified_sections.sort(key=lambda x: -get_source_line_num(x)) - - # Regular sections: sort by target_line (large to small), then by source line number - regular_sections.sort(key=lambda x: (-x[2], -get_source_line_num(x))) - - # Bottom added: sort by source line number (small to large) for proper document order - bottom_added_sections.sort(key=lambda x: get_source_line_num(x)) - - # Combine all sections in processing order - all_sections = bottom_modified_sections + regular_sections + bottom_added_sections - - thread_safe_print(f"\nšŸ“Š Processing order: bottom-modified -> regular -> bottom-added") - thread_safe_print(f" šŸ“‹ Bottom modified sections: {len(bottom_modified_sections)}") - thread_safe_print(f" šŸ“‹ Regular sections: {len(regular_sections)}") - thread_safe_print(f" šŸ“‹ Bottom added sections: {len(bottom_added_sections)}") - - if not all_sections: - thread_safe_print("āŒ No valid sections found for update") - return False - - thread_safe_print(f"\nšŸ“Š Detailed processing order:") - for i, (key, section_data, line_num) in enumerate(all_sections, 1): - operation = section_data.get('source_operation', '') - hierarchy = section_data.get('target_hierarchy', '') - insertion_type = section_data.get('insertion_type', '') - - # Extract source line number for display - source_line_num = int(key.split('_')[1]) if '_' in key and key.split('_')[1].isdigit() else 'N/A' - - # Display target_line with special handling for bottom sections - target_display = "END" if line_num == -1 else str(line_num) - - # Determine section group - if hierarchy.startswith('bottom-modified-'): - group = "BotMod" - elif hierarchy.startswith('bottom-added-'): - group = "BotAdd" - else: - group = "Regular" - - if operation == 'deleted': - action = "delete" - elif insertion_type == "before_reference": - action = "insert" - elif line_num == -1: - action = "append" - else: - action = "replace" - - thread_safe_print(f" {i:2}. [{group:7}] Target:{target_display:>3} Src:{source_line_num:3} | {key:15} ({operation:8}) | {action:7} | {hierarchy}") - - # Determine target file name - if target_file_name is None: - # Extract target file name from match file path - # e.g., "tikv-configuration-file-match_source_diff_to_target.json" -> "tikv-configuration-file.md" - match_filename = os.path.basename(match_file_path) - if match_filename.endswith('-match_source_diff_to_target.json'): - extracted_name = match_filename[:-len('-match_source_diff_to_target.json')] + '.md' - target_file_name = extracted_name - thread_safe_print(f" šŸ“‚ Extracted target file name from match file: {target_file_name}") - else: - # Fallback: try to determine from source hierarchy - first_entry = next(iter(match_data.values())) - source_hierarchy = first_entry.get('source_original_hierarchy', '') - - if 'TiFlash' in source_hierarchy or 'tiflash' in source_hierarchy.lower(): - target_file_name = "tiflash/tiflash-configuration.md" - else: - # Default to command-line flags for other cases - target_file_name = "command-line-flags-for-tidb-configuration.md" - thread_safe_print(f" šŸ“‚ Determined target file name from hierarchy: {target_file_name}") - else: - thread_safe_print(f" šŸ“‚ Using provided target file name: {target_file_name}") - - target_file_path = os.path.join(target_local_path, target_file_name) - thread_safe_print(f"\nšŸ“„ Target file path: {target_file_path}") - - # Update target document - thread_safe_print(f"\nšŸš€ Starting target document update, will modify {len(all_sections)} sections...") - success = update_target_document_sections(all_sections, target_file_path) - - return success - -def update_target_document_sections(all_sections, target_file_path): - """ - Update target document sections - integrated from test_target_update.py - """ - thread_safe_print(f"\nšŸš€ Starting target document update: {target_file_path}") - - # Read target document - if not os.path.exists(target_file_path): - thread_safe_print(f"āŒ Target file does not exist: {target_file_path}") - return False - - with open(target_file_path, 'r', encoding='utf-8') as f: - target_lines = f.readlines() - - thread_safe_print(f"šŸ“„ Target document total lines: {len(target_lines)}") - - # Process modifications in order (bottom-modified -> regular -> bottom-added) - for i, (key, section_data, target_line_num) in enumerate(all_sections, 1): - operation = section_data.get('source_operation', '') - insertion_type = section_data.get('insertion_type', '') - target_hierarchy = section_data.get('target_hierarchy', '') - target_new_content = section_data.get('target_new_content') - - thread_safe_print(f"\nšŸ“ {i}/{len(all_sections)} Processing {key} (Line {target_line_num})") - thread_safe_print(f" Operation type: {operation}") - thread_safe_print(f" Target section: {target_hierarchy}") - - if operation == 'deleted': - # Delete logic: remove the specified section - if target_line_num == -1: - thread_safe_print(f" āŒ Invalid delete operation for bottom section") - continue - - thread_safe_print(f" šŸ—‘ļø Delete mode: removing section starting at line {target_line_num}") - - # Find section end position - start_line = target_line_num - 1 # Convert to 0-based index - - if start_line >= len(target_lines): - thread_safe_print(f" āŒ Line number out of range: {target_line_num} > {len(target_lines)}") - continue - - # Find section end position - end_line = find_section_end_for_update(target_lines, start_line, target_hierarchy) - - thread_safe_print(f" šŸ“ Delete range: line {start_line + 1} to {end_line}") - thread_safe_print(f" šŸ“„ Delete content: {target_lines[start_line].strip()[:50]}...") - - # Delete content - deleted_lines = target_lines[start_line:end_line] - target_lines[start_line:end_line] = [] - - thread_safe_print(f" āœ… Deleted {len(deleted_lines)} lines of content") - - elif target_new_content is None: - thread_safe_print(f" āš ļø Skipping: target_new_content is null") - continue - - elif not target_new_content: - thread_safe_print(f" āš ļø Skipping: target_new_content is empty") - continue - - else: - # Handle content format - thread_safe_print(f" šŸ“„ Content preview: {repr(target_new_content[:80])}...") - - if target_hierarchy.startswith('bottom-'): - # Bottom section special handling - if target_hierarchy.startswith('bottom-modified-'): - # Bottom modified: find and replace existing content at document end - thread_safe_print(f" šŸ”„ Bottom modified section: replacing existing content at document end") - - # Get the old content to search for - source_operation_data = section_data.get('source_operation_data', {}) - old_content = source_operation_data.get('old_content', '').strip() - - if old_content: - # Search backwards from end to find the matching section - found_line = None - for idx in range(len(target_lines) - 1, -1, -1): - line_content = target_lines[idx].strip() - if line_content == old_content: - found_line = idx - thread_safe_print(f" šŸ“ Found target section at line {found_line + 1}: {line_content[:50]}...") - break - - if found_line is not None: - # Find section end - end_line = find_section_end_for_update(target_lines, found_line, target_hierarchy) - - # Ensure content format is correct - if not target_new_content.endswith('\n'): - target_new_content += '\n' - - # Split content by lines - new_lines = target_new_content.splitlines(keepends=True) - - # Replace content - target_lines[found_line:end_line] = new_lines - - thread_safe_print(f" āœ… Replaced {end_line - found_line} lines with {len(new_lines)} lines") - else: - thread_safe_print(f" āš ļø Could not find target section, appending to end instead") - # Fallback: append to end - if not target_new_content.endswith('\n'): - target_new_content += '\n' - if target_lines and target_lines[-1].strip(): - target_new_content = '\n' + target_new_content - new_lines = target_new_content.splitlines(keepends=True) - target_lines.extend(new_lines) - thread_safe_print(f" āœ… Appended {len(new_lines)} lines to end of document") - else: - thread_safe_print(f" āš ļø No old_content found, appending to end instead") - # Fallback: append to end - if not target_new_content.endswith('\n'): - target_new_content += '\n' - if target_lines and target_lines[-1].strip(): - target_new_content = '\n' + target_new_content - new_lines = target_new_content.splitlines(keepends=True) - target_lines.extend(new_lines) - thread_safe_print(f" āœ… Appended {len(new_lines)} lines to end of document") - - elif target_hierarchy.startswith('bottom-added-'): - # Bottom added: append new content to end of document - thread_safe_print(f" šŸ”š Bottom added section: appending new content to end") - - # Ensure content format is correct - if not target_new_content.endswith('\n'): - target_new_content += '\n' - - # Add spacing before new section if needed - if target_lines and target_lines[-1].strip(): - target_new_content = '\n' + target_new_content - - # Split content by lines - new_lines = target_new_content.splitlines(keepends=True) - - # Append to end of document - target_lines.extend(new_lines) - - thread_safe_print(f" āœ… Appended {len(new_lines)} lines to end of document") - else: - # Other bottom sections: append to end - thread_safe_print(f" šŸ”š Other bottom section: appending to end of document") - - # Ensure content format is correct - if not target_new_content.endswith('\n'): - target_new_content += '\n' - - # Add spacing before new section if needed - if target_lines and target_lines[-1].strip(): - target_new_content = '\n' + target_new_content - - # Split content by lines - new_lines = target_new_content.splitlines(keepends=True) - - # Append to end of document - target_lines.extend(new_lines) - - thread_safe_print(f" āœ… Appended {len(new_lines)} lines to end of document") - - elif target_hierarchy == "frontmatter": - # Frontmatter special handling: directly replace front lines - thread_safe_print(f" šŸ“„ Frontmatter mode: directly replacing document beginning") - - # Find the first top-level heading position - first_header_line = 0 - for i, line in enumerate(target_lines): - if line.strip().startswith('# '): - first_header_line = i - break - - thread_safe_print(f" šŸ“ Frontmatter range: line 1 to {first_header_line}") - - # Split new content by lines, preserving original structure including trailing empty lines - new_lines = target_new_content.splitlines(keepends=True) - - # If the original content ends with \n, it means there should be an empty line after the last content line - # splitlines() doesn't create this empty line, so we need to add it manually - if target_new_content.endswith('\n'): - new_lines.append('\n') - elif target_new_content: - # If content doesn't end with newline, ensure the last line has one - if not new_lines[-1].endswith('\n'): - new_lines[-1] += '\n' - - # Replace frontmatter - target_lines[0:first_header_line] = new_lines - - thread_safe_print(f" āœ… Replaced {first_header_line} lines of frontmatter with {len(new_lines)} lines") - - elif insertion_type == "before_reference": - # Insert logic: insert before specified line - if target_line_num == -1: - thread_safe_print(f" āŒ Invalid insert operation for bottom section") - continue - - thread_safe_print(f" šŸ“ Insert mode: inserting before line {target_line_num}") - - # Ensure content format is correct - if not target_new_content.endswith('\n'): - target_new_content += '\n' - - # Ensure spacing between sections - if not target_new_content.endswith('\n\n'): - target_new_content += '\n' - - # Split content by lines - new_lines = target_new_content.splitlines(keepends=True) - - # Insert at specified position - insert_position = target_line_num - 1 # Convert to 0-based index - if insert_position < 0: - insert_position = 0 - elif insert_position > len(target_lines): - insert_position = len(target_lines) - - # Execute insertion - for j, line in enumerate(new_lines): - target_lines.insert(insert_position + j, line) - - thread_safe_print(f" āœ… Inserted {len(new_lines)} lines of content") - - else: - # Replace logic: find target section and replace - if target_line_num == -1: - thread_safe_print(f" āŒ Invalid replace operation for bottom section") - continue - - thread_safe_print(f" šŸ”„ Replace mode: replacing section starting at line {target_line_num}") - - # Ensure content format is correct - if not target_new_content.endswith('\n'): - target_new_content += '\n' - - # Ensure spacing between sections - if not target_new_content.endswith('\n\n'): - target_new_content += '\n' - - # Find section end position - start_line = target_line_num - 1 # Convert to 0-based index - - if start_line >= len(target_lines): - thread_safe_print(f" āŒ Line number out of range: {target_line_num} > {len(target_lines)}") - continue - - # Find section end position - end_line = find_section_end_for_update(target_lines, start_line, target_hierarchy) - - thread_safe_print(f" šŸ“ Replace range: line {start_line + 1} to {end_line}") - - # Split new content by lines - new_lines = target_new_content.splitlines(keepends=True) - - # Replace content - target_lines[start_line:end_line] = new_lines - - thread_safe_print(f" āœ… Replaced {end_line - start_line} lines with {len(new_lines)} lines") - - - with open(target_file_path, 'w', encoding='utf-8') as f: - f.writelines(target_lines) - - thread_safe_print(f"\nāœ… Target document update completed!") - thread_safe_print(f"šŸ“„ Updated file: {target_file_path}") - - return True - -def find_section_end_for_update(lines, start_line, target_hierarchy): - """Find section end position - based on test_target_update.py logic""" - current_line = lines[start_line].strip() - - if target_hierarchy == "frontmatter": - # Frontmatter special handling: from --- to second ---, then to first top-level heading - if start_line == 0 and current_line.startswith('---'): - # Find second --- - for i in range(start_line + 1, len(lines)): - if lines[i].strip() == '---': - # Found frontmatter end, but need to include up to next content start - # Look for first non-empty line or first heading - for j in range(i + 1, len(lines)): - line = lines[j].strip() - if line and line.startswith('# '): - thread_safe_print(f" šŸ“ Frontmatter ends at line {j} (before first top-level heading)") - return j - elif line and not line.startswith('#'): - # If there's other content, end there - thread_safe_print(f" šŸ“ Frontmatter ends at line {j} (before other content)") - return j - # If no other content found, end after second --- - thread_safe_print(f" šŸ“ Frontmatter ends at line {i+1} (after second ---)") - return i + 1 - # If not standard frontmatter format, find first top-level heading - for i in range(start_line + 1, len(lines)): - if lines[i].strip().startswith('# '): - thread_safe_print(f" šŸ“ Frontmatter ends at line {i} (before first top-level heading)") - return i - # If no top-level heading found, process entire file - return len(lines) - - if current_line.startswith('#'): - # Use file_updater.py method to calculate heading level - current_level = len(current_line.split()[0]) if current_line.split() else 0 - thread_safe_print(f" šŸ” Current heading level: {current_level} (heading: {current_line[:50]}...)") - - # Special handling for top-level headings: only process until first second-level heading - if current_level == 1: - for i in range(start_line + 1, len(lines)): - line = lines[i].strip() - if line.startswith('##'): # Find first second-level heading - thread_safe_print(f" šŸ“ Top-level heading ends at line {i} (before first second-level heading)") - return i - # If no second-level heading found, look for next top-level heading - for i in range(start_line + 1, len(lines)): - line = lines[i].strip() - if line.startswith('#') and not line.startswith('##'): - thread_safe_print(f" šŸ“ Top-level heading ends at line {i} (before next top-level heading)") - return i - else: - # For other level headings, stop at ANY header to get only direct content - # This prevents including sub-sections in the update range - for i in range(start_line + 1, len(lines)): - line = lines[i].strip() - if line.startswith('#'): - # Stop at ANY header to get only direct content - thread_safe_print(f" šŸ“ Found header at line {i}: {line[:30]}... (stopping for direct content only)") - return i - - # If not found, return file end - thread_safe_print(f" šŸ“ No end position found, using file end") - return len(lines) - - # Non-heading line, only replace current line - return start_line + 1 diff --git a/scripts/translate_doc_pr/main_workflow.py b/scripts/translate_doc_pr/main_workflow.py deleted file mode 100644 index c76952ad01994..0000000000000 --- a/scripts/translate_doc_pr/main_workflow.py +++ /dev/null @@ -1,689 +0,0 @@ -""" -Main Entry Point for GitHub Workflow -Orchestrates the entire auto-sync workflow in GitHub Actions environment -""" - -import sys -import os -import json -import threading -import tiktoken -from github import Github, Auth - -# Conditional import for Gemini -try: - import google.generativeai as genai - GEMINI_AVAILABLE = True -except ImportError: - GEMINI_AVAILABLE = False - -# Import all modules -from pr_analyzer import analyze_source_changes, get_repo_config, get_target_hierarchy_and_content, parse_pr_url -from file_adder import process_added_files -from file_deleter import process_deleted_files -from file_updater import process_files_in_batches, process_added_sections, process_modified_sections, process_deleted_sections -from toc_processor import process_toc_files -from section_matcher import match_source_diff_to_target - -# Configuration from environment variables -SOURCE_PR_URL = os.getenv("SOURCE_PR_URL") -TARGET_PR_URL = os.getenv("TARGET_PR_URL") -GITHUB_TOKEN = os.getenv("GITHUB_TOKEN") -AI_PROVIDER = os.getenv("AI_PROVIDER", "deepseek") -TARGET_REPO_PATH = os.getenv("TARGET_REPO_PATH") - -# AI configuration -DEEPSEEK_API_KEY = os.getenv("DEEPSEEK_API_TOKEN") -DEEPSEEK_BASE_URL = "https://api.deepseek.com" -GEMINI_API_KEY = os.getenv("GEMINI_API_TOKEN") -GEMINI_MODEL_NAME = "gemini-2.0-flash" - -# Processing limit configuration -MAX_NON_SYSTEM_SECTIONS_FOR_AI = 120 -SOURCE_TOKEN_LIMIT = 5000 # Maximum tokens for source new_content before skipping file processing - -# AI configuration -AI_MAX_TOKENS = 20000 # Maximum tokens for AI translation requests - -# Special file configuration -SPECIAL_FILES = ["TOC.md"] -IGNORE_FILES = ["faq/ddl-faq.md","command-line-flags-for-tidb-configuration.md","pd-configuration-file.md"] - -# Repository configuration for workflow -def get_workflow_repo_configs(): - """Get repository configuration based on environment variables""" - if not SOURCE_PR_URL or not TARGET_PR_URL: - raise ValueError("SOURCE_PR_URL and TARGET_PR_URL must be set") - - # Parse source and target repo info - source_parts = SOURCE_PR_URL.split('/') - target_parts = TARGET_PR_URL.split('/') - - source_owner, source_repo = source_parts[-4], source_parts[-3] - target_owner, target_repo = target_parts[-4], target_parts[-3] - - source_repo_key = f"{source_owner}/{source_repo}" - target_repo_key = f"{target_owner}/{target_repo}" - - # Determine language direction based on repo names - if source_repo.endswith('-cn') and not target_repo.endswith('-cn'): - # Chinese to English - source_language = "Chinese" - target_language = "English" - elif not source_repo.endswith('-cn') and target_repo.endswith('-cn'): - # English to Chinese - source_language = "English" - target_language = "Chinese" - else: - # Default fallback - source_language = "English" - target_language = "Chinese" - - return { - source_repo_key: { - "target_repo": target_repo_key, - "target_local_path": TARGET_REPO_PATH, - "source_language": source_language, - "target_language": target_language - } - } - -# Thread-safe printing function -print_lock = threading.Lock() - -def thread_safe_print(*args, **kwargs): - with print_lock: - print(*args, **kwargs) - -def ensure_temp_output_dir(): - """Ensure the temp_output directory exists""" - # Get the directory of the current script - script_dir = os.path.dirname(os.path.abspath(__file__)) - temp_dir = os.path.join(script_dir, "temp_output") - os.makedirs(temp_dir, exist_ok=True) - return temp_dir - -def clean_temp_output_dir(): - """Clean the temp_output directory at the start of execution""" - import shutil - # Get the directory of the current script - script_dir = os.path.dirname(os.path.abspath(__file__)) - temp_dir = os.path.join(script_dir, "temp_output") - if os.path.exists(temp_dir): - if os.path.isdir(temp_dir): - shutil.rmtree(temp_dir) - print(f"🧹 Cleaned existing temp_output directory") - else: - # Remove file if it exists - os.remove(temp_dir) - print(f"🧹 Removed existing temp_output file") - os.makedirs(temp_dir, exist_ok=True) - print(f"šŸ“ Created temp_output directory: {temp_dir}") - return temp_dir - -def estimate_tokens(text): - """Calculate accurate token count using tiktoken (GPT-4/3.5 encoding)""" - if not text: - return 0 - try: - enc = tiktoken.get_encoding("cl100k_base") # GPT-4/3.5 encoding - tokens = enc.encode(text) - return len(tokens) - except Exception as e: - # Fallback to character approximation if tiktoken fails - thread_safe_print(f" āš ļø Tiktoken encoding failed: {e}, using character approximation") - return len(text) // 4 - -def print_token_estimation(prompt_text, context="AI translation"): - """Print accurate token consumption for a request""" - actual_tokens = estimate_tokens(prompt_text) - char_count = len(prompt_text) - thread_safe_print(f" šŸ’° {context}") - thread_safe_print(f" šŸ“ Input: {char_count:,} characters") - thread_safe_print(f" šŸ”¢ Actual tokens: {actual_tokens:,} (using tiktoken cl100k_base)") - return actual_tokens - -class UnifiedAIClient: - """Unified interface for different AI providers""" - - def __init__(self, provider="deepseek"): - self.provider = provider - if provider == "deepseek": - from openai import OpenAI - self.client = OpenAI(api_key=DEEPSEEK_API_KEY, base_url=DEEPSEEK_BASE_URL) - self.model = "deepseek-chat" - elif provider == "gemini": - if not GEMINI_AVAILABLE: - raise ImportError("google-generativeai package not installed. Run: pip install google-generativeai") - if not GEMINI_API_KEY: - raise ValueError("GEMINI_API_TOKEN environment variable must be set") - genai.configure(api_key=GEMINI_API_KEY) - self.model = GEMINI_MODEL_NAME - else: - raise ValueError(f"Unsupported AI provider: {provider}") - - def chat_completion(self, messages, temperature=0.1, max_tokens=20000): - """Unified chat completion interface""" - if self.provider == "deepseek": - response = self.client.chat.completions.create( - model=self.model, - messages=messages, - temperature=temperature, - max_tokens=max_tokens - ) - return response.choices[0].message.content.strip() - elif self.provider == "gemini": - try: - # Convert OpenAI-style messages to Gemini format - prompt = self._convert_messages_to_prompt(messages) - thread_safe_print(f" šŸ”„ Calling Gemini API...") - - # Use the correct Gemini API call format - model = genai.GenerativeModel(self.model) - response = model.generate_content(prompt) - - if response and response.text: - thread_safe_print(f" āœ… Gemini response received") - return response.text.strip() - else: - thread_safe_print(f" āš ļø Gemini response was empty or blocked") - return "No response from Gemini" - - except Exception as e: - thread_safe_print(f" āŒ Gemini API error: {str(e)}") - # Fallback: suggest switching to DeepSeek - thread_safe_print(f" šŸ’” Consider switching to DeepSeek in main.py: AI_PROVIDER = 'deepseek'") - raise e - - def _convert_messages_to_prompt(self, messages): - """Convert OpenAI-style messages to a single prompt for Gemini""" - prompt_parts = [] - for message in messages: - role = message.get("role", "user") - content = message.get("content", "") - if role == "user": - prompt_parts.append(content) - elif role == "system": - prompt_parts.append(f"System: {content}") - return "\n\n".join(prompt_parts) - -def check_source_token_limit(source_diff_dict_file, token_limit=SOURCE_TOKEN_LIMIT): - """Check if the total tokens of all new_content in source-diff-dict exceeds the limit""" - try: - with open(source_diff_dict_file, 'r', encoding='utf-8') as f: - source_diff_dict = json.load(f) - - total_new_content = "" - section_count = 0 - - for key, section_data in source_diff_dict.items(): - if isinstance(section_data, dict): - new_content = section_data.get('new_content', '') - if new_content: - total_new_content += new_content + "\n" - section_count += 1 - - if not total_new_content.strip(): - thread_safe_print(f" āš ļø No new_content found in {source_diff_dict_file}") - return True, 0, 0 # Allow processing if no content to check - - total_tokens = estimate_tokens(total_new_content) - char_count = len(total_new_content) - - thread_safe_print(f" šŸ“Š Source token limit check:") - thread_safe_print(f" šŸ“ Total new_content: {char_count:,} characters from {section_count} sections") - thread_safe_print(f" šŸ”¢ Total tokens: {total_tokens:,}") - thread_safe_print(f" 🚧 Token limit: {token_limit:,}") - - if total_tokens > token_limit: - thread_safe_print(f" āŒ Token limit exceeded! ({total_tokens:,} > {token_limit:,})") - return False, total_tokens, token_limit - else: - thread_safe_print(f" āœ… Within token limit ({total_tokens:,} ≤ {token_limit:,})") - return True, total_tokens, token_limit - - except Exception as e: - thread_safe_print(f" āŒ Error checking token limit for {source_diff_dict_file}: {e}") - return True, 0, 0 # Allow processing on error to avoid blocking - -def get_pr_diff(pr_url, github_client): - """Get the diff content from a GitHub PR (from auto-sync-pr-changes.py)""" - try: - from pr_analyzer import parse_pr_url - owner, repo, pr_number = parse_pr_url(pr_url) - repository = github_client.get_repo(f"{owner}/{repo}") - pr = repository.get_pull(pr_number) - - # Get files and their patches - files = pr.get_files() - diff_content = [] - - for file in files: - if file.filename.endswith('.md') and file.patch: - diff_content.append(f"File: {file.filename}") - diff_content.append(file.patch) - diff_content.append("-" * 80) - - return "\n".join(diff_content) - - except Exception as e: - thread_safe_print(f" āŒ Error getting PR diff: {e}") - return None - -def filter_diff_by_operation_type(pr_diff, operation_type, target_sections=None): - """Filter PR diff to only include changes relevant to specific operation type""" - - if not pr_diff: - return "" - - if operation_type == "modified": - # For modified sections, we want the full diff but focus on changed content - return pr_diff - elif operation_type == "added": - # For added sections, we want to show what was added - filtered_lines = [] - for line in pr_diff.split('\n'): - if line.startswith('+') and not line.startswith('+++'): - filtered_lines.append(line) - elif line.startswith('@@') or line.startswith('File:'): - filtered_lines.append(line) - return '\n'.join(filtered_lines) - elif operation_type == "deleted": - # For deleted sections, we want to show what was removed - filtered_lines = [] - for line in pr_diff.split('\n'): - if line.startswith('-') and not line.startswith('---'): - filtered_lines.append(line) - elif line.startswith('@@') or line.startswith('File:'): - filtered_lines.append(line) - return '\n'.join(filtered_lines) - - return pr_diff - -def filter_diff_for_target_file(pr_diff, target_file, source_diff_dict): - """Extract file-specific diff from the complete PR diff based on source files that map to the target file""" - if not pr_diff or not source_diff_dict: - return pr_diff - - # Extract source files that contribute to this target file - source_files = set() - for key, section_data in source_diff_dict.items(): - if isinstance(section_data, dict): - source_file = section_data.get('source_file', '') - if source_file: - source_files.add(source_file) - - if not source_files: - print(f" āš ļø No source files found in source_diff_dict, using complete PR diff") - return pr_diff - - print(f" šŸ“„ Source files contributing to {target_file}: {list(source_files)}") - - # Filter PR diff to only include changes from these source files - filtered_lines = [] - current_file = None - include_section = False - - for line in pr_diff.split('\n'): - if line.startswith('File: '): - current_file = line.replace('File: ', '').strip() - include_section = current_file in source_files - if include_section: - filtered_lines.append(line) - elif line.startswith('-' * 80): - if include_section: - filtered_lines.append(line) - elif include_section: - filtered_lines.append(line) - - file_specific_diff = '\n'.join(filtered_lines) - print(f" šŸ“Š Filtered diff: {len(file_specific_diff)} chars (from {len(pr_diff)} chars)") - - return file_specific_diff if file_specific_diff.strip() else pr_diff - -def extract_file_diff_from_pr(pr_diff, source_file_path): - """Extract diff content for a specific source file from the complete PR diff""" - if not pr_diff: - return "" - - filtered_lines = [] - current_file = None - include_section = False - - for line in pr_diff.split('\n'): - if line.startswith('File: '): - current_file = line.replace('File: ', '').strip() - include_section = (current_file == source_file_path) - if include_section: - filtered_lines.append(line) - elif line.startswith('-' * 80): - if include_section: - filtered_lines.append(line) - include_section = False # End of this file's section - elif include_section: - filtered_lines.append(line) - - return '\n'.join(filtered_lines) - -def determine_file_processing_type(source_file_path, file_sections, special_files=None): - """Determine how to process a file based on operation type and file characteristics""" - - # Check if this is a special file (like TOC.md) - if special_files and os.path.basename(source_file_path) in special_files: - return "special_file_toc" - - # For all other modified files, use regular processing - return "regular_modified" - -def process_regular_modified_file(source_file_path, file_sections, file_diff, pr_url, github_client, ai_client, repo_config, max_sections): - """Process a regular markdown file that has been modified""" - try: - print(f" šŸ“ Processing as regular modified file: {source_file_path}") - - # Extract the actual sections from the file_sections structure - # file_sections contains: {'sections': {...}, 'original_hierarchy': {...}, 'current_hierarchy': {...}} - if isinstance(file_sections, dict) and 'sections' in file_sections: - actual_sections = file_sections['sections'] - else: - # Fallback: assume file_sections is already the sections dict - actual_sections = file_sections - - print(f" šŸ“Š Extracted sections: {len(actual_sections)} sections") - - # CRITICAL: Load the source-diff-dict.json and perform matching - import json - import os - from section_matcher import match_source_diff_to_target - from pr_analyzer import get_target_hierarchy_and_content - - # Load source-diff-dict.json with file prefix - temp_dir = ensure_temp_output_dir() - file_prefix = source_file_path.replace('/', '-').replace('.md', '') - source_diff_dict_file = os.path.join(temp_dir, f"{file_prefix}-source-diff-dict.json") - if os.path.exists(source_diff_dict_file): - with open(source_diff_dict_file, 'r', encoding='utf-8') as f: - source_diff_dict = json.load(f) - print(f" šŸ“‚ Loaded source diff dict with {len(source_diff_dict)} sections from {source_diff_dict_file}") - - # Check source token limit before proceeding with processing - print(f" šŸ” Checking source token limit...") - within_limit, total_tokens, token_limit = check_source_token_limit(source_diff_dict_file) - if not within_limit: - print(f" 🚫 Skipping file processing: source content exceeds token limit") - print(f" šŸ“Š Total tokens: {total_tokens:,} > Limit: {token_limit:,}") - print(f" ā­ļø File {source_file_path} will not be processed") - return False - - else: - print(f" āŒ {source_diff_dict_file} not found") - return False - - # Get target file hierarchy and content - target_repo = repo_config['target_repo'] - target_hierarchy, target_lines = get_target_hierarchy_and_content(source_file_path, github_client, target_repo) - - if not target_hierarchy or not target_lines: - print(f" āŒ Could not get target file content for {source_file_path}") - return False - - print(f" šŸ“– Target file: {len(target_hierarchy)} sections, {len(target_lines)} lines") - - # Perform source diff to target matching - print(f" šŸ”— Matching source diff to target...") - enhanced_sections = match_source_diff_to_target( - source_diff_dict, - target_hierarchy, - target_lines, - ai_client, - repo_config, - max_sections, - AI_MAX_TOKENS - ) - - if not enhanced_sections: - print(f" āŒ No sections matched") - return False - - print(f" āœ… Matched {len(enhanced_sections)} sections") - - # Save the match result for reference - match_file = os.path.join(temp_dir, f"{source_file_path.replace('/', '-').replace('.md', '')}-match_source_diff_to_target.json") - with open(match_file, 'w', encoding='utf-8') as f: - json.dump(enhanced_sections, f, ensure_ascii=False, indent=2) - print(f" šŸ’¾ Saved match result to: {match_file}") - - # Step 2: Get AI translation for the matched sections - print(f" šŸ¤– Getting AI translation for matched sections...") - - # Create file data structure with enhanced matching info - # Wrap enhanced_sections in the expected format for process_single_file - file_data = { - source_file_path: { - 'type': 'enhanced_sections', - 'sections': enhanced_sections - } - } - - # Call the existing process_modified_sections function to get AI translation - results = process_modified_sections(file_data, file_diff, pr_url, github_client, ai_client, repo_config, max_sections) - - # Step 3: Update match_source_diff_to_target.json with AI results - if results and len(results) > 0: - file_path, success, ai_updated_sections = results[0] # Get first result - if success and isinstance(ai_updated_sections, dict): - print(f" šŸ“ Step 3: Updating {match_file} with AI results...") - - # Load current match_source_diff_to_target.json - with open(match_file, 'r', encoding='utf-8') as f: - match_data = json.load(f) - - # Add target_new_content field to each section based on AI results - updated_count = 0 - for key, section_data in match_data.items(): - operation = section_data.get('source_operation', '') - - if operation == 'deleted': - # For deleted sections, set target_new_content to null - section_data['target_new_content'] = None - elif key in ai_updated_sections: - # For modified/added sections with AI translation - section_data['target_new_content'] = ai_updated_sections[key] - updated_count += 1 - else: - # For sections not translated, keep original content - section_data['target_new_content'] = section_data.get('target_content', '') - - # Save updated match_source_diff_to_target.json - with open(match_file, 'w', encoding='utf-8') as f: - json.dump(match_data, f, ensure_ascii=False, indent=2) - - print(f" āœ… Updated {updated_count} sections with AI translations in {match_file}") - - # Step 4: Apply updates to target document using update_target_document_from_match_data - print(f" šŸ“ Step 4: Applying updates to target document...") - from file_updater import update_target_document_from_match_data - - success = update_target_document_from_match_data(match_file, repo_config['target_local_path'], source_file_path) - if success: - print(f" šŸŽ‰ Target document successfully updated!") - return True - else: - print(f" āŒ Failed to update target document") - return False - - else: - print(f" āš ļø AI translation failed or returned invalid results") - return False - else: - print(f" āš ļø No results from process_modified_sections") - return False - - except Exception as e: - print(f" āŒ Error processing regular modified file {source_file_path}: {e}") - return False - - -def get_workflow_repo_config(pr_url, repo_configs): - """Get repository configuration for workflow environment""" - from pr_analyzer import parse_pr_url - - owner, repo, pr_number = parse_pr_url(pr_url) - source_repo = f"{owner}/{repo}" - - if source_repo not in repo_configs: - raise ValueError(f"Unsupported source repository: {source_repo}. Supported: {list(repo_configs.keys())}") - - config = repo_configs[source_repo].copy() - config['source_repo'] = source_repo - config['pr_number'] = pr_number - - return config - -def main(): - """Main function - orchestrates the entire workflow for GitHub Actions""" - - # Validate environment variables - if not all([SOURCE_PR_URL, TARGET_PR_URL, GITHUB_TOKEN, TARGET_REPO_PATH]): - print("āŒ Missing required environment variables:") - print(f" SOURCE_PR_URL: {SOURCE_PR_URL}") - print(f" TARGET_PR_URL: {TARGET_PR_URL}") - print(f" GITHUB_TOKEN: {'Set' if GITHUB_TOKEN else 'Not set'}") - print(f" TARGET_REPO_PATH: {TARGET_REPO_PATH}") - return - - print(f"šŸ”§ Auto PR Sync Tool (GitHub Workflow Version)") - print(f"šŸ“ Source PR URL: {SOURCE_PR_URL}") - print(f"šŸ“ Target PR URL: {TARGET_PR_URL}") - print(f"šŸ¤– AI Provider: {AI_PROVIDER}") - print(f"šŸ“ Target Repo Path: {TARGET_REPO_PATH}") - - # Clean and prepare temp_output directory - clean_temp_output_dir() - - # Get repository configuration using workflow config - try: - repo_configs = get_workflow_repo_configs() - repo_config = get_workflow_repo_config(SOURCE_PR_URL, repo_configs) - print(f"šŸ“ Source Repo: {repo_config['source_repo']} ({repo_config['source_language']})") - print(f"šŸ“ Target Repo: {repo_config['target_repo']} ({repo_config['target_language']})") - print(f"šŸ“ Target Path: {repo_config['target_local_path']}") - except ValueError as e: - print(f"āŒ {e}") - return - - # Initialize clients - auth = Auth.Token(GITHUB_TOKEN) - github_client = Github(auth=auth) - - # Initialize unified AI client - try: - ai_client = UnifiedAIClient(provider=AI_PROVIDER) - thread_safe_print(f"šŸ¤– AI Provider: {AI_PROVIDER.upper()} ({ai_client.model})") - except Exception as e: - thread_safe_print(f"āŒ Failed to initialize AI client: {e}") - return - - print(f"\nšŸš€ Starting auto-sync for PR: {SOURCE_PR_URL}") - - # Step 1: Get PR diff - print(f"\nšŸ“‹ Step 1: Getting PR diff...") - pr_diff = get_pr_diff(SOURCE_PR_URL, github_client) - if not pr_diff: - print("āŒ Could not get PR diff") - return - print(f"āœ… Got PR diff: {len(pr_diff)} characters") - - # Step 2: Analyze source changes with operation categorization - print(f"\nšŸ“Š Step 2: Analyzing source changes...") - added_sections, modified_sections, deleted_sections, added_files, deleted_files, toc_files = analyze_source_changes( - SOURCE_PR_URL, github_client, - special_files=SPECIAL_FILES, - ignore_files=IGNORE_FILES, - repo_configs=repo_configs, - max_non_system_sections=MAX_NON_SYSTEM_SECTIONS_FOR_AI, - pr_diff=pr_diff # Pass the PR diff to avoid re-fetching - ) - - # Step 3: Process different types of files based on operation type - print(f"\nšŸ“‹ Step 3: Processing files based on operation type...") - - # Import necessary functions - from file_updater import process_modified_sections, update_target_document_from_match_data - from toc_processor import process_toc_files - - # Step 3.1: Process deleted files (file-level deletions) - if deleted_files: - print(f"\nšŸ—‘ļø Step 3.1: Processing {len(deleted_files)} deleted files...") - process_deleted_files(deleted_files, github_client, repo_config) - print(f" āœ… Deleted files processed") - - # Step 3.2: Process added files (file-level additions) - if added_files: - print(f"\nšŸ“„ Step 3.2: Processing {len(added_files)} added files...") - process_added_files(added_files, SOURCE_PR_URL, github_client, ai_client, repo_config) - print(f" āœ… Added files processed") - - # Step 3.3: Process special files (TOC.md and similar) - if toc_files: - print(f"\nšŸ“‹ Step 3.3: Processing {len(toc_files)} special files (TOC)...") - process_toc_files(toc_files, SOURCE_PR_URL, github_client, ai_client, repo_config) - print(f" āœ… Special files processed") - - # Step 3.4: Process modified files (section-level modifications) - if modified_sections: - print(f"\nšŸ“ Step 3.4: Processing {len(modified_sections)} modified files...") - - # Process each modified file separately - for source_file_path, file_sections in modified_sections.items(): - print(f"\nšŸ“„ Processing modified file: {source_file_path}") - - # Extract file-specific diff from the complete PR diff - print(f" šŸ” Extracting file-specific diff for: {source_file_path}") - file_specific_diff = extract_file_diff_from_pr(pr_diff, source_file_path) - - if not file_specific_diff: - print(f" āš ļø No diff found for {source_file_path}, skipping...") - continue - - print(f" šŸ“Š File-specific diff: {len(file_specific_diff)} chars") - - # Determine file processing approach for modified files - file_type = determine_file_processing_type(source_file_path, file_sections, SPECIAL_FILES) - print(f" šŸ” File processing type: {file_type}") - - if file_type == "special_file_toc": - # Special files should have been processed in Step 3.3, skip here - print(f" ā­ļø Special file already processed in Step 3.3, skipping...") - continue - - elif file_type == "regular_modified": - # Regular markdown files with modifications - success = process_regular_modified_file( - source_file_path, - file_sections, - file_specific_diff, - SOURCE_PR_URL, - github_client, - ai_client, - repo_config, - MAX_NON_SYSTEM_SECTIONS_FOR_AI - ) - - if success: - print(f" āœ… Successfully processed {source_file_path}") - else: - print(f" āŒ Failed to process {source_file_path}") - - else: - print(f" āš ļø Unknown file processing type: {file_type} for {source_file_path}, skipping...") - - # Final summary - print(f"šŸ“Š Summary:") - print(f" šŸ“„ Added files: {len(added_files)} processed") - print(f" šŸ—‘ļø Deleted files: {len(deleted_files)} processed") - print(f" šŸ“‹ TOC files: {len(toc_files)} processed") - print(f" šŸ“ Modified files: {len(modified_sections)} processed") - print(f"šŸŽ‰ Workflow completed successfully!") - -if __name__ == "__main__": - main() diff --git a/scripts/translate_doc_pr/pr_analyzer.py b/scripts/translate_doc_pr/pr_analyzer.py deleted file mode 100644 index c164da1520163..0000000000000 --- a/scripts/translate_doc_pr/pr_analyzer.py +++ /dev/null @@ -1,1447 +0,0 @@ -#!/usr/bin/env python3 -""" -PR Analyzer Module -Handles PR analysis, diff parsing, content getting, hierarchy building, and section getting -""" - -import json -import os -import re -import threading -from github import Github - -# Thread-safe printing -print_lock = threading.Lock() - -def thread_safe_print(*args, **kwargs): - """Thread-safe print function""" - with print_lock: - print(*args, **kwargs) - - -def parse_pr_url(pr_url): - """Parse PR URL to get repo info""" - parts = pr_url.split('/') - return parts[-4], parts[-3], int(parts[-1]) # owner, repo, pr_number - -def get_repo_config(pr_url, repo_configs): - """Get repository configuration based on source repo""" - owner, repo, pr_number = parse_pr_url(pr_url) - source_repo = f"{owner}/{repo}" - - if source_repo not in repo_configs: - raise ValueError(f"Unsupported source repository: {source_repo}. Supported: {list(repo_configs.keys())}") - - config = repo_configs[source_repo].copy() - config['source_repo'] = source_repo - config['pr_number'] = pr_number - - return config - -def get_pr_diff(pr_url, github_client): - """Get the diff content from a GitHub PR""" - try: - owner, repo, pr_number = parse_pr_url(pr_url) - repository = github_client.get_repo(f"{owner}/{repo}") - pr = repository.get_pull(pr_number) - - # Get files and their patches - files = pr.get_files() - diff_content = [] - - for file in files: - if file.filename.endswith('.md') and file.patch: - diff_content.append(f"File: {file.filename}") - diff_content.append(file.patch) - diff_content.append("-" * 80) - - return "\n".join(diff_content) - - except Exception as e: - print(f" āŒ Error getting PR diff: {e}") - return None - -def get_changed_line_ranges(file): - """Get the ranges of lines that were changed in the PR""" - changed_ranges = [] - patch = file.patch - if not patch: - return changed_ranges - - lines = patch.split('\n') - current_line = 0 - - for line in lines: - if line.startswith('@@'): - # Parse the hunk header to get line numbers - match = re.search(r'\+(\d+),?(\d+)?', line) - if match: - current_line = int(match.group(1)) - elif line.startswith('+') and not line.startswith('+++'): - # This is an added line - changed_ranges.append(current_line) - current_line += 1 - elif line.startswith('-') and not line.startswith('---'): - # This is a deleted line, also consider as changed - changed_ranges.append(current_line) - # Don't increment current_line for deleted lines - continue - elif line.startswith(' '): - # Context line - current_line += 1 - - return changed_ranges - -def analyze_diff_operations(file): - """Analyze diff to categorize operations as added, modified, or deleted (improved GitHub-like approach)""" - operations = { - 'added_lines': [], # Lines that were added - 'deleted_lines': [], # Lines that were deleted - 'modified_lines': [] # Lines that were modified (both added and deleted content) - } - - patch = file.patch - if not patch: - return operations - - lines = patch.split('\n') - current_line = 0 - deleted_line = 0 - - # Parse diff and keep track of sequence order for better modification detection - diff_sequence = [] # Track the order of operations in diff - - for i, line in enumerate(lines): - if line.startswith('@@'): - # Parse the hunk header to get line numbers - # Format: @@ -old_start,old_count +new_start,new_count @@ - match = re.search(r'-(\d+),?(\d+)?\s+\+(\d+),?(\d+)?', line) - if match: - deleted_line = int(match.group(1)) - current_line = int(match.group(3)) - elif line.startswith('+') and not line.startswith('+++'): - # This is an added line - added_entry = { - 'line_number': current_line, - 'content': line[1:], # Remove the '+' prefix - 'is_header': line[1:].strip().startswith('#'), - 'diff_index': i # Track position in diff - } - operations['added_lines'].append(added_entry) - diff_sequence.append(('added', added_entry)) - current_line += 1 - elif line.startswith('-') and not line.startswith('---'): - # This is a deleted line - deleted_entry = { - 'line_number': deleted_line, - 'content': line[1:], # Remove the '-' prefix - 'is_header': line[1:].strip().startswith('#'), - 'diff_index': i # Track position in diff - } - operations['deleted_lines'].append(deleted_entry) - diff_sequence.append(('deleted', deleted_entry)) - deleted_line += 1 - elif line.startswith(' '): - # Context line (unchanged) - current_line += 1 - deleted_line += 1 - - # GitHub-like modification detection: based on diff sequence proximity - modified_pairs = [] - deleted_headers = [d for d in operations['deleted_lines'] if d['is_header']] - added_headers = [a for a in operations['added_lines'] if a['is_header']] - - used_added_indices = set() - used_deleted_indices = set() - - # Helper function for semantic similarity - def are_headers_similar(old, new): - # Remove markdown markers - old_clean = old.replace('#', '').replace('`', '').strip() - new_clean = new.replace('#', '').replace('`', '').strip() - - # Check if one is a substring/extension of the other - if old_clean in new_clean or new_clean in old_clean: - return True - - # Check for similar patterns (like appending -pu, -new, etc.) - old_base = old_clean.split('-')[0] - new_base = new_clean.split('-')[0] - if old_base and new_base and old_base == new_base: - return True - - return False - - # GitHub-like approach: Look for adjacent or close operations in diff sequence - for i, deleted_header in enumerate(deleted_headers): - if i in used_deleted_indices: - continue - - for j, added_header in enumerate(added_headers): - if j in used_added_indices: - continue - - deleted_content = deleted_header['content'].strip() - added_content = added_header['content'].strip() - - # Check if they are close in the diff sequence (GitHub's approach) - diff_distance = abs(added_header['diff_index'] - deleted_header['diff_index']) - is_close_in_diff = diff_distance <= 5 # Allow small gap for context lines - - # Check semantic similarity - is_similar = are_headers_similar(deleted_content, added_content) - - # GitHub-like logic: prioritize diff proximity + semantic similarity - if is_close_in_diff and is_similar: - modified_pairs.append({ - 'deleted': deleted_header, - 'added': added_header, - 'original_content': deleted_header['content'] - }) - used_added_indices.add(j) - used_deleted_indices.add(i) - break - # Fallback: strong semantic similarity even if not adjacent - elif is_similar and abs(added_header['line_number'] - deleted_header['line_number']) <= 20: - modified_pairs.append({ - 'deleted': deleted_header, - 'added': added_header, - 'original_content': deleted_header['content'] - }) - used_added_indices.add(j) - used_deleted_indices.add(i) - break - - # Remove identified modifications from pure additions/deletions - for pair in modified_pairs: - if pair['deleted'] in operations['deleted_lines']: - operations['deleted_lines'].remove(pair['deleted']) - if pair['added'] in operations['added_lines']: - operations['added_lines'].remove(pair['added']) - # Store both new and original content for modified headers - modified_entry = pair['added'].copy() - modified_entry['original_content'] = pair['original_content'] - operations['modified_lines'].append(modified_entry) - - return operations - -def build_hierarchy_dict(file_content): - """Build hierarchy dictionary from file content, excluding content inside code blocks""" - lines = file_content.split('\n') - level_stack = [] - all_hierarchy_dict = {} - - # Track code block state - in_code_block = False - code_block_delimiter = None # Track the type of code block (``` or ```) - - # Build complete hierarchy for all headers - for line_num, line in enumerate(lines, 1): - original_line = line - line = line.strip() - - # Check for code block delimiters - if line.startswith('```') or line.startswith('~~~'): - if not in_code_block: - # Entering a code block - in_code_block = True - code_block_delimiter = line[:3] # Store the delimiter type - continue - elif line.startswith(code_block_delimiter): - # Exiting a code block - in_code_block = False - code_block_delimiter = None - continue - - # Skip processing if we're inside a code block - if in_code_block: - continue - - # Process headers only if not in code block - if line.startswith('#'): - match = re.match(r'^(#{1,10})\s+(.+)', line) - if match: - level = len(match.group(1)) - title = match.group(2).strip() - - # Remove items from stack that are at same or deeper level - while level_stack and level_stack[-1][0] >= level: - level_stack.pop() - - # Build hierarchy with special handling for top-level titles - if level == 1: - # Top-level titles are included directly without hierarchy path - hierarchy_line = line - elif level_stack: - # For other levels, build path but skip the top-level title (level 1) - path_parts = [item[1] for item in level_stack if item[0] > 1] # Skip level 1 items - path_parts.append(line) - hierarchy_line = " > ".join(path_parts) - else: - # Fallback for other cases - hierarchy_line = line - - if hierarchy_line: # Only add non-empty hierarchies - all_hierarchy_dict[line_num] = hierarchy_line - - level_stack.append((level, line)) - - return all_hierarchy_dict - -def build_hierarchy_path(lines, line_num, all_headers): - """Build the full hierarchy path for a header at given line""" - if line_num not in all_headers: - return [] - - current_header = all_headers[line_num] - current_level = current_header['level'] - hierarchy_path = [] - - # Find all parent headers - for check_line in sorted(all_headers.keys()): - if check_line >= line_num: - break - - header = all_headers[check_line] - if header['level'] < current_level: - # This is a potential parent - # Remove any headers at same or deeper level - while hierarchy_path and hierarchy_path[-1]['level'] >= header['level']: - hierarchy_path.pop() - hierarchy_path.append(header) - - # Add current header - hierarchy_path.append(current_header) - - return hierarchy_path - -def build_hierarchy_for_modified_section(file_content, target_line_num, original_line, base_hierarchy_dict): - """Build hierarchy path for a modified section using original content""" - lines = file_content.split('\n') - - # Get the level of the original header - original_match = re.match(r'^(#{1,10})\s+(.+)', original_line) - if not original_match: - return None - - original_level = len(original_match.group(1)) - original_title = original_match.group(2).strip() - - # Find parent sections by looking backwards from target line - level_stack = [] - - for line_num in range(1, target_line_num): - if line_num in base_hierarchy_dict: - # This is a header line - line_content = lines[line_num - 1].strip() - if line_content.startswith('#'): - match = re.match(r'^(#{1,10})\s+(.+)', line_content) - if match: - level = len(match.group(1)) - title = match.group(2).strip() - - # Remove items from stack that are at same or deeper level - while level_stack and level_stack[-1][0] >= level: - level_stack.pop() - - # Add this header to stack if it's a potential parent - if level < original_level: - level_stack.append((level, line_content)) - - # Build hierarchy path using original content - if level_stack: - path_parts = [item[1] for item in level_stack[1:]] # Skip first level - path_parts.append(original_line) - hierarchy_line = " > ".join(path_parts) - else: - hierarchy_line = original_line if original_level > 1 else "" - - return hierarchy_line if hierarchy_line else None - -def find_section_boundaries(lines, hierarchy_dict): - """Find the start and end line for each section based on hierarchy""" - section_boundaries = {} - - # Sort sections by line number - sorted_sections = sorted(hierarchy_dict.items(), key=lambda x: int(x[0])) - - for i, (line_num, hierarchy) in enumerate(sorted_sections): - start_line = int(line_num) - 1 # Convert to 0-based index - - # Find end line (start of next section at same or higher level) - end_line = len(lines) # Default to end of document - - if start_line >= len(lines): - continue - - # Get current section level - current_line = lines[start_line].strip() - if not current_line.startswith('#'): - continue - - current_level = len(current_line.split()[0]) # Count # characters - - # Look for next section at same or higher level - for j in range(start_line + 1, len(lines)): - line = lines[j].strip() - if line.startswith('#'): - line_level = len(line.split()[0]) if line.split() else 0 - if line_level <= current_level: - end_line = j - break - - section_boundaries[line_num] = { - 'start': start_line, - 'end': end_line, - 'hierarchy': hierarchy, - 'level': current_level - } - - return section_boundaries - -def extract_section_content(lines, start_line, hierarchy_dict): - """Extract the content of a section starting from start_line (includes sub-sections)""" - if not lines or start_line < 1 or start_line > len(lines): - return "" - - start_index = start_line - 1 # Convert to 0-based index - section_content = [] - - # Find the header at start_line - current_line = lines[start_index].strip() - if not current_line.startswith('#'): - return "" - - # Get the level of current header - current_level = len(current_line.split()[0]) # Count # characters - section_content.append(current_line) - - # Special handling for top-level titles (level 1) - if current_level == 1: - # For top-level titles, only extract content until the first next-level header (##) - for i in range(start_index + 1, len(lines)): - line = lines[i].strip() - - if line.startswith('#'): - # Check if this is a header of next level (##, ###, etc.) - line_level = len(line.split()[0]) if line.split() else 0 - if line_level > current_level: - # Found first subsection, stop here for top-level titles - break - elif line_level <= current_level: - # Found same or higher level header, also stop - break - - section_content.append(lines[i].rstrip()) # Keep original line without trailing whitespace - else: - # For non-top-level titles, use the original logic - # Extract content until we hit the next header of same or higher level - for i in range(start_index + 1, len(lines)): - line = lines[i].strip() - - if line.startswith('#'): - # Check if this is a header of same or higher level - line_level = len(line.split()[0]) if line.split() else 0 - if line_level <= current_level: - # Found a header of same or higher level, stop here regardless - # Each section should be extracted individually - break - - section_content.append(lines[i].rstrip()) # Keep original line without trailing whitespace - - return '\n'.join(section_content) - -def extract_section_direct_content(lines, start_line): - """Extract ONLY the direct content of a section (excluding sub-sections) - for source diff dict""" - if not lines or start_line < 1 or start_line > len(lines): - return "" - - start_index = start_line - 1 # Convert to 0-based index - section_content = [] - - # Find the header at start_line - current_line = lines[start_index].strip() - if not current_line.startswith('#'): - return "" - - # Add the header line - section_content.append(current_line) - - # Only extract until the first header (any level) - # This means we stop at ANY header - whether it's a sub-section OR same/higher level - for i in range(start_index + 1, len(lines)): - line = lines[i].strip() - if line.startswith('#'): - # Stop at ANY header to get only direct content - break - section_content.append(lines[i].rstrip()) - - return '\n'.join(section_content) - -def extract_frontmatter_content(file_lines): - """Extract content from the beginning of file to the first top-level header""" - if not file_lines: - return "" - - frontmatter_lines = [] - for i, line in enumerate(file_lines): - line_stripped = line.strip() - # Stop when we hit the first top-level header - if line_stripped.startswith('# '): - break - frontmatter_lines.append(line.rstrip()) - - return '\n'.join(frontmatter_lines) - - -def extract_affected_sections(hierarchy_dict, file_lines): - """Extract all affected sections based on hierarchy dict""" - affected_sections = {} - - for line_num, hierarchy in hierarchy_dict.items(): - if line_num == "0" and hierarchy == "frontmatter": - # Special handling for frontmatter - frontmatter_content = extract_frontmatter_content(file_lines) - if frontmatter_content: - affected_sections[line_num] = frontmatter_content - else: - line_number = int(line_num) - section_content = extract_section_content(file_lines, line_number, hierarchy_dict) - - if section_content: - affected_sections[line_num] = section_content - - return affected_sections - -def find_containing_section(line_num, all_headers): - """Find which section a line belongs to""" - current_section = None - for header_line_num in sorted(all_headers.keys()): - if header_line_num <= line_num: - current_section = header_line_num - else: - break - return current_section - -def find_affected_sections(lines, changed_lines, all_headers): - """Find which sections are affected by the changes""" - affected_sections = set() - - for changed_line in changed_lines: - # Find the section this changed line belongs to - current_section = None - - # Find the most recent header before or at the changed line - for line_num in sorted(all_headers.keys()): - if line_num <= changed_line: - current_section = line_num - else: - break - - if current_section: - # Only add the directly affected section (the one that directly contains the change) - affected_sections.add(current_section) - - return affected_sections - -def find_sections_by_operation_type(lines, operations, all_headers, base_hierarchy_dict=None): - """Find sections affected by different types of operations""" - sections = { - 'added': set(), - 'modified': set(), - 'deleted': set() - } - - # Process added lines - for added_line in operations['added_lines']: - line_num = added_line['line_number'] - if added_line['is_header']: - # This is a new header - only mark the section as added if the header itself is new - sections['added'].add(line_num) - # Note: We don't mark sections as "added" just because they contain new non-header content - # That would be a "modified" section, not an "added" section - - # Process modified lines - for modified_line in operations['modified_lines']: - line_num = modified_line['line_number'] - if modified_line['is_header']: - sections['modified'].add(line_num) - else: - section = find_containing_section(line_num, all_headers) - if section: - sections['modified'].add(section) - - # Process deleted lines - use base hierarchy to find deleted sections - for deleted_line in operations['deleted_lines']: - if deleted_line['is_header']: - # Find this header in the base file hierarchy (before deletion) - deleted_title = clean_title_for_matching(deleted_line['content']) - # Use base hierarchy if available, otherwise fall back to current headers - search_hierarchy = base_hierarchy_dict if base_hierarchy_dict else all_headers - - found_deleted = False - for line_num, hierarchy_line in search_hierarchy.items(): - # Extract title from hierarchy line - if ' > ' in hierarchy_line: - original_title = clean_title_for_matching(hierarchy_line.split(' > ')[-1]) - else: - original_title = clean_title_for_matching(hierarchy_line) - - if deleted_title == original_title: - sections['deleted'].add(line_num) - print(f" šŸ—‘ļø Detected deleted section: {deleted_line['content']} (line {line_num})") - found_deleted = True - break - - if not found_deleted: - # If not found by exact match, try partial matching for renamed sections - print(f" āš ļø Could not find deleted section: {deleted_line['content']}") - - return sections - - -def get_target_hierarchy_and_content(file_path, github_client, target_repo): - """Get target hierarchy and content""" - try: - repository = github_client.get_repo(target_repo) - file_content = repository.get_contents(file_path, ref="master").decoded_content.decode('utf-8') - lines = file_content.split('\n') - - # Build hierarchy using same method - hierarchy = build_hierarchy_dict(file_content) - - return hierarchy, lines - except Exception as e: - print(f" āŒ Error getting target file: {e}") - return {}, [] - -def get_source_sections_content(pr_url, file_path, source_affected, github_client): - """Get the content of source sections for better context""" - try: - owner, repo, pr_number = parse_pr_url(pr_url) - repository = github_client.get_repo(f"{owner}/{repo}") - pr = repository.get_pull(pr_number) - - # Get the source file content - file_content = repository.get_contents(file_path, ref=pr.head.sha).decoded_content.decode('utf-8') - lines = file_content.split('\n') - - # Extract source sections - source_sections = {} - - for line_num, hierarchy in source_affected.items(): - if line_num == "0" and hierarchy == "frontmatter": - # Special handling for frontmatter - frontmatter_content = extract_frontmatter_content(lines) - if frontmatter_content: - source_sections[line_num] = frontmatter_content - else: - line_number = int(line_num) - section_content = extract_section_content(lines, line_number, source_affected) - if section_content: - source_sections[line_num] = section_content - - return source_sections - except Exception as e: - thread_safe_print(f" āš ļø Could not get source sections: {e}") - return {} - -def get_source_file_hierarchy(file_path, pr_url, github_client, get_base_version=False): - """Get source file hierarchy from PR head or base""" - try: - owner, repo, pr_number = parse_pr_url(pr_url) - repository = github_client.get_repo(f"{owner}/{repo}") - pr = repository.get_pull(pr_number) - - if get_base_version: - # Get the source file content before PR changes (base version) - source_file_content = repository.get_contents(file_path, ref=pr.base.sha).decoded_content.decode('utf-8') - else: - # Get the source file content after PR changes (head version) - source_file_content = repository.get_contents(file_path, ref=pr.head.sha).decoded_content.decode('utf-8') - - source_hierarchy = build_hierarchy_dict(source_file_content) - - return source_hierarchy - - except Exception as e: - thread_safe_print(f" āŒ Error getting source file hierarchy: {e}") - return {} - -# Helper function needed for find_sections_by_operation_type -def clean_title_for_matching(title): - """Clean title for matching by removing markdown formatting and span elements""" - if not title: - return "" - - # Remove span elements like New in v5.0 - title = re.sub(r']*>.*?', '', title) - - # Remove markdown header prefix (# ## ### etc.) - title = re.sub(r'^#{1,6}\s*', '', title.strip()) - - # Remove backticks - title = title.replace('`', '') - - # Strip whitespace - title = title.strip() - - return title - -def find_previous_section_for_added(added_sections, hierarchy_dict): - """Find the previous section hierarchy for each added section group""" - insertion_points = {} - - if not added_sections: - return insertion_points - - # Group consecutive added sections - added_list = sorted(list(added_sections)) - groups = [] - current_group = [added_list[0]] - - for i in range(1, len(added_list)): - if added_list[i] - added_list[i-1] <= 10: # Consider sections within 10 lines as consecutive - current_group.append(added_list[i]) - else: - groups.append(current_group) - current_group = [added_list[i]] - groups.append(current_group) - - # For each group, find the previous section hierarchy - for group in groups: - first_new_section = min(group) - - # Find the section that comes before this group - previous_section_line = None - previous_section_hierarchy = None - - for line_num_str in sorted(hierarchy_dict.keys(), key=int): - line_num = int(line_num_str) - if line_num < first_new_section: - previous_section_line = line_num - previous_section_hierarchy = hierarchy_dict[line_num_str] - else: - break - - if previous_section_hierarchy: - insertion_points[f"group_{groups.index(group)}"] = { - 'previous_section_hierarchy': previous_section_hierarchy, - 'previous_section_line': previous_section_line, - 'new_sections': group, - 'insertion_type': 'multiple' if len(group) > 1 else 'single' - } - print(f" šŸ“ Added section group: {len(group)} sections after '{previous_section_hierarchy}'") - else: - print(f" āš ļø Could not find previous section for added sections starting at line {first_new_section}") - - return insertion_points - -def build_source_diff_dict(modified_sections, added_sections, deleted_sections, all_hierarchy_dict, base_hierarchy_dict, operations, file_content, base_file_content): - """Build source diff dictionary with correct structure for matching""" - from section_matcher import clean_title_for_matching - source_diff_dict = {} - - # Helper function to extract section content (only direct content, no sub-sections) - def extract_section_content_for_diff(line_num, hierarchy_dict): - if str(line_num) == "0": - # Handle frontmatter - return extract_frontmatter_content(file_content.split('\n')) - else: - return extract_section_direct_content(file_content.split('\n'), line_num) - - # Helper function to extract old content from base file (only direct content, no sub-sections) - def extract_old_content_for_diff(line_num, base_hierarchy_dict, base_file_content): - if str(line_num) == "0": - # Handle frontmatter from base file - return extract_frontmatter_content(base_file_content.split('\n')) - else: - return extract_section_direct_content(base_file_content.split('\n'), line_num) - - # Helper function to extract old content by hierarchy (for modified sections that may have moved) - def extract_old_content_by_hierarchy(original_hierarchy, base_hierarchy_dict, base_file_content): - """Extract old content by finding the section with matching hierarchy in base file (only direct content)""" - if original_hierarchy == "frontmatter": - return extract_frontmatter_content(base_file_content.split('\n')) - - # Find the line number in base file that matches the original hierarchy - for base_line_num_str, base_hierarchy in base_hierarchy_dict.items(): - if base_hierarchy == original_hierarchy: - base_line_num = int(base_line_num_str) if base_line_num_str != "0" else 0 - if base_line_num == 0: - return extract_frontmatter_content(base_file_content.split('\n')) - else: - return extract_section_direct_content(base_file_content.split('\n'), base_line_num) - - # If exact match not found, return empty string - print(f" āš ļø Could not find matching hierarchy in base file: {original_hierarchy}") - return "" - - # Helper function to build complete hierarchy for a section using base file info - def build_complete_original_hierarchy(line_num, current_hierarchy, base_hierarchy_dict, operations): - """Build complete hierarchy path for original section""" - line_num_str = str(line_num) - - # Special cases: frontmatter and top-level titles - if line_num_str == "0": - return "frontmatter" - - # Check if this line was modified and has original content - for modified_line in operations.get('modified_lines', []): - if (modified_line.get('is_header') and - modified_line.get('line_number') == line_num and - 'original_content' in modified_line): - original_line = modified_line['original_content'].strip() - - # For top-level titles, return the original content directly - if ' > ' not in current_hierarchy: - return original_line - - # For nested sections, build the complete hierarchy using original content - # Find the hierarchy path using base hierarchy dict and replace the leaf with original - if line_num_str in base_hierarchy_dict: - base_hierarchy = base_hierarchy_dict[line_num_str] - if ' > ' in base_hierarchy: - # Replace the leaf (last part) with original content - hierarchy_parts = base_hierarchy.split(' > ') - hierarchy_parts[-1] = original_line - return ' > '.join(hierarchy_parts) - else: - # Single level, return original content - return original_line - - # Fallback: return original content - return original_line - - # If not modified, use base hierarchy if available - if line_num_str in base_hierarchy_dict: - return base_hierarchy_dict[line_num_str] - - # If not found in base (new section), use current hierarchy - return current_hierarchy - - # Process modified sections - for line_num_str, hierarchy in modified_sections.items(): - line_num = int(line_num_str) if line_num_str != "0" else 0 - - # Build complete original hierarchy - original_hierarchy = build_complete_original_hierarchy(line_num, hierarchy, base_hierarchy_dict, operations) - - # Extract both old and new content - new_content = extract_section_content_for_diff(line_num, all_hierarchy_dict) - # Use hierarchy-based lookup for old content instead of line number - old_content = extract_old_content_by_hierarchy(original_hierarchy, base_hierarchy_dict, base_file_content) - - # Only include if content actually changed - if new_content != old_content: - # Check if this is a bottom modified section (no next section in base file) - is_bottom_modified = False - if line_num_str in base_hierarchy_dict: - # Get all sections in base file sorted by line number - base_sections = sorted([(int(ln), hier) for ln, hier in base_hierarchy_dict.items() if ln != "0"]) - - # Check if there's any section after this line in base file - has_next_section = any(base_line > line_num for base_line, _ in base_sections) - - if not has_next_section: - is_bottom_modified = True - print(f" āœ… Bottom modified section detected at line {line_num_str}: no next section in base file") - - # Use special marker for bottom modified sections - if is_bottom_modified: - final_original_hierarchy = f"bottom-modified-{line_num}" - else: - final_original_hierarchy = original_hierarchy - - source_diff_dict[f"modified_{line_num_str}"] = { - "new_line_number": line_num, - "original_hierarchy": final_original_hierarchy, - "operation": "modified", - "new_content": new_content, - "old_content": old_content - } - print(f" āœ… Real modification detected at line {line_num_str}: content changed") - else: - print(f" 🚫 Filtered out false positive at line {line_num_str}: content unchanged (likely line shift artifact)") - - # Process added sections - find next section from current document hierarchy - for line_num_str, hierarchy in added_sections.items(): - line_num = int(line_num_str) - - print(f" šŸ” Finding next section for added section at line {line_num}: {hierarchy}") - - # Strategy: Find the next section directly from the current document (post-PR) - # Get all current sections sorted by line number - current_sections = sorted([(int(ln), curr_hierarchy) for ln, curr_hierarchy in all_hierarchy_dict.items()]) - print(f" šŸ“‹ Current sections around line {line_num}: {[(ln, h.split(' > ')[-1] if ' > ' in h else h) for ln, h in current_sections if abs(ln - line_num) <= 15]}") - - next_section_original_hierarchy = None - - # Find the next section that comes after the added section in the current document - for curr_line_num, curr_hierarchy in current_sections: - if curr_line_num > line_num: - # Found the next section in current document - # Now find its original hierarchy in base document - curr_line_str = str(curr_line_num) - - # Get the original hierarchy for this next section - # Use the same logic as build_complete_original_hierarchy to get original content - if curr_line_str in base_hierarchy_dict: - # Check if this section was modified - was_modified = False - for modified_line in operations.get('modified_lines', []): - if (modified_line.get('is_header') and - modified_line.get('line_number') == curr_line_num and - 'original_content' in modified_line): - # This section was modified, use original content - original_line = modified_line['original_content'].strip() - base_hierarchy = base_hierarchy_dict[curr_line_str] - - if ' > ' in base_hierarchy: - # Replace the leaf with original content - hierarchy_parts = base_hierarchy.split(' > ') - hierarchy_parts[-1] = original_line - next_section_original_hierarchy = ' > '.join(hierarchy_parts) - else: - next_section_original_hierarchy = original_line - - print(f" āœ… Found next section (modified): line {curr_line_num} -> {next_section_original_hierarchy.split(' > ')[-1] if ' > ' in next_section_original_hierarchy else next_section_original_hierarchy}") - was_modified = True - break - - if not was_modified: - # Section was not modified, use base hierarchy directly - next_section_original_hierarchy = base_hierarchy_dict[curr_line_str] - print(f" āœ… Found next section (unchanged): line {curr_line_num} -> {next_section_original_hierarchy.split(' > ')[-1] if ' > ' in next_section_original_hierarchy else next_section_original_hierarchy}") - - break - else: - # This next section might also be new or modified - # Try to find it by content matching in base hierarchy - found_match = False - for base_line_str, base_hierarchy in base_hierarchy_dict.items(): - # Compare the leaf titles (last part of hierarchy) - curr_leaf = curr_hierarchy.split(' > ')[-1] if ' > ' in curr_hierarchy else curr_hierarchy - base_leaf = base_hierarchy.split(' > ')[-1] if ' > ' in base_hierarchy else base_hierarchy - - # Clean titles for comparison - curr_clean = clean_title_for_matching(curr_leaf) - base_clean = clean_title_for_matching(base_leaf) - - if curr_clean == base_clean: - next_section_original_hierarchy = base_hierarchy - print(f" āœ… Found next section (by content): {base_hierarchy.split(' > ')[-1] if ' > ' in base_hierarchy else base_hierarchy}") - found_match = True - break - - if found_match: - break - else: - print(f" āš ļø Next section at line {curr_line_num} not found in base, continuing search...") - - # If no next section found, this is being added at the end - if not next_section_original_hierarchy: - print(f" āœ… Bottom section detected: this section is added at the end of document") - # Use special marker for bottom added sections - no matching needed - next_section_original_hierarchy = f"bottom-added-{line_num}" - - source_diff_dict[f"added_{line_num_str}"] = { - "new_line_number": line_num, - "original_hierarchy": next_section_original_hierarchy, - "operation": "added", - "new_content": extract_section_content_for_diff(line_num, all_hierarchy_dict), - "old_content": None # Added sections have no old content - } - - # Process deleted sections - use original hierarchy from base file - for line_num_str, hierarchy in deleted_sections.items(): - line_num = int(line_num_str) - # Use complete hierarchy from base file - original_hierarchy = base_hierarchy_dict.get(line_num_str, hierarchy) - - # Extract old content for deleted sections - old_content = extract_old_content_for_diff(line_num, base_hierarchy_dict, base_file_content) - - source_diff_dict[f"deleted_{line_num_str}"] = { - "new_line_number": line_num, - "original_hierarchy": original_hierarchy, - "operation": "deleted", - "new_content": None, # No new content for deleted sections - "old_content": old_content # Show what was deleted - } - - # Sort the dictionary by new_line_number for better readability - sorted_items = sorted(source_diff_dict.items(), key=lambda x: x[1]['new_line_number']) - source_diff_dict = dict(sorted_items) - - return source_diff_dict - -def analyze_source_changes(pr_url, github_client, special_files=None, ignore_files=None, repo_configs=None, max_non_system_sections=120, pr_diff=None): - """Analyze source language changes and categorize them as added, modified, or deleted""" - # Import modules needed in this function - import os - import json - from toc_processor import process_toc_operations - - owner, repo, pr_number = parse_pr_url(pr_url) - repository = github_client.get_repo(f"{owner}/{repo}") - pr = repository.get_pull(pr_number) - - # Get repository configuration for target repo info - repo_config = get_repo_config(pr_url, repo_configs) - - print(f"šŸ“‹ Processing PR #{pr_number}: {pr.title}") - - # Get markdown files - files = pr.get_files() - markdown_files = [f for f in files if f.filename.endswith('.md')] - - print(f"šŸ“„ Found {len(markdown_files)} markdown files") - - # Return dictionaries for different operation types - added_sections = {} # New sections that were added - modified_sections = {} # Existing sections that were modified - deleted_sections = {} # Sections that were deleted - added_files = {} # Completely new files that were added - deleted_files = [] # Completely deleted files - ignored_files = [] # Files that were ignored - toc_files = {} # Special TOC files requiring special processing - - for file in markdown_files: - print(f"\nšŸ” Analyzing {file.filename}") - - # Check if this file should be ignored - if file.filename in ignore_files: - print(f" ā­ļø Skipping ignored file: {file.filename}") - ignored_files.append(file.filename) - continue - - # Check if this is a completely new file or deleted file - if file.status == 'added': - print(f" āž• Detected new file: {file.filename}") - try: - file_content = repository.get_contents(file.filename, ref=pr.head.sha).decoded_content.decode('utf-8') - added_files[file.filename] = file_content - print(f" āœ… Added complete file for translation") - continue - except Exception as e: - print(f" āŒ Error getting new file content: {e}") - continue - - elif file.status == 'removed': - print(f" šŸ—‘ļø Detected deleted file: {file.filename}") - deleted_files.append(file.filename) - print(f" āœ… Marked file for deletion") - continue - - # For modified files, check if it's a special file like TOC.md - try: - file_content = repository.get_contents(file.filename, ref=pr.head.sha).decoded_content.decode('utf-8') - except Exception as e: - print(f" āŒ Error getting content: {e}") - continue - - # Check if this is a TOC.md file requiring special processing - if os.path.basename(file.filename) in special_files: - print(f" šŸ“‹ Detected special file: {file.filename}") - - # Get target file content for comparison - try: - target_repository = github_client.get_repo(repo_config['target_repo']) - target_file_content = target_repository.get_contents(file.filename, ref="master").decoded_content.decode('utf-8') - target_lines = target_file_content.split('\n') - except Exception as e: - print(f" āš ļø Could not get target file content: {e}") - continue - - # Analyze diff operations for TOC.md - operations = analyze_diff_operations(file) - source_lines = file_content.split('\n') - - # Process with special TOC logic - toc_results = process_toc_operations(file.filename, operations, source_lines, target_lines, "") # Local path will be determined later - - # Store TOC operations for later processing - if any([toc_results['added'], toc_results['modified'], toc_results['deleted']]): - # Combine all operations for processing - all_toc_operations = [] - all_toc_operations.extend(toc_results['added']) - all_toc_operations.extend(toc_results['modified']) - all_toc_operations.extend(toc_results['deleted']) - - # Add to special TOC processing queue (separate from regular sections) - toc_files[file.filename] = { - 'type': 'toc', - 'operations': all_toc_operations - } - - print(f" šŸ“‹ TOC operations queued for processing:") - if toc_results['added']: - print(f" āž• Added: {len(toc_results['added'])} entries") - if toc_results['modified']: - print(f" āœļø Modified: {len(toc_results['modified'])} entries") - if toc_results['deleted']: - print(f" āŒ Deleted: {len(toc_results['deleted'])} entries") - else: - print(f" ā„¹ļø No TOC operations found") - - continue # Skip regular processing for TOC files - - # Analyze diff operations - operations = analyze_diff_operations(file) - print(f" šŸ“ Diff analysis: {len(operations['added_lines'])} added, {len(operations['modified_lines'])} modified, {len(operations['deleted_lines'])} deleted lines") - - lines = file_content.split('\n') - all_headers = {} - - # Track code block state - in_code_block = False - code_block_delimiter = None - - # First pass: collect all headers (excluding those in code blocks) - for line_num, line in enumerate(lines, 1): - original_line = line - line = line.strip() - - # Check for code block delimiters - if line.startswith('```') or line.startswith('~~~'): - if not in_code_block: - # Entering a code block - in_code_block = True - code_block_delimiter = line[:3] - continue - elif line.startswith(code_block_delimiter): - # Exiting a code block - in_code_block = False - code_block_delimiter = None - continue - - # Skip processing if we're inside a code block - if in_code_block: - continue - - # Process headers only if not in code block - if line.startswith('#'): - match = re.match(r'^(#{1,10})\s+(.+)', line) - if match: - level = len(match.group(1)) - title = match.group(2).strip() - all_headers[line_num] = { - 'level': level, - 'title': title, - 'line': line - } - - # Build complete hierarchy from HEAD (after changes) - all_hierarchy_dict = build_hierarchy_dict(file_content) - - # For deletion detection, we also need the base file hierarchy - try: - base_file_content = repository.get_contents(file.filename, ref=f"{repository.default_branch}").decoded_content.decode('utf-8') - base_hierarchy_dict = build_hierarchy_dict(base_file_content) - except Exception as e: - print(f" āš ļø Could not get base file content: {e}") - base_hierarchy_dict = all_hierarchy_dict - base_file_content = file_content # Fallback to current content - - # Find sections by operation type with corrected logic - sections_by_type = find_sections_by_operation_type(lines, operations, all_headers, base_hierarchy_dict) - - # Prioritize modified headers over added ones (fix for header changes like --host -> --hosts) - modified_header_lines = set() - for modified_line in operations['modified_lines']: - if modified_line['is_header']: - modified_header_lines.add(modified_line['line_number']) - - # Remove modified header lines from added set - sections_by_type['added'] = sections_by_type['added'] - modified_header_lines - - # Enhanced logic: check for actual content changes within sections - # This helps detect changes in section content (not just headers) - print(f" šŸ” Enhanced detection: checking for actual section content changes...") - - # Get only lines that have actual content changes (exclude headers) - real_content_changes = set() - - # Added lines (new content, excluding headers) - for added_line in operations['added_lines']: - if not added_line['is_header']: - real_content_changes.add(added_line['line_number']) - - # Deleted lines (removed content, excluding headers) - for deleted_line in operations['deleted_lines']: - if not deleted_line['is_header']: - real_content_changes.add(deleted_line['line_number']) - - # Modified lines (changed content, excluding headers) - for modified_line in operations['modified_lines']: - if not modified_line['is_header']: - real_content_changes.add(modified_line['line_number']) - - print(f" šŸ“ Real content changes (non-header): {sorted(real_content_changes)}") - - # Find sections that contain actual content changes - content_affected_sections = set() - for changed_line in real_content_changes: - # Find which section this changed line belongs to - containing_section = None - for line_num in sorted(all_headers.keys()): - if line_num <= changed_line: - containing_section = line_num - else: - break - - if containing_section and containing_section not in sections_by_type['added']: - # Additional check: make sure this is not just a line number shift - # Only add if the change is within reasonable distance from the section header - # AND if the changed line is not part of a completely deleted section header - is_deleted_header = False - for deleted_line in operations['deleted_lines']: - if (deleted_line['is_header'] and - abs(changed_line - deleted_line['line_number']) <= 2): - is_deleted_header = True - print(f" āš ļø Skipping change at line {changed_line} (deleted header near line {deleted_line['line_number']})") - break - - # More precise filtering: check if this change is actually meaningful - # Skip changes that are part of deleted content or line shifts due to deletions - should_include = True - - # Skip exact deleted headers - for deleted_line in operations['deleted_lines']: - if (deleted_line['is_header'] and - changed_line == deleted_line['line_number']): - should_include = False - print(f" āš ļø Skipping change at line {changed_line} (exact deleted header)") - break - - # Skip changes that are very close to deleted content AND far from their containing section - # This helps filter out line shift artifacts while keeping real content changes - if should_include: - for deleted_line in operations['deleted_lines']: - # Only skip if both conditions are met: - # 1. Very close to deleted content (within 5 lines) - # 2. The change is far from its containing section (likely a shift artifact) - distance_to_deletion = abs(changed_line - deleted_line['line_number']) - distance_to_section = changed_line - containing_section - - if (distance_to_deletion <= 5 and distance_to_section > 100): - should_include = False - print(f" āš ļø Skipping change at line {changed_line} (likely line shift: {distance_to_deletion} lines from deletion, {distance_to_section} from section)") - break - - if should_include and changed_line - containing_section <= 30: - content_affected_sections.add(containing_section) - print(f" šŸ“ Content change at line {changed_line} affects section at line {containing_section}") - elif should_include: - print(f" āš ļø Skipping distant change at line {changed_line} from section {containing_section}") - - # Add content-modified sections to the modified set, but exclude sections that are already marked as added or deleted - for line_num in content_affected_sections: - if (line_num not in sections_by_type['modified'] and - line_num not in sections_by_type['added'] and - line_num not in sections_by_type['deleted']): # āœ… Critical fix: exclude deleted sections - sections_by_type['modified'].add(line_num) - print(f" šŸ“ Added content-modified section at line {line_num}") - elif line_num in sections_by_type['deleted']: - print(f" 🚫 Skipping content-modified section at line {line_num}: already marked as deleted") - - # Prepare sections data for source_diff_dict - file_modified = {} - file_added = {} - file_deleted = {} - - # Build modified sections - for line_num in sections_by_type['modified']: - if line_num in all_hierarchy_dict: - file_modified[str(line_num)] = all_hierarchy_dict[line_num] - - # Build added sections - for line_num in sections_by_type['added']: - if line_num in all_hierarchy_dict: - file_added[str(line_num)] = all_hierarchy_dict[line_num] - - # Build deleted sections - for line_num in sections_by_type['deleted']: - if line_num in base_hierarchy_dict: - file_deleted[str(line_num)] = base_hierarchy_dict[line_num] - - # Check for frontmatter changes (content before first top-level header) - print(f" šŸ” Checking for frontmatter changes...") - frontmatter_changed = False - - # Check if any changes occur before the first top-level header - first_header_line = None - for line_num in sorted(all_headers.keys()): - header_info = all_headers[line_num] - if header_info['level'] == 1: # First top-level header - first_header_line = line_num - break - - print(f" šŸ“Š First header line: {first_header_line}") - print(f" šŸ“Š Real content changes: {sorted(real_content_changes)}") - - if first_header_line: - # Check if any real content changes are before the first header - for line_num in real_content_changes: - #print(f" šŸ” Checking line {line_num} vs first header {first_header_line}") - if line_num < first_header_line: - frontmatter_changed = True - print(f" šŸ“„ Frontmatter change detected: line {line_num} < {first_header_line}") - break - - print(f" šŸ“Š Frontmatter changed: {frontmatter_changed}") - - if frontmatter_changed: - print(f" šŸ“„ Frontmatter changes detected (before line {first_header_line})") - # Add frontmatter as a special section with line number 0 - file_modified["0"] = "frontmatter" - print(f" āœ… Added frontmatter section to modified sections") - - # Build source diff dictionary - source_diff_dict = build_source_diff_dict( - file_modified, file_added, file_deleted, - all_hierarchy_dict, base_hierarchy_dict, - operations, file_content, base_file_content - ) - - # Breakpoint: Output source_diff_dict to file for review with file prefix - - # Ensure temp_output directory exists - script_dir = os.path.dirname(os.path.abspath(__file__)) - temp_dir = os.path.join(script_dir, "temp_output") - os.makedirs(temp_dir, exist_ok=True) - - file_prefix = file.filename.replace('/', '-').replace('.md', '') - output_file = os.path.join(temp_dir, f"{file_prefix}-source-diff-dict.json") - with open(output_file, 'w', encoding='utf-8') as f: - json.dump(source_diff_dict, f, ensure_ascii=False, indent=2) - - print(f" šŸ’¾ Saved source diff dictionary to: {output_file}") - print(f" šŸ“Š Source diff dictionary contains {len(source_diff_dict)} sections:") - for key, diff_info in source_diff_dict.items(): - print(f" {diff_info['operation']}: {key} -> original_hierarchy: {diff_info['original_hierarchy']}") - - # source-diff-dict.json generation is complete, continue to next step in main.py - - # For modified headers, we need to build a mapping using original titles for matching - original_hierarchy_dict = all_hierarchy_dict.copy() - - # Update hierarchy dict to use original content for modified headers when needed for matching - for line_num in sections_by_type['modified']: - if line_num in all_headers: - header_info = all_headers[line_num] - # Check if this header was modified and has original content - for op in operations['modified_lines']: - if (op['is_header'] and - op['line_number'] == line_num and - 'original_content' in op): - # Create hierarchy path using original content for matching - original_line = op['original_content'].strip() - if original_line.startswith('#'): - # Build original hierarchy for matching - original_hierarchy = build_hierarchy_for_modified_section( - file_content, line_num, original_line, all_hierarchy_dict) - if original_hierarchy: - original_hierarchy_dict[line_num] = original_hierarchy - break - - # Process added sections - if sections_by_type['added']: - file_added = {} - # Find insertion points using the simplified logic: - # Record the previous section hierarchy for each added section - insertion_points = find_previous_section_for_added(sections_by_type['added'], all_hierarchy_dict) - - # Get actual content for added sections - for line_num in sections_by_type['added']: - if line_num in all_hierarchy_dict: - file_added[str(line_num)] = all_hierarchy_dict[line_num] - - # Get source sections content (actual content, not just hierarchy) - if file_added: - source_sections_content = get_source_sections_content(pr_url, file.filename, file_added, github_client) - file_added = source_sections_content # Replace hierarchy with actual content - - if file_added: - added_sections[file.filename] = { - 'sections': file_added, - 'insertion_points': insertion_points - } - print(f" āž• Found {len(file_added)} added sections with {len(insertion_points)} insertion points") - - # Process modified sections - if sections_by_type['modified']: - file_modified = {} - for line_num in sections_by_type['modified']: - if line_num in original_hierarchy_dict: - file_modified[str(line_num)] = original_hierarchy_dict[line_num] - - if file_modified: - modified_sections[file.filename] = { - 'sections': file_modified, - 'original_hierarchy': original_hierarchy_dict, - 'current_hierarchy': all_hierarchy_dict - } - print(f" āœļø Found {len(file_modified)} modified sections") - - # Process deleted sections - if sections_by_type['deleted']: - file_deleted = {} - for line_num in sections_by_type['deleted']: - # Use base hierarchy to get the deleted section info - if line_num in base_hierarchy_dict: - file_deleted[str(line_num)] = base_hierarchy_dict[line_num] - - if file_deleted: - deleted_sections[file.filename] = file_deleted - print(f" āŒ Found {len(file_deleted)} deleted sections") - - # Enhanced logic: also check content-level changes using legacy detection - # This helps detect changes in section content (not just headers) - print(f" šŸ” Enhanced detection: checking content-level changes...") - changed_lines = get_changed_line_ranges(file) - affected_sections = find_affected_sections(lines, changed_lines, all_headers) - - legacy_modified = {} - for line_num in affected_sections: - if line_num in all_hierarchy_dict: - section_hierarchy = all_hierarchy_dict[line_num] - # Only add if not already detected by operation-type analysis - already_detected = False - if file.filename in modified_sections: - for existing_line, existing_hierarchy in modified_sections[file.filename].get('sections', {}).items(): - if existing_hierarchy == section_hierarchy: - already_detected = True - break - - if not already_detected: - legacy_modified[str(line_num)] = section_hierarchy - - if legacy_modified: - print(f" āœ… Enhanced detection found {len(legacy_modified)} additional content-modified sections") - # Merge with existing modified sections - if file.filename in modified_sections: - # Merge the sections - existing_sections = modified_sections[file.filename].get('sections', {}) - existing_sections.update(legacy_modified) - modified_sections[file.filename]['sections'] = existing_sections - else: - # Create new entry - modified_sections[file.filename] = { - 'sections': legacy_modified, - 'original_hierarchy': all_hierarchy_dict, - 'current_hierarchy': all_hierarchy_dict - } - - print(f"\nšŸ“Š Summary:") - #print(f" āœļø Modified files: {} files") - print(f" šŸ“„ Added files: {len(added_files)} files") - print(f" šŸ—‘ļø Deleted files: {len(deleted_files)} files") - print(f" šŸ“‹ TOC files: {len(toc_files)} files") - if ignored_files: - print(f" ā­ļø Ignored files: {len(ignored_files)} files") - for ignored_file in ignored_files: - print(f" - {ignored_file}") - - return added_sections, modified_sections, deleted_sections, added_files, deleted_files, toc_files diff --git a/scripts/translate_doc_pr/requirements.txt b/scripts/translate_doc_pr/requirements.txt deleted file mode 100644 index d8336cf8cebe7..0000000000000 --- a/scripts/translate_doc_pr/requirements.txt +++ /dev/null @@ -1,4 +0,0 @@ -PyGithub>=1.55.0 -openai>=1.0.0 -tiktoken>=0.4.0 -google-generativeai>=0.3.0 diff --git a/scripts/translate_doc_pr/section_matcher.py b/scripts/translate_doc_pr/section_matcher.py deleted file mode 100644 index ce4ef61116c89..0000000000000 --- a/scripts/translate_doc_pr/section_matcher.py +++ /dev/null @@ -1,973 +0,0 @@ -""" -Section Matcher Module -Handles section hierarchy matching including direct matching and AI matching -""" - -import os -import re -import json -import threading -from github import Github -from openai import OpenAI - -# Thread-safe printing -print_lock = threading.Lock() - -def thread_safe_print(*args, **kwargs): - with print_lock: - print(*args, **kwargs) - -def clean_title_for_matching(title): - """Clean title for matching by removing markdown formatting and span elements""" - if not title: - return "" - - # Remove span elements like New in v5.0 - title = re.sub(r']*>.*?', '', title) - - # Remove markdown header prefix (# ## ### etc.) - title = re.sub(r'^#{1,6}\s*', '', title.strip()) - - # Remove backticks - title = title.replace('`', '') - - # Strip whitespace - title = title.strip() - - return title - -def is_system_variable_or_config(title): - """Check if a title represents a system variable or configuration item""" - cleaned_title = clean_title_for_matching(title) - - if not cleaned_title: - return False - - # Check if original title had backticks (indicating code/config item) - original_has_backticks = '`' in title - - # System variables and config items are typically: - # 1. Alphanumeric characters with underscores, hyphens, dots, or percent signs - # 2. No spaces in the middle - # 3. Often contain underscores, hyphens, dots, or percent signs - # 4. May contain uppercase letters (like alert rule names) - # 5. Single words wrapped in backticks (like `capacity`, `engine`) - - # Check if it contains only allowed characters (including % for metrics/alerts) - allowed_chars = re.match(r'^[a-zA-Z0-9_\-\.%]+$', cleaned_title) - - # Check if it contains at least one separator (common in system vars/config/alerts) - has_separator = ('_' in cleaned_title or '-' in cleaned_title or - '.' in cleaned_title or '%' in cleaned_title) - - # Check if it doesn't contain spaces (spaces would indicate it's likely a regular title) - no_spaces = ' ' not in cleaned_title - - # Additional patterns for alert rules and metrics - is_alert_rule = (cleaned_title.startswith('PD_') or - cleaned_title.startswith('TiDB_') or - cleaned_title.startswith('TiKV_') or - cleaned_title.endswith('_alert') or - '%' in cleaned_title) - - # NEW: Check if it's a single word in backticks (config/variable name) - # Examples: `capacity`, `engine`, `enable`, `dirname` etc. - is_single_backticked_word = (original_has_backticks and - allowed_chars and - no_spaces and - len(cleaned_title.split()) == 1) - - return bool(allowed_chars and (has_separator or is_alert_rule or is_single_backticked_word) and no_spaces) - -def find_toplevel_title_matches(source_sections, target_lines): - """Find matches for top-level titles (# Level) by direct pattern matching""" - matched_dict = {} - failed_matches = [] - skipped_sections = [] - - thread_safe_print(f"šŸ” Searching for top-level title matches") - - for source_line_num, source_hierarchy in source_sections.items(): - # Extract the leaf title from hierarchy - source_leaf_title = source_hierarchy.split(' > ')[-1] if ' > ' in source_hierarchy else source_hierarchy - - # Only process top-level titles - if not source_leaf_title.startswith('# '): - skipped_sections.append({ - 'line_num': source_line_num, - 'hierarchy': source_hierarchy, - 'reason': 'Not a top-level title' - }) - continue - - thread_safe_print(f" šŸ“ Looking for top-level match: {source_leaf_title}") - - # Find the first top-level title in target document - target_match = None - for line_num, line in enumerate(target_lines, 1): - line = line.strip() - if line.startswith('# '): - target_match = { - 'line_num': line_num, - 'title': line, - 'hierarchy_string': line[2:].strip() # Remove '# ' prefix for hierarchy - } - thread_safe_print(f" āœ“ Found target top-level at line {line_num}: {line}") - break - - if target_match: - matched_dict[str(target_match['line_num'])] = target_match['hierarchy_string'] - thread_safe_print(f" āœ… Top-level match: line {target_match['line_num']}") - else: - thread_safe_print(f" āŒ No top-level title found in target") - failed_matches.append({ - 'line_num': source_line_num, - 'hierarchy': source_hierarchy, - 'reason': 'No top-level title found in target' - }) - - thread_safe_print(f"šŸ“Š Top-level matching result: {len(matched_dict)} matches found") - if failed_matches: - thread_safe_print(f"āš ļø {len(failed_matches)} top-level sections failed to match:") - for failed in failed_matches: - thread_safe_print(f" āŒ Line {failed['line_num']}: {failed['hierarchy']} - {failed['reason']}") - - return matched_dict, failed_matches, skipped_sections - - -def find_direct_matches_for_special_files(source_sections, target_hierarchy, target_lines): - """Find direct matches for system variables/config items without using AI""" - matched_dict = {} - failed_matches = [] - skipped_sections = [] - - # Build target headers with hierarchy paths - target_headers = {} - for line_num, line in enumerate(target_lines, 1): - line = line.strip() - if line.startswith('#'): - match = re.match(r'^(#{1,10})\s+(.+)', line) - if match: - level = len(match.group(1)) - title = match.group(2).strip() - target_headers[line_num] = { - 'level': level, - 'title': title, - 'line': line - } - - thread_safe_print(f" šŸ” Searching for direct matches among {len(target_headers)} target headers") - - for source_line_num, source_hierarchy in source_sections.items(): - # Extract the leaf title from hierarchy - source_leaf_title = source_hierarchy.split(' > ')[-1] if ' > ' in source_hierarchy else source_hierarchy - source_clean_title = clean_title_for_matching(source_leaf_title) - - thread_safe_print(f" šŸ“ Looking for match: {source_clean_title}") - - if not is_system_variable_or_config(source_leaf_title): - thread_safe_print(f" āš ļø Not a system variable/config, skipping direct match") - skipped_sections.append({ - 'line_num': source_line_num, - 'hierarchy': source_hierarchy, - 'reason': 'Not a system variable or config item' - }) - continue - - # Find potential matches in target - potential_matches = [] - for target_line_num, target_header in target_headers.items(): - target_clean_title = clean_title_for_matching(target_header['title']) - - if source_clean_title == target_clean_title: - # Build hierarchy path for this target header - hierarchy_path = build_hierarchy_path(target_lines, target_line_num, target_headers) - potential_matches.append({ - 'line_num': target_line_num, - 'header': target_header, - 'hierarchy_path': hierarchy_path, - 'hierarchy_string': ' > '.join([f"{'#' * h['level']} {h['title']}" for h in hierarchy_path if h['level'] > 1 or len(hierarchy_path) == 1]) - }) - thread_safe_print(f" āœ“ Found potential match at line {target_line_num}: {target_header['title']}") - - if len(potential_matches) == 1: - # Single match found - match = potential_matches[0] - matched_dict[str(match['line_num'])] = match['hierarchy_string'] - thread_safe_print(f" āœ… Direct match: line {match['line_num']}") - elif len(potential_matches) > 1: - # Multiple matches, need to use parent hierarchy to disambiguate - thread_safe_print(f" šŸ”€ Multiple matches found ({len(potential_matches)}), using parent hierarchy") - - # Extract parent hierarchy from source - source_parts = source_hierarchy.split(' > ') - if len(source_parts) > 1: - source_parent_titles = [clean_title_for_matching(part) for part in source_parts[:-1]] - - best_match = None - best_score = -1 - - for match in potential_matches: - # Compare parent hierarchy - target_parent_titles = [clean_title_for_matching(h['title']) for h in match['hierarchy_path'][:-1]] - - # Calculate similarity score - score = 0 - min_len = min(len(source_parent_titles), len(target_parent_titles)) - - for i in range(min_len): - if i < len(source_parent_titles) and i < len(target_parent_titles): - if source_parent_titles[-(i+1)] == target_parent_titles[-(i+1)]: # Compare from end - score += 1 - else: - break - - thread_safe_print(f" šŸ“Š Match at line {match['line_num']} score: {score}") - - if score > best_score: - best_score = score - best_match = match - - if best_match and best_score > 0: - matched_dict[str(best_match['line_num'])] = best_match['hierarchy_string'] - thread_safe_print(f" āœ… Best match: line {best_match['line_num']} (score: {best_score})") - else: - thread_safe_print(f" āŒ No good parent hierarchy match found") - failed_matches.append({ - 'line_num': source_line_num, - 'hierarchy': source_hierarchy, - 'reason': 'Multiple matches found but no good parent hierarchy match' - }) - else: - thread_safe_print(f" āš ļø No parent hierarchy in source, cannot disambiguate") - failed_matches.append({ - 'line_num': source_line_num, - 'hierarchy': source_hierarchy, - 'reason': 'Multiple matches found but no parent hierarchy to disambiguate' - }) - else: - thread_safe_print(f" āŒ No matches found for: {source_clean_title}") - # Try fuzzy matching for similar titles (e.g., --host vs --hosts) - fuzzy_matched = False - source_clean_lower = source_clean_title.lower() - for target_header in target_headers: - # Handle both dict and tuple formats - if isinstance(target_header, dict): - target_clean = clean_title_for_matching(target_header['title']) - elif isinstance(target_header, (list, tuple)) and len(target_header) >= 2: - target_clean = clean_title_for_matching(target_header[1]) # title is at index 1 - else: - continue # Skip invalid entries - target_clean_lower = target_clean.lower() - # Check for similar titles (handle plural/singular and minor differences) - # Case 1: One is substring of another (e.g., --host vs --hosts) - # Case 2: Small character difference (1-2 characters) - len_diff = abs(len(source_clean_lower) - len(target_clean_lower)) - if (len_diff <= 2 and - (source_clean_lower in target_clean_lower or - target_clean_lower in source_clean_lower)): - thread_safe_print(f" ā‰ˆ Fuzzy match found: {source_clean_title} ā‰ˆ {target_clean}") - if isinstance(target_header, dict): - matched_dict[str(target_header['line_num'])] = target_header['hierarchy_string'] - thread_safe_print(f" āœ… Fuzzy match: line {target_header['line_num']}") - elif isinstance(target_header, (list, tuple)) and len(target_header) >= 3: - matched_dict[str(target_header[0])] = target_header[2] # line_num at index 0, hierarchy at index 2 - thread_safe_print(f" āœ… Fuzzy match: line {target_header[0]}") - fuzzy_matched = True - break - - if not fuzzy_matched: - failed_matches.append({ - 'line_num': source_line_num, - 'hierarchy': source_hierarchy, - 'reason': 'No matching section found in target' - }) - - thread_safe_print(f" šŸ“Š Direct matching result: {len(matched_dict)} matches found") - - if failed_matches: - thread_safe_print(f" āš ļø {len(failed_matches)} sections failed to match:") - for failed in failed_matches: - thread_safe_print(f" āŒ Line {failed['line_num']}: {failed['hierarchy']} - {failed['reason']}") - - if skipped_sections: - thread_safe_print(f" ā„¹ļø {len(skipped_sections)} sections skipped (not system variables/config):") - for skipped in skipped_sections: - thread_safe_print(f" ā­ļø Line {skipped['line_num']}: {skipped['hierarchy']} - {skipped['reason']}") - - return matched_dict, failed_matches, skipped_sections - -def filter_non_system_sections(target_hierarchy): - """Filter out system variable/config sections from target hierarchy for AI mapping""" - filtered_hierarchy = {} - system_sections_count = 0 - - for line_num, hierarchy in target_hierarchy.items(): - # Extract the leaf title from hierarchy - leaf_title = hierarchy.split(' > ')[-1] if ' > ' in hierarchy else hierarchy - - if is_system_variable_or_config(leaf_title): - system_sections_count += 1 - else: - filtered_hierarchy[line_num] = hierarchy - - thread_safe_print(f" šŸ”§ Filtered target hierarchy: {len(filtered_hierarchy)} non-system sections (removed {system_sections_count} system sections)") - - return filtered_hierarchy - -def get_corresponding_sections(source_sections, target_sections, ai_client, source_language, target_language, max_tokens=20000): - """Use AI to find corresponding sections between different languages""" - - # Format source sections - source_text = "\n".join(source_sections) - target_text = "\n".join(target_sections) - number_of_sections = len(source_sections) - - prompt = f"""I am aligning the {source_language} and {target_language} documentation for TiDB. I have modified the following {number_of_sections} sections in the {source_language} file: - -{source_text} - -Here is the section structure of the corresponding {target_language} file. Please select the corresponding {number_of_sections} sections in {target_language} from the following list that I should modify. Do not output any other text, return the Markdown code block enclosed in three backticks. - -{target_text}""" - - thread_safe_print(f"\n šŸ“¤ AI Mapping Prompt ({source_language} → {target_language}):") - thread_safe_print(f" " + "="*80) - thread_safe_print(f" {prompt}") - thread_safe_print(f" " + "="*80) - - # Import token estimation function from main - try: - from main import print_token_estimation - print_token_estimation(prompt, f"Section mapping ({source_language} → {target_language})") - except ImportError: - # Fallback if import fails - use tiktoken - try: - import tiktoken - enc = tiktoken.get_encoding("cl100k_base") - tokens = enc.encode(prompt) - actual_tokens = len(tokens) - char_count = len(prompt) - thread_safe_print(f" šŸ’° Section mapping ({source_language} → {target_language})") - thread_safe_print(f" šŸ“ Input: {char_count:,} characters") - thread_safe_print(f" šŸ”¢ Actual tokens: {actual_tokens:,} (using tiktoken cl100k_base)") - except Exception: - # Final fallback to character approximation - estimated_tokens = len(prompt) // 4 - char_count = len(prompt) - thread_safe_print(f" šŸ’° Section mapping ({source_language} → {target_language})") - thread_safe_print(f" šŸ“ Input: {char_count:,} characters") - thread_safe_print(f" šŸ”¢ Estimated tokens: ~{estimated_tokens:,} (fallback: 4 chars/token approximation)") - - try: - ai_response = ai_client.chat_completion( - messages=[ - {"role": "user", "content": prompt} - ], - temperature=0.1, - max_tokens=max_tokens - ) - - thread_safe_print(f"\n šŸ“„ AI Mapping Response:") - thread_safe_print(f" " + "-"*80) - thread_safe_print(f" {ai_response}") - thread_safe_print(f" " + "-"*80) - - return ai_response - except Exception as e: - print(f" āŒ AI mapping error: {e}") - return None - -def parse_ai_response(ai_response): - """Parse AI response to extract section names""" - sections = [] - lines = ai_response.split('\n') - - for line in lines: - line = line.strip() - # Skip markdown code block markers and empty lines - if line and not line.startswith('```'): - # Remove leading "## " if present and clean up - if line.startswith('## '): - sections.append(line) - elif line.startswith('- '): - # Handle cases where AI returns a list - sections.append(line[2:].strip()) - - return sections - -def find_matching_line_numbers(ai_sections, target_hierarchy_dict): - """Find line numbers in target hierarchy dict that match AI sections""" - matched_dict = {} - - for ai_section in ai_sections: - # Look for exact matches first - found = False - for line_num, hierarchy in target_hierarchy_dict.items(): - if hierarchy == ai_section: - matched_dict[str(line_num)] = hierarchy - found = True - break - - if not found: - # Look for partial matches (in case of slight differences) - for line_num, hierarchy in target_hierarchy_dict.items(): - # Remove common variations and compare - ai_clean = ai_section.replace('### ', '').replace('## ', '').strip() - hierarchy_clean = hierarchy.replace('### ', '').replace('## ', '').strip() - - if ai_clean in hierarchy_clean or hierarchy_clean in ai_clean: - matched_dict[str(line_num)] = hierarchy - thread_safe_print(f" ā‰ˆ Partial match found at line {line_num}: {hierarchy}") - found = True - break - - if not found: - thread_safe_print(f" āœ— No match found for: {ai_section}") - - return matched_dict - -def build_hierarchy_path(lines, line_num, all_headers): - """Build the full hierarchy path for a header at given line (from auto-sync-pr-changes.py)""" - if line_num not in all_headers: - return [] - - current_header = all_headers[line_num] - current_level = current_header['level'] - hierarchy_path = [] - - # Find all parent headers - for check_line in sorted(all_headers.keys()): - if check_line >= line_num: - break - - header = all_headers[check_line] - if header['level'] < current_level: - # This is a potential parent - # Remove any headers at same or deeper level - while hierarchy_path and hierarchy_path[-1]['level'] >= header['level']: - hierarchy_path.pop() - hierarchy_path.append(header) - - # Add current header - hierarchy_path.append(current_header) - - return hierarchy_path - -def map_insertion_points_to_target(insertion_points, target_hierarchy, target_lines, file_path, pr_url, github_client, ai_client, repo_config, max_non_system_sections=120): - """Map source insertion points to target language locations""" - target_insertion_points = {} - - thread_safe_print(f" šŸ“ Mapping {len(insertion_points)} insertion points to target...") - - for group_key, point_info in insertion_points.items(): - previous_section_hierarchy = point_info['previous_section_hierarchy'] - thread_safe_print(f" šŸ” Finding target location for: {previous_section_hierarchy}") - - # Extract title for system variable checking - if ' > ' in previous_section_hierarchy: - title = previous_section_hierarchy.split(' > ')[-1] - else: - title = previous_section_hierarchy - - # Check if this is a system variable/config that can be directly matched - cleaned_title = clean_title_for_matching(title) - if is_system_variable_or_config(cleaned_title): - thread_safe_print(f" šŸŽÆ Direct matching for system var/config: {cleaned_title}") - - # Direct matching for system variables - temp_source = {point_info['previous_section_line']: previous_section_hierarchy} - matched_dict, failed_matches, skipped_sections = find_direct_matches_for_special_files( - temp_source, target_hierarchy, target_lines - ) - - if matched_dict: - # Get the first (and should be only) matched target line - target_line = list(matched_dict.keys())[0] - - # Find the end of this section - target_line_num = int(target_line) - insertion_after_line = find_section_end_line(target_line_num, target_hierarchy, target_lines) - - target_insertion_points[group_key] = { - 'insertion_after_line': insertion_after_line, - 'target_hierarchy': target_hierarchy.get(str(target_line_num), ''), - 'insertion_type': point_info['insertion_type'], - 'new_sections': point_info['new_sections'] - } - thread_safe_print(f" āœ… Direct match found, insertion after line {insertion_after_line}") - continue - - # If not a system variable or direct matching failed, use AI - thread_safe_print(f" šŸ¤– Using AI mapping for: {cleaned_title}") - - # Filter target hierarchy for AI (remove system sections) - filtered_target_hierarchy = filter_non_system_sections(target_hierarchy) - - # Check if filtered hierarchy is too large for AI - # Use provided max_non_system_sections parameter - if len(filtered_target_hierarchy) > max_non_system_sections: - thread_safe_print(f" āŒ Target hierarchy too large for AI: {len(filtered_target_hierarchy)} > {max_non_system_sections}") - continue - - # Prepare source for AI mapping - temp_source = {str(point_info['previous_section_line']): previous_section_hierarchy} - - # Get AI mapping - ai_response = get_corresponding_sections( - list(temp_source.values()), - list(filtered_target_hierarchy.values()), - ai_client, - repo_config['source_language'], - repo_config['target_language'], - max_tokens=20000 # Use default value since this function doesn't accept max_tokens yet - ) - - if ai_response: - # Parse AI response and find matching line numbers - ai_sections = parse_ai_response(ai_response) - ai_matched = find_matching_line_numbers(ai_sections, target_hierarchy) - - if ai_matched and len(ai_matched) > 0: - # Get the first match (we only have one source section) - target_line = list(ai_matched.keys())[0] - target_line_num = int(target_line) - - # Find the end of this section - insertion_after_line = find_section_end_line(target_line_num, target_hierarchy, target_lines) - - target_insertion_points[group_key] = { - 'insertion_after_line': insertion_after_line, - 'target_hierarchy': target_hierarchy.get(target_line, ''), - 'insertion_type': point_info['insertion_type'], - 'new_sections': point_info['new_sections'] - } - thread_safe_print(f" āœ… AI match found, insertion after line {insertion_after_line}") - else: - thread_safe_print(f" āŒ No AI matching sections found for: {previous_section_hierarchy}") - else: - thread_safe_print(f" āŒ No AI response received for: {previous_section_hierarchy}") - - return target_insertion_points - -def extract_hierarchies_from_diff_dict(source_diff_dict): - """Extract original_hierarchy from source_diff_dict for section matching""" - extracted_hierarchies = {} - - for key, diff_info in source_diff_dict.items(): - operation = diff_info.get('operation', '') - original_hierarchy = diff_info.get('original_hierarchy', '') - - # Process all sections: modified, deleted, and added - if operation in ['modified', 'deleted', 'added'] and original_hierarchy: - # Use the key as the identifier for the hierarchy - extracted_hierarchies[key] = original_hierarchy - - thread_safe_print(f"šŸ“„ Extracted {len(extracted_hierarchies)} hierarchies from source diff dict:") - for key, hierarchy in extracted_hierarchies.items(): - thread_safe_print(f" {key}: {hierarchy}") - - return extracted_hierarchies - -def match_source_diff_to_target(source_diff_dict, target_hierarchy, target_lines, ai_client, repo_config, max_non_system_sections=120, max_tokens=20000): - """ - Match source_diff_dict original_hierarchy to target file sections - Uses direct matching for system variables/config and AI matching for others - - Returns: - dict: Matched sections with enhanced information including: - - target_line: Line number in target file - - target_hierarchy: Target section hierarchy - - insertion_type: For added sections only - - source_original_hierarchy: Original hierarchy from source - - source_operation: Operation type (modified/added/deleted) - - source_old_content: Old content from source diff - - source_new_content: New content from source diff - """ - thread_safe_print(f"šŸ”— Starting source diff to target matching...") - - # Extract hierarchies from source diff dict - source_hierarchies = extract_hierarchies_from_diff_dict(source_diff_dict) - - if not source_hierarchies: - thread_safe_print(f"āš ļø No hierarchies to match") - return {} - - # Process sections in original order to maintain consistency - # Initialize final matching results with ordered dict to preserve order - from collections import OrderedDict - all_matched_sections = OrderedDict() - - # Categorize sections for processing strategy but maintain order - direct_match_sections = OrderedDict() - ai_match_sections = OrderedDict() - added_sections = OrderedDict() - bottom_sections = OrderedDict() # New category for bottom sections - - for key, hierarchy in source_hierarchies.items(): - # Check if this is a bottom section (no matching needed) - if hierarchy.startswith('bottom-'): - bottom_sections[key] = hierarchy - # Check if this is an added section - elif key.startswith('added_'): - added_sections[key] = hierarchy - else: - # Extract the leaf title from hierarchy for checking - leaf_title = hierarchy.split(' > ')[-1] if ' > ' in hierarchy else hierarchy - - # Check if this is suitable for direct matching - if (hierarchy == "frontmatter" or - leaf_title.startswith('# ') or # Top-level titles - is_system_variable_or_config(leaf_title)): # System variables/config - direct_match_sections[key] = hierarchy - else: - ai_match_sections[key] = hierarchy - - thread_safe_print(f"šŸ“Š Section categorization:") - thread_safe_print(f" šŸŽÆ Direct matching: {len(direct_match_sections)} sections") - thread_safe_print(f" šŸ¤– AI matching: {len(ai_match_sections)} sections") - thread_safe_print(f" āž• Added sections: {len(added_sections)} sections") - thread_safe_print(f" šŸ”š Bottom sections: {len(bottom_sections)} sections (no matching needed)") - - # Process each section in original order - thread_safe_print(f"\nšŸ”„ Processing sections in original order...") - - for key, hierarchy in source_hierarchies.items(): - thread_safe_print(f" šŸ” Processing {key}: {hierarchy}") - - # Determine processing strategy based on section type and content - if hierarchy.startswith('bottom-'): - # Bottom section - no matching needed, append to end - thread_safe_print(f" šŸ”š Bottom section - append to end of document") - result = { - "target_line": "-1", # Special marker for bottom sections - "target_hierarchy": hierarchy # Keep the bottom-xxx marker - } - elif key.startswith('added_'): - # Added section - find insertion point - thread_safe_print(f" āž• Added section - finding insertion point") - result = process_added_section(key, hierarchy, target_hierarchy, target_lines, ai_client, repo_config, max_non_system_sections, max_tokens) - else: - # Modified or deleted section - find matching section - operation = source_diff_dict[key].get('operation', 'unknown') - thread_safe_print(f" {operation.capitalize()} section - finding target match") - result = process_modified_or_deleted_section(key, hierarchy, target_hierarchy, target_lines, ai_client, repo_config, max_non_system_sections, max_tokens) - - if result: - # Add source language information from source_diff_dict - source_info = source_diff_dict.get(key, {}) - - # Extract target content from target_lines - target_line = result.get('target_line', 'unknown') - target_content = "" - if target_line != 'unknown' and target_line != '0': - try: - target_line_num = int(target_line) - # For ALL operations, only extract direct content (no sub-sections) - # This avoids duplication when both parent and child sections have operations - target_content = extract_section_direct_content(target_line_num, target_lines) - except (ValueError, IndexError): - target_content = "" - elif target_line == '0': - # For frontmatter, extract content from beginning to first header - target_content = extract_frontmatter_content(target_lines) - - enhanced_result = { - **result, # Include existing target matching info - 'target_content': target_content, # Add target section content - 'source_original_hierarchy': source_info.get('original_hierarchy', ''), - 'source_operation': source_info.get('operation', ''), - 'source_old_content': source_info.get('old_content', ''), - 'source_new_content': source_info.get('new_content', '') - } - all_matched_sections[key] = enhanced_result - thread_safe_print(f" āœ… {key}: -> line {target_line}") - else: - thread_safe_print(f" āŒ {key}: matching failed") - - thread_safe_print(f"\nšŸ“Š Final matching results: {len(all_matched_sections)} total matches") - return all_matched_sections - -def process_modified_or_deleted_section(key, hierarchy, target_hierarchy, target_lines, ai_client, repo_config, max_non_system_sections, max_tokens=20000): - """Process modified or deleted sections to find target matches""" - # Extract the leaf title from hierarchy for checking - leaf_title = hierarchy.split(' > ')[-1] if ' > ' in hierarchy else hierarchy - - # Check if this is suitable for direct matching - if (hierarchy == "frontmatter" or - leaf_title.startswith('# ') or # Top-level titles - is_system_variable_or_config(leaf_title)): # System variables/config - - if hierarchy == "frontmatter": - return {"target_line": "0", "target_hierarchy": "frontmatter"} - - elif leaf_title.startswith('# '): - # Top-level title matching - temp_sections = {key: hierarchy} - matched_dict, failed_matches, skipped_sections = find_toplevel_title_matches( - temp_sections, target_lines - ) - if matched_dict: - target_line = list(matched_dict.keys())[0] - # For top-level titles, add # prefix to the hierarchy - return { - "target_line": target_line, - "target_hierarchy": f"# {matched_dict[target_line]}" - } - - else: - # System variable/config matching - temp_sections = {key: hierarchy} - matched_dict, failed_matches, skipped_sections = find_direct_matches_for_special_files( - temp_sections, target_hierarchy, target_lines - ) - if matched_dict: - target_line = list(matched_dict.keys())[0] - target_hierarchy_str = list(matched_dict.values())[0] - - # Extract the leaf title and add # prefix, remove top-level title from hierarchy - if ' > ' in target_hierarchy_str: - # Remove top-level title and keep only the leaf with ## prefix - leaf_title = target_hierarchy_str.split(' > ')[-1] - formatted_hierarchy = f"## {leaf_title}" - else: - # Single level, add ## prefix - formatted_hierarchy = f"## {target_hierarchy_str}" - - return { - "target_line": target_line, - "target_hierarchy": formatted_hierarchy - } - else: - # AI matching for non-system sections - filtered_target_hierarchy = filter_non_system_sections(target_hierarchy) - - if len(filtered_target_hierarchy) <= max_non_system_sections: - temp_sections = {key: hierarchy} - - ai_response = get_corresponding_sections( - list(temp_sections.values()), - list(filtered_target_hierarchy.values()), - ai_client, - repo_config['source_language'], - repo_config['target_language'], - max_tokens - ) - - if ai_response: - ai_sections = parse_ai_response(ai_response) - ai_matched = find_matching_line_numbers(ai_sections, target_hierarchy) - - if ai_matched: - target_line = list(ai_matched.keys())[0] - target_hierarchy_str = list(ai_matched.values())[0] - - # Format AI matched hierarchy with # prefix and remove top-level title - formatted_hierarchy = format_target_hierarchy(target_hierarchy_str) - - return { - "target_line": target_line, - "target_hierarchy": formatted_hierarchy - } - - return None - -def format_target_hierarchy(target_hierarchy_str): - """Format target hierarchy to preserve complete hierarchy structure""" - if target_hierarchy_str.startswith('##') or target_hierarchy_str.startswith('#'): - # Already formatted, return as is - return target_hierarchy_str - elif ' > ' in target_hierarchy_str: - # Keep complete hierarchy structure, just ensure proper formatting - return target_hierarchy_str - else: - # Single level, add ## prefix for compatibility - return f"## {target_hierarchy_str}" - -def process_added_section(key, reference_hierarchy, target_hierarchy, target_lines, ai_client, repo_config, max_non_system_sections, max_tokens=20000): - """Process added sections to find insertion points""" - # For added sections, hierarchy points to the next section (where to insert before) - reference_leaf = reference_hierarchy.split(' > ')[-1] if ' > ' in reference_hierarchy else reference_hierarchy - - if (reference_hierarchy == "frontmatter" or - reference_leaf.startswith('# ') or - is_system_variable_or_config(reference_leaf)): - - # Use direct matching for the reference section - temp_reference = {f"ref_{key}": reference_hierarchy} - - if reference_hierarchy == "frontmatter": - return { - "target_line": "0", - "target_hierarchy": "frontmatter", - "insertion_type": "before_reference" - } - - elif reference_leaf.startswith('# '): - matched_dict, failed_matches, skipped_sections = find_toplevel_title_matches( - temp_reference, target_lines - ) - if matched_dict: - target_line = list(matched_dict.keys())[0] - formatted_hierarchy = f"# {matched_dict[target_line]}" - return { - "target_line": target_line, - "target_hierarchy": formatted_hierarchy, - "insertion_type": "before_reference" - } - - else: - # System variable/config - matched_dict, failed_matches, skipped_sections = find_direct_matches_for_special_files( - temp_reference, target_hierarchy, target_lines - ) - if matched_dict: - target_line = list(matched_dict.keys())[0] - target_hierarchy_str = list(matched_dict.values())[0] - formatted_hierarchy = format_target_hierarchy(target_hierarchy_str) - return { - "target_line": target_line, - "target_hierarchy": formatted_hierarchy, - "insertion_type": "before_reference" - } - else: - # Use AI matching for the reference section - filtered_target_hierarchy = filter_non_system_sections(target_hierarchy) - - if len(filtered_target_hierarchy) <= max_non_system_sections: - temp_reference = {f"ref_{key}": reference_hierarchy} - - ai_response = get_corresponding_sections( - list(temp_reference.values()), - list(filtered_target_hierarchy.values()), - ai_client, - repo_config['source_language'], - repo_config['target_language'], - max_tokens - ) - - if ai_response: - ai_sections = parse_ai_response(ai_response) - ai_matched = find_matching_line_numbers(ai_sections, target_hierarchy) - - if ai_matched: - target_line = list(ai_matched.keys())[0] - target_hierarchy_str = list(ai_matched.values())[0] - formatted_hierarchy = format_target_hierarchy(target_hierarchy_str) - return { - "target_line": target_line, - "target_hierarchy": formatted_hierarchy, - "insertion_type": "before_reference" - } - - return None - -def extract_target_section_content(target_line_num, target_lines): - """Extract target section content from target_lines (includes sub-sections)""" - if target_line_num >= len(target_lines): - return "" - - start_line = target_line_num - 1 # Convert to 0-based index - - # Find the end of the section by looking for the next header - current_line = target_lines[start_line].strip() - if not current_line.startswith('#'): - return current_line - - current_level = len(current_line.split()[0]) # Count # characters - end_line = len(target_lines) # Default to end of file - - # For top-level headers (# level 1), stop at first sublevel (## level 2) - # For other headers, stop at same or higher level - if current_level == 1: - # Top-level header: stop at first ## (level 2) or higher - for i in range(start_line + 1, len(target_lines)): - line = target_lines[i].strip() - if line.startswith('#'): - line_level = len(line.split()[0]) - if line_level >= 2: # Stop at ## or higher level - end_line = i - break - else: - # Sub-level header: stop at same or higher level (traditional behavior) - for i in range(start_line + 1, len(target_lines)): - line = target_lines[i].strip() - if line.startswith('#'): - line_level = len(line.split()[0]) - if line_level <= current_level: - end_line = i - break - - # Extract content from start_line to end_line - section_content = '\n'.join(target_lines[start_line:end_line]) - return section_content.strip() - -def extract_section_direct_content(target_line_num, target_lines): - """Extract ONLY the direct content of a section (excluding sub-sections)""" - if target_line_num >= len(target_lines): - return "" - - start_line = target_line_num - 1 # Convert to 0-based index - - # Find the end of the section by looking for the next header - current_line = target_lines[start_line].strip() - if not current_line.startswith('#'): - return current_line - - current_level = len(current_line.split()[0]) # Count # characters - end_line = len(target_lines) # Default to end of file - - # Only extract until the first header (any level) - # This means we stop at ANY header - whether it's a sub-section OR same/higher level - for i in range(start_line + 1, len(target_lines)): - line = target_lines[i].strip() - if line.startswith('#'): - # Stop at ANY header to get only direct content - end_line = i - break - - # Extract content from start_line to end_line - section_content = '\n'.join(target_lines[start_line:end_line]) - return section_content.strip() - -def extract_frontmatter_content(target_lines): - """Extract frontmatter content from beginning to first header""" - if not target_lines: - return "" - - frontmatter_lines = [] - for i, line in enumerate(target_lines): - line_stripped = line.strip() - # Stop when we hit the first top-level header - if line_stripped.startswith('# '): - break - frontmatter_lines.append(line.rstrip()) - - return '\n'.join(frontmatter_lines) - -def find_section_end_line(section_start_line, target_hierarchy, target_lines): - """Find the end line of a section to determine insertion point (from auto-sync-pr-changes.py)""" - - # Get the current section's level - current_section_line = target_lines[section_start_line - 1].strip() - current_level = len(current_section_line.split()[0]) if current_section_line.startswith('#') else 5 - - # Find the next section at the same level or higher (lower number) - next_section_line = None - for line_num_str in sorted(target_hierarchy.keys(), key=int): - line_num = int(line_num_str) - if line_num > section_start_line: - # Check the level of this section - section_line = target_lines[line_num - 1].strip() - if section_line.startswith('#'): - section_level = len(section_line.split()[0]) - if section_level <= current_level: - next_section_line = line_num - break - - if next_section_line: - # Insert before the next same-level or higher-level section - return next_section_line - 1 - else: - # This is the last section at this level, insert at the end of the file - return len(target_lines) diff --git a/scripts/translate_doc_pr/toc_processor.py b/scripts/translate_doc_pr/toc_processor.py deleted file mode 100644 index 71cce4a17f8bb..0000000000000 --- a/scripts/translate_doc_pr/toc_processor.py +++ /dev/null @@ -1,434 +0,0 @@ -""" -TOC Processor Module -Handles special processing logic for TOC.md files -""" - -import os -import re -import json -import threading -from github import Github -from openai import OpenAI - -# Thread-safe printing -print_lock = threading.Lock() - -def thread_safe_print(*args, **kwargs): - with print_lock: - print(*args, **kwargs) - -def extract_toc_link_from_line(line): - """Extract the link part (including parentheses) from a TOC line""" - # Pattern to match [text](link) format - pattern = r'\[([^\]]+)\]\(([^)]+)\)' - match = re.search(pattern, line) - if match: - return f"({match.group(2)})" # Return (link) including parentheses - return None - -def is_toc_translation_needed(line): - """Check if a TOC line needs translation based on content in square brackets""" - # Extract content within square brackets [content] - pattern = r'\[([^\]]+)\]' - match = re.search(pattern, line) - if match: - content = match.group(1) - # Skip translation if content has no Chinese and no spaces - has_chinese = bool(re.search(r'[\u4e00-\u9fff]', content)) - has_spaces = ' ' in content - - # Need translation if has Chinese OR has spaces - # Skip translation only if it's alphanumeric/technical term without spaces - return has_chinese or has_spaces - return True # Default to translate if can't parse - -def find_best_toc_match(target_link, target_lines, source_line_num): - """Find the best matching line in target TOC based on link content and line proximity""" - matches = [] - - for i, line in enumerate(target_lines): - line_link = extract_toc_link_from_line(line.strip()) - if line_link and line_link == target_link: - matches.append({ - 'line_num': i + 1, # Convert to 1-based - 'line': line.strip(), - 'distance': abs((i + 1) - source_line_num) - }) - - if not matches: - return None - - # Sort by distance to source line number, choose the closest one - matches.sort(key=lambda x: x['distance']) - return matches[0] - -def group_consecutive_lines(lines): - """Group consecutive lines together""" - if not lines: - return [] - - # Sort lines by line number - sorted_lines = sorted(lines, key=lambda x: x['line_number']) - - groups = [] - current_group = [sorted_lines[0]] - - for i in range(1, len(sorted_lines)): - current_line = sorted_lines[i] - prev_line = sorted_lines[i-1] - - # Consider lines consecutive if they are within 2 lines of each other - if current_line['line_number'] - prev_line['line_number'] <= 2: - current_group.append(current_line) - else: - groups.append(current_group) - current_group = [current_line] - - groups.append(current_group) - return groups - -def process_toc_operations(file_path, operations, source_lines, target_lines, target_local_path): - """Process TOC.md file operations with special logic""" - thread_safe_print(f"\nšŸ“‹ Processing TOC.md with special logic...") - - results = { - 'added': [], - 'modified': [], - 'deleted': [] - } - - # Process deleted lines first - for deleted_line in operations['deleted_lines']: - if not deleted_line['is_header']: # TOC lines are not headers - deleted_content = deleted_line['content'] - deleted_link = extract_toc_link_from_line(deleted_content) - - if deleted_link: - thread_safe_print(f" šŸ—‘ļø Processing deleted TOC line with link: {deleted_link}") - - # Find matching line in target - match = find_best_toc_match(deleted_link, target_lines, deleted_line['line_number']) - if match: - thread_safe_print(f" āœ… Found target line {match['line_num']}: {match['line']}") - results['deleted'].append({ - 'source_line': deleted_line['line_number'], - 'target_line': match['line_num'], - 'content': deleted_content - }) - else: - thread_safe_print(f" āŒ No matching line found for {deleted_link}") - - # Process added lines - added_groups = group_consecutive_lines(operations['added_lines']) - for group in added_groups: - if group: # Skip empty groups - first_added_line = group[0] - thread_safe_print(f" āž• Processing added TOC group starting at line {first_added_line['line_number']}") - - # Find the previous line in source to determine insertion point - previous_line_num = first_added_line['line_number'] - 1 - if previous_line_num > 0 and previous_line_num <= len(source_lines): - previous_line_content = source_lines[previous_line_num - 1] - previous_link = extract_toc_link_from_line(previous_line_content) - - if previous_link: - thread_safe_print(f" šŸ“ Previous line link: {previous_link}") - - # Find matching previous line in target - match = find_best_toc_match(previous_link, target_lines, previous_line_num) - if match: - thread_safe_print(f" āœ… Found target insertion point after line {match['line_num']}") - - # Process each line in the group - for added_line in group: - added_content = added_line['content'] - if is_toc_translation_needed(added_content): - results['added'].append({ - 'source_line': added_line['line_number'], - 'target_insertion_after': match['line_num'], - 'content': added_content, - 'needs_translation': True - }) - thread_safe_print(f" šŸ“ Added for translation: {added_content.strip()}") - else: - results['added'].append({ - 'source_line': added_line['line_number'], - 'target_insertion_after': match['line_num'], - 'content': added_content, - 'needs_translation': False - }) - thread_safe_print(f" ā­ļø Added without translation: {added_content.strip()}") - else: - thread_safe_print(f" āŒ No target insertion point found for {previous_link}") - else: - thread_safe_print(f" āŒ No link found in previous line: {previous_line_content.strip()}") - - # Process modified lines - modified_groups = group_consecutive_lines(operations['modified_lines']) - for group in modified_groups: - if group: # Skip empty groups - first_modified_line = group[0] - thread_safe_print(f" āœļø Processing modified TOC group starting at line {first_modified_line['line_number']}") - - # Find the previous line in source to determine target location - previous_line_num = first_modified_line['line_number'] - 1 - if previous_line_num > 0 and previous_line_num <= len(source_lines): - previous_line_content = source_lines[previous_line_num - 1] - previous_link = extract_toc_link_from_line(previous_line_content) - - if previous_link: - thread_safe_print(f" šŸ“ Previous line link: {previous_link}") - - # Find matching previous line in target - match = find_best_toc_match(previous_link, target_lines, previous_line_num) - if match: - # Process each line in the group - for modified_line in group: - modified_content = modified_line['content'] - if is_toc_translation_needed(modified_content): - results['modified'].append({ - 'source_line': modified_line['line_number'], - 'target_line_context': match['line_num'], - 'content': modified_content, - 'needs_translation': True - }) - thread_safe_print(f" šŸ“ Modified for translation: {modified_content.strip()}") - else: - results['modified'].append({ - 'source_line': modified_line['line_number'], - 'target_line_context': match['line_num'], - 'content': modified_content, - 'needs_translation': False - }) - thread_safe_print(f" ā­ļø Modified without translation: {modified_content.strip()}") - else: - thread_safe_print(f" āŒ No target context found for {previous_link}") - else: - thread_safe_print(f" āŒ No link found in previous line: {previous_line_content.strip()}") - - return results - -def find_toc_modification_line(mod_op, target_lines): - """Find the actual line number to modify in target TOC based on context""" - # This function helps find the exact line to modify in target TOC - # based on the modification operation context - - target_line_context = mod_op.get('target_line_context', 0) - - # Look for the line after the context line that should be modified - # This is a simplified approach - in practice, you might need more sophisticated logic - - if target_line_context > 0 and target_line_context < len(target_lines): - # Check if the next line is the one to modify - return target_line_context + 1 - - return target_line_context - -def translate_toc_lines(toc_operations, ai_client, repo_config): - """Translate multiple TOC lines at once""" - lines_to_translate = [] - - # Collect all lines that need translation - for op in toc_operations: - if op.get('needs_translation', False): - lines_to_translate.append({ - 'operation_type': 'added' if 'target_insertion_after' in op else 'modified', - 'content': op['content'], - 'source_line': op['source_line'] - }) - - if not lines_to_translate: - thread_safe_print(f" ā­ļø No TOC lines need translation") - return {} - - thread_safe_print(f" šŸ¤– Translating {len(lines_to_translate)} TOC lines...") - - # Prepare content for AI translation - content_dict = {} - for i, line_info in enumerate(lines_to_translate): - content_dict[f"line_{i}"] = line_info['content'] - - source_lang = repo_config['source_language'] - target_lang = repo_config['target_language'] - - prompt = f"""You are a professional translator. Please translate the following TOC (Table of Contents) lines from {source_lang} to {target_lang}. - -IMPORTANT INSTRUCTIONS: -1. Preserve ALL formatting, indentation, spaces, and dashes exactly as they appear -2. Only translate the text content within square brackets [text] -3. Keep all markdown links, parentheses, and special characters unchanged -4. Maintain the exact same indentation and spacing structure - -Input lines to translate: -{json.dumps(content_dict, indent=2, ensure_ascii=False)} - -Please return the translated lines in the same JSON format, preserving all formatting and only translating the text within square brackets. - -Return format: -{{ - "line_0": "translated line with preserved formatting", - "line_1": "translated line with preserved formatting" -}}""" - - #print(prompt) #DEBUG - # Add token estimation - try: - from main import print_token_estimation - print_token_estimation(prompt, "TOC translation") - except ImportError: - # Fallback if import fails - use tiktoken - try: - import tiktoken - enc = tiktoken.get_encoding("cl100k_base") - tokens = enc.encode(prompt) - actual_tokens = len(tokens) - char_count = len(prompt) - print(f" šŸ’° TOC translation") - print(f" šŸ“ Input: {char_count:,} characters") - print(f" šŸ”¢ Actual tokens: {actual_tokens:,} (using tiktoken cl100k_base)") - except Exception: - # Final fallback to character approximation - estimated_tokens = len(prompt) // 4 - char_count = len(prompt) - print(f" šŸ’° TOC translation") - print(f" šŸ“ Input: {char_count:,} characters") - print(f" šŸ”¢ Estimated tokens: ~{estimated_tokens:,} (fallback: 4 chars/token approximation)") - - try: - ai_response = ai_client.chat_completion( - messages=[{"role": "user", "content": prompt}], - temperature=0.1 - ) - #print(ai_response) #DEBUG - thread_safe_print(f" šŸ“ AI translation response received") - - # Parse AI response - try: - json_start = ai_response.find('{') - json_end = ai_response.rfind('}') + 1 - - if json_start != -1 and json_end > json_start: - json_str = ai_response[json_start:json_end] - translated_lines = json.loads(json_str) - - # Map back to original operations - translation_mapping = {} - for i, line_info in enumerate(lines_to_translate): - key = f"line_{i}" - if key in translated_lines: - translation_mapping[line_info['source_line']] = translated_lines[key] - - thread_safe_print(f" āœ… Successfully translated {len(translation_mapping)} TOC lines") - return translation_mapping - - except json.JSONDecodeError as e: - thread_safe_print(f" āŒ Failed to parse AI translation response: {e}") - return {} - - except Exception as e: - thread_safe_print(f" āŒ AI translation failed: {e}") - return {} - -def process_toc_file(file_path, toc_data, pr_url, github_client, ai_client, repo_config): - """Process a single TOC.md file with special logic""" - thread_safe_print(f"\nšŸ“‹ Processing TOC file: {file_path}") - - try: - target_local_path = repo_config['target_local_path'] - target_file_path = os.path.join(target_local_path, file_path) - - # Read current target file - with open(target_file_path, 'r', encoding='utf-8') as f: - target_content = f.read() - - target_lines = target_content.split('\n') - operations = toc_data['operations'] - - # Separate operations by type - deleted_ops = [op for op in operations if 'target_line' in op] - added_ops = [op for op in operations if 'target_insertion_after' in op] - modified_ops = [op for op in operations if 'target_line_context' in op] - - thread_safe_print(f" šŸ“Š TOC operations: {len(deleted_ops)} deleted, {len(added_ops)} added, {len(modified_ops)} modified") - - # Process deletions first (work backwards to maintain line numbers) - if deleted_ops: - thread_safe_print(f" šŸ—‘ļø Processing {len(deleted_ops)} deletions...") - deleted_ops.sort(key=lambda x: x['target_line'], reverse=True) - - for del_op in deleted_ops: - target_line_num = del_op['target_line'] - 1 # Convert to 0-based - if 0 <= target_line_num < len(target_lines): - thread_safe_print(f" āŒ Deleting line {del_op['target_line']}: {target_lines[target_line_num].strip()}") - del target_lines[target_line_num] - - # Process modifications - if modified_ops: - thread_safe_print(f" āœļø Processing {len(modified_ops)} modifications...") - - # Get translations for operations that need them - translations = translate_toc_lines(modified_ops, ai_client, repo_config) - - for mod_op in modified_ops: - target_line_num = find_toc_modification_line(mod_op, target_lines) - 1 # Convert to 0-based - - if 0 <= target_line_num < len(target_lines): - if mod_op.get('needs_translation', False) and mod_op['source_line'] in translations: - new_content = translations[mod_op['source_line']] - thread_safe_print(f" āœļø Modifying line {target_line_num + 1} with translation") - else: - new_content = mod_op['content'] - thread_safe_print(f" āœļø Modifying line {target_line_num + 1} without translation") - - target_lines[target_line_num] = new_content - - # Process additions last - if added_ops: - thread_safe_print(f" āž• Processing {len(added_ops)} additions...") - - # Get translations for operations that need them - translations = translate_toc_lines(added_ops, ai_client, repo_config) - - # Group additions by insertion point and process in reverse order - added_ops.sort(key=lambda x: x['target_insertion_after'], reverse=True) - - for add_op in added_ops: - insertion_after = add_op['target_insertion_after'] - - if add_op.get('needs_translation', False) and add_op['source_line'] in translations: - new_content = translations[add_op['source_line']] - thread_safe_print(f" āž• Inserting after line {insertion_after} with translation") - else: - new_content = add_op['content'] - thread_safe_print(f" āž• Inserting after line {insertion_after} without translation") - - # Insert the new line - if insertion_after < len(target_lines): - target_lines.insert(insertion_after, new_content) - else: - target_lines.append(new_content) - - # Write updated content back to file - updated_content = '\n'.join(target_lines) - with open(target_file_path, 'w', encoding='utf-8') as f: - f.write(updated_content) - - thread_safe_print(f" āœ… TOC file updated: {file_path}") - - except Exception as e: - thread_safe_print(f" āŒ Error processing TOC file {file_path}: {e}") - -def process_toc_files(toc_files, pr_url, github_client, ai_client, repo_config): - """Process all TOC files""" - if not toc_files: - return - - thread_safe_print(f"\nšŸ“‹ Processing {len(toc_files)} TOC files...") - - for file_path, toc_data in toc_files.items(): - if toc_data['type'] == 'toc': - process_toc_file(file_path, toc_data, pr_url, github_client, ai_client, repo_config) - else: - thread_safe_print(f" āš ļø Unknown TOC data type: {toc_data['type']} for {file_path}") - - thread_safe_print(f" āœ… All TOC files processed") From bed0d76f4ab497d95643502c1e68820d609555e9 Mon Sep 17 00:00:00 2001 From: qiancai Date: Tue, 21 Oct 2025 18:49:07 +0800 Subject: [PATCH 14/18] Delete sync-en-cloud-toc-changes-to-zh.py --- .../sync-en-cloud-toc-changes-to-zh.py | 616 ------------------ 1 file changed, 616 deletions(-) delete mode 100644 .github/workflows/sync-en-cloud-toc-changes-to-zh.py diff --git a/.github/workflows/sync-en-cloud-toc-changes-to-zh.py b/.github/workflows/sync-en-cloud-toc-changes-to-zh.py deleted file mode 100644 index da2c16b10da20..0000000000000 --- a/.github/workflows/sync-en-cloud-toc-changes-to-zh.py +++ /dev/null @@ -1,616 +0,0 @@ -# This script is used to sync the changes from the English TOC files to the Chinese TOC files. Detailed steps are as follows: -# 1. The script automatically gets the latest commit of the English TOC file from GitHub and the earlier commit of the English TOC file from the Chinese TOC file in the same repository. -# 2. It compares two English commits and performs the following operations: -# - If the commit numbers are the same, skip the update for that TOC file. -# - If the commit numbers are different, update the Chinese TOC with the following operations: -# a. Updates the Chinese TOC according to the English diff. -# b. Generates bilingual terms based on the old version of the Chinese and English TOC files. -# c. Update the modified English lines in the Chinese TOC with Chinese based on the bilingual terms. -# d. Translate the remaining English in the Chinese TOC using AI. - -import re -import os -import sys -import json -import logging -from urllib.request import urlopen, Request -from urllib.error import URLError, HTTPError -from google import genai - -REPO_OWNER = "qiancai" -REPO_NAME = "docs" -EN_BRANCH = "release-8.5" -ZH_BRANCH = "i18n-zh-release-8.5" -TOC_FILE_NAMES = ["TOC-tidb-cloud-starter.md", "TOC-tidb-cloud-essential.md", "TOC-tidb-cloud.md"] -TOC_HEADER_LINE_COUNT = 3 # The Starting line to create bilingual terms -TEMP_TOC_FILENAME = "en_cloud_toc.md" # The filename of the temporary English TOC content - - -# ========== Logging Configuration ========== -logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') -logger = logging.getLogger(__name__) - -# ========== AI Configuration ========== -MODEL_NAME = "gemini-2.0-flash" -genai_token = os.getenv("GEMINI_API_TOKEN") -if not genai_token: - logger.error("GEMINI_API_TOKEN environment variable must be set") - sys.exit(1) - -client = genai.Client(api_key=genai_token) - -def read_file_from_repo(file_path): - """Read a file from the current repository""" - try: - with open(file_path, 'r', encoding='utf-8') as f: - return f.read() - except IOError as e: - logger.error(f"Error reading file {file_path}: {e}") - return None - -def write_file_to_repo(file_path, content): - """Write content to a file in the current repository""" - try: - with open(file_path, 'w', encoding='utf-8') as f: - f.write(content) - return True - except IOError as e: - logger.error(f"Error writing file {file_path}: {e}") - return False - -def extract_commit_from_target_file(target_file): - """Extract the EN commit SHA from the target TOC file comment""" - try: - content = read_file_from_repo(target_file) - if not content: - return None - - lines = content.split('\n') - for i, line in enumerate(lines): - if i > 10: # Only check first 10 lines - break - - # Look for the pattern: - if "EN commit:" in line: - # Extract commit SHA using regex - match = re.search(r'EN commit:\s*([a-f0-9]{40})', line) - if match: - commit_sha = match.group(1) - logger.info(f"Found earlier EN commit in target file: {commit_sha}") - return commit_sha - - logger.error("No EN commit comment found in target file") - return None - - except Exception as e: - logger.error(f"Error reading target file for commit extraction: {e}") - return None - -def get_latest_commit_sha(repo_owner, repo_name, branch, toc_file_name): - """Get the latest commit SHA for a specific file on GitHub""" - try: - # Use GitHub API to get commits for the specific file - url = f"https://api.github.com/repos/{repo_owner}/{repo_name}/commits" - params = f"?sha={branch}&path={toc_file_name}&per_page=1" - full_url = url + params - headers = { - "User-Agent": "tidb-docs-sync/1.0", - "Accept": "application/vnd.github.v3+json", - } - gh_token = os.getenv("GITHUB_TOKEN") - if gh_token: - headers["Authorization"] = f"Bearer {gh_token}" - req = Request(full_url, headers=headers) - - with urlopen(req) as resp: - data = json.loads(resp.read().decode('utf-8')) - - if data and len(data) > 0: - latest_commit = data[0]['sha'] - logger.info(f"Latest commit: {latest_commit}") - return latest_commit - else: - logger.warning("No commits found for the specified file") - return None - - except (URLError, HTTPError, json.JSONDecodeError) as e: - logger.error(f"Error fetching latest commit: {e}") - return None - -def get_github_compare_diff(base_commit, head_commit): - """Fetch unified diff from GitHub compare endpoint (.diff) for the repo {REPO_OWNER}/{REPO_NAME}""" - try: - url = f"https://github.com/{REPO_OWNER}/{REPO_NAME}/compare/{base_commit}...{head_commit}.diff" - logger.info(f"Fetching compare diff from: {url}") - headers = { - "User-Agent": "tidb-docs-sync/1.0", - "Accept": "application/vnd.github.v3.diff", - } - gh_token = os.getenv("GITHUB_TOKEN") - if gh_token: - headers["Authorization"] = f"Bearer {gh_token}" - req = Request(url, headers=headers) - with urlopen(req, timeout=20) as resp: - content_bytes = resp.read() - # GitHub serves UTF-8 - return content_bytes.decode("utf-8", errors="replace") - except (URLError, HTTPError) as e: - logger.error(f"Error fetching GitHub compare diff: {e}") - return None - -def parse_github_diff_for_file(diff_text, target_rel_path): - """Parse the multi-file unified diff and return hunks for the specified file. - - Returns a list of hunks: {old_start, old_count, new_start, new_count, lines} - where lines are the raw hunk lines starting with ' ', '+', or '-'. - """ - if not diff_text: - return [] - - lines = diff_text.splitlines() - hunks = [] - collecting_for_file = False - current_hunk = None - current_file_path = None - - # Normalize target path to compare by suffix - target_suffix = target_rel_path.strip("/") - - for line in lines: - if line.startswith("diff --git "): - # finalize any open hunk - if current_hunk is not None and collecting_for_file: - hunks.append(current_hunk) - current_hunk = None - collecting_for_file = False - current_file_path = None - continue - - if line.startswith("+++ "): - path = line[4:].strip() - # Expected formats: 'b/path/to/file' or '/dev/null' - if path == "/dev/null": - current_file_path = None - collecting_for_file = False - else: - # strip the leading 'a/' or 'b/' - if path.startswith("a/") or path.startswith("b/"): - path_clean = path[2:] - else: - path_clean = path - current_file_path = path_clean - collecting_for_file = path_clean.endswith(target_suffix) - continue - - if not collecting_for_file: - continue - - # Within the target file section, parse hunks - if line.startswith("@@ "): - # finalize previous hunk - if current_hunk is not None: - hunks.append(current_hunk) - - m = re.match(r"@@ -(\d+),?(\d*) \+(\d+),?(\d*) @@", line) - if not m: - continue - old_start = int(m.group(1)) - old_count = int(m.group(2)) if m.group(2) else 1 - new_start = int(m.group(3)) - new_count = int(m.group(4)) if m.group(4) else 1 - - current_hunk = { - "old_start": old_start, - "old_count": old_count, - "new_start": new_start, - "new_count": new_count, - "lines": [], - } - continue - - # Collect hunk body lines - if current_hunk is not None and (line.startswith(" ") or line.startswith("+") or line.startswith("-")): - current_hunk["lines"].append(line) - - # finalize last hunk if any - if current_hunk is not None and collecting_for_file: - hunks.append(current_hunk) - - return hunks - -def apply_hunks_by_line_numbers(target_file, hunks, earlier_commit, latest_commit): - """Apply unified-diff hunks to target file strictly by old line numbers. - - Only change the lines marked as deletions ('-') and additions ('+'). - Context lines (' ') are used for positioning but are left untouched in the target. - """ - try: - content = read_file_from_repo(target_file) - if not content: - return False, {} - lines = content.splitlines() - - modified = list(lines) - line_offset_delta = 0 - modified_lines = {} - - for hunk_index, hunk in enumerate(hunks): - cursor = hunk["old_start"] - 1 + line_offset_delta - - if cursor < 0: - print(f"Hunk {hunk_index+1}: start cursor {cursor} adjusted to 0") - cursor = 0 - if cursor > len(modified): - print(f"Hunk {hunk_index+1}: start cursor {cursor} beyond EOF {len(modified)}; clamping to EOF") - cursor = len(modified) - - #print(f"Applying hunk {hunk_index+1} at approx line {cursor+1}") - - for raw in hunk["lines"]: - if not raw: - continue - marker = raw[0] - text = raw[1:] - - if marker == ' ': # context: advance cursor, keep original content - cursor += 1 - elif marker == '-': # deletion: remove line at cursor - if cursor < len(modified): - deleted = modified.pop(cursor) - line_offset_delta -= 1 - else: - print(f"Hunk {hunk_index+1}: deletion cursor {cursor} at/after EOF; skipping deletion") - elif marker == '+': # addition: insert line at cursor - modified.insert(cursor, text) - modified_lines[cursor+1] = text - #print(f"Inserted line at line {cursor+1}: {text}") - cursor += 1 - line_offset_delta += 1 - else: - # Unknown marker; ignore - pass - - # replace the earlier commit with the latest commit - for i, line in enumerate(modified): - if "EN commit:" in line and earlier_commit in line: - modified[i] = line.replace(earlier_commit, latest_commit) - break - modified_content = "\n".join(modified) + "\n" - - success = write_file_to_repo(target_file, modified_content) - if not success: - return False, {} - - logger.info(f"Successfully applied {len(hunks)} hunks to {target_file}") - return True, modified_lines - except Exception as e: - logger.error(f"Error applying hunks: {e}") - return False, {} - -def sync_toc_files_using_github_compare(commit1, commit2, source_file, target_file): - """Sync by fetching compare diff from GitHub and applying hunks by line numbers.""" - logger.info(f"Fetching GitHub compare diff between {commit1} and {commit2}...") - diff_text = get_github_compare_diff(commit1, commit2) - if not diff_text: - logger.warning("No diff content retrieved from GitHub") - return False, {} - - logger.info("Parsing diff for target file hunks...") - hunks = parse_github_diff_for_file(diff_text, source_file) - if not hunks: - logger.info(f"No hunks found for file: {source_file}") - return False, {} - - logger.info(f"Found {len(hunks)} hunks for {source_file}. Applying to {target_file} by line numbers...") - sync_status, modified_lines = apply_hunks_by_line_numbers(target_file, hunks, commit1, commit2) - return sync_status, modified_lines - -def create_bilingual_comparison(target_toc_file): - """Create bilingual comparison list from TOC files""" - bilingual_list = [] - - # Read both files - zh_content = read_file_from_repo(target_toc_file) - en_content = read_file_from_repo(TEMP_TOC_FILENAME) - - if not zh_content or not en_content: - return [] - - zh_lines = zh_content.splitlines(True) - en_lines = en_content.splitlines(True) - - # Process from line 4 onwards (index 3) - start_line = TOC_HEADER_LINE_COUNT - - # Ensure both files have the same number of lines - min_lines = min(len(zh_lines), len(en_lines)) - - logger.info(f"Processing {min_lines - start_line} lines starting from line {start_line + 1}") - - for i in range(start_line, min_lines): - zh_line = zh_lines[i].rstrip('\n\r') - en_line = en_lines[i].rstrip('\n\r') - - # Skip empty lines - if not zh_line.strip() and not en_line.strip(): - continue - - # Clean the lines consistently using the same pattern as replace function - zh_toc_pattern = re.match(r'^\s*-\s', zh_line) - en_toc_pattern = re.match(r'^\s*-\s', en_line) - - zh_cleaned = zh_line[zh_toc_pattern.end():].rstrip() if zh_toc_pattern else zh_line.rstrip() - en_cleaned = en_line[en_toc_pattern.end():].rstrip() if en_toc_pattern else en_line.rstrip() - - # Only add non-empty cleaned lines - if zh_cleaned.strip() and en_cleaned.strip(): - bilingual_list.append([zh_cleaned, en_cleaned, i + 1]) - logger.debug(f"Bilingual items: Line {i + 1}: '{en_cleaned}' -> '{zh_cleaned}'") - - logger.info(f"Created bilingual list with {len(bilingual_list)} entries") - return bilingual_list - -def replace_content_with_translation(bilingual_list, modified_lines, target_toc_file): - """Replace English content with existing Chinese translations, return unmatched lines""" - # Read the target file - content = read_file_from_repo(target_toc_file) - if not content: - return modified_lines - target_lines = content.splitlines(True) - - # Optimize lookup by creating a dictionary for O(1) lookups - bilingual_map = {en_text: zh_text for zh_text, en_text, _ in bilingual_list} - - replaced_count = 0 - matched_lines = set() - - logger.info(f"Found {len(modified_lines)} modified lines to process.") - logger.debug(f"Modified lines: {list(modified_lines.keys())}") - - # Process each modified line - for line_number in modified_lines.keys(): - line_index = line_number - 1 # Convert to 0-based - - if 0 <= line_index < len(target_lines): - line_content = target_lines[line_index].rstrip('\n\r') - - # Clean the line content for matching - toc_pattern = re.match(r'^\s*-\s', line_content) - if toc_pattern: - prefix = toc_pattern.group(0) - cleaned_content = line_content[toc_pattern.end():].rstrip() - else: - prefix = '' - cleaned_content = line_content.rstrip() - - # Try to find exact match in bilingual map (O(1) lookup) - if cleaned_content in bilingual_map: - # Found match! Replace with Chinese translation - zh_text = bilingual_map[cleaned_content] - new_line = prefix + zh_text - target_lines[line_index] = new_line + '\n' - replaced_count += 1 - matched_lines.add(line_number) - logger.debug(f"Matched line {line_number}: '{cleaned_content}' -> '{zh_text}'") - - # Write back the updated content - if replaced_count > 0: - updated_content = ''.join(target_lines) - write_file_to_repo(target_toc_file, updated_content) - logger.info(f"Applied {replaced_count} existing translations.") - - # Return unmatched lines for AI translation - unmatched_lines = {k: v for k, v in modified_lines.items() if k not in matched_lines} - logger.info(f"Lines needing AI translation: {len(unmatched_lines)}") - - return unmatched_lines - -def translate_content(modified_lines, target_file): - """Translate English content to Chinese using Gemini API with JSON format""" - if not modified_lines: - logger.info("No content to translate.") - return {} - - logger.info(f"Translating {len(modified_lines)} lines using Gemini API...") - - # Read the target file to get original formatted lines - content = read_file_from_repo(target_file) - if not content: - return {} - target_lines = content.splitlines(True) - - # Create JSON input with original formatted lines - translation_json = {} - for line_num in modified_lines.keys(): - line_index = line_num - 1 - if 0 <= line_index < len(target_lines): - original_line = target_lines[line_index] - translation_json[str(line_num)] = original_line - - if not translation_json: - logger.warning("No valid content to translate after processing.") - return {} - - # Create JSON string for the prompt - json_input = json.dumps(translation_json, ensure_ascii=False, indent=2) - logger.debug(f"Translation JSON input: {json_input}") - - # Create translation prompt - prompt = f"""Please translate the following TOC (Table of Contents) entries from English to Chinese. -These are navigation items for TiDB Cloud documentation with original formatting. - -IMPORTANT: -1. Return the result in the EXACT SAME JSON format with the same keys (line numbers) -2. Keep ALL original formatting: indentation, spaces, dashes, brackets, etc. -3. Only translate the English text content to Chinese, preserve everything else exactly -4. Maintain technical terms appropriately (like "TiDB Cloud", "HTAP", "CLI", etc.) - -Input JSON: -{json_input} - -Return only the JSON with Chinese translations that preserve all original formatting.""" - - try: - logger.info("Sending translation request to Gemini API...") - response = client.models.generate_content( - model=MODEL_NAME, contents=prompt - ) - - if response.text: - # Extract JSON from response - response_text = response.text.strip() - logger.debug(f"Translation JSON response: {response_text}") - - # Try to find and parse JSON from the response - try: - # Use regex to find JSON block more robustly - json_text = response_text - match = re.search(r"```json\s*([\s\S]*?)\s*```", response_text) - if match: - json_text = match.group(1).strip() - elif '```' in response_text: - start = response_text.find('```') + 3 - end = response_text.find('```', start) - json_text = response_text[start:end].strip() - - # Parse the JSON - translated_json = json.loads(json_text) - - # Convert back to integer keys and return - zh_modified_lines = {} - for line_num_str, translated_text in translated_json.items(): - line_num = int(line_num_str) - zh_modified_lines[line_num] = translated_text - original_text = modified_lines.get(line_num, "") - logger.debug(f"Line {line_num}: '{original_text}' -> '{translated_text}'") - - logger.info(f"Translation completed. Processed {len(zh_modified_lines)} lines.") - return zh_modified_lines - - except (json.JSONDecodeError, ValueError) as e: - logger.error(f"Error parsing JSON response: {e}") - logger.error(f"Response was: {response_text}") - # Fallback: return empty dict to prevent writing untranslated content - return {} - else: - logger.error("Empty response from Gemini API") - return {} - - except Exception as e: - logger.error(f"Error during translation: {e}") - # Fallback: return empty dict to prevent writing untranslated content - return {} - -def update_toc_file(zh_modified_lines, target_file): - """Apply translated content to specific lines in the target TOC file""" - if not zh_modified_lines: - logger.info("No translated content to apply.") - return - - logger.info(f"Applying {len(zh_modified_lines)} translated lines to {target_file}...") - - try: - # Read the target file - content = read_file_from_repo(target_file) - if not content: - logger.error(f"Could not read target file {target_file}") - return - target_lines = content.splitlines(True) - - # Apply translations to specific lines - applied_count = 0 - for line_num, translated_content in zh_modified_lines.items(): - # Convert to 0-based index - line_index = line_num - 1 - - if 0 <= line_index < len(target_lines): - # AI has already provided the complete formatted line, use it directly - target_lines[line_index] = translated_content - applied_count += 1 - else: - logger.warning(f"Line number {line_num} is out of range (file has {len(target_lines)} lines)") - - # Write the updated content back to the file - updated_content = ''.join(target_lines) - write_file_to_repo(target_file, updated_content) - - logger.info(f"Successfully applied {applied_count} translations to {target_file}") - - except Exception as e: - logger.error(f"Error updating TOC file: {e}") - raise - -def cleanup_temp_files(): - """Clean up temporary files""" - try: - if os.path.exists(TEMP_TOC_FILENAME): - os.remove(TEMP_TOC_FILENAME) - logger.info(f"Cleaned up temporary file: {TEMP_TOC_FILENAME}") - except Exception as e: - logger.warning(f"Could not clean up temporary files: {e}") - -def process_toc_file(toc_file_name): - """Process a single TOC file for synchronization""" - target_toc_file = toc_file_name - - logger.info("-" * 50) - logger.info(f"Processing {toc_file_name}...") - - logger.info("Extracting EN commit SHA from target file...") - earlier_commit = extract_commit_from_target_file(target_toc_file) - - logger.info("Fetching latest commit SHA for TOC file...") - latest_commit = get_latest_commit_sha(REPO_OWNER, REPO_NAME, EN_BRANCH, toc_file_name) - - # If earlier_commit is different from latest_commit, sync the TOC file. - if earlier_commit and latest_commit and earlier_commit != latest_commit: - # Download the EN TOC content from the earlier commit for comparison - en_toc_path = f"https://raw.githubusercontent.com/{REPO_OWNER}/{REPO_NAME}/{earlier_commit}/{toc_file_name}" - logger.info(f"Downloading EN TOC content from: {en_toc_path}") - en_toc_content = urlopen(en_toc_path).read().decode("utf-8") - - # Write en_toc_content to a file for bilingual comparison - write_file_to_repo(TEMP_TOC_FILENAME, en_toc_content) - - logger.info("Creating bilingual comparison...") - bilingual_list = create_bilingual_comparison(target_toc_file) - - logger.info("Running TOC sync using GitHub compare diff...") - sync_status, modified_lines = sync_toc_files_using_github_compare( - earlier_commit, - latest_commit, - toc_file_name, - target_toc_file, - ) - - if sync_status: - logger.info("TOC file sync completed successfully!") - - # Match with existing bilingual translations - unmatched_lines = replace_content_with_translation(bilingual_list, modified_lines, target_toc_file) - - # Use AI to translate remaining unmatched lines - if unmatched_lines: - logger.info(f"Using AI to translate {len(unmatched_lines)} unmatched lines...") - zh_modified_lines = translate_content(unmatched_lines, target_toc_file) - update_toc_file(zh_modified_lines, target_toc_file) - logger.info("AI translations have been applied successfully!") - else: - logger.info("All lines were matched with existing translations. No AI translation needed.") - else: - logger.error("TOC file sync failed!") - else: - if earlier_commit == latest_commit: - logger.info(f"Earlier commit is the same as latest commit. No sync needed for {toc_file_name}.") - else: - logger.warning(f"Skipping sync for {toc_file_name} due to missing commit information. Check logs for errors.") - -if __name__ == "__main__": - logger.info("Starting TOC synchronization process...") - - for toc_file_name in TOC_FILE_NAMES: - process_toc_file(toc_file_name) - - # Clean up temporary files - cleanup_temp_files() - logger.info("Script execution completed.") From bcea1e93423d68649271e29b86306e4cadbcede9 Mon Sep 17 00:00:00 2001 From: qiancai Date: Tue, 21 Oct 2025 19:01:58 +0800 Subject: [PATCH 15/18] change the path to the scripts --- .github/workflows/sync-docs-cn-to-en.yml | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/.github/workflows/sync-docs-cn-to-en.yml b/.github/workflows/sync-docs-cn-to-en.yml index 9a6865106d047..6ef49cb6fd1b7 100644 --- a/.github/workflows/sync-docs-cn-to-en.yml +++ b/.github/workflows/sync-docs-cn-to-en.yml @@ -25,7 +25,7 @@ on: jobs: sync-docs: runs-on: ubuntu-latest - + steps: - name: Checkout current repository uses: actions/checkout@v4 @@ -33,6 +33,13 @@ jobs: token: ${{ secrets.GITHUB_TOKEN }} fetch-depth: 0 + - name: Checkout ai-pr-translator repository + uses: actions/checkout@v4 + with: + repository: "qiancai/ai-pr-translator" + ref: "main" + path: "ai-pr-translator" + - name: Set up Python uses: actions/setup-python@v4 with: @@ -41,7 +48,7 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - pip install -r scripts/translate_doc_pr/requirements.txt + pip install -r ai-pr-translator/scripts/requirements.txt - name: Extract PR information id: extract_info @@ -101,7 +108,7 @@ jobs: AI_PROVIDER: ${{ github.event.inputs.ai_provider }} TARGET_REPO_PATH: ${{ github.workspace }}/target_repo run: | - cd scripts/translate_doc_pr + cd ai-pr-translator/scripts if python main_workflow.py; then echo "sync_success=true" >> $GITHUB_OUTPUT echo "āœ… Sync script completed successfully" @@ -120,13 +127,13 @@ jobs: echo "No changes to commit" else git commit -m "Auto-sync: Update English docs from Chinese PR ${{ github.event.inputs.source_pr_url }} - + Synced from: ${{ github.event.inputs.source_pr_url }} Target PR: ${{ github.event.inputs.target_pr_url }} AI Provider: ${{ github.event.inputs.ai_provider }} - + Co-authored-by: github-actions[bot] " - + git push origin ${{ steps.target_branch.outputs.target_branch }} echo "Changes pushed to target PR branch: ${{ steps.target_branch.outputs.target_branch }}" fi From 3086f31f99164a2a0a4e9bcdc1373a3f0ed92928 Mon Sep 17 00:00:00 2001 From: Grace Cai Date: Tue, 21 Oct 2025 19:03:57 +0800 Subject: [PATCH 16/18] Add temp.md --- temp.md | 1 + 1 file changed, 1 insertion(+) create mode 100644 temp.md diff --git a/temp.md b/temp.md new file mode 100644 index 0000000000000..af27ff4986a7b --- /dev/null +++ b/temp.md @@ -0,0 +1 @@ +This is a test file. \ No newline at end of file From 05f680452813766a46625f5148a0cebb49034f73 Mon Sep 17 00:00:00 2001 From: Grace Cai Date: Tue, 21 Oct 2025 19:04:03 +0800 Subject: [PATCH 17/18] Delete temp.md --- temp.md | 1 - 1 file changed, 1 deletion(-) delete mode 100644 temp.md diff --git a/temp.md b/temp.md deleted file mode 100644 index af27ff4986a7b..0000000000000 --- a/temp.md +++ /dev/null @@ -1 +0,0 @@ -This is a test file. \ No newline at end of file From 3edad37ea1c0bfc3d4793db2a6f6d1531f25a3ce Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Tue, 21 Oct 2025 11:05:17 +0000 Subject: [PATCH 18/18] Auto-sync: Update English docs from Chinese PR https://github.com/qiancai/docs-cn/pull/15 Synced from: https://github.com/qiancai/docs-cn/pull/15 Target PR: https://github.com/qiancai/docs/pull/50 AI Provider: gemini Co-authored-by: github-actions[bot] --- system-variables.md | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/system-variables.md b/system-variables.md index 9732f79f6b3ce..ef993d69f32dd 100644 --- a/system-variables.md +++ b/system-variables.md @@ -1791,6 +1791,16 @@ Assume that you have a cluster with 4 TiDB nodes and multiple TiKV nodes. In thi - This variable is renamed from the variable [`tidb_ddl_version`](https://docs-archive.pingcap.com/tidb/v7.6/system-variables#tidb_ddl_version-new-in-v760) that is introduced in v7.6.0. Starting from v8.0.0, `tidb_ddl_version` no longer takes effect. - Starting from TiDB v8.5.0, the accelerated table creation feature is enabled by default for newly created clusters, with `tidb_enable_fast_create_table` set to `ON`. For clusters upgraded from v8.4.0 or earlier versions, the default value of `tidb_enable_fast_create_table` remains unchanged. +### `tidb_opt_selectivity_factor` Introduced in v9.0.0 + +- Scope: SESSION | GLOBAL +- Is persisted to the cluster: Yes +- Is controlled by the Hint [SET_VAR](/optimizer-hints.md#set_varvar_namevar_value): Yes +- Type: Floating-point number +- Value range: `[0, 1]` +- Default value: `0.8` +- This variable is used to specify the default selectivity of the TiDB optimizer. In some cases, when the optimizer cannot derive the predicate selectivity based on statistics, the optimizer uses this default selectivity as an alternative value. **It is not recommended** to modify this value. + ### tidb_default_string_match_selectivity New in v6.2.0 - Scope: SESSION | GLOBAL