diff --git a/README.md b/README.md index ca8f135..a2b1179 100644 --- a/README.md +++ b/README.md @@ -115,7 +115,8 @@ usage: leetcode-export [-h] [--cookies COOKIES] [--folder FOLDER] [--problem-statement-content PROBLEM_STATEMENT_CONTENT] [--submission-filename SUBMISSION_FILENAME] [--only-accepted] [--only-last-submission] - [--language LANGUAGE_UNPROCESSED] [-v] [-vv] [-V] + [--language LANGUAGE_UNPROCESSED] + [--checkpoint-file CHECKPOINT_FILE] [-v] [-vv] [-V] Export LeetCode submissions @@ -144,11 +145,49 @@ options: html, php, golang, scala, pythonml, rust, ruby, bash, swift example: --language=python,cpp,java + --checkpoint-file CHECKPOINT_FILE + path to checkpoint file for incremental backups (stores Unix + timestamp of newest processed submission) -v, --verbose enable verbose logging details -vv, --extra-verbose enable more verbose logging details -V, --version show program's version number and exit ``` +### Incremental Backups + +The `--checkpoint-file` option enables incremental backups by storing the timestamp of the newest processed submission. This allows you to run the script multiple times and only download new submissions since the last run, making it much faster for regular backups. + +#### How it works + +1. **First run**: If the checkpoint file doesn't exist, the script will prompt you to create it and perform a full backup +2. **Subsequent runs**: The script reads the timestamp from the checkpoint file and only processes submissions newer than that timestamp +3. **Automatic updates**: The checkpoint file is updated automatically at the end of a successful run with the timestamp of the newest submission processed + +#### Example usage + +```bash +# First run - full backup +leetcode-export \ + --folder ./submissions \ + --checkpoint-file ~/.leetcode_checkpoint \ + --only-accepted \ + --cookies "your_cookies_here" + +# Subsequent runs - only new submissions +leetcode-export \ + --folder ./submissions \ + --checkpoint-file ~/.leetcode_checkpoint \ + --only-accepted \ + --cookies "your_cookies_here" +``` + +#### Important notes + +- The checkpoint file stores a Unix timestamp of the newest processed submission +- Only submissions that are actually written to disk (not filtered out) update the checkpoint +- If no new submissions are found, the checkpoint file remains unchanged +- The script will stop early when it reaches submissions older than the checkpoint, making it very efficient + ### Problem template arguments #### Problem statement filename template diff --git a/leetcode_export/__main__.py b/leetcode_export/__main__.py index 18bf628..83ff147 100644 --- a/leetcode_export/__main__.py +++ b/leetcode_export/__main__.py @@ -8,7 +8,7 @@ import logging import os from string import Template -from typing import Set +from typing import Optional, Set from leetcode_export._version import __version__ from leetcode_export.leetcode import LeetCode @@ -95,6 +95,11 @@ def parse_args(): action="store_true", help="enable more verbose logging details", ) + parser.add_argument( + "--checkpoint-file", + type=str, + help="path to checkpoint file for incremental backups (stores Unix timestamp of newest processed submission)", + ) parser.add_argument( "-V", "--version", @@ -118,6 +123,45 @@ def parse_args(): return args +def load_checkpoint(checkpoint_file: str) -> Optional[int]: + """ + Load timestamp from checkpoint file + :param checkpoint_file: path to checkpoint file + :return: Unix timestamp or None if file doesn't exist or is invalid + """ + if not os.path.exists(checkpoint_file): + logging.info(f"Checkpoint file {checkpoint_file} does not exist") + response = input(f"Create checkpoint file at {checkpoint_file} and start from beginning? (y/N): ") + if response.lower() in ['y', 'yes']: + write_checkpoint(checkpoint_file, 0) + return 0 + else: + logging.error("Checkpoint file required for incremental backup. Exiting.") + exit(1) + try: + with open(checkpoint_file, 'r') as f: + timestamp = int(f.read().strip()) + logging.info(f"Loaded checkpoint timestamp: {timestamp}") + return timestamp + except (ValueError, IOError) as e: + logging.error(f"Failed to read checkpoint file {checkpoint_file}: {e}") + exit(1) + + +def write_checkpoint(checkpoint_file: str, timestamp: int) -> None: + """ + Write timestamp to checkpoint file + :param checkpoint_file: path to checkpoint file + :param timestamp: Unix timestamp to write + """ + try: + with open(checkpoint_file, 'w') as f: + f.write(str(timestamp)) + logging.debug(f"Updated checkpoint to timestamp: {timestamp}") + except IOError as e: + logging.error(f"Failed to write checkpoint file {checkpoint_file}: {e}") + + def configure_logging(args): logging_file_handler = logging.FileHandler("debug.log", encoding="UTF8") logging_file_handler.setLevel(logging.DEBUG) @@ -166,15 +210,27 @@ def main(): logging.info("Output folder not found, creating it") os.mkdir(args.folder) os.chdir(args.folder) + base_folder = os.getcwd() title_slug_to_problem_folder_name: dict[str, str] = dict() title_slug_to_exported_languages: dict[str, set[str]] = dict() last_submission_timestamp: Optional[int] = None + # Handle checkpoint functionality + checkpoint_timestamp: Optional[int] = None + newest_processed_timestamp: Optional[int] = None + submissions_processed = 0 + + if args.checkpoint_file: + checkpoint_timestamp = load_checkpoint(args.checkpoint_file) + logging.info(f"Using checkpoint file: {args.checkpoint_file}") + if checkpoint_timestamp > 0: + logging.info(f"Only processing submissions newer than timestamp {checkpoint_timestamp}") + print("Exporting LeetCode submissions...") - for submission in leetcode.get_submissions(): + for submission in leetcode.get_submissions(since_timestamp=checkpoint_timestamp): if ( last_submission_timestamp is not None and submission.timestamp > last_submission_timestamp @@ -220,7 +276,7 @@ def main(): problem_folder_name ) if not os.path.exists(problem_folder_name): - os.mkdir(problem_folder_name) + os.makedirs(problem_folder_name, exist_ok=True) os.chdir(problem_folder_name) problem_statement_filename = problem_statement_filename_template.substitute( @@ -241,17 +297,35 @@ def main(): submission_filename = submission_filename_template.substitute( **submission.__dict__ ) + submission_was_written = False if not os.path.exists(submission_filename): logging.info(f"Writing {submission.title_slug}/{submission_filename}") sub_file = open(submission_filename, "w+") sub_file.write(submission.code) sub_file.close() + submission_was_written = True else: logging.info( f"{submission.title_slug}/{submission_filename} already exists, skipping it" ) - os.chdir("..") + # Track processing for checkpoint updates + if submission_was_written: + submissions_processed += 1 + if newest_processed_timestamp is None or submission.timestamp > newest_processed_timestamp: + newest_processed_timestamp = submission.timestamp + + os.chdir(base_folder) + + # Final summary and checkpoint update + if args.checkpoint_file: + if submissions_processed > 0: + # Only update checkpoint after successful completion of all processing + write_checkpoint(args.checkpoint_file, newest_processed_timestamp) + print(f"Processed {submissions_processed} new submissions") + print(f"Updated checkpoint to timestamp: {newest_processed_timestamp}") + else: + logging.info("No new submissions found since last checkpoint") if __name__ == "__main__": diff --git a/leetcode_export/leetcode.py b/leetcode_export/leetcode.py index 0fe0349..a8f37b8 100644 --- a/leetcode_export/leetcode.py +++ b/leetcode_export/leetcode.py @@ -5,7 +5,7 @@ import datetime import logging from time import sleep -from typing import Dict, Iterator +from typing import Dict, Iterator, Optional import requests @@ -131,9 +131,10 @@ def get_problem_statement(self, slug: str) -> Problem: ) return Problem.from_dict(problem_dict) - def get_submissions(self) -> Iterator[Submission]: + def get_submissions(self, since_timestamp: Optional[int] = None) -> Iterator[Submission]: """ Get submissions for logged user + :param since_timestamp: Only return submissions newer than this Unix timestamp :return: Iterator[Submission], LeetCode submission """ if not self.is_user_logged(): @@ -152,7 +153,14 @@ def get_submissions(self) -> Iterator[Submission]: logging.debug(response.content) response_json = response.json() if "submissions_dump" in response_json: + found_older_submission = False for submission_dict in response_json["submissions_dump"]: + # Check if this submission is older than our checkpoint + if since_timestamp is not None and submission_dict["timestamp"] <= since_timestamp: + logging.info(f"Reached submissions older than checkpoint timestamp {since_timestamp}, stopping") + found_older_submission = True + break + submission_dict["runtime"] = submission_dict["runtime"].replace( " ", "" ) @@ -177,6 +185,10 @@ def get_submissions(self) -> Iterator[Submission]: submission = Submission.from_dict(submission_dict) yield submission + # If we found an older submission, stop pagination + if found_older_submission: + break + current += 20 sleep(5) # cooldown time for get request if "detail" in response_json: