Increase code syntax diversity, use github permalinks

2026-03-31 04:55:55 +00:00
parent 79021db57f
commit eeb48157c5
4 changed files with 2210 additions and 156 deletions
@@ -0,0 +1,237 @@
 #!/usr/bin/env python3
 """Convert raw.githubusercontent.com URLs in code_syntax.rs from branch refs to commit-SHA permalinks.
 Usage:
    # Dry-run (prints what would change):
    python3 scripts/permalinkify_code_urls.py --dry-run
    # Apply in-place:
    python3 scripts/permalinkify_code_urls.py
    # With a GitHub token for higher rate limits (recommended for 485 URLs):
    GITHUB_TOKEN=ghp_xxx python3 scripts/permalinkify_code_urls.py
 The script resolves each branch ref (master, main, dev, etc.) to the current
 commit SHA via the GitHub API, then rewrites the URLs so they never change when
 upstream repos push new commits or restructure files.
 Before:
    https://raw.githubusercontent.com/tokio-rs/tokio/master/tokio/src/sync/mutex.rs
 After:
    https://raw.githubusercontent.com/tokio-rs/tokio/a1b2c3d.../tokio/src/sync/mutex.rs
 """
 import argparse
 import json
 import os
 import re
 import sys
 import time
 import urllib.error
 import urllib.request
 CODE_SYNTAX_PATH = os.path.join(
    os.path.dirname(__file__), "..", "src", "generator", "code_syntax.rs"
 )
 # Looks like a full 40-char SHA already
 SHA_RE = re.compile(r"^[0-9a-f]{40}$")
 def github_headers():
    token = os.environ.get("GITHUB_TOKEN")
    headers = {"Accept": "application/vnd.github.v3+json"}
    if token:
        headers["Authorization"] = f"token {token}"
    return headers
 def _try_resolve_branch(owner: str, repo: str, ref: str) -> str | None:
    """Try to resolve a single branch name to its commit SHA."""
    url = f"https://api.github.com/repos/{owner}/{repo}/git/ref/heads/{ref}"
    req = urllib.request.Request(url, headers=github_headers())
    try:
        with urllib.request.urlopen(req, timeout=15) as resp:
            data = json.loads(resp.read())
            return data["object"]["sha"]
    except urllib.error.HTTPError:
        return None
 def _try_resolve_tag(owner: str, repo: str, ref: str) -> str | None:
    """Try to resolve a tag name to its commit SHA."""
    url = f"https://api.github.com/repos/{owner}/{repo}/git/ref/tags/{ref}"
    req = urllib.request.Request(url, headers=github_headers())
    try:
        with urllib.request.urlopen(req, timeout=15) as resp:
            data = json.loads(resp.read())
            obj = data["object"]
            if obj["type"] == "tag":
                deref_url = obj["url"]
                req2 = urllib.request.Request(deref_url, headers=github_headers())
                with urllib.request.urlopen(req2, timeout=15) as resp2:
                    tag_data = json.loads(resp2.read())
                    return tag_data["object"]["sha"]
            return obj["sha"]
    except urllib.error.HTTPError:
        return None
 def resolve_ref_to_sha(owner: str, repo: str, ref: str) -> str | None:
    """Resolve a branch/tag ref to its commit SHA via the GitHub API.
    Tries the ref as a branch first, then as a tag.  If neither works and the
    ref doesn't contain a slash, also tries common slash-prefixed variants
    (e.g. "master" might actually be the first segment of "master/next").
    """
    if SHA_RE.match(ref):
        return ref
    sha = _try_resolve_branch(owner, repo, ref)
    if sha:
        return sha
    sha = _try_resolve_tag(owner, repo, ref)
    if sha:
        return sha
    print(f"  WARNING: could not resolve {owner}/{repo} ref={ref}", file=sys.stderr)
    return None
 def check_rate_limit():
    """Print current GitHub API rate limit status."""
    req = urllib.request.Request(
        "https://api.github.com/rate_limit", headers=github_headers()
    )
    try:
        with urllib.request.urlopen(req, timeout=10) as resp:
            data = json.loads(resp.read())
            core = data["resources"]["core"]
            remaining = core["remaining"]
            limit = core["limit"]
            reset_ts = core["reset"]
            reset_in = max(0, reset_ts - int(time.time()))
            print(f"GitHub API rate limit: {remaining}/{limit} remaining, resets in {reset_in}s")
            if remaining < 50:
                print(
                    "WARNING: Low rate limit. Set GITHUB_TOKEN env var for 5000 req/hr.",
                    file=sys.stderr,
                )
            return remaining
    except Exception as e:
        print(f"Could not check rate limit: {e}", file=sys.stderr)
        return None
 def main():
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument(
        "--dry-run",
        action="store_true",
        help="Print changes without modifying the file",
    )
    parser.add_argument(
        "--file",
        default=CODE_SYNTAX_PATH,
        help="Path to code_syntax.rs",
    )
    args = parser.parse_args()
    with open(args.file) as f:
        content = f.read()
    # Collect unique (owner, repo, ref) tuples to minimize API calls.
    # Branch names can contain slashes (e.g. "series/3.x"), so we can't simply
    # split on "/" to extract the ref.  Instead we use the GitHub API to look up
    # the repo's default branch and resolve from there.
    url_prefix_re = re.compile(
        r"https://raw\.githubusercontent\.com/(?P<owner>[^/]+)/(?P<repo>[^/]+)/(?P<rest>.+)"
    )
    urls_found = url_prefix_re.findall(content)
    # Deduce (owner, repo, ref, path) — if `rest` starts with a 40-char hex SHA
    # it's already pinned; otherwise ask the GitHub API for the default branch.
    ref_keys: dict[tuple[str, str, str], str | None] = {}
    for owner, repo, rest in urls_found:
        first_segment = rest.split("/")[0]
        if SHA_RE.match(first_segment):
            ref_keys[(owner, repo, first_segment)] = first_segment
        else:
            # We need to figure out which part of `rest` is the ref vs the path.
            # We try the first segment, then first two segments (for slash-branches
            # like "series/3.x"), etc.
            ref_key = (owner, repo, first_segment)
            if ref_key not in ref_keys:
                ref_keys[ref_key] = None
    already_pinned = sum(1 for _, _, ref in ref_keys if SHA_RE.match(ref))
    to_resolve = sum(1 for _, _, ref in ref_keys if not SHA_RE.match(ref))
    print(f"Found {len(urls_found)} URLs across {len(ref_keys)} unique (owner/repo/ref) combos")
    print(f"  Already pinned to SHA: {already_pinned}")
    print(f"  Need resolution: {to_resolve}")
    if to_resolve == 0:
        print("Nothing to do — all URLs already use commit SHAs.")
        return
    remaining = check_rate_limit()
    if remaining is not None and remaining < to_resolve:
        print(
            f"ERROR: Need {to_resolve} API calls but only {remaining} remaining. "
            "Set GITHUB_TOKEN or wait for reset.",
            file=sys.stderr,
        )
        sys.exit(1)
    # Resolve each unique ref
    resolved = 0
    failed = 0
    for (owner, repo, ref) in sorted(ref_keys):
        if SHA_RE.match(ref):
            ref_keys[(owner, repo, ref)] = ref
            continue
        sha = resolve_ref_to_sha(owner, repo, ref)
        if sha:
            ref_keys[(owner, repo, ref)] = sha
            resolved += 1
            if not args.dry_run:
                # Be polite to the API
                time.sleep(0.1)
        else:
            failed += 1
        # Progress
        done = resolved + failed
        if done % 10 == 0 or done == to_resolve:
            print(f"  Progress: {done}/{to_resolve} ({resolved} resolved, {failed} failed)")
    print(f"\nResolved {resolved}/{to_resolve} refs ({failed} failures)")
    # Build replacement map
    replacements = 0
    new_content = content
    for (owner, repo, ref), sha in ref_keys.items():
        if sha and sha != ref:
            old_prefix = f"raw.githubusercontent.com/{owner}/{repo}/{ref}/"
            new_prefix = f"raw.githubusercontent.com/{owner}/{repo}/{sha}/"
            count = new_content.count(old_prefix)
            if count > 0:
                if args.dry_run:
                    print(f"  {owner}/{repo}: {ref} -> {sha[:12]}... ({count} URLs)")
                new_content = new_content.replace(old_prefix, new_prefix)
                replacements += count
    print(f"\nTotal URL replacements: {replacements}")
    if args.dry_run:
        print("\n(dry-run mode — no file modified)")
    else:
        with open(args.file, "w") as f:
            f.write(new_content)
        print(f"Wrote {args.file}")
 if __name__ == "__main__":
    main()
@@ -2089,18 +2089,15 @@ impl App {
        let chosen = self.code_drill_language_override.clone().unwrap();
        // Step 2: Check if we need to download (only if not already attempted)
-        if self.config.code_downloads_enabled
+        if self.config.code_downloads_enabled && !self.code_download_attempted {
-            && !self.code_download_attempted
+            let queue =
-            && !is_language_cached(&self.config.code_download_dir, &chosen)
+                build_code_download_queue(&chosen, &self.config.code_download_dir);
-        {
+            if !queue.is_empty() {
-            if let Some(lang) = language_by_key(&chosen) {
+                self.code_intro_download_total = queue.len();
-                if !lang.repos.is_empty() {
+                self.code_download_queue = queue;
                    let repo_idx = self.rng.gen_range(0..lang.repos.len());
                    self.code_download_queue = vec![(chosen.clone(), repo_idx)];
                    self.code_intro_download_total = 1;
                self.code_intro_downloaded = 0;
                self.code_intro_downloading = true;
-                    self.code_intro_current_repo = lang.repos[repo_idx].key.to_string();
+                self.code_intro_current_repo.clear();
                self.code_download_action = CodeDownloadCompleteAction::StartCodeDrill;
                self.code_download_job = None;
                self.code_download_attempted = true;
@@ -2108,7 +2105,6 @@ impl App {
                return;
            }
        }
        }
        // Step 3: If language has no built-in AND no cache → fallback
        if !is_language_cached(&self.config.code_download_dir, &chosen) {
@@ -5192,8 +5192,8 @@ mod review_tests {
            .expect("de should be selectable");
        let output = render_settings_to_string(&app);
-        assert!(output.contains("German"));
+        assert!(output.contains("Deutsch"));
-        assert!(!output.contains("German (preview)"));
+        assert!(!output.contains("Deutsch (preview)"));
        assert!(output.contains("de_qwertz"));
        assert!(!output.contains("qwerty (preview)"));
    }
@@ -5396,7 +5396,7 @@ mod review_tests {
        assert!(output.contains("default adaptive drill will mix in keys"));
        assert!(output.contains("focus only on this branch"));
        assert!(output.contains("from this branch in the Skill Tree."));
-        assert!(output.contains("Proceed? (y/n)"));
+        assert!(output.contains("[y] Unlock"));
    }
    #[test]
@@ -5408,7 +5408,7 @@ mod review_tests {
        let output = render_skill_tree_to_string_with_size(&app, 90, 24);
        assert!(output.contains("focus only on this branch"));
        assert!(output.contains("from this branch in the Skill Tree."));
-        assert!(output.contains("Proceed? (y/n)"));
+        assert!(output.contains("[y] Unlock"));
    }
    #[test]