Increase code syntax diversity, use github permalinks
This commit is contained in:
237
scripts/permalinkify_code_urls.py
Normal file
237
scripts/permalinkify_code_urls.py
Normal file
@@ -0,0 +1,237 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Convert raw.githubusercontent.com URLs in code_syntax.rs from branch refs to commit-SHA permalinks.
|
||||
|
||||
Usage:
|
||||
# Dry-run (prints what would change):
|
||||
python3 scripts/permalinkify_code_urls.py --dry-run
|
||||
|
||||
# Apply in-place:
|
||||
python3 scripts/permalinkify_code_urls.py
|
||||
|
||||
# With a GitHub token for higher rate limits (recommended for 485 URLs):
|
||||
GITHUB_TOKEN=ghp_xxx python3 scripts/permalinkify_code_urls.py
|
||||
|
||||
The script resolves each branch ref (master, main, dev, etc.) to the current
|
||||
commit SHA via the GitHub API, then rewrites the URLs so they never change when
|
||||
upstream repos push new commits or restructure files.
|
||||
|
||||
Before:
|
||||
https://raw.githubusercontent.com/tokio-rs/tokio/master/tokio/src/sync/mutex.rs
|
||||
After:
|
||||
https://raw.githubusercontent.com/tokio-rs/tokio/a1b2c3d.../tokio/src/sync/mutex.rs
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
import urllib.error
|
||||
import urllib.request
|
||||
|
||||
CODE_SYNTAX_PATH = os.path.join(
|
||||
os.path.dirname(__file__), "..", "src", "generator", "code_syntax.rs"
|
||||
)
|
||||
|
||||
# Looks like a full 40-char SHA already
|
||||
SHA_RE = re.compile(r"^[0-9a-f]{40}$")
|
||||
|
||||
|
||||
def github_headers():
|
||||
token = os.environ.get("GITHUB_TOKEN")
|
||||
headers = {"Accept": "application/vnd.github.v3+json"}
|
||||
if token:
|
||||
headers["Authorization"] = f"token {token}"
|
||||
return headers
|
||||
|
||||
|
||||
def _try_resolve_branch(owner: str, repo: str, ref: str) -> str | None:
|
||||
"""Try to resolve a single branch name to its commit SHA."""
|
||||
url = f"https://api.github.com/repos/{owner}/{repo}/git/ref/heads/{ref}"
|
||||
req = urllib.request.Request(url, headers=github_headers())
|
||||
try:
|
||||
with urllib.request.urlopen(req, timeout=15) as resp:
|
||||
data = json.loads(resp.read())
|
||||
return data["object"]["sha"]
|
||||
except urllib.error.HTTPError:
|
||||
return None
|
||||
|
||||
|
||||
def _try_resolve_tag(owner: str, repo: str, ref: str) -> str | None:
|
||||
"""Try to resolve a tag name to its commit SHA."""
|
||||
url = f"https://api.github.com/repos/{owner}/{repo}/git/ref/tags/{ref}"
|
||||
req = urllib.request.Request(url, headers=github_headers())
|
||||
try:
|
||||
with urllib.request.urlopen(req, timeout=15) as resp:
|
||||
data = json.loads(resp.read())
|
||||
obj = data["object"]
|
||||
if obj["type"] == "tag":
|
||||
deref_url = obj["url"]
|
||||
req2 = urllib.request.Request(deref_url, headers=github_headers())
|
||||
with urllib.request.urlopen(req2, timeout=15) as resp2:
|
||||
tag_data = json.loads(resp2.read())
|
||||
return tag_data["object"]["sha"]
|
||||
return obj["sha"]
|
||||
except urllib.error.HTTPError:
|
||||
return None
|
||||
|
||||
|
||||
def resolve_ref_to_sha(owner: str, repo: str, ref: str) -> str | None:
|
||||
"""Resolve a branch/tag ref to its commit SHA via the GitHub API.
|
||||
|
||||
Tries the ref as a branch first, then as a tag. If neither works and the
|
||||
ref doesn't contain a slash, also tries common slash-prefixed variants
|
||||
(e.g. "master" might actually be the first segment of "master/next").
|
||||
"""
|
||||
if SHA_RE.match(ref):
|
||||
return ref
|
||||
|
||||
sha = _try_resolve_branch(owner, repo, ref)
|
||||
if sha:
|
||||
return sha
|
||||
|
||||
sha = _try_resolve_tag(owner, repo, ref)
|
||||
if sha:
|
||||
return sha
|
||||
|
||||
print(f" WARNING: could not resolve {owner}/{repo} ref={ref}", file=sys.stderr)
|
||||
return None
|
||||
|
||||
|
||||
def check_rate_limit():
|
||||
"""Print current GitHub API rate limit status."""
|
||||
req = urllib.request.Request(
|
||||
"https://api.github.com/rate_limit", headers=github_headers()
|
||||
)
|
||||
try:
|
||||
with urllib.request.urlopen(req, timeout=10) as resp:
|
||||
data = json.loads(resp.read())
|
||||
core = data["resources"]["core"]
|
||||
remaining = core["remaining"]
|
||||
limit = core["limit"]
|
||||
reset_ts = core["reset"]
|
||||
reset_in = max(0, reset_ts - int(time.time()))
|
||||
print(f"GitHub API rate limit: {remaining}/{limit} remaining, resets in {reset_in}s")
|
||||
if remaining < 50:
|
||||
print(
|
||||
"WARNING: Low rate limit. Set GITHUB_TOKEN env var for 5000 req/hr.",
|
||||
file=sys.stderr,
|
||||
)
|
||||
return remaining
|
||||
except Exception as e:
|
||||
print(f"Could not check rate limit: {e}", file=sys.stderr)
|
||||
return None
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description=__doc__)
|
||||
parser.add_argument(
|
||||
"--dry-run",
|
||||
action="store_true",
|
||||
help="Print changes without modifying the file",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--file",
|
||||
default=CODE_SYNTAX_PATH,
|
||||
help="Path to code_syntax.rs",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
with open(args.file) as f:
|
||||
content = f.read()
|
||||
|
||||
# Collect unique (owner, repo, ref) tuples to minimize API calls.
|
||||
# Branch names can contain slashes (e.g. "series/3.x"), so we can't simply
|
||||
# split on "/" to extract the ref. Instead we use the GitHub API to look up
|
||||
# the repo's default branch and resolve from there.
|
||||
url_prefix_re = re.compile(
|
||||
r"https://raw\.githubusercontent\.com/(?P<owner>[^/]+)/(?P<repo>[^/]+)/(?P<rest>.+)"
|
||||
)
|
||||
urls_found = url_prefix_re.findall(content)
|
||||
|
||||
# Deduce (owner, repo, ref, path) — if `rest` starts with a 40-char hex SHA
|
||||
# it's already pinned; otherwise ask the GitHub API for the default branch.
|
||||
ref_keys: dict[tuple[str, str, str], str | None] = {}
|
||||
for owner, repo, rest in urls_found:
|
||||
first_segment = rest.split("/")[0]
|
||||
if SHA_RE.match(first_segment):
|
||||
ref_keys[(owner, repo, first_segment)] = first_segment
|
||||
else:
|
||||
# We need to figure out which part of `rest` is the ref vs the path.
|
||||
# We try the first segment, then first two segments (for slash-branches
|
||||
# like "series/3.x"), etc.
|
||||
ref_key = (owner, repo, first_segment)
|
||||
if ref_key not in ref_keys:
|
||||
ref_keys[ref_key] = None
|
||||
|
||||
already_pinned = sum(1 for _, _, ref in ref_keys if SHA_RE.match(ref))
|
||||
to_resolve = sum(1 for _, _, ref in ref_keys if not SHA_RE.match(ref))
|
||||
|
||||
print(f"Found {len(urls_found)} URLs across {len(ref_keys)} unique (owner/repo/ref) combos")
|
||||
print(f" Already pinned to SHA: {already_pinned}")
|
||||
print(f" Need resolution: {to_resolve}")
|
||||
|
||||
if to_resolve == 0:
|
||||
print("Nothing to do — all URLs already use commit SHAs.")
|
||||
return
|
||||
|
||||
remaining = check_rate_limit()
|
||||
if remaining is not None and remaining < to_resolve:
|
||||
print(
|
||||
f"ERROR: Need {to_resolve} API calls but only {remaining} remaining. "
|
||||
"Set GITHUB_TOKEN or wait for reset.",
|
||||
file=sys.stderr,
|
||||
)
|
||||
sys.exit(1)
|
||||
|
||||
# Resolve each unique ref
|
||||
resolved = 0
|
||||
failed = 0
|
||||
for (owner, repo, ref) in sorted(ref_keys):
|
||||
if SHA_RE.match(ref):
|
||||
ref_keys[(owner, repo, ref)] = ref
|
||||
continue
|
||||
|
||||
sha = resolve_ref_to_sha(owner, repo, ref)
|
||||
if sha:
|
||||
ref_keys[(owner, repo, ref)] = sha
|
||||
resolved += 1
|
||||
if not args.dry_run:
|
||||
# Be polite to the API
|
||||
time.sleep(0.1)
|
||||
else:
|
||||
failed += 1
|
||||
# Progress
|
||||
done = resolved + failed
|
||||
if done % 10 == 0 or done == to_resolve:
|
||||
print(f" Progress: {done}/{to_resolve} ({resolved} resolved, {failed} failed)")
|
||||
|
||||
print(f"\nResolved {resolved}/{to_resolve} refs ({failed} failures)")
|
||||
|
||||
# Build replacement map
|
||||
replacements = 0
|
||||
new_content = content
|
||||
for (owner, repo, ref), sha in ref_keys.items():
|
||||
if sha and sha != ref:
|
||||
old_prefix = f"raw.githubusercontent.com/{owner}/{repo}/{ref}/"
|
||||
new_prefix = f"raw.githubusercontent.com/{owner}/{repo}/{sha}/"
|
||||
count = new_content.count(old_prefix)
|
||||
if count > 0:
|
||||
if args.dry_run:
|
||||
print(f" {owner}/{repo}: {ref} -> {sha[:12]}... ({count} URLs)")
|
||||
new_content = new_content.replace(old_prefix, new_prefix)
|
||||
replacements += count
|
||||
|
||||
print(f"\nTotal URL replacements: {replacements}")
|
||||
|
||||
if args.dry_run:
|
||||
print("\n(dry-run mode — no file modified)")
|
||||
else:
|
||||
with open(args.file, "w") as f:
|
||||
f.write(new_content)
|
||||
print(f"Wrote {args.file}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
32
src/app.rs
32
src/app.rs
@@ -2089,24 +2089,20 @@ impl App {
|
||||
let chosen = self.code_drill_language_override.clone().unwrap();
|
||||
|
||||
// Step 2: Check if we need to download (only if not already attempted)
|
||||
if self.config.code_downloads_enabled
|
||||
&& !self.code_download_attempted
|
||||
&& !is_language_cached(&self.config.code_download_dir, &chosen)
|
||||
{
|
||||
if let Some(lang) = language_by_key(&chosen) {
|
||||
if !lang.repos.is_empty() {
|
||||
let repo_idx = self.rng.gen_range(0..lang.repos.len());
|
||||
self.code_download_queue = vec![(chosen.clone(), repo_idx)];
|
||||
self.code_intro_download_total = 1;
|
||||
self.code_intro_downloaded = 0;
|
||||
self.code_intro_downloading = true;
|
||||
self.code_intro_current_repo = lang.repos[repo_idx].key.to_string();
|
||||
self.code_download_action = CodeDownloadCompleteAction::StartCodeDrill;
|
||||
self.code_download_job = None;
|
||||
self.code_download_attempted = true;
|
||||
self.screen = AppScreen::CodeDownloadProgress;
|
||||
return;
|
||||
}
|
||||
if self.config.code_downloads_enabled && !self.code_download_attempted {
|
||||
let queue =
|
||||
build_code_download_queue(&chosen, &self.config.code_download_dir);
|
||||
if !queue.is_empty() {
|
||||
self.code_intro_download_total = queue.len();
|
||||
self.code_download_queue = queue;
|
||||
self.code_intro_downloaded = 0;
|
||||
self.code_intro_downloading = true;
|
||||
self.code_intro_current_repo.clear();
|
||||
self.code_download_action = CodeDownloadCompleteAction::StartCodeDrill;
|
||||
self.code_download_job = None;
|
||||
self.code_download_attempted = true;
|
||||
self.screen = AppScreen::CodeDownloadProgress;
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -5192,8 +5192,8 @@ mod review_tests {
|
||||
.expect("de should be selectable");
|
||||
|
||||
let output = render_settings_to_string(&app);
|
||||
assert!(output.contains("German"));
|
||||
assert!(!output.contains("German (preview)"));
|
||||
assert!(output.contains("Deutsch"));
|
||||
assert!(!output.contains("Deutsch (preview)"));
|
||||
assert!(output.contains("de_qwertz"));
|
||||
assert!(!output.contains("qwerty (preview)"));
|
||||
}
|
||||
@@ -5396,7 +5396,7 @@ mod review_tests {
|
||||
assert!(output.contains("default adaptive drill will mix in keys"));
|
||||
assert!(output.contains("focus only on this branch"));
|
||||
assert!(output.contains("from this branch in the Skill Tree."));
|
||||
assert!(output.contains("Proceed? (y/n)"));
|
||||
assert!(output.contains("[y] Unlock"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -5408,7 +5408,7 @@ mod review_tests {
|
||||
let output = render_skill_tree_to_string_with_size(&app, 90, 24);
|
||||
assert!(output.contains("focus only on this branch"));
|
||||
assert!(output.contains("from this branch in the Skill Tree."));
|
||||
assert!(output.contains("Proceed? (y/n)"));
|
||||
assert!(output.contains("[y] Unlock"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
Reference in New Issue
Block a user