Skill tree progression system & whitespace support

This commit is contained in:
2026-02-15 07:30:34 +00:00
parent 13550505c1
commit 6d6815af02
22 changed files with 2883 additions and 238 deletions

123
src/generator/capitalize.rs Normal file
View File

@@ -0,0 +1,123 @@
use rand::rngs::SmallRng;
use rand::Rng;
/// Post-processing pass that capitalizes words in generated text.
/// Only capitalizes using letters from `unlocked_capitals`.
pub fn apply_capitalization(
text: &str,
unlocked_capitals: &[char],
focused: Option<char>,
rng: &mut SmallRng,
) -> String {
if unlocked_capitals.is_empty() {
return text.to_string();
}
// If focused key is an uppercase letter, boost its probability
let focused_upper = focused.filter(|ch| ch.is_ascii_uppercase());
let mut result = String::with_capacity(text.len());
let mut at_sentence_start = true;
for (i, ch) in text.chars().enumerate() {
if at_sentence_start && ch.is_ascii_lowercase() {
let upper = ch.to_ascii_uppercase();
if unlocked_capitals.contains(&upper) {
result.push(upper);
at_sentence_start = false;
continue;
}
}
// After period/question/exclamation + space, next word starts a sentence
if ch == ' ' && i > 0 {
let prev = text.as_bytes().get(i - 1).map(|&b| b as char);
if matches!(prev, Some('.' | '?' | '!')) {
at_sentence_start = true;
}
}
// Capitalize word starts: boosted for focused key, ~12% for others
if ch.is_ascii_lowercase() && !at_sentence_start {
let is_word_start = i == 0 || text.as_bytes().get(i - 1).map(|&b| b as char) == Some(' ');
if is_word_start {
let upper = ch.to_ascii_uppercase();
if unlocked_capitals.contains(&upper) {
let prob = if focused_upper == Some(upper) { 0.40 } else { 0.12 };
if rng.gen_bool(prob) {
result.push(upper);
continue;
}
}
}
}
if ch != '.' && ch != '?' && ch != '!' {
at_sentence_start = false;
}
result.push(ch);
}
result
}
#[cfg(test)]
mod tests {
use super::*;
use rand::SeedableRng;
#[test]
fn test_no_caps_when_empty() {
let mut rng = SmallRng::seed_from_u64(42);
let result = apply_capitalization("hello world", &[], None, &mut rng);
assert_eq!(result, "hello world");
}
#[test]
fn test_capitalizes_first_word() {
let mut rng = SmallRng::seed_from_u64(42);
let result = apply_capitalization("hello world", &['H', 'W'], None, &mut rng);
assert!(result.starts_with('H'));
}
#[test]
fn test_only_capitalizes_unlocked() {
let mut rng = SmallRng::seed_from_u64(42);
// Only 'W' is unlocked, not 'H'
let result = apply_capitalization("hello world", &['W'], None, &mut rng);
assert!(result.starts_with('h')); // 'H' not unlocked
}
#[test]
fn test_after_period() {
let mut rng = SmallRng::seed_from_u64(42);
let result = apply_capitalization("one. two", &['O', 'T'], None, &mut rng);
assert!(result.starts_with('O'));
assert!(result.contains("Two") || result.contains("two"));
// At minimum, first word should be capitalized
}
#[test]
fn test_focused_capital_boosted() {
// With focused 'W', W capitalization should happen more often
let caps = &['H', 'W'];
let mut focused_count = 0;
let mut unfocused_count = 0;
// Run many trials to check statistical boosting
for seed in 0..200 {
let mut rng = SmallRng::seed_from_u64(seed);
let text = "hello world wide web wonder what where who will work";
let result = apply_capitalization(text, caps, Some('W'), &mut rng);
// Count W capitalizations (skip first word which is always capitalized if 'H' is available)
focused_count += result.matches('W').count();
let mut rng2 = SmallRng::seed_from_u64(seed);
let result2 = apply_capitalization(text, caps, None, &mut rng2);
unfocused_count += result2.matches('W').count();
}
assert!(
focused_count > unfocused_count,
"Focused W count ({focused_count}) should exceed unfocused ({unfocused_count})"
);
}
}

View File

@@ -0,0 +1,220 @@
use rand::rngs::SmallRng;
use rand::Rng;
/// Post-processing pass that inserts code-like expressions into text.
/// Only uses symbols from `unlocked_symbols`.
pub fn apply_code_symbols(
text: &str,
unlocked_symbols: &[char],
focused: Option<char>,
rng: &mut SmallRng,
) -> String {
if unlocked_symbols.is_empty() {
return text.to_string();
}
// If focused key is a code symbol, boost insertion probability
let focused_symbol = focused.filter(|ch| unlocked_symbols.contains(ch));
let base_prob = if focused_symbol.is_some() { 0.35 } else { 0.20 };
let words: Vec<&str> = text.split(' ').collect();
let mut result = Vec::new();
for word in &words {
if rng.gen_bool(base_prob) {
let expr = generate_code_expr(word, unlocked_symbols, focused_symbol, rng);
result.push(expr);
} else {
result.push(word.to_string());
}
}
result.join(" ")
}
fn generate_code_expr(
word: &str,
symbols: &[char],
focused_symbol: Option<char>,
rng: &mut SmallRng,
) -> String {
// Categorize available symbols
let has = |ch: char| symbols.contains(&ch);
// Try various patterns based on available symbols
let mut patterns: Vec<Box<dyn Fn(&mut SmallRng) -> String>> = Vec::new();
// Track which patterns use the focused symbol for priority selection
let mut focused_patterns: Vec<usize> = Vec::new();
// Arithmetic & Assignment patterns
if has('=') {
let w = word.to_string();
let idx = patterns.len();
patterns.push(Box::new(move |_| format!("{w} = val")));
if focused_symbol == Some('=') { focused_patterns.push(idx); }
}
if has('+') {
let w = word.to_string();
let idx = patterns.len();
patterns.push(Box::new(move |_| format!("{w} + num")));
if focused_symbol == Some('+') { focused_patterns.push(idx); }
}
if has('*') {
let w = word.to_string();
let idx = patterns.len();
patterns.push(Box::new(move |_| format!("{w} * cnt")));
if focused_symbol == Some('*') { focused_patterns.push(idx); }
}
if has('/') {
let w = word.to_string();
let idx = patterns.len();
patterns.push(Box::new(move |_| format!("{w} / max")));
if focused_symbol == Some('/') { focused_patterns.push(idx); }
}
if has('-') {
let w = word.to_string();
let idx = patterns.len();
patterns.push(Box::new(move |_| format!("{w} - one")));
if focused_symbol == Some('-') { focused_patterns.push(idx); }
let w = word.to_string();
let idx = patterns.len();
patterns.push(Box::new(move |_| format!("-{w}")));
if focused_symbol == Some('-') { focused_patterns.push(idx); }
}
if has('=') && has('+') {
let w = word.to_string();
patterns.push(Box::new(move |_| format!("{w} += one")));
}
if has('=') && has('-') {
let w = word.to_string();
let idx = patterns.len();
patterns.push(Box::new(move |_| format!("{w} -= one")));
if focused_symbol == Some('-') { focused_patterns.push(idx); }
}
if has('=') && has('=') {
let w = word.to_string();
patterns.push(Box::new(move |_| format!("{w} == nil")));
}
// Grouping patterns
if has('{') && has('}') {
let w = word.to_string();
let idx = patterns.len();
patterns.push(Box::new(move |_| format!("{{ {w} }}")));
if matches!(focused_symbol, Some('{') | Some('}')) { focused_patterns.push(idx); }
}
if has('[') && has(']') {
let w = word.to_string();
let idx = patterns.len();
patterns.push(Box::new(move |_| format!("{w}[idx]")));
if matches!(focused_symbol, Some('[') | Some(']')) { focused_patterns.push(idx); }
}
if has('<') && has('>') {
let w = word.to_string();
let idx = patterns.len();
patterns.push(Box::new(move |_| format!("Vec<{w}>")));
if matches!(focused_symbol, Some('<') | Some('>')) { focused_patterns.push(idx); }
}
// Logic patterns
if has('&') {
let w = word.to_string();
let idx = patterns.len();
patterns.push(Box::new(move |_| format!("&{w}")));
if focused_symbol == Some('&') { focused_patterns.push(idx); }
}
if has('|') {
let w = word.to_string();
let idx = patterns.len();
patterns.push(Box::new(move |_| format!("{w} | nil")));
if focused_symbol == Some('|') { focused_patterns.push(idx); }
}
if has('!') {
let w = word.to_string();
let idx = patterns.len();
patterns.push(Box::new(move |_| format!("!{w}")));
if focused_symbol == Some('!') { focused_patterns.push(idx); }
}
// Special patterns
if has('@') {
let w = word.to_string();
let idx = patterns.len();
patterns.push(Box::new(move |_| format!("@{w}")));
if focused_symbol == Some('@') { focused_patterns.push(idx); }
}
if has('#') {
let w = word.to_string();
let idx = patterns.len();
patterns.push(Box::new(move |_| format!("#{w}")));
if focused_symbol == Some('#') { focused_patterns.push(idx); }
}
if has('_') {
let w = word.to_string();
let idx = patterns.len();
patterns.push(Box::new(move |_| format!("{w}_val")));
if focused_symbol == Some('_') { focused_patterns.push(idx); }
}
if has('$') {
let w = word.to_string();
let idx = patterns.len();
patterns.push(Box::new(move |_| format!("${w}")));
if focused_symbol == Some('$') { focused_patterns.push(idx); }
}
if has('\\') {
let w = word.to_string();
let idx = patterns.len();
patterns.push(Box::new(move |_| format!("\\{w}")));
if focused_symbol == Some('\\') { focused_patterns.push(idx); }
}
if patterns.is_empty() {
return word.to_string();
}
// 50% chance to prefer a pattern that uses the focused symbol
let idx = if !focused_patterns.is_empty() && rng.gen_bool(0.50) {
focused_patterns[rng.gen_range(0..focused_patterns.len())]
} else {
rng.gen_range(0..patterns.len())
};
patterns[idx](rng)
}
#[cfg(test)]
mod tests {
use super::*;
use rand::SeedableRng;
#[test]
fn test_no_symbols_when_empty() {
let mut rng = SmallRng::seed_from_u64(42);
let result = apply_code_symbols("hello world", &[], None, &mut rng);
assert_eq!(result, "hello world");
}
#[test]
fn test_uses_only_unlocked_symbols() {
let mut rng = SmallRng::seed_from_u64(42);
let symbols = ['=', '+'];
let text = "a b c d e f g h i j";
let result = apply_code_symbols(text, &symbols, None, &mut rng);
for ch in result.chars() {
if !ch.is_alphanumeric() && ch != ' ' {
assert!(
symbols.contains(&ch),
"Unexpected symbol '{ch}' in: {result}"
);
}
}
}
#[test]
fn test_dash_patterns_generated() {
let mut rng = SmallRng::seed_from_u64(42);
let symbols = ['-', '='];
let text = "a b c d e f g h i j k l m n o p q r s t";
let result = apply_code_symbols(text, &symbols, None, &mut rng);
assert!(result.contains('-'), "Expected dash in: {result}");
}
}

View File

@@ -245,11 +245,11 @@ impl TextGenerator for CodeSyntaxGenerator {
result.push(snippet.to_string());
}
result.join(" ")
result.join("\n\n")
}
}
/// Extract function-length snippets from raw source code
/// Extract function-length snippets from raw source code, preserving whitespace.
fn extract_code_snippets(source: &str) -> Vec<String> {
let mut snippets = Vec::new();
let lines: Vec<&str> = source.lines().collect();
@@ -285,11 +285,11 @@ fn extract_code_snippets(source: &str) -> Vec<String> {
}
if snippet_lines.len() >= 3 && snippet_lines.len() <= 30 {
let snippet = snippet_lines.join(" ");
// Normalize whitespace
let normalized: String = snippet.split_whitespace().collect::<Vec<_>>().join(" ");
if normalized.len() >= 20 && normalized.len() <= 500 {
snippets.push(normalized);
// Preserve original newlines and indentation
let snippet = snippet_lines.join("\n");
let char_count = snippet.chars().filter(|c| !c.is_whitespace()).count();
if char_count >= 20 && snippet.len() <= 800 {
snippets.push(snippet);
}
}

View File

@@ -1,9 +1,13 @@
pub mod cache;
pub mod capitalize;
pub mod code_patterns;
pub mod code_syntax;
pub mod dictionary;
pub mod github_code;
pub mod numbers;
pub mod passage;
pub mod phonetic;
pub mod punctuate;
pub mod transition_table;
use crate::engine::filter::CharFilter;

132
src/generator/numbers.rs Normal file
View File

@@ -0,0 +1,132 @@
use rand::rngs::SmallRng;
use rand::Rng;
/// Post-processing pass that inserts number expressions into text.
/// Only uses digits from `unlocked_digits`.
pub fn apply_numbers(
text: &str,
unlocked_digits: &[char],
has_dot: bool,
focused: Option<char>,
rng: &mut SmallRng,
) -> String {
if unlocked_digits.is_empty() {
return text.to_string();
}
// If focused key is a digit, boost number insertion probability
let focused_digit = focused.filter(|ch| ch.is_ascii_digit());
let base_prob = if focused_digit.is_some() { 0.30 } else { 0.15 };
let words: Vec<&str> = text.split(' ').collect();
let mut result = Vec::new();
for word in &words {
if rng.gen_bool(base_prob) {
let expr = generate_number_expr(unlocked_digits, has_dot, focused_digit, rng);
result.push(expr);
} else {
result.push(word.to_string());
}
}
result.join(" ")
}
fn generate_number_expr(
digits: &[char],
has_dot: bool,
focused_digit: Option<char>,
rng: &mut SmallRng,
) -> String {
// Determine how many patterns are available (version pattern needs dot)
let max_pattern = if has_dot { 5 } else { 4 };
let pattern = rng.gen_range(0..max_pattern);
let num = match pattern {
0 => {
// Simple count: "3" or "42"
random_number(digits, 1, 3, focused_digit, rng)
}
1 => {
// Measurement: "7 miles" or "42 items"
let num = random_number(digits, 1, 2, focused_digit, rng);
let units = ["items", "miles", "days", "lines", "times", "parts"];
let unit = units[rng.gen_range(0..units.len())];
return format!("{num} {unit}");
}
2 => {
// Year-like: "2024"
random_number(digits, 4, 4, focused_digit, rng)
}
3 => {
// ID: "room 42" or "page 7"
let prefixes = ["room", "page", "step", "item", "line", "port"];
let prefix = prefixes[rng.gen_range(0..prefixes.len())];
let num = random_number(digits, 1, 3, focused_digit, rng);
return format!("{prefix} {num}");
}
_ => {
// Version-like: "3.14" or "2.0" (only when dot is available)
let major = random_number(digits, 1, 1, focused_digit, rng);
let minor = random_number(digits, 1, 2, focused_digit, rng);
return format!("{major}.{minor}");
}
};
num
}
fn random_number(
digits: &[char],
min_len: usize,
max_len: usize,
focused_digit: Option<char>,
rng: &mut SmallRng,
) -> String {
let len = rng.gen_range(min_len..=max_len);
(0..len)
.map(|_| {
// 40% chance to use the focused digit if it's a digit
if let Some(fd) = focused_digit {
if rng.gen_bool(0.40) {
return fd;
}
}
digits[rng.gen_range(0..digits.len())]
})
.collect()
}
#[cfg(test)]
mod tests {
use super::*;
use rand::SeedableRng;
#[test]
fn test_no_numbers_when_empty() {
let mut rng = SmallRng::seed_from_u64(42);
let result = apply_numbers("hello world", &[], false, None, &mut rng);
assert_eq!(result, "hello world");
}
#[test]
fn test_numbers_use_only_unlocked_digits() {
let mut rng = SmallRng::seed_from_u64(42);
let digits = ['1', '2', '3'];
let text = "a b c d e f g h i j k l m n o p q r s t";
let result = apply_numbers(text, &digits, false, None, &mut rng);
for ch in result.chars() {
if ch.is_ascii_digit() {
assert!(digits.contains(&ch), "Unexpected digit {ch} in: {result}");
}
}
}
#[test]
fn test_no_dot_without_punctuation() {
let mut rng = SmallRng::seed_from_u64(42);
let digits = ['1', '2', '3', '4', '5'];
let text = "a b c d e f g h i j k l m n o p q r s t";
let result = apply_numbers(text, &digits, false, None, &mut rng);
assert!(!result.contains('.'), "Should not contain dot when has_dot=false: {result}");
}
}

213
src/generator/punctuate.rs Normal file
View File

@@ -0,0 +1,213 @@
use rand::rngs::SmallRng;
use rand::Rng;
/// Post-processing pass that inserts punctuation into generated text.
/// Only uses punctuation chars from `unlocked_punct`.
pub fn apply_punctuation(
text: &str,
unlocked_punct: &[char],
focused: Option<char>,
rng: &mut SmallRng,
) -> String {
if unlocked_punct.is_empty() {
return text.to_string();
}
// If focused key is a punctuation char in our set, boost its insertion probability
let focused_punct = focused.filter(|ch| unlocked_punct.contains(ch));
let words: Vec<&str> = text.split(' ').collect();
if words.is_empty() {
return text.to_string();
}
let has_period = unlocked_punct.contains(&'.');
let has_comma = unlocked_punct.contains(&',');
let has_apostrophe = unlocked_punct.contains(&'\'');
let has_semicolon = unlocked_punct.contains(&';');
let has_colon = unlocked_punct.contains(&':');
let has_quote = unlocked_punct.contains(&'"');
let has_dash = unlocked_punct.contains(&'-');
let has_question = unlocked_punct.contains(&'?');
let has_exclaim = unlocked_punct.contains(&'!');
let has_open_paren = unlocked_punct.contains(&'(');
let has_close_paren = unlocked_punct.contains(&')');
let mut result = Vec::new();
let mut words_since_period = 0;
let mut words_since_comma = 0;
for (i, word) in words.iter().enumerate() {
let mut w = word.to_string();
// Contractions (~8% of words, boosted if apostrophe is focused)
let apostrophe_prob = if focused_punct == Some('\'') { 0.30 } else { 0.08 };
if has_apostrophe && w.len() >= 3 && rng.gen_bool(apostrophe_prob) {
w = make_contraction(&w, rng);
}
// Compound words with dash (~5% of words, boosted if dash is focused)
let dash_prob = if focused_punct == Some('-') { 0.25 } else { 0.05 };
if has_dash && i + 1 < words.len() && rng.gen_bool(dash_prob) {
w.push('-');
}
// Sentence ending punctuation
words_since_period += 1;
let end_sentence = words_since_period >= 8 && rng.gen_bool(0.15)
|| words_since_period >= 12;
if end_sentence && i < words.len() - 1 {
let q_prob = if focused_punct == Some('?') { 0.40 } else { 0.15 };
let excl_prob = if focused_punct == Some('!') { 0.40 } else { 0.10 };
if has_question && rng.gen_bool(q_prob) {
w.push('?');
} else if has_exclaim && rng.gen_bool(excl_prob) {
w.push('!');
} else if has_period {
w.push('.');
}
words_since_period = 0;
words_since_comma = 0;
} else {
// Comma after clause (~every 4-6 words)
words_since_comma += 1;
let comma_prob = if focused_punct == Some(',') { 0.40 } else { 0.20 };
if has_comma && words_since_comma >= 4 && rng.gen_bool(comma_prob) && i < words.len() - 1 {
w.push(',');
words_since_comma = 0;
}
// Semicolon between clauses (rare, boosted if focused)
let semi_prob = if focused_punct == Some(';') { 0.25 } else { 0.05 };
if has_semicolon && words_since_comma >= 5 && rng.gen_bool(semi_prob) && i < words.len() - 1 {
w.push(';');
words_since_comma = 0;
}
// Colon before list-like content (rare, boosted if focused)
let colon_prob = if focused_punct == Some(':') { 0.20 } else { 0.03 };
if has_colon && rng.gen_bool(colon_prob) && i < words.len() - 1 {
w.push(':');
}
}
// Quoted phrases (~5% chance to start a quote, boosted if focused)
let quote_prob = if focused_punct == Some('"') { 0.20 } else { 0.04 };
if has_quote && rng.gen_bool(quote_prob) && i + 2 < words.len() {
w = format!("\"{w}");
}
// Parenthetical asides (rare, boosted if focused)
let paren_prob = if matches!(focused_punct, Some('(' | ')')) { 0.15 } else { 0.03 };
if has_open_paren && has_close_paren && rng.gen_bool(paren_prob) && i + 2 < words.len() {
w = format!("({w}");
}
result.push(w);
}
// End with period if we have it
if has_period {
if let Some(last) = result.last_mut() {
let last_char = last.chars().last();
if !matches!(last_char, Some('.' | '?' | '!' | '"' | ')')) {
last.push('.');
}
}
}
// Close any open quotes/parens
let mut open_quotes = 0i32;
let mut open_parens = 0i32;
for w in &result {
for ch in w.chars() {
if ch == '"' { open_quotes += 1; }
if ch == '(' { open_parens += 1; }
if ch == ')' { open_parens -= 1; }
}
}
if let Some(last) = result.last_mut() {
if open_quotes % 2 != 0 && has_quote {
// Remove trailing period to put quote after
let had_period = last.ends_with('.');
if had_period {
last.pop();
}
last.push('"');
if had_period {
last.push('.');
}
}
if open_parens > 0 && has_close_paren {
let had_period = last.ends_with('.');
if had_period {
last.pop();
}
last.push(')');
if had_period {
last.push('.');
}
}
}
result.join(" ")
}
fn make_contraction(word: &str, rng: &mut SmallRng) -> String {
// Simple contractions based on common patterns
let contractions: &[(&str, &str)] = &[
("not", "n't"),
("will", "'ll"),
("would", "'d"),
("have", "'ve"),
("are", "'re"),
("is", "'s"),
];
for &(base, suffix) in contractions {
if word == base {
// For "not" -> "don't", "can't", etc. - just return the contraction form
return format!("{word}{suffix}");
}
}
// Generic: ~chance to add 's
if rng.gen_bool(0.5) {
format!("{word}'s")
} else {
word.to_string()
}
}
#[cfg(test)]
mod tests {
use super::*;
use rand::SeedableRng;
#[test]
fn test_no_punct_when_empty() {
let mut rng = SmallRng::seed_from_u64(42);
let result = apply_punctuation("hello world", &[], None, &mut rng);
assert_eq!(result, "hello world");
}
#[test]
fn test_adds_period_at_end() {
let mut rng = SmallRng::seed_from_u64(42);
let text = "one two three four five six seven eight nine ten";
let result = apply_punctuation(text, &['.'], None, &mut rng);
assert!(result.ends_with('.'));
}
#[test]
fn test_period_appears_mid_text() {
let mut rng = SmallRng::seed_from_u64(42);
let words: Vec<&str> = (0..20).map(|_| "word").collect();
let text = words.join(" ");
let result = apply_punctuation(&text, &['.', ','], None, &mut rng);
// Should have at least one period somewhere in the middle
let period_count = result.chars().filter(|&c| c == '.').count();
assert!(period_count >= 1, "Expected periods in: {result}");
}
}