Skill tree progression system & whitespace support
This commit is contained in:
123
src/generator/capitalize.rs
Normal file
123
src/generator/capitalize.rs
Normal file
@@ -0,0 +1,123 @@
|
||||
use rand::rngs::SmallRng;
|
||||
use rand::Rng;
|
||||
|
||||
/// Post-processing pass that capitalizes words in generated text.
|
||||
/// Only capitalizes using letters from `unlocked_capitals`.
|
||||
pub fn apply_capitalization(
|
||||
text: &str,
|
||||
unlocked_capitals: &[char],
|
||||
focused: Option<char>,
|
||||
rng: &mut SmallRng,
|
||||
) -> String {
|
||||
if unlocked_capitals.is_empty() {
|
||||
return text.to_string();
|
||||
}
|
||||
|
||||
// If focused key is an uppercase letter, boost its probability
|
||||
let focused_upper = focused.filter(|ch| ch.is_ascii_uppercase());
|
||||
|
||||
let mut result = String::with_capacity(text.len());
|
||||
let mut at_sentence_start = true;
|
||||
|
||||
for (i, ch) in text.chars().enumerate() {
|
||||
if at_sentence_start && ch.is_ascii_lowercase() {
|
||||
let upper = ch.to_ascii_uppercase();
|
||||
if unlocked_capitals.contains(&upper) {
|
||||
result.push(upper);
|
||||
at_sentence_start = false;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
// After period/question/exclamation + space, next word starts a sentence
|
||||
if ch == ' ' && i > 0 {
|
||||
let prev = text.as_bytes().get(i - 1).map(|&b| b as char);
|
||||
if matches!(prev, Some('.' | '?' | '!')) {
|
||||
at_sentence_start = true;
|
||||
}
|
||||
}
|
||||
|
||||
// Capitalize word starts: boosted for focused key, ~12% for others
|
||||
if ch.is_ascii_lowercase() && !at_sentence_start {
|
||||
let is_word_start = i == 0 || text.as_bytes().get(i - 1).map(|&b| b as char) == Some(' ');
|
||||
if is_word_start {
|
||||
let upper = ch.to_ascii_uppercase();
|
||||
if unlocked_capitals.contains(&upper) {
|
||||
let prob = if focused_upper == Some(upper) { 0.40 } else { 0.12 };
|
||||
if rng.gen_bool(prob) {
|
||||
result.push(upper);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if ch != '.' && ch != '?' && ch != '!' {
|
||||
at_sentence_start = false;
|
||||
}
|
||||
|
||||
result.push(ch);
|
||||
}
|
||||
|
||||
result
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use rand::SeedableRng;
|
||||
|
||||
#[test]
|
||||
fn test_no_caps_when_empty() {
|
||||
let mut rng = SmallRng::seed_from_u64(42);
|
||||
let result = apply_capitalization("hello world", &[], None, &mut rng);
|
||||
assert_eq!(result, "hello world");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_capitalizes_first_word() {
|
||||
let mut rng = SmallRng::seed_from_u64(42);
|
||||
let result = apply_capitalization("hello world", &['H', 'W'], None, &mut rng);
|
||||
assert!(result.starts_with('H'));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_only_capitalizes_unlocked() {
|
||||
let mut rng = SmallRng::seed_from_u64(42);
|
||||
// Only 'W' is unlocked, not 'H'
|
||||
let result = apply_capitalization("hello world", &['W'], None, &mut rng);
|
||||
assert!(result.starts_with('h')); // 'H' not unlocked
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_after_period() {
|
||||
let mut rng = SmallRng::seed_from_u64(42);
|
||||
let result = apply_capitalization("one. two", &['O', 'T'], None, &mut rng);
|
||||
assert!(result.starts_with('O'));
|
||||
assert!(result.contains("Two") || result.contains("two"));
|
||||
// At minimum, first word should be capitalized
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_focused_capital_boosted() {
|
||||
// With focused 'W', W capitalization should happen more often
|
||||
let caps = &['H', 'W'];
|
||||
let mut focused_count = 0;
|
||||
let mut unfocused_count = 0;
|
||||
// Run many trials to check statistical boosting
|
||||
for seed in 0..200 {
|
||||
let mut rng = SmallRng::seed_from_u64(seed);
|
||||
let text = "hello world wide web wonder what where who will work";
|
||||
let result = apply_capitalization(text, caps, Some('W'), &mut rng);
|
||||
// Count W capitalizations (skip first word which is always capitalized if 'H' is available)
|
||||
focused_count += result.matches('W').count();
|
||||
let mut rng2 = SmallRng::seed_from_u64(seed);
|
||||
let result2 = apply_capitalization(text, caps, None, &mut rng2);
|
||||
unfocused_count += result2.matches('W').count();
|
||||
}
|
||||
assert!(
|
||||
focused_count > unfocused_count,
|
||||
"Focused W count ({focused_count}) should exceed unfocused ({unfocused_count})"
|
||||
);
|
||||
}
|
||||
}
|
||||
220
src/generator/code_patterns.rs
Normal file
220
src/generator/code_patterns.rs
Normal file
@@ -0,0 +1,220 @@
|
||||
use rand::rngs::SmallRng;
|
||||
use rand::Rng;
|
||||
|
||||
/// Post-processing pass that inserts code-like expressions into text.
|
||||
/// Only uses symbols from `unlocked_symbols`.
|
||||
pub fn apply_code_symbols(
|
||||
text: &str,
|
||||
unlocked_symbols: &[char],
|
||||
focused: Option<char>,
|
||||
rng: &mut SmallRng,
|
||||
) -> String {
|
||||
if unlocked_symbols.is_empty() {
|
||||
return text.to_string();
|
||||
}
|
||||
|
||||
// If focused key is a code symbol, boost insertion probability
|
||||
let focused_symbol = focused.filter(|ch| unlocked_symbols.contains(ch));
|
||||
let base_prob = if focused_symbol.is_some() { 0.35 } else { 0.20 };
|
||||
|
||||
let words: Vec<&str> = text.split(' ').collect();
|
||||
let mut result = Vec::new();
|
||||
|
||||
for word in &words {
|
||||
if rng.gen_bool(base_prob) {
|
||||
let expr = generate_code_expr(word, unlocked_symbols, focused_symbol, rng);
|
||||
result.push(expr);
|
||||
} else {
|
||||
result.push(word.to_string());
|
||||
}
|
||||
}
|
||||
|
||||
result.join(" ")
|
||||
}
|
||||
|
||||
fn generate_code_expr(
|
||||
word: &str,
|
||||
symbols: &[char],
|
||||
focused_symbol: Option<char>,
|
||||
rng: &mut SmallRng,
|
||||
) -> String {
|
||||
// Categorize available symbols
|
||||
let has = |ch: char| symbols.contains(&ch);
|
||||
|
||||
// Try various patterns based on available symbols
|
||||
let mut patterns: Vec<Box<dyn Fn(&mut SmallRng) -> String>> = Vec::new();
|
||||
// Track which patterns use the focused symbol for priority selection
|
||||
let mut focused_patterns: Vec<usize> = Vec::new();
|
||||
|
||||
// Arithmetic & Assignment patterns
|
||||
if has('=') {
|
||||
let w = word.to_string();
|
||||
let idx = patterns.len();
|
||||
patterns.push(Box::new(move |_| format!("{w} = val")));
|
||||
if focused_symbol == Some('=') { focused_patterns.push(idx); }
|
||||
}
|
||||
if has('+') {
|
||||
let w = word.to_string();
|
||||
let idx = patterns.len();
|
||||
patterns.push(Box::new(move |_| format!("{w} + num")));
|
||||
if focused_symbol == Some('+') { focused_patterns.push(idx); }
|
||||
}
|
||||
if has('*') {
|
||||
let w = word.to_string();
|
||||
let idx = patterns.len();
|
||||
patterns.push(Box::new(move |_| format!("{w} * cnt")));
|
||||
if focused_symbol == Some('*') { focused_patterns.push(idx); }
|
||||
}
|
||||
if has('/') {
|
||||
let w = word.to_string();
|
||||
let idx = patterns.len();
|
||||
patterns.push(Box::new(move |_| format!("{w} / max")));
|
||||
if focused_symbol == Some('/') { focused_patterns.push(idx); }
|
||||
}
|
||||
if has('-') {
|
||||
let w = word.to_string();
|
||||
let idx = patterns.len();
|
||||
patterns.push(Box::new(move |_| format!("{w} - one")));
|
||||
if focused_symbol == Some('-') { focused_patterns.push(idx); }
|
||||
let w = word.to_string();
|
||||
let idx = patterns.len();
|
||||
patterns.push(Box::new(move |_| format!("-{w}")));
|
||||
if focused_symbol == Some('-') { focused_patterns.push(idx); }
|
||||
}
|
||||
if has('=') && has('+') {
|
||||
let w = word.to_string();
|
||||
patterns.push(Box::new(move |_| format!("{w} += one")));
|
||||
}
|
||||
if has('=') && has('-') {
|
||||
let w = word.to_string();
|
||||
let idx = patterns.len();
|
||||
patterns.push(Box::new(move |_| format!("{w} -= one")));
|
||||
if focused_symbol == Some('-') { focused_patterns.push(idx); }
|
||||
}
|
||||
if has('=') && has('=') {
|
||||
let w = word.to_string();
|
||||
patterns.push(Box::new(move |_| format!("{w} == nil")));
|
||||
}
|
||||
|
||||
// Grouping patterns
|
||||
if has('{') && has('}') {
|
||||
let w = word.to_string();
|
||||
let idx = patterns.len();
|
||||
patterns.push(Box::new(move |_| format!("{{ {w} }}")));
|
||||
if matches!(focused_symbol, Some('{') | Some('}')) { focused_patterns.push(idx); }
|
||||
}
|
||||
if has('[') && has(']') {
|
||||
let w = word.to_string();
|
||||
let idx = patterns.len();
|
||||
patterns.push(Box::new(move |_| format!("{w}[idx]")));
|
||||
if matches!(focused_symbol, Some('[') | Some(']')) { focused_patterns.push(idx); }
|
||||
}
|
||||
if has('<') && has('>') {
|
||||
let w = word.to_string();
|
||||
let idx = patterns.len();
|
||||
patterns.push(Box::new(move |_| format!("Vec<{w}>")));
|
||||
if matches!(focused_symbol, Some('<') | Some('>')) { focused_patterns.push(idx); }
|
||||
}
|
||||
|
||||
// Logic patterns
|
||||
if has('&') {
|
||||
let w = word.to_string();
|
||||
let idx = patterns.len();
|
||||
patterns.push(Box::new(move |_| format!("&{w}")));
|
||||
if focused_symbol == Some('&') { focused_patterns.push(idx); }
|
||||
}
|
||||
if has('|') {
|
||||
let w = word.to_string();
|
||||
let idx = patterns.len();
|
||||
patterns.push(Box::new(move |_| format!("{w} | nil")));
|
||||
if focused_symbol == Some('|') { focused_patterns.push(idx); }
|
||||
}
|
||||
if has('!') {
|
||||
let w = word.to_string();
|
||||
let idx = patterns.len();
|
||||
patterns.push(Box::new(move |_| format!("!{w}")));
|
||||
if focused_symbol == Some('!') { focused_patterns.push(idx); }
|
||||
}
|
||||
|
||||
// Special patterns
|
||||
if has('@') {
|
||||
let w = word.to_string();
|
||||
let idx = patterns.len();
|
||||
patterns.push(Box::new(move |_| format!("@{w}")));
|
||||
if focused_symbol == Some('@') { focused_patterns.push(idx); }
|
||||
}
|
||||
if has('#') {
|
||||
let w = word.to_string();
|
||||
let idx = patterns.len();
|
||||
patterns.push(Box::new(move |_| format!("#{w}")));
|
||||
if focused_symbol == Some('#') { focused_patterns.push(idx); }
|
||||
}
|
||||
if has('_') {
|
||||
let w = word.to_string();
|
||||
let idx = patterns.len();
|
||||
patterns.push(Box::new(move |_| format!("{w}_val")));
|
||||
if focused_symbol == Some('_') { focused_patterns.push(idx); }
|
||||
}
|
||||
if has('$') {
|
||||
let w = word.to_string();
|
||||
let idx = patterns.len();
|
||||
patterns.push(Box::new(move |_| format!("${w}")));
|
||||
if focused_symbol == Some('$') { focused_patterns.push(idx); }
|
||||
}
|
||||
if has('\\') {
|
||||
let w = word.to_string();
|
||||
let idx = patterns.len();
|
||||
patterns.push(Box::new(move |_| format!("\\{w}")));
|
||||
if focused_symbol == Some('\\') { focused_patterns.push(idx); }
|
||||
}
|
||||
|
||||
if patterns.is_empty() {
|
||||
return word.to_string();
|
||||
}
|
||||
|
||||
// 50% chance to prefer a pattern that uses the focused symbol
|
||||
let idx = if !focused_patterns.is_empty() && rng.gen_bool(0.50) {
|
||||
focused_patterns[rng.gen_range(0..focused_patterns.len())]
|
||||
} else {
|
||||
rng.gen_range(0..patterns.len())
|
||||
};
|
||||
patterns[idx](rng)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use rand::SeedableRng;
|
||||
|
||||
#[test]
|
||||
fn test_no_symbols_when_empty() {
|
||||
let mut rng = SmallRng::seed_from_u64(42);
|
||||
let result = apply_code_symbols("hello world", &[], None, &mut rng);
|
||||
assert_eq!(result, "hello world");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_uses_only_unlocked_symbols() {
|
||||
let mut rng = SmallRng::seed_from_u64(42);
|
||||
let symbols = ['=', '+'];
|
||||
let text = "a b c d e f g h i j";
|
||||
let result = apply_code_symbols(text, &symbols, None, &mut rng);
|
||||
for ch in result.chars() {
|
||||
if !ch.is_alphanumeric() && ch != ' ' {
|
||||
assert!(
|
||||
symbols.contains(&ch),
|
||||
"Unexpected symbol '{ch}' in: {result}"
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_dash_patterns_generated() {
|
||||
let mut rng = SmallRng::seed_from_u64(42);
|
||||
let symbols = ['-', '='];
|
||||
let text = "a b c d e f g h i j k l m n o p q r s t";
|
||||
let result = apply_code_symbols(text, &symbols, None, &mut rng);
|
||||
assert!(result.contains('-'), "Expected dash in: {result}");
|
||||
}
|
||||
}
|
||||
@@ -245,11 +245,11 @@ impl TextGenerator for CodeSyntaxGenerator {
|
||||
result.push(snippet.to_string());
|
||||
}
|
||||
|
||||
result.join(" ")
|
||||
result.join("\n\n")
|
||||
}
|
||||
}
|
||||
|
||||
/// Extract function-length snippets from raw source code
|
||||
/// Extract function-length snippets from raw source code, preserving whitespace.
|
||||
fn extract_code_snippets(source: &str) -> Vec<String> {
|
||||
let mut snippets = Vec::new();
|
||||
let lines: Vec<&str> = source.lines().collect();
|
||||
@@ -285,11 +285,11 @@ fn extract_code_snippets(source: &str) -> Vec<String> {
|
||||
}
|
||||
|
||||
if snippet_lines.len() >= 3 && snippet_lines.len() <= 30 {
|
||||
let snippet = snippet_lines.join(" ");
|
||||
// Normalize whitespace
|
||||
let normalized: String = snippet.split_whitespace().collect::<Vec<_>>().join(" ");
|
||||
if normalized.len() >= 20 && normalized.len() <= 500 {
|
||||
snippets.push(normalized);
|
||||
// Preserve original newlines and indentation
|
||||
let snippet = snippet_lines.join("\n");
|
||||
let char_count = snippet.chars().filter(|c| !c.is_whitespace()).count();
|
||||
if char_count >= 20 && snippet.len() <= 800 {
|
||||
snippets.push(snippet);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -1,9 +1,13 @@
|
||||
pub mod cache;
|
||||
pub mod capitalize;
|
||||
pub mod code_patterns;
|
||||
pub mod code_syntax;
|
||||
pub mod dictionary;
|
||||
pub mod github_code;
|
||||
pub mod numbers;
|
||||
pub mod passage;
|
||||
pub mod phonetic;
|
||||
pub mod punctuate;
|
||||
pub mod transition_table;
|
||||
|
||||
use crate::engine::filter::CharFilter;
|
||||
|
||||
132
src/generator/numbers.rs
Normal file
132
src/generator/numbers.rs
Normal file
@@ -0,0 +1,132 @@
|
||||
use rand::rngs::SmallRng;
|
||||
use rand::Rng;
|
||||
|
||||
/// Post-processing pass that inserts number expressions into text.
|
||||
/// Only uses digits from `unlocked_digits`.
|
||||
pub fn apply_numbers(
|
||||
text: &str,
|
||||
unlocked_digits: &[char],
|
||||
has_dot: bool,
|
||||
focused: Option<char>,
|
||||
rng: &mut SmallRng,
|
||||
) -> String {
|
||||
if unlocked_digits.is_empty() {
|
||||
return text.to_string();
|
||||
}
|
||||
|
||||
// If focused key is a digit, boost number insertion probability
|
||||
let focused_digit = focused.filter(|ch| ch.is_ascii_digit());
|
||||
let base_prob = if focused_digit.is_some() { 0.30 } else { 0.15 };
|
||||
|
||||
let words: Vec<&str> = text.split(' ').collect();
|
||||
let mut result = Vec::new();
|
||||
|
||||
for word in &words {
|
||||
if rng.gen_bool(base_prob) {
|
||||
let expr = generate_number_expr(unlocked_digits, has_dot, focused_digit, rng);
|
||||
result.push(expr);
|
||||
} else {
|
||||
result.push(word.to_string());
|
||||
}
|
||||
}
|
||||
|
||||
result.join(" ")
|
||||
}
|
||||
|
||||
fn generate_number_expr(
|
||||
digits: &[char],
|
||||
has_dot: bool,
|
||||
focused_digit: Option<char>,
|
||||
rng: &mut SmallRng,
|
||||
) -> String {
|
||||
// Determine how many patterns are available (version pattern needs dot)
|
||||
let max_pattern = if has_dot { 5 } else { 4 };
|
||||
let pattern = rng.gen_range(0..max_pattern);
|
||||
let num = match pattern {
|
||||
0 => {
|
||||
// Simple count: "3" or "42"
|
||||
random_number(digits, 1, 3, focused_digit, rng)
|
||||
}
|
||||
1 => {
|
||||
// Measurement: "7 miles" or "42 items"
|
||||
let num = random_number(digits, 1, 2, focused_digit, rng);
|
||||
let units = ["items", "miles", "days", "lines", "times", "parts"];
|
||||
let unit = units[rng.gen_range(0..units.len())];
|
||||
return format!("{num} {unit}");
|
||||
}
|
||||
2 => {
|
||||
// Year-like: "2024"
|
||||
random_number(digits, 4, 4, focused_digit, rng)
|
||||
}
|
||||
3 => {
|
||||
// ID: "room 42" or "page 7"
|
||||
let prefixes = ["room", "page", "step", "item", "line", "port"];
|
||||
let prefix = prefixes[rng.gen_range(0..prefixes.len())];
|
||||
let num = random_number(digits, 1, 3, focused_digit, rng);
|
||||
return format!("{prefix} {num}");
|
||||
}
|
||||
_ => {
|
||||
// Version-like: "3.14" or "2.0" (only when dot is available)
|
||||
let major = random_number(digits, 1, 1, focused_digit, rng);
|
||||
let minor = random_number(digits, 1, 2, focused_digit, rng);
|
||||
return format!("{major}.{minor}");
|
||||
}
|
||||
};
|
||||
num
|
||||
}
|
||||
|
||||
fn random_number(
|
||||
digits: &[char],
|
||||
min_len: usize,
|
||||
max_len: usize,
|
||||
focused_digit: Option<char>,
|
||||
rng: &mut SmallRng,
|
||||
) -> String {
|
||||
let len = rng.gen_range(min_len..=max_len);
|
||||
(0..len)
|
||||
.map(|_| {
|
||||
// 40% chance to use the focused digit if it's a digit
|
||||
if let Some(fd) = focused_digit {
|
||||
if rng.gen_bool(0.40) {
|
||||
return fd;
|
||||
}
|
||||
}
|
||||
digits[rng.gen_range(0..digits.len())]
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use rand::SeedableRng;
|
||||
|
||||
#[test]
|
||||
fn test_no_numbers_when_empty() {
|
||||
let mut rng = SmallRng::seed_from_u64(42);
|
||||
let result = apply_numbers("hello world", &[], false, None, &mut rng);
|
||||
assert_eq!(result, "hello world");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_numbers_use_only_unlocked_digits() {
|
||||
let mut rng = SmallRng::seed_from_u64(42);
|
||||
let digits = ['1', '2', '3'];
|
||||
let text = "a b c d e f g h i j k l m n o p q r s t";
|
||||
let result = apply_numbers(text, &digits, false, None, &mut rng);
|
||||
for ch in result.chars() {
|
||||
if ch.is_ascii_digit() {
|
||||
assert!(digits.contains(&ch), "Unexpected digit {ch} in: {result}");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_no_dot_without_punctuation() {
|
||||
let mut rng = SmallRng::seed_from_u64(42);
|
||||
let digits = ['1', '2', '3', '4', '5'];
|
||||
let text = "a b c d e f g h i j k l m n o p q r s t";
|
||||
let result = apply_numbers(text, &digits, false, None, &mut rng);
|
||||
assert!(!result.contains('.'), "Should not contain dot when has_dot=false: {result}");
|
||||
}
|
||||
}
|
||||
213
src/generator/punctuate.rs
Normal file
213
src/generator/punctuate.rs
Normal file
@@ -0,0 +1,213 @@
|
||||
use rand::rngs::SmallRng;
|
||||
use rand::Rng;
|
||||
|
||||
/// Post-processing pass that inserts punctuation into generated text.
|
||||
/// Only uses punctuation chars from `unlocked_punct`.
|
||||
pub fn apply_punctuation(
|
||||
text: &str,
|
||||
unlocked_punct: &[char],
|
||||
focused: Option<char>,
|
||||
rng: &mut SmallRng,
|
||||
) -> String {
|
||||
if unlocked_punct.is_empty() {
|
||||
return text.to_string();
|
||||
}
|
||||
|
||||
// If focused key is a punctuation char in our set, boost its insertion probability
|
||||
let focused_punct = focused.filter(|ch| unlocked_punct.contains(ch));
|
||||
|
||||
let words: Vec<&str> = text.split(' ').collect();
|
||||
if words.is_empty() {
|
||||
return text.to_string();
|
||||
}
|
||||
|
||||
let has_period = unlocked_punct.contains(&'.');
|
||||
let has_comma = unlocked_punct.contains(&',');
|
||||
let has_apostrophe = unlocked_punct.contains(&'\'');
|
||||
let has_semicolon = unlocked_punct.contains(&';');
|
||||
let has_colon = unlocked_punct.contains(&':');
|
||||
let has_quote = unlocked_punct.contains(&'"');
|
||||
let has_dash = unlocked_punct.contains(&'-');
|
||||
let has_question = unlocked_punct.contains(&'?');
|
||||
let has_exclaim = unlocked_punct.contains(&'!');
|
||||
let has_open_paren = unlocked_punct.contains(&'(');
|
||||
let has_close_paren = unlocked_punct.contains(&')');
|
||||
|
||||
let mut result = Vec::new();
|
||||
let mut words_since_period = 0;
|
||||
let mut words_since_comma = 0;
|
||||
|
||||
for (i, word) in words.iter().enumerate() {
|
||||
let mut w = word.to_string();
|
||||
|
||||
// Contractions (~8% of words, boosted if apostrophe is focused)
|
||||
let apostrophe_prob = if focused_punct == Some('\'') { 0.30 } else { 0.08 };
|
||||
if has_apostrophe && w.len() >= 3 && rng.gen_bool(apostrophe_prob) {
|
||||
w = make_contraction(&w, rng);
|
||||
}
|
||||
|
||||
// Compound words with dash (~5% of words, boosted if dash is focused)
|
||||
let dash_prob = if focused_punct == Some('-') { 0.25 } else { 0.05 };
|
||||
if has_dash && i + 1 < words.len() && rng.gen_bool(dash_prob) {
|
||||
w.push('-');
|
||||
}
|
||||
|
||||
// Sentence ending punctuation
|
||||
words_since_period += 1;
|
||||
let end_sentence = words_since_period >= 8 && rng.gen_bool(0.15)
|
||||
|| words_since_period >= 12;
|
||||
|
||||
if end_sentence && i < words.len() - 1 {
|
||||
let q_prob = if focused_punct == Some('?') { 0.40 } else { 0.15 };
|
||||
let excl_prob = if focused_punct == Some('!') { 0.40 } else { 0.10 };
|
||||
if has_question && rng.gen_bool(q_prob) {
|
||||
w.push('?');
|
||||
} else if has_exclaim && rng.gen_bool(excl_prob) {
|
||||
w.push('!');
|
||||
} else if has_period {
|
||||
w.push('.');
|
||||
}
|
||||
words_since_period = 0;
|
||||
words_since_comma = 0;
|
||||
} else {
|
||||
// Comma after clause (~every 4-6 words)
|
||||
words_since_comma += 1;
|
||||
let comma_prob = if focused_punct == Some(',') { 0.40 } else { 0.20 };
|
||||
if has_comma && words_since_comma >= 4 && rng.gen_bool(comma_prob) && i < words.len() - 1 {
|
||||
w.push(',');
|
||||
words_since_comma = 0;
|
||||
}
|
||||
|
||||
// Semicolon between clauses (rare, boosted if focused)
|
||||
let semi_prob = if focused_punct == Some(';') { 0.25 } else { 0.05 };
|
||||
if has_semicolon && words_since_comma >= 5 && rng.gen_bool(semi_prob) && i < words.len() - 1 {
|
||||
w.push(';');
|
||||
words_since_comma = 0;
|
||||
}
|
||||
|
||||
// Colon before list-like content (rare, boosted if focused)
|
||||
let colon_prob = if focused_punct == Some(':') { 0.20 } else { 0.03 };
|
||||
if has_colon && rng.gen_bool(colon_prob) && i < words.len() - 1 {
|
||||
w.push(':');
|
||||
}
|
||||
}
|
||||
|
||||
// Quoted phrases (~5% chance to start a quote, boosted if focused)
|
||||
let quote_prob = if focused_punct == Some('"') { 0.20 } else { 0.04 };
|
||||
if has_quote && rng.gen_bool(quote_prob) && i + 2 < words.len() {
|
||||
w = format!("\"{w}");
|
||||
}
|
||||
|
||||
// Parenthetical asides (rare, boosted if focused)
|
||||
let paren_prob = if matches!(focused_punct, Some('(' | ')')) { 0.15 } else { 0.03 };
|
||||
if has_open_paren && has_close_paren && rng.gen_bool(paren_prob) && i + 2 < words.len() {
|
||||
w = format!("({w}");
|
||||
}
|
||||
|
||||
result.push(w);
|
||||
}
|
||||
|
||||
// End with period if we have it
|
||||
if has_period {
|
||||
if let Some(last) = result.last_mut() {
|
||||
let last_char = last.chars().last();
|
||||
if !matches!(last_char, Some('.' | '?' | '!' | '"' | ')')) {
|
||||
last.push('.');
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Close any open quotes/parens
|
||||
let mut open_quotes = 0i32;
|
||||
let mut open_parens = 0i32;
|
||||
for w in &result {
|
||||
for ch in w.chars() {
|
||||
if ch == '"' { open_quotes += 1; }
|
||||
if ch == '(' { open_parens += 1; }
|
||||
if ch == ')' { open_parens -= 1; }
|
||||
}
|
||||
}
|
||||
if let Some(last) = result.last_mut() {
|
||||
if open_quotes % 2 != 0 && has_quote {
|
||||
// Remove trailing period to put quote after
|
||||
let had_period = last.ends_with('.');
|
||||
if had_period {
|
||||
last.pop();
|
||||
}
|
||||
last.push('"');
|
||||
if had_period {
|
||||
last.push('.');
|
||||
}
|
||||
}
|
||||
if open_parens > 0 && has_close_paren {
|
||||
let had_period = last.ends_with('.');
|
||||
if had_period {
|
||||
last.pop();
|
||||
}
|
||||
last.push(')');
|
||||
if had_period {
|
||||
last.push('.');
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
result.join(" ")
|
||||
}
|
||||
|
||||
fn make_contraction(word: &str, rng: &mut SmallRng) -> String {
|
||||
// Simple contractions based on common patterns
|
||||
let contractions: &[(&str, &str)] = &[
|
||||
("not", "n't"),
|
||||
("will", "'ll"),
|
||||
("would", "'d"),
|
||||
("have", "'ve"),
|
||||
("are", "'re"),
|
||||
("is", "'s"),
|
||||
];
|
||||
|
||||
for &(base, suffix) in contractions {
|
||||
if word == base {
|
||||
// For "not" -> "don't", "can't", etc. - just return the contraction form
|
||||
return format!("{word}{suffix}");
|
||||
}
|
||||
}
|
||||
|
||||
// Generic: ~chance to add 's
|
||||
if rng.gen_bool(0.5) {
|
||||
format!("{word}'s")
|
||||
} else {
|
||||
word.to_string()
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use rand::SeedableRng;
|
||||
|
||||
#[test]
|
||||
fn test_no_punct_when_empty() {
|
||||
let mut rng = SmallRng::seed_from_u64(42);
|
||||
let result = apply_punctuation("hello world", &[], None, &mut rng);
|
||||
assert_eq!(result, "hello world");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_adds_period_at_end() {
|
||||
let mut rng = SmallRng::seed_from_u64(42);
|
||||
let text = "one two three four five six seven eight nine ten";
|
||||
let result = apply_punctuation(text, &['.'], None, &mut rng);
|
||||
assert!(result.ends_with('.'));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_period_appears_mid_text() {
|
||||
let mut rng = SmallRng::seed_from_u64(42);
|
||||
let words: Vec<&str> = (0..20).map(|_| "word").collect();
|
||||
let text = words.join(" ");
|
||||
let result = apply_punctuation(&text, &['.', ','], None, &mut rng);
|
||||
// Should have at least one period somewhere in the middle
|
||||
let period_count = result.chars().filter(|&c| c == '.').count();
|
||||
assert!(period_count >= 1, "Expected periods in: {result}");
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user