First improvement pass

This commit is contained in:
2026-02-10 23:32:57 -05:00
parent f65e3d8413
commit c78a8a90a3
26 changed files with 13200 additions and 207 deletions

49
src/generator/cache.rs Normal file
View File

@@ -0,0 +1,49 @@
use std::fs;
use std::path::PathBuf;
pub struct DiskCache {
base_dir: PathBuf,
}
impl DiskCache {
pub fn new(subdir: &str) -> Option<Self> {
let base = dirs::data_dir()?.join("keydr").join(subdir);
fs::create_dir_all(&base).ok()?;
Some(Self { base_dir: base })
}
pub fn get(&self, key: &str) -> Option<String> {
let path = self.base_dir.join(Self::sanitize_key(key));
fs::read_to_string(path).ok()
}
pub fn put(&self, key: &str, content: &str) -> bool {
let path = self.base_dir.join(Self::sanitize_key(key));
fs::write(path, content).is_ok()
}
fn sanitize_key(key: &str) -> String {
key.chars()
.map(|c| if c.is_alphanumeric() || c == '-' || c == '_' || c == '.' { c } else { '_' })
.collect()
}
}
#[cfg(feature = "network")]
pub fn fetch_url(url: &str) -> Option<String> {
let client = reqwest::blocking::Client::builder()
.timeout(std::time::Duration::from_secs(10))
.build()
.ok()?;
let response = client.get(url).send().ok()?;
if response.status().is_success() {
response.text().ok()
} else {
None
}
}
#[cfg(not(feature = "network"))]
pub fn fetch_url(_url: &str) -> Option<String> {
None
}

View File

@@ -2,18 +2,81 @@ use rand::rngs::SmallRng;
use rand::Rng;
use crate::engine::filter::CharFilter;
use crate::generator::cache::{DiskCache, fetch_url};
use crate::generator::TextGenerator;
pub struct CodeSyntaxGenerator {
rng: SmallRng,
language: String,
fetched_snippets: Vec<String>,
}
impl CodeSyntaxGenerator {
pub fn new(rng: SmallRng, language: &str) -> Self {
Self {
let mut generator = Self {
rng,
language: language.to_string(),
fetched_snippets: Vec::new(),
};
generator.load_cached_snippets();
generator
}
fn load_cached_snippets(&mut self) {
if let Some(cache) = DiskCache::new("code_cache") {
let key = format!("{}_snippets", self.language);
if let Some(content) = cache.get(&key) {
self.fetched_snippets = content
.split("\n---SNIPPET---\n")
.filter(|s| !s.trim().is_empty())
.map(|s| s.to_string())
.collect();
}
}
}
fn try_fetch_code(&mut self) {
let urls = match self.language.as_str() {
"rust" => vec![
"https://raw.githubusercontent.com/tokio-rs/tokio/master/tokio/src/sync/mutex.rs",
"https://raw.githubusercontent.com/serde-rs/serde/master/serde/src/ser/mod.rs",
],
"python" => vec![
"https://raw.githubusercontent.com/python/cpython/main/Lib/json/encoder.py",
"https://raw.githubusercontent.com/python/cpython/main/Lib/pathlib/__init__.py",
],
"javascript" | "js" => vec![
"https://raw.githubusercontent.com/lodash/lodash/main/src/chunk.ts",
"https://raw.githubusercontent.com/expressjs/express/master/lib/router/index.js",
],
"go" => vec![
"https://raw.githubusercontent.com/golang/go/master/src/fmt/print.go",
],
_ => vec![],
};
let cache = match DiskCache::new("code_cache") {
Some(c) => c,
None => return,
};
let key = format!("{}_snippets", self.language);
if cache.get(&key).is_some() {
return;
}
let mut all_snippets = Vec::new();
for url in urls {
if let Some(content) = fetch_url(url) {
let snippets = extract_code_snippets(&content);
all_snippets.extend(snippets);
}
}
if !all_snippets.is_empty() {
let combined = all_snippets.join("\n---SNIPPET---\n");
cache.put(&key, &combined);
self.fetched_snippets = all_snippets;
}
}
@@ -35,6 +98,20 @@ impl CodeSyntaxGenerator {
"trait Display { fn show(&self) -> String; }",
"while let Some(item) = stack.pop() { process(item); }",
"#[derive(Debug, Clone)] struct Config { name: String, value: i32 }",
"let handle = std::thread::spawn(|| { println!(\"thread\"); });",
"let mut map = HashMap::new(); map.insert(\"key\", 42);",
"fn factorial(n: u64) -> u64 { if n <= 1 { 1 } else { n * factorial(n - 1) } }",
"impl Iterator for Counter { type Item = u32; fn next(&mut self) -> Option<Self::Item> { None } }",
"async fn fetch(url: &str) -> Result<String> { let body = reqwest::get(url).await?.text().await?; Ok(body) }",
"let closure = |x: i32, y: i32| -> i32 { x + y };",
"mod tests { use super::*; #[test] fn it_works() { assert_eq!(2 + 2, 4); } }",
"pub struct Builder { name: Option<String> } impl Builder { pub fn name(mut self, n: &str) -> Self { self.name = Some(n.into()); self } }",
"use std::sync::{Arc, Mutex}; let data = Arc::new(Mutex::new(vec![1, 2, 3]));",
"if let Ok(value) = \"42\".parse::<i32>() { println!(\"parsed: {}\", value); }",
"fn longest<'a>(x: &'a str, y: &'a str) -> &'a str { if x.len() > y.len() { x } else { y } }",
"type Result<T> = std::result::Result<T, Box<dyn std::error::Error>>;",
"macro_rules! vec_of_strings { ($($x:expr),*) => { vec![$($x.to_string()),*] }; }",
"let (tx, rx) = std::sync::mpsc::channel(); tx.send(42).unwrap();",
]
}
@@ -52,6 +129,19 @@ impl CodeSyntaxGenerator {
"from collections import defaultdict",
"lambda x: x * 2 + 1",
"dict_comp = {k: v for k, v in pairs.items()}",
"async def fetch(url): async with aiohttp.ClientSession() as session: return await session.get(url)",
"def fibonacci(n): return n if n <= 1 else fibonacci(n-1) + fibonacci(n-2)",
"@property def name(self): return self._name",
"from dataclasses import dataclass; @dataclass class Config: name: str; value: int = 0",
"yield from range(10)",
"sorted(items, key=lambda x: x.name, reverse=True)",
"from typing import Optional, List, Dict",
"with contextlib.suppress(FileNotFoundError): os.remove(\"temp.txt\")",
"class Meta(type): def __new__(cls, name, bases, attrs): return super().__new__(cls, name, bases, attrs)",
"from functools import lru_cache; @lru_cache(maxsize=128) def expensive(n): return sum(range(n))",
"from pathlib import Path; files = list(Path(\".\").glob(\"**/*.py\"))",
"assert isinstance(result, dict), f\"Expected dict, got {type(result)}\"",
"values = {*set_a, *set_b}; merged = {**dict_a, **dict_b}",
]
}
@@ -69,6 +159,18 @@ impl CodeSyntaxGenerator {
"try { parse(data); } catch (e) { console.error(e); }",
"export default function handler(req, res) { res.send(\"ok\"); }",
"const result = items.filter(x => x > 0).reduce((a, b) => a + b, 0);",
"const promise = new Promise((resolve, reject) => { setTimeout(resolve, 1000); });",
"const [first, ...rest] = array;",
"class EventEmitter { constructor() { this.listeners = new Map(); } }",
"const proxy = new Proxy(target, { get(obj, prop) { return obj[prop]; } });",
"for await (const chunk of stream) { process(chunk); }",
"const memoize = (fn) => { const cache = new Map(); return (...args) => cache.get(args) ?? fn(...args); };",
"import { useState, useEffect } from 'react'; const [state, setState] = useState(null);",
"const pipe = (...fns) => (x) => fns.reduce((v, f) => f(v), x);",
"Object.entries(obj).forEach(([key, value]) => { console.log(key, value); });",
"const debounce = (fn, ms) => { let timer; return (...args) => { clearTimeout(timer); timer = setTimeout(() => fn(...args), ms); }; };",
"const observable = new Observable(subscriber => { subscriber.next(1); subscriber.complete(); });",
"Symbol.iterator",
]
}
@@ -84,6 +186,16 @@ impl CodeSyntaxGenerator {
"switch val { case 1: return \"one\" default: return \"other\" }",
"go func() { ch <- result }()",
"defer file.Close()",
"type Reader interface { Read(p []byte) (n int, err error) }",
"ctx, cancel := context.WithTimeout(context.Background(), time.Second)",
"var wg sync.WaitGroup; wg.Add(1); go func() { defer wg.Done() }()",
"func (p *Point) Distance() float64 { return math.Sqrt(p.X*p.X + p.Y*p.Y) }",
"select { case msg := <-ch: process(msg) case <-time.After(time.Second): timeout() }",
"json.NewEncoder(w).Encode(response)",
"http.HandleFunc(\"/api\", func(w http.ResponseWriter, r *http.Request) { w.Write([]byte(\"ok\")) })",
"func Map[T, U any](s []T, f func(T) U) []U { r := make([]U, len(s)); for i, v := range s { r[i] = f(v) }; return r }",
"var once sync.Once; once.Do(func() { initialize() })",
"buf := bytes.NewBuffer(nil); buf.WriteString(\"hello\")",
]
}
@@ -105,18 +217,88 @@ impl TextGenerator for CodeSyntaxGenerator {
_focused: Option<char>,
word_count: usize,
) -> String {
let snippets = self.get_snippets();
// Try to fetch from GitHub on first use
if self.fetched_snippets.is_empty() {
self.try_fetch_code();
}
let embedded = self.get_snippets();
let mut result = Vec::new();
let target_words = word_count;
let mut current_words = 0;
let total_available = embedded.len() + self.fetched_snippets.len();
while current_words < target_words {
let idx = self.rng.gen_range(0..snippets.len());
let snippet = snippets[idx];
let idx = self.rng.gen_range(0..total_available.max(1));
let snippet = if idx < embedded.len() {
embedded[idx]
} else if !self.fetched_snippets.is_empty() {
let f_idx = (idx - embedded.len()) % self.fetched_snippets.len();
&self.fetched_snippets[f_idx]
} else {
embedded[idx % embedded.len()]
};
current_words += snippet.split_whitespace().count();
result.push(snippet);
result.push(snippet.to_string());
}
result.join(" ")
}
}
/// Extract function-length snippets from raw source code
fn extract_code_snippets(source: &str) -> Vec<String> {
let mut snippets = Vec::new();
let lines: Vec<&str> = source.lines().collect();
let mut i = 0;
while i < lines.len() {
// Look for function/method starts
let line = lines[i].trim();
let is_func_start = line.starts_with("fn ")
|| line.starts_with("pub fn ")
|| line.starts_with("def ")
|| line.starts_with("func ")
|| line.starts_with("function ")
|| line.starts_with("async fn ")
|| line.starts_with("pub async fn ");
if is_func_start {
let mut snippet_lines = Vec::new();
let mut depth = 0i32;
let mut j = i;
while j < lines.len() && snippet_lines.len() < 30 {
let l = lines[j];
snippet_lines.push(l);
depth += l.chars().filter(|&c| c == '{' || c == '(').count() as i32;
depth -= l.chars().filter(|&c| c == '}' || c == ')').count() as i32;
if depth <= 0 && j > i {
break;
}
j += 1;
}
if snippet_lines.len() >= 3 && snippet_lines.len() <= 30 {
let snippet = snippet_lines.join(" ");
// Normalize whitespace
let normalized: String = snippet.split_whitespace().collect::<Vec<_>>().join(" ");
if normalized.len() >= 20 && normalized.len() <= 500 {
snippets.push(normalized);
}
}
i = j + 1;
} else {
i += 1;
}
}
snippets.truncate(50);
snippets
}

View File

@@ -0,0 +1,45 @@
use crate::engine::filter::CharFilter;
const WORDS_EN: &str = include_str!("../../assets/words-en.json");
pub struct Dictionary {
words: Vec<String>,
}
impl Dictionary {
pub fn load() -> Self {
let words: Vec<String> = serde_json::from_str(WORDS_EN).unwrap_or_default();
// Filter to words of length >= 3 (matching keybr)
let words = words
.into_iter()
.filter(|w| w.len() >= 3 && w.chars().all(|c| c.is_ascii_lowercase()))
.collect();
Self { words }
}
pub fn words_list(&self) -> Vec<String> {
self.words.clone()
}
pub fn find_matching(
&self,
filter: &CharFilter,
focused: Option<char>,
) -> Vec<&str> {
let mut matching: Vec<&str> = self
.words
.iter()
.filter(|w| w.chars().all(|c| filter.is_allowed(c)))
.map(|s| s.as_str())
.collect();
// If there's a focused letter, prioritize words containing it
if let Some(focus) = focused {
matching.sort_by_key(|w| if w.contains(focus) { 0 } else { 1 });
}
matching
}
}

View File

@@ -1,4 +1,6 @@
pub mod cache;
pub mod code_syntax;
pub mod dictionary;
pub mod github_code;
pub mod passage;
pub mod phonetic;

View File

@@ -1,7 +1,12 @@
use rand::rngs::SmallRng;
use rand::Rng;
use crate::engine::filter::CharFilter;
use crate::generator::cache::{DiskCache, fetch_url};
use crate::generator::TextGenerator;
const PASSAGES: &[&str] = &[
// Classic literature & speeches
"the quick brown fox jumps over the lazy dog and then runs across the field while the sun sets behind the distant hills",
"it was the best of times it was the worst of times it was the age of wisdom it was the age of foolishness",
"in the beginning there was nothing but darkness and then the light appeared slowly spreading across the vast empty space",
@@ -17,21 +22,132 @@ const PASSAGES: &[&str] = &[
"he picked up the book and began to read turning the pages slowly as the story drew him deeper and deeper into its world",
"the stars shone brightly in the clear night sky and the moon cast a silver light over the sleeping town below",
"they gathered around the fire telling stories and laughing while the wind howled outside and the snow piled up against the door",
// Pride and Prejudice
"it is a truth universally acknowledged that a single man in possession of a good fortune must be in want of a wife",
"there is a stubbornness about me that never can bear to be frightened at the will of others my courage always rises at every attempt to intimidate me",
"i could easily forgive his pride if he had not mortified mine but vanity not love has been my folly",
// Alice in Wonderland
"alice was beginning to get very tired of sitting by her sister on the bank and of having nothing to do",
"who in the world am i that is the great puzzle she said as she looked around the strange room with wonder",
"but i dont want to go among mad people alice remarked oh you cant help that said the cat were all mad here",
// Great Gatsby
"in my younger and more vulnerable years my father gave me some advice that i have been turning over in my mind ever since",
"so we beat on boats against the current borne back ceaselessly into the past dreaming of that green light",
// Sherlock Holmes
"when you have eliminated the impossible whatever remains however improbable must be the truth my dear watson",
"the world is full of obvious things which nobody by any chance ever observes but which are perfectly visible",
// Moby Dick
"call me ishmael some years ago having little or no money in my purse and nothing particular to interest me on shore",
"it is not down on any map because true places never are and the voyage was long and the sea was deep",
// 1984
"it was a bright cold day in april and the clocks were striking thirteen winston smith his chin nuzzled into his breast",
"who controls the past controls the future and who controls the present controls the past said the voice from the screen",
// Walden
"i went to the woods because i wished to live deliberately to front only the essential facts of life",
"the mass of men lead lives of quiet desperation and go to the grave with the song still in them",
// Science & philosophy
"the only way to do great work is to love what you do and if you have not found it yet keep looking and do not settle",
"imagination is more important than knowledge for while knowledge defines all we currently know imagination points to what we might discover",
"the important thing is not to stop questioning for curiosity has its own reason for existing in this wonderful universe",
"we are all in the gutter but some of us are looking at the stars and dreaming of worlds beyond our own",
"the greatest glory in living lies not in never falling but in rising every time we fall and trying once more",
// Nature & observation
"the autumn wind scattered golden leaves across the garden as the last rays of sunlight painted the clouds in shades of orange and pink",
"deep in the forest where the ancient trees stood tall and silent a small stream wound its way through moss covered stones",
"the ocean stretched endlessly before them its surface catching the light of the setting sun in a thousand shimmering reflections",
"morning mist hung low over the meadow as the first birds began their chorus and dew drops sparkled like diamonds on every blade of grass",
"the mountain peak stood above the clouds its snow covered summit glowing pink and gold in the light of the early morning sun",
// Everyday wisdom
"the best time to plant a tree was twenty years ago and the second best time is now so do not wait any longer to begin",
"a journey of a thousand miles begins with a single step and every great achievement started with the decision to try",
"the more that you read the more things you will know and the more that you learn the more places you will go",
"in three words i can sum up everything i have learned about life it goes on and so must we with hope",
"happiness is not something ready made it comes from your own actions and your choices shape the life you live",
"do not go where the path may lead but go instead where there is no path and leave a trail for others to follow",
"success is not final failure is not fatal it is the courage to continue that counts in the end",
"be yourself because everyone else is already taken and the world needs what only you can bring to it",
"life is what happens when you are busy making other plans so enjoy the journey along the way",
"the secret of getting ahead is getting started and the secret of getting started is breaking your tasks into small steps",
];
/// Gutenberg book IDs for popular public domain works
const GUTENBERG_IDS: &[(u32, &str)] = &[
(1342, "pride_and_prejudice"),
(11, "alice_in_wonderland"),
(1661, "sherlock_holmes"),
(84, "frankenstein"),
(1952, "yellow_wallpaper"),
(2701, "moby_dick"),
(74, "tom_sawyer"),
(345, "dracula"),
(1232, "prince"),
(76, "huckleberry_finn"),
(5200, "metamorphosis"),
(2542, "aesop_fables"),
(174, "dorian_gray"),
(98, "tale_two_cities"),
(1080, "modest_proposal"),
(219, "heart_of_darkness"),
(4300, "ulysses"),
(28054, "brothers_karamazov"),
(2554, "crime_and_punishment"),
(55, "oz"),
];
pub struct PassageGenerator {
current_idx: usize,
fetched_passages: Vec<String>,
rng: SmallRng,
}
impl PassageGenerator {
pub fn new() -> Self {
Self { current_idx: 0 }
pub fn new(rng: SmallRng) -> Self {
let mut generator = Self {
current_idx: 0,
fetched_passages: Vec::new(),
rng,
};
generator.load_cached_passages();
generator
}
}
impl Default for PassageGenerator {
fn default() -> Self {
Self::new()
fn load_cached_passages(&mut self) {
if let Some(cache) = DiskCache::new("passages") {
for &(_, name) in GUTENBERG_IDS {
if let Some(content) = cache.get(name) {
let paragraphs = extract_paragraphs(&content);
self.fetched_passages.extend(paragraphs);
}
}
}
}
fn try_fetch_gutenberg(&mut self) {
let cache = match DiskCache::new("passages") {
Some(c) => c,
None => return,
};
// Pick a random book that we haven't cached yet
let uncached: Vec<(u32, &str)> = GUTENBERG_IDS
.iter()
.filter(|(_, name)| cache.get(name).is_none())
.copied()
.collect();
if uncached.is_empty() {
return;
}
let idx = self.rng.gen_range(0..uncached.len());
let (book_id, name) = uncached[idx];
let url = format!("https://www.gutenberg.org/cache/epub/{book_id}/pg{book_id}.txt");
if let Some(content) = fetch_url(&url) {
cache.put(name, &content);
let paragraphs = extract_paragraphs(&content);
self.fetched_passages.extend(paragraphs);
}
}
}
@@ -42,8 +158,87 @@ impl TextGenerator for PassageGenerator {
_focused: Option<char>,
_word_count: usize,
) -> String {
let passage = PASSAGES[self.current_idx % PASSAGES.len()];
// Try to fetch a new Gutenberg book in the background (first few calls)
if self.fetched_passages.len() < 50 && self.current_idx < 3 {
self.try_fetch_gutenberg();
}
let total_passages = PASSAGES.len() + self.fetched_passages.len();
if total_passages == 0 {
self.current_idx += 1;
return PASSAGES[0].to_string();
}
// Mix embedded and fetched passages
let idx = self.current_idx % total_passages;
self.current_idx += 1;
passage.to_string()
if idx < PASSAGES.len() {
PASSAGES[idx].to_string()
} else {
let fetched_idx = idx - PASSAGES.len();
self.fetched_passages[fetched_idx % self.fetched_passages.len()].clone()
}
}
}
/// Extract readable paragraphs from Gutenberg text, skipping header/footer
fn extract_paragraphs(text: &str) -> Vec<String> {
let mut paragraphs = Vec::new();
// Find the start of actual content (after Gutenberg header)
let start_markers = ["*** START OF", "***START OF"];
let end_markers = ["*** END OF", "***END OF"];
let content_start = start_markers
.iter()
.filter_map(|marker| text.find(marker))
.min()
.map(|pos| {
// Find the end of the header line
text[pos..].find('\n').map(|nl| pos + nl + 1).unwrap_or(pos)
})
.unwrap_or(0);
let content_end = end_markers
.iter()
.filter_map(|marker| text.find(marker))
.min()
.unwrap_or(text.len());
let content = &text[content_start..content_end];
// Split into paragraphs (double newline separated)
for para in content.split("\r\n\r\n").chain(content.split("\n\n")) {
let cleaned: String = para
.lines()
.map(|l| l.trim())
.collect::<Vec<_>>()
.join(" ")
.chars()
.filter(|c| c.is_ascii_alphanumeric() || c.is_ascii_whitespace() || c.is_ascii_punctuation())
.collect::<String>()
.to_lowercase();
let word_count = cleaned.split_whitespace().count();
if word_count >= 15 && word_count <= 60 {
// Keep only the alpha/space portions for typing
let typing_text: String = cleaned
.chars()
.filter(|c| c.is_ascii_lowercase() || *c == ' ')
.collect::<String>()
.split_whitespace()
.collect::<Vec<_>>()
.join(" ");
if typing_text.split_whitespace().count() >= 10 {
paragraphs.push(typing_text);
}
}
}
// Take at most 100 paragraphs per book
paragraphs.truncate(100);
paragraphs
}

View File

@@ -2,17 +2,27 @@ use rand::rngs::SmallRng;
use rand::Rng;
use crate::engine::filter::CharFilter;
use crate::generator::dictionary::Dictionary;
use crate::generator::transition_table::TransitionTable;
use crate::generator::TextGenerator;
const MIN_WORD_LEN: usize = 3;
const MAX_WORD_LEN: usize = 10;
const MIN_REAL_WORDS: usize = 15;
pub struct PhoneticGenerator {
table: TransitionTable,
dictionary: Dictionary,
rng: SmallRng,
}
impl PhoneticGenerator {
pub fn new(table: TransitionTable, rng: SmallRng) -> Self {
Self { table, rng }
pub fn new(table: TransitionTable, dictionary: Dictionary, rng: SmallRng) -> Self {
Self {
table,
dictionary,
rng,
}
}
fn pick_weighted_from(
@@ -46,28 +56,30 @@ impl PhoneticGenerator {
Some(filtered.last().unwrap().0)
}
fn generate_word(&mut self, filter: &CharFilter, focused: Option<char>) -> String {
let min_len = 3;
let max_len = 10;
let mut word = String::new();
fn generate_phonetic_word(&mut self, filter: &CharFilter, focused: Option<char>) -> String {
for _attempt in 0..5 {
let word = self.try_generate_word(filter, focused);
if word.len() >= MIN_WORD_LEN {
return word;
}
}
// Fallback
"the".to_string()
}
fn try_generate_word(&mut self, filter: &CharFilter, focused: Option<char>) -> String {
let mut word = Vec::new();
// Start with space prefix
let start_char = if let Some(focus) = focused {
if self.rng.gen_bool(0.4) {
let probs = self.table.get_next_probs(' ', focus).cloned();
if let Some(probs) = probs {
let filtered: Vec<(char, f64)> = probs
.iter()
.filter(|(ch, _)| filter.is_allowed(*ch))
.copied()
.collect();
if !filtered.is_empty() {
word.push(focus);
Self::pick_weighted_from(&mut self.rng, &filtered, filter)
} else {
None
}
if self.rng.gen_bool(0.4) && filter.is_allowed(focus) {
word.push(focus);
// Get next char from transition table
let prefix = vec![' ', ' ', focus];
if let Some(probs) = self.table.segment(&prefix) {
Self::pick_weighted_from(&mut self.rng, probs, filter)
} else {
Some(focus)
None
}
} else {
None
@@ -77,25 +89,32 @@ impl PhoneticGenerator {
};
if word.is_empty() {
let starters: Vec<(char, f64)> = filter
.allowed
.iter()
.map(|&ch| {
(
ch,
if ch == 'e' || ch == 't' || ch == 'a' {
3.0
} else {
1.0
},
)
})
.collect();
if let Some(ch) = Self::pick_weighted_from(&mut self.rng, &starters, filter) {
word.push(ch);
} else {
return "the".to_string();
// Pick a start from transition table
let prefix = vec![' ', ' ', ' '];
if let Some(probs) = self.table.segment(&prefix) {
if let Some(ch) = Self::pick_weighted_from(&mut self.rng, probs, filter) {
word.push(ch);
}
}
// Fallback: weighted random start
if word.is_empty() {
let starters: Vec<(char, f64)> = filter
.allowed
.iter()
.map(|&ch| {
let w = match ch {
'e' | 't' | 'a' => 3.0,
'o' | 'i' | 'n' | 's' => 2.0,
_ => 1.0,
};
(ch, w)
})
.collect();
if let Some(ch) = Self::pick_weighted_from(&mut self.rng, &starters, filter) {
word.push(ch);
} else {
return "the".to_string();
}
}
}
@@ -103,33 +122,60 @@ impl PhoneticGenerator {
word.push(ch);
}
while word.len() < max_len {
let chars: Vec<char> = word.chars().collect();
let len = chars.len();
let (prev, curr) = if len >= 2 {
(chars[len - 2], chars[len - 1])
while word.len() < MAX_WORD_LEN {
// Build prefix from recent chars, padded with spaces
let prefix_len = self.table.order - 1;
let mut prefix = Vec::new();
let start = if word.len() >= prefix_len {
word.len() - prefix_len
} else {
(' ', chars[len - 1])
0
};
let space_prob = 1.3f64.powi(word.len() as i32 - min_len as i32);
if word.len() >= min_len
&& self
.rng
.gen_bool((space_prob / (space_prob + 5.0)).min(0.8))
{
break;
for _ in 0..(prefix_len.saturating_sub(word.len())) {
prefix.push(' ');
}
for i in start..word.len() {
prefix.push(word[i]);
}
let probs = self.table.get_next_probs(prev, curr).cloned();
if let Some(probs) = probs {
if let Some(next) = Self::pick_weighted_from(&mut self.rng, &probs, filter) {
// Check for word ending (space probability increases with length)
if word.len() >= MIN_WORD_LEN {
if let Some(probs) = self.table.segment(&prefix) {
let space_weight: f64 = probs
.iter()
.filter(|(ch, _)| *ch == ' ')
.map(|(_, w)| w)
.sum();
if space_weight > 0.0 {
let boost = 1.3f64.powi(word.len() as i32 - MIN_WORD_LEN as i32);
let total: f64 = probs.iter().map(|(_, w)| w).sum();
let space_prob = (space_weight * boost) / (total + space_weight * (boost - 1.0));
if self.rng.gen_bool(space_prob.min(0.85)) {
break;
}
}
}
// Even without space in table, use length-based ending
let end_prob = 1.3f64.powi(word.len() as i32 - MIN_WORD_LEN as i32);
if self.rng.gen_bool((end_prob / (end_prob + 5.0)).min(0.8)) {
break;
}
}
// Get next character from transition table
if let Some(probs) = self.table.segment(&prefix) {
let non_space: Vec<(char, f64)> = probs
.iter()
.filter(|(ch, _)| *ch != ' ')
.copied()
.collect();
if let Some(next) = Self::pick_weighted_from(&mut self.rng, &non_space, filter) {
word.push(next);
} else {
break;
}
} else {
// Fallback to vowel
let vowels: Vec<(char, f64)> = ['a', 'e', 'i', 'o', 'u']
.iter()
.filter(|&&v| filter.is_allowed(v))
@@ -143,11 +189,7 @@ impl PhoneticGenerator {
}
}
if word.is_empty() {
"the".to_string()
} else {
word
}
word.iter().collect()
}
}
@@ -158,10 +200,42 @@ impl TextGenerator for PhoneticGenerator {
focused: Option<char>,
word_count: usize,
) -> String {
// keybr's approach: prefer real words when enough match the filter
// Collect matching words into owned Vec to avoid borrow conflict
let matching_words: Vec<String> = self
.dictionary
.find_matching(filter, focused)
.iter()
.map(|s| s.to_string())
.collect();
let use_real_words = matching_words.len() >= MIN_REAL_WORDS;
let mut words: Vec<String> = Vec::new();
let mut last_word = String::new();
for _ in 0..word_count {
words.push(self.generate_word(filter, focused));
if use_real_words {
// Pick a real word (avoid consecutive duplicates)
let mut picked = None;
for _ in 0..3 {
let idx = self.rng.gen_range(0..matching_words.len());
let word = matching_words[idx].clone();
if word != last_word {
picked = Some(word);
break;
}
}
let word = match picked {
Some(w) => w,
None => self.generate_phonetic_word(filter, focused),
};
last_word.clone_from(&word);
words.push(word);
} else {
// Fall back to phonetic pseudo-words
let word = self.generate_phonetic_word(filter, focused);
words.push(word);
}
}
words.join(" ")

View File

@@ -4,29 +4,108 @@ use serde::{Deserialize, Serialize};
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct TransitionTable {
pub transitions: HashMap<(char, char), Vec<(char, f64)>>,
pub order: usize,
transitions: HashMap<Vec<char>, Vec<(char, f64)>>,
}
impl TransitionTable {
pub fn new() -> Self {
pub fn new(order: usize) -> Self {
Self {
order,
transitions: HashMap::new(),
}
}
pub fn add(&mut self, prev: char, curr: char, next: char, weight: f64) {
pub fn add(&mut self, prefix: &[char], next: char, weight: f64) {
self.transitions
.entry((prev, curr))
.entry(prefix.to_vec())
.or_default()
.push((next, weight));
}
pub fn get_next_probs(&self, prev: char, curr: char) -> Option<&Vec<(char, f64)>> {
self.transitions.get(&(prev, curr))
pub fn segment(&self, prefix: &[char]) -> Option<&Vec<(char, f64)>> {
// Try exact prefix match first, then fall back to shorter prefixes
let key_len = self.order - 1;
let prefix = if prefix.len() >= key_len {
&prefix[prefix.len() - key_len..]
} else {
prefix
};
// Try progressively shorter prefixes for backoff
for start in 0..prefix.len() {
let key = prefix[start..].to_vec();
if let Some(entries) = self.transitions.get(&key) {
return Some(entries);
}
}
None
}
/// Build an order-4 transition table from a word frequency list.
/// Words earlier in the list are higher frequency and get more weight.
pub fn build_from_words(words: &[String]) -> Self {
let mut table = Self::new(4);
let prefix_len = 3; // order - 1
for (rank, word) in words.iter().enumerate() {
if word.len() < 3 {
continue;
}
if !word.chars().all(|c| c.is_ascii_lowercase()) {
continue;
}
// Weight decreases with rank (frequency-based)
let weight = 1.0 / (1.0 + (rank as f64 / 500.0));
// Add word start transitions (space prefix -> first chars)
let chars: Vec<char> = word.chars().collect();
// Start of word: ' ' prefix
for i in 0..chars.len() {
let mut prefix = Vec::new();
// Build prefix from space + preceding chars
let start = if i >= prefix_len { i - prefix_len } else { 0 };
if i < prefix_len {
// Pad with spaces
for _ in 0..(prefix_len - i) {
prefix.push(' ');
}
}
for j in start..i {
prefix.push(chars[j]);
}
let next = chars[i];
table.add(&prefix, next, weight);
}
// End of word: last chars -> space
let end_start = if chars.len() >= prefix_len {
chars.len() - prefix_len
} else {
0
};
let mut end_prefix: Vec<char> = Vec::new();
if chars.len() < prefix_len {
for _ in 0..(prefix_len - chars.len()) {
end_prefix.push(' ');
}
}
for j in end_start..chars.len() {
end_prefix.push(chars[j]);
}
table.add(&end_prefix, ' ', weight);
}
table
}
/// Legacy order-2 table for fallback
#[allow(dead_code)]
pub fn build_english() -> Self {
let mut table = Self::new();
let mut table = Self::new(4);
let common_patterns: &[(&str, f64)] = &[
("the", 10.0), ("and", 8.0), ("ing", 7.0), ("tion", 6.0), ("ent", 5.0),
@@ -40,25 +119,24 @@ impl TransitionTable {
("ght", 2.0), ("whi", 2.0), ("who", 2.0), ("hen", 2.0), ("ter", 2.0),
("man", 2.0), ("men", 2.0), ("ner", 2.0), ("per", 2.0), ("pre", 2.0),
("ran", 2.0), ("lin", 2.0), ("kin", 2.0), ("din", 2.0), ("sin", 2.0),
("out", 2.0), ("ind", 2.0), ("ith", 2.0), ("ber", 2.0), ("der", 2.0),
("out", 2.0), ("ind", 2.0), ("ber", 2.0), ("der", 2.0),
("end", 2.0), ("hin", 2.0), ("old", 2.0), ("ear", 2.0), ("ain", 2.0),
("ant", 2.0), ("urn", 2.0), ("ell", 2.0), ("ill", 2.0), ("ade", 2.0),
("igh", 2.0), ("ong", 2.0), ("ung", 2.0), ("ast", 2.0), ("ist", 2.0),
("ong", 2.0), ("ung", 2.0), ("ast", 2.0), ("ist", 2.0),
("ust", 2.0), ("ost", 2.0), ("ard", 2.0), ("ord", 2.0), ("art", 2.0),
("ort", 2.0), ("ect", 2.0), ("act", 2.0), ("ack", 2.0), ("ick", 2.0),
("ock", 2.0), ("uck", 2.0), ("ash", 2.0), ("ish", 2.0), ("ush", 2.0),
("anc", 1.5), ("enc", 1.5), ("inc", 1.5), ("onc", 1.5), ("unc", 1.5),
("unt", 1.5), ("int", 1.5), ("ont", 1.5), ("ent", 1.5), ("ment", 1.5),
("ness", 1.5), ("less", 1.5), ("able", 1.5), ("ible", 1.5), ("ting", 1.5),
("ring", 1.5), ("sing", 1.5), ("king", 1.5), ("ning", 1.5), ("ling", 1.5),
("wing", 1.5), ("ding", 1.5), ("ping", 1.5), ("ging", 1.5), ("ving", 1.5),
("bing", 1.5), ("ming", 1.5), ("fing", 1.0), ("hing", 1.0), ("cing", 1.0),
];
for &(pattern, weight) in common_patterns {
let chars: Vec<char> = pattern.chars().collect();
for window in chars.windows(3) {
table.add(window[0], window[1], window[2], weight);
let prefix = vec![window[0], window[1]];
table.add(&prefix, window[2], weight);
}
// Also add shorter prefix entries for the start of patterns
if chars.len() >= 2 {
table.add(&[' ', chars[0]], chars[1], weight * 0.5);
}
}
@@ -70,20 +148,14 @@ impl TransitionTable {
for &c in &consonants {
for &v in &vowels {
table.add(' ', c, v, 1.0);
table.add(v, c, 'e', 0.5);
for &v2 in &vowels {
table.add(c, v, v2.to_ascii_lowercase(), 0.3);
}
for &c2 in &consonants {
table.add(v, c, c2, 0.2);
}
table.add(&[' ', c], v, 1.0);
table.add(&[v, c], 'e', 0.5);
}
}
for &v in &vowels {
for &c in &consonants {
table.add(' ', v, c, 0.5);
table.add(&[' ', v], c, 0.5);
}
}
@@ -93,6 +165,6 @@ impl TransitionTable {
impl Default for TransitionTable {
fn default() -> Self {
Self::new()
Self::new(4)
}
}