First improvement pass
This commit is contained in:
49
src/generator/cache.rs
Normal file
49
src/generator/cache.rs
Normal file
@@ -0,0 +1,49 @@
|
||||
use std::fs;
|
||||
use std::path::PathBuf;
|
||||
|
||||
pub struct DiskCache {
|
||||
base_dir: PathBuf,
|
||||
}
|
||||
|
||||
impl DiskCache {
|
||||
pub fn new(subdir: &str) -> Option<Self> {
|
||||
let base = dirs::data_dir()?.join("keydr").join(subdir);
|
||||
fs::create_dir_all(&base).ok()?;
|
||||
Some(Self { base_dir: base })
|
||||
}
|
||||
|
||||
pub fn get(&self, key: &str) -> Option<String> {
|
||||
let path = self.base_dir.join(Self::sanitize_key(key));
|
||||
fs::read_to_string(path).ok()
|
||||
}
|
||||
|
||||
pub fn put(&self, key: &str, content: &str) -> bool {
|
||||
let path = self.base_dir.join(Self::sanitize_key(key));
|
||||
fs::write(path, content).is_ok()
|
||||
}
|
||||
|
||||
fn sanitize_key(key: &str) -> String {
|
||||
key.chars()
|
||||
.map(|c| if c.is_alphanumeric() || c == '-' || c == '_' || c == '.' { c } else { '_' })
|
||||
.collect()
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(feature = "network")]
|
||||
pub fn fetch_url(url: &str) -> Option<String> {
|
||||
let client = reqwest::blocking::Client::builder()
|
||||
.timeout(std::time::Duration::from_secs(10))
|
||||
.build()
|
||||
.ok()?;
|
||||
let response = client.get(url).send().ok()?;
|
||||
if response.status().is_success() {
|
||||
response.text().ok()
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(not(feature = "network"))]
|
||||
pub fn fetch_url(_url: &str) -> Option<String> {
|
||||
None
|
||||
}
|
||||
@@ -2,18 +2,81 @@ use rand::rngs::SmallRng;
|
||||
use rand::Rng;
|
||||
|
||||
use crate::engine::filter::CharFilter;
|
||||
use crate::generator::cache::{DiskCache, fetch_url};
|
||||
use crate::generator::TextGenerator;
|
||||
|
||||
pub struct CodeSyntaxGenerator {
|
||||
rng: SmallRng,
|
||||
language: String,
|
||||
fetched_snippets: Vec<String>,
|
||||
}
|
||||
|
||||
impl CodeSyntaxGenerator {
|
||||
pub fn new(rng: SmallRng, language: &str) -> Self {
|
||||
Self {
|
||||
let mut generator = Self {
|
||||
rng,
|
||||
language: language.to_string(),
|
||||
fetched_snippets: Vec::new(),
|
||||
};
|
||||
generator.load_cached_snippets();
|
||||
generator
|
||||
}
|
||||
|
||||
fn load_cached_snippets(&mut self) {
|
||||
if let Some(cache) = DiskCache::new("code_cache") {
|
||||
let key = format!("{}_snippets", self.language);
|
||||
if let Some(content) = cache.get(&key) {
|
||||
self.fetched_snippets = content
|
||||
.split("\n---SNIPPET---\n")
|
||||
.filter(|s| !s.trim().is_empty())
|
||||
.map(|s| s.to_string())
|
||||
.collect();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn try_fetch_code(&mut self) {
|
||||
let urls = match self.language.as_str() {
|
||||
"rust" => vec![
|
||||
"https://raw.githubusercontent.com/tokio-rs/tokio/master/tokio/src/sync/mutex.rs",
|
||||
"https://raw.githubusercontent.com/serde-rs/serde/master/serde/src/ser/mod.rs",
|
||||
],
|
||||
"python" => vec![
|
||||
"https://raw.githubusercontent.com/python/cpython/main/Lib/json/encoder.py",
|
||||
"https://raw.githubusercontent.com/python/cpython/main/Lib/pathlib/__init__.py",
|
||||
],
|
||||
"javascript" | "js" => vec![
|
||||
"https://raw.githubusercontent.com/lodash/lodash/main/src/chunk.ts",
|
||||
"https://raw.githubusercontent.com/expressjs/express/master/lib/router/index.js",
|
||||
],
|
||||
"go" => vec![
|
||||
"https://raw.githubusercontent.com/golang/go/master/src/fmt/print.go",
|
||||
],
|
||||
_ => vec![],
|
||||
};
|
||||
|
||||
let cache = match DiskCache::new("code_cache") {
|
||||
Some(c) => c,
|
||||
None => return,
|
||||
};
|
||||
|
||||
let key = format!("{}_snippets", self.language);
|
||||
if cache.get(&key).is_some() {
|
||||
return;
|
||||
}
|
||||
|
||||
let mut all_snippets = Vec::new();
|
||||
for url in urls {
|
||||
if let Some(content) = fetch_url(url) {
|
||||
let snippets = extract_code_snippets(&content);
|
||||
all_snippets.extend(snippets);
|
||||
}
|
||||
}
|
||||
|
||||
if !all_snippets.is_empty() {
|
||||
let combined = all_snippets.join("\n---SNIPPET---\n");
|
||||
cache.put(&key, &combined);
|
||||
self.fetched_snippets = all_snippets;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -35,6 +98,20 @@ impl CodeSyntaxGenerator {
|
||||
"trait Display { fn show(&self) -> String; }",
|
||||
"while let Some(item) = stack.pop() { process(item); }",
|
||||
"#[derive(Debug, Clone)] struct Config { name: String, value: i32 }",
|
||||
"let handle = std::thread::spawn(|| { println!(\"thread\"); });",
|
||||
"let mut map = HashMap::new(); map.insert(\"key\", 42);",
|
||||
"fn factorial(n: u64) -> u64 { if n <= 1 { 1 } else { n * factorial(n - 1) } }",
|
||||
"impl Iterator for Counter { type Item = u32; fn next(&mut self) -> Option<Self::Item> { None } }",
|
||||
"async fn fetch(url: &str) -> Result<String> { let body = reqwest::get(url).await?.text().await?; Ok(body) }",
|
||||
"let closure = |x: i32, y: i32| -> i32 { x + y };",
|
||||
"mod tests { use super::*; #[test] fn it_works() { assert_eq!(2 + 2, 4); } }",
|
||||
"pub struct Builder { name: Option<String> } impl Builder { pub fn name(mut self, n: &str) -> Self { self.name = Some(n.into()); self } }",
|
||||
"use std::sync::{Arc, Mutex}; let data = Arc::new(Mutex::new(vec![1, 2, 3]));",
|
||||
"if let Ok(value) = \"42\".parse::<i32>() { println!(\"parsed: {}\", value); }",
|
||||
"fn longest<'a>(x: &'a str, y: &'a str) -> &'a str { if x.len() > y.len() { x } else { y } }",
|
||||
"type Result<T> = std::result::Result<T, Box<dyn std::error::Error>>;",
|
||||
"macro_rules! vec_of_strings { ($($x:expr),*) => { vec![$($x.to_string()),*] }; }",
|
||||
"let (tx, rx) = std::sync::mpsc::channel(); tx.send(42).unwrap();",
|
||||
]
|
||||
}
|
||||
|
||||
@@ -52,6 +129,19 @@ impl CodeSyntaxGenerator {
|
||||
"from collections import defaultdict",
|
||||
"lambda x: x * 2 + 1",
|
||||
"dict_comp = {k: v for k, v in pairs.items()}",
|
||||
"async def fetch(url): async with aiohttp.ClientSession() as session: return await session.get(url)",
|
||||
"def fibonacci(n): return n if n <= 1 else fibonacci(n-1) + fibonacci(n-2)",
|
||||
"@property def name(self): return self._name",
|
||||
"from dataclasses import dataclass; @dataclass class Config: name: str; value: int = 0",
|
||||
"yield from range(10)",
|
||||
"sorted(items, key=lambda x: x.name, reverse=True)",
|
||||
"from typing import Optional, List, Dict",
|
||||
"with contextlib.suppress(FileNotFoundError): os.remove(\"temp.txt\")",
|
||||
"class Meta(type): def __new__(cls, name, bases, attrs): return super().__new__(cls, name, bases, attrs)",
|
||||
"from functools import lru_cache; @lru_cache(maxsize=128) def expensive(n): return sum(range(n))",
|
||||
"from pathlib import Path; files = list(Path(\".\").glob(\"**/*.py\"))",
|
||||
"assert isinstance(result, dict), f\"Expected dict, got {type(result)}\"",
|
||||
"values = {*set_a, *set_b}; merged = {**dict_a, **dict_b}",
|
||||
]
|
||||
}
|
||||
|
||||
@@ -69,6 +159,18 @@ impl CodeSyntaxGenerator {
|
||||
"try { parse(data); } catch (e) { console.error(e); }",
|
||||
"export default function handler(req, res) { res.send(\"ok\"); }",
|
||||
"const result = items.filter(x => x > 0).reduce((a, b) => a + b, 0);",
|
||||
"const promise = new Promise((resolve, reject) => { setTimeout(resolve, 1000); });",
|
||||
"const [first, ...rest] = array;",
|
||||
"class EventEmitter { constructor() { this.listeners = new Map(); } }",
|
||||
"const proxy = new Proxy(target, { get(obj, prop) { return obj[prop]; } });",
|
||||
"for await (const chunk of stream) { process(chunk); }",
|
||||
"const memoize = (fn) => { const cache = new Map(); return (...args) => cache.get(args) ?? fn(...args); };",
|
||||
"import { useState, useEffect } from 'react'; const [state, setState] = useState(null);",
|
||||
"const pipe = (...fns) => (x) => fns.reduce((v, f) => f(v), x);",
|
||||
"Object.entries(obj).forEach(([key, value]) => { console.log(key, value); });",
|
||||
"const debounce = (fn, ms) => { let timer; return (...args) => { clearTimeout(timer); timer = setTimeout(() => fn(...args), ms); }; };",
|
||||
"const observable = new Observable(subscriber => { subscriber.next(1); subscriber.complete(); });",
|
||||
"Symbol.iterator",
|
||||
]
|
||||
}
|
||||
|
||||
@@ -84,6 +186,16 @@ impl CodeSyntaxGenerator {
|
||||
"switch val { case 1: return \"one\" default: return \"other\" }",
|
||||
"go func() { ch <- result }()",
|
||||
"defer file.Close()",
|
||||
"type Reader interface { Read(p []byte) (n int, err error) }",
|
||||
"ctx, cancel := context.WithTimeout(context.Background(), time.Second)",
|
||||
"var wg sync.WaitGroup; wg.Add(1); go func() { defer wg.Done() }()",
|
||||
"func (p *Point) Distance() float64 { return math.Sqrt(p.X*p.X + p.Y*p.Y) }",
|
||||
"select { case msg := <-ch: process(msg) case <-time.After(time.Second): timeout() }",
|
||||
"json.NewEncoder(w).Encode(response)",
|
||||
"http.HandleFunc(\"/api\", func(w http.ResponseWriter, r *http.Request) { w.Write([]byte(\"ok\")) })",
|
||||
"func Map[T, U any](s []T, f func(T) U) []U { r := make([]U, len(s)); for i, v := range s { r[i] = f(v) }; return r }",
|
||||
"var once sync.Once; once.Do(func() { initialize() })",
|
||||
"buf := bytes.NewBuffer(nil); buf.WriteString(\"hello\")",
|
||||
]
|
||||
}
|
||||
|
||||
@@ -105,18 +217,88 @@ impl TextGenerator for CodeSyntaxGenerator {
|
||||
_focused: Option<char>,
|
||||
word_count: usize,
|
||||
) -> String {
|
||||
let snippets = self.get_snippets();
|
||||
// Try to fetch from GitHub on first use
|
||||
if self.fetched_snippets.is_empty() {
|
||||
self.try_fetch_code();
|
||||
}
|
||||
|
||||
let embedded = self.get_snippets();
|
||||
let mut result = Vec::new();
|
||||
let target_words = word_count;
|
||||
let mut current_words = 0;
|
||||
|
||||
let total_available = embedded.len() + self.fetched_snippets.len();
|
||||
|
||||
while current_words < target_words {
|
||||
let idx = self.rng.gen_range(0..snippets.len());
|
||||
let snippet = snippets[idx];
|
||||
let idx = self.rng.gen_range(0..total_available.max(1));
|
||||
|
||||
let snippet = if idx < embedded.len() {
|
||||
embedded[idx]
|
||||
} else if !self.fetched_snippets.is_empty() {
|
||||
let f_idx = (idx - embedded.len()) % self.fetched_snippets.len();
|
||||
&self.fetched_snippets[f_idx]
|
||||
} else {
|
||||
embedded[idx % embedded.len()]
|
||||
};
|
||||
|
||||
current_words += snippet.split_whitespace().count();
|
||||
result.push(snippet);
|
||||
result.push(snippet.to_string());
|
||||
}
|
||||
|
||||
result.join(" ")
|
||||
}
|
||||
}
|
||||
|
||||
/// Extract function-length snippets from raw source code
|
||||
fn extract_code_snippets(source: &str) -> Vec<String> {
|
||||
let mut snippets = Vec::new();
|
||||
let lines: Vec<&str> = source.lines().collect();
|
||||
|
||||
let mut i = 0;
|
||||
while i < lines.len() {
|
||||
// Look for function/method starts
|
||||
let line = lines[i].trim();
|
||||
let is_func_start = line.starts_with("fn ")
|
||||
|| line.starts_with("pub fn ")
|
||||
|| line.starts_with("def ")
|
||||
|| line.starts_with("func ")
|
||||
|| line.starts_with("function ")
|
||||
|| line.starts_with("async fn ")
|
||||
|| line.starts_with("pub async fn ");
|
||||
|
||||
if is_func_start {
|
||||
let mut snippet_lines = Vec::new();
|
||||
let mut depth = 0i32;
|
||||
let mut j = i;
|
||||
|
||||
while j < lines.len() && snippet_lines.len() < 30 {
|
||||
let l = lines[j];
|
||||
snippet_lines.push(l);
|
||||
|
||||
depth += l.chars().filter(|&c| c == '{' || c == '(').count() as i32;
|
||||
depth -= l.chars().filter(|&c| c == '}' || c == ')').count() as i32;
|
||||
|
||||
if depth <= 0 && j > i {
|
||||
break;
|
||||
}
|
||||
j += 1;
|
||||
}
|
||||
|
||||
if snippet_lines.len() >= 3 && snippet_lines.len() <= 30 {
|
||||
let snippet = snippet_lines.join(" ");
|
||||
// Normalize whitespace
|
||||
let normalized: String = snippet.split_whitespace().collect::<Vec<_>>().join(" ");
|
||||
if normalized.len() >= 20 && normalized.len() <= 500 {
|
||||
snippets.push(normalized);
|
||||
}
|
||||
}
|
||||
|
||||
i = j + 1;
|
||||
} else {
|
||||
i += 1;
|
||||
}
|
||||
}
|
||||
|
||||
snippets.truncate(50);
|
||||
snippets
|
||||
}
|
||||
|
||||
45
src/generator/dictionary.rs
Normal file
45
src/generator/dictionary.rs
Normal file
@@ -0,0 +1,45 @@
|
||||
use crate::engine::filter::CharFilter;
|
||||
|
||||
const WORDS_EN: &str = include_str!("../../assets/words-en.json");
|
||||
|
||||
pub struct Dictionary {
|
||||
words: Vec<String>,
|
||||
}
|
||||
|
||||
impl Dictionary {
|
||||
pub fn load() -> Self {
|
||||
let words: Vec<String> = serde_json::from_str(WORDS_EN).unwrap_or_default();
|
||||
|
||||
// Filter to words of length >= 3 (matching keybr)
|
||||
let words = words
|
||||
.into_iter()
|
||||
.filter(|w| w.len() >= 3 && w.chars().all(|c| c.is_ascii_lowercase()))
|
||||
.collect();
|
||||
|
||||
Self { words }
|
||||
}
|
||||
|
||||
pub fn words_list(&self) -> Vec<String> {
|
||||
self.words.clone()
|
||||
}
|
||||
|
||||
pub fn find_matching(
|
||||
&self,
|
||||
filter: &CharFilter,
|
||||
focused: Option<char>,
|
||||
) -> Vec<&str> {
|
||||
let mut matching: Vec<&str> = self
|
||||
.words
|
||||
.iter()
|
||||
.filter(|w| w.chars().all(|c| filter.is_allowed(c)))
|
||||
.map(|s| s.as_str())
|
||||
.collect();
|
||||
|
||||
// If there's a focused letter, prioritize words containing it
|
||||
if let Some(focus) = focused {
|
||||
matching.sort_by_key(|w| if w.contains(focus) { 0 } else { 1 });
|
||||
}
|
||||
|
||||
matching
|
||||
}
|
||||
}
|
||||
@@ -1,4 +1,6 @@
|
||||
pub mod cache;
|
||||
pub mod code_syntax;
|
||||
pub mod dictionary;
|
||||
pub mod github_code;
|
||||
pub mod passage;
|
||||
pub mod phonetic;
|
||||
|
||||
@@ -1,7 +1,12 @@
|
||||
use rand::rngs::SmallRng;
|
||||
use rand::Rng;
|
||||
|
||||
use crate::engine::filter::CharFilter;
|
||||
use crate::generator::cache::{DiskCache, fetch_url};
|
||||
use crate::generator::TextGenerator;
|
||||
|
||||
const PASSAGES: &[&str] = &[
|
||||
// Classic literature & speeches
|
||||
"the quick brown fox jumps over the lazy dog and then runs across the field while the sun sets behind the distant hills",
|
||||
"it was the best of times it was the worst of times it was the age of wisdom it was the age of foolishness",
|
||||
"in the beginning there was nothing but darkness and then the light appeared slowly spreading across the vast empty space",
|
||||
@@ -17,21 +22,132 @@ const PASSAGES: &[&str] = &[
|
||||
"he picked up the book and began to read turning the pages slowly as the story drew him deeper and deeper into its world",
|
||||
"the stars shone brightly in the clear night sky and the moon cast a silver light over the sleeping town below",
|
||||
"they gathered around the fire telling stories and laughing while the wind howled outside and the snow piled up against the door",
|
||||
// Pride and Prejudice
|
||||
"it is a truth universally acknowledged that a single man in possession of a good fortune must be in want of a wife",
|
||||
"there is a stubbornness about me that never can bear to be frightened at the will of others my courage always rises at every attempt to intimidate me",
|
||||
"i could easily forgive his pride if he had not mortified mine but vanity not love has been my folly",
|
||||
// Alice in Wonderland
|
||||
"alice was beginning to get very tired of sitting by her sister on the bank and of having nothing to do",
|
||||
"who in the world am i that is the great puzzle she said as she looked around the strange room with wonder",
|
||||
"but i dont want to go among mad people alice remarked oh you cant help that said the cat were all mad here",
|
||||
// Great Gatsby
|
||||
"in my younger and more vulnerable years my father gave me some advice that i have been turning over in my mind ever since",
|
||||
"so we beat on boats against the current borne back ceaselessly into the past dreaming of that green light",
|
||||
// Sherlock Holmes
|
||||
"when you have eliminated the impossible whatever remains however improbable must be the truth my dear watson",
|
||||
"the world is full of obvious things which nobody by any chance ever observes but which are perfectly visible",
|
||||
// Moby Dick
|
||||
"call me ishmael some years ago having little or no money in my purse and nothing particular to interest me on shore",
|
||||
"it is not down on any map because true places never are and the voyage was long and the sea was deep",
|
||||
// 1984
|
||||
"it was a bright cold day in april and the clocks were striking thirteen winston smith his chin nuzzled into his breast",
|
||||
"who controls the past controls the future and who controls the present controls the past said the voice from the screen",
|
||||
// Walden
|
||||
"i went to the woods because i wished to live deliberately to front only the essential facts of life",
|
||||
"the mass of men lead lives of quiet desperation and go to the grave with the song still in them",
|
||||
// Science & philosophy
|
||||
"the only way to do great work is to love what you do and if you have not found it yet keep looking and do not settle",
|
||||
"imagination is more important than knowledge for while knowledge defines all we currently know imagination points to what we might discover",
|
||||
"the important thing is not to stop questioning for curiosity has its own reason for existing in this wonderful universe",
|
||||
"we are all in the gutter but some of us are looking at the stars and dreaming of worlds beyond our own",
|
||||
"the greatest glory in living lies not in never falling but in rising every time we fall and trying once more",
|
||||
// Nature & observation
|
||||
"the autumn wind scattered golden leaves across the garden as the last rays of sunlight painted the clouds in shades of orange and pink",
|
||||
"deep in the forest where the ancient trees stood tall and silent a small stream wound its way through moss covered stones",
|
||||
"the ocean stretched endlessly before them its surface catching the light of the setting sun in a thousand shimmering reflections",
|
||||
"morning mist hung low over the meadow as the first birds began their chorus and dew drops sparkled like diamonds on every blade of grass",
|
||||
"the mountain peak stood above the clouds its snow covered summit glowing pink and gold in the light of the early morning sun",
|
||||
// Everyday wisdom
|
||||
"the best time to plant a tree was twenty years ago and the second best time is now so do not wait any longer to begin",
|
||||
"a journey of a thousand miles begins with a single step and every great achievement started with the decision to try",
|
||||
"the more that you read the more things you will know and the more that you learn the more places you will go",
|
||||
"in three words i can sum up everything i have learned about life it goes on and so must we with hope",
|
||||
"happiness is not something ready made it comes from your own actions and your choices shape the life you live",
|
||||
"do not go where the path may lead but go instead where there is no path and leave a trail for others to follow",
|
||||
"success is not final failure is not fatal it is the courage to continue that counts in the end",
|
||||
"be yourself because everyone else is already taken and the world needs what only you can bring to it",
|
||||
"life is what happens when you are busy making other plans so enjoy the journey along the way",
|
||||
"the secret of getting ahead is getting started and the secret of getting started is breaking your tasks into small steps",
|
||||
];
|
||||
|
||||
/// Gutenberg book IDs for popular public domain works
|
||||
const GUTENBERG_IDS: &[(u32, &str)] = &[
|
||||
(1342, "pride_and_prejudice"),
|
||||
(11, "alice_in_wonderland"),
|
||||
(1661, "sherlock_holmes"),
|
||||
(84, "frankenstein"),
|
||||
(1952, "yellow_wallpaper"),
|
||||
(2701, "moby_dick"),
|
||||
(74, "tom_sawyer"),
|
||||
(345, "dracula"),
|
||||
(1232, "prince"),
|
||||
(76, "huckleberry_finn"),
|
||||
(5200, "metamorphosis"),
|
||||
(2542, "aesop_fables"),
|
||||
(174, "dorian_gray"),
|
||||
(98, "tale_two_cities"),
|
||||
(1080, "modest_proposal"),
|
||||
(219, "heart_of_darkness"),
|
||||
(4300, "ulysses"),
|
||||
(28054, "brothers_karamazov"),
|
||||
(2554, "crime_and_punishment"),
|
||||
(55, "oz"),
|
||||
];
|
||||
|
||||
pub struct PassageGenerator {
|
||||
current_idx: usize,
|
||||
fetched_passages: Vec<String>,
|
||||
rng: SmallRng,
|
||||
}
|
||||
|
||||
impl PassageGenerator {
|
||||
pub fn new() -> Self {
|
||||
Self { current_idx: 0 }
|
||||
pub fn new(rng: SmallRng) -> Self {
|
||||
let mut generator = Self {
|
||||
current_idx: 0,
|
||||
fetched_passages: Vec::new(),
|
||||
rng,
|
||||
};
|
||||
generator.load_cached_passages();
|
||||
generator
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for PassageGenerator {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
fn load_cached_passages(&mut self) {
|
||||
if let Some(cache) = DiskCache::new("passages") {
|
||||
for &(_, name) in GUTENBERG_IDS {
|
||||
if let Some(content) = cache.get(name) {
|
||||
let paragraphs = extract_paragraphs(&content);
|
||||
self.fetched_passages.extend(paragraphs);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn try_fetch_gutenberg(&mut self) {
|
||||
let cache = match DiskCache::new("passages") {
|
||||
Some(c) => c,
|
||||
None => return,
|
||||
};
|
||||
|
||||
// Pick a random book that we haven't cached yet
|
||||
let uncached: Vec<(u32, &str)> = GUTENBERG_IDS
|
||||
.iter()
|
||||
.filter(|(_, name)| cache.get(name).is_none())
|
||||
.copied()
|
||||
.collect();
|
||||
|
||||
if uncached.is_empty() {
|
||||
return;
|
||||
}
|
||||
|
||||
let idx = self.rng.gen_range(0..uncached.len());
|
||||
let (book_id, name) = uncached[idx];
|
||||
let url = format!("https://www.gutenberg.org/cache/epub/{book_id}/pg{book_id}.txt");
|
||||
|
||||
if let Some(content) = fetch_url(&url) {
|
||||
cache.put(name, &content);
|
||||
let paragraphs = extract_paragraphs(&content);
|
||||
self.fetched_passages.extend(paragraphs);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -42,8 +158,87 @@ impl TextGenerator for PassageGenerator {
|
||||
_focused: Option<char>,
|
||||
_word_count: usize,
|
||||
) -> String {
|
||||
let passage = PASSAGES[self.current_idx % PASSAGES.len()];
|
||||
// Try to fetch a new Gutenberg book in the background (first few calls)
|
||||
if self.fetched_passages.len() < 50 && self.current_idx < 3 {
|
||||
self.try_fetch_gutenberg();
|
||||
}
|
||||
|
||||
let total_passages = PASSAGES.len() + self.fetched_passages.len();
|
||||
|
||||
if total_passages == 0 {
|
||||
self.current_idx += 1;
|
||||
return PASSAGES[0].to_string();
|
||||
}
|
||||
|
||||
// Mix embedded and fetched passages
|
||||
let idx = self.current_idx % total_passages;
|
||||
self.current_idx += 1;
|
||||
passage.to_string()
|
||||
|
||||
if idx < PASSAGES.len() {
|
||||
PASSAGES[idx].to_string()
|
||||
} else {
|
||||
let fetched_idx = idx - PASSAGES.len();
|
||||
self.fetched_passages[fetched_idx % self.fetched_passages.len()].clone()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Extract readable paragraphs from Gutenberg text, skipping header/footer
|
||||
fn extract_paragraphs(text: &str) -> Vec<String> {
|
||||
let mut paragraphs = Vec::new();
|
||||
|
||||
// Find the start of actual content (after Gutenberg header)
|
||||
let start_markers = ["*** START OF", "***START OF"];
|
||||
let end_markers = ["*** END OF", "***END OF"];
|
||||
|
||||
let content_start = start_markers
|
||||
.iter()
|
||||
.filter_map(|marker| text.find(marker))
|
||||
.min()
|
||||
.map(|pos| {
|
||||
// Find the end of the header line
|
||||
text[pos..].find('\n').map(|nl| pos + nl + 1).unwrap_or(pos)
|
||||
})
|
||||
.unwrap_or(0);
|
||||
|
||||
let content_end = end_markers
|
||||
.iter()
|
||||
.filter_map(|marker| text.find(marker))
|
||||
.min()
|
||||
.unwrap_or(text.len());
|
||||
|
||||
let content = &text[content_start..content_end];
|
||||
|
||||
// Split into paragraphs (double newline separated)
|
||||
for para in content.split("\r\n\r\n").chain(content.split("\n\n")) {
|
||||
let cleaned: String = para
|
||||
.lines()
|
||||
.map(|l| l.trim())
|
||||
.collect::<Vec<_>>()
|
||||
.join(" ")
|
||||
.chars()
|
||||
.filter(|c| c.is_ascii_alphanumeric() || c.is_ascii_whitespace() || c.is_ascii_punctuation())
|
||||
.collect::<String>()
|
||||
.to_lowercase();
|
||||
|
||||
let word_count = cleaned.split_whitespace().count();
|
||||
if word_count >= 15 && word_count <= 60 {
|
||||
// Keep only the alpha/space portions for typing
|
||||
let typing_text: String = cleaned
|
||||
.chars()
|
||||
.filter(|c| c.is_ascii_lowercase() || *c == ' ')
|
||||
.collect::<String>()
|
||||
.split_whitespace()
|
||||
.collect::<Vec<_>>()
|
||||
.join(" ");
|
||||
|
||||
if typing_text.split_whitespace().count() >= 10 {
|
||||
paragraphs.push(typing_text);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Take at most 100 paragraphs per book
|
||||
paragraphs.truncate(100);
|
||||
paragraphs
|
||||
}
|
||||
|
||||
@@ -2,17 +2,27 @@ use rand::rngs::SmallRng;
|
||||
use rand::Rng;
|
||||
|
||||
use crate::engine::filter::CharFilter;
|
||||
use crate::generator::dictionary::Dictionary;
|
||||
use crate::generator::transition_table::TransitionTable;
|
||||
use crate::generator::TextGenerator;
|
||||
|
||||
const MIN_WORD_LEN: usize = 3;
|
||||
const MAX_WORD_LEN: usize = 10;
|
||||
const MIN_REAL_WORDS: usize = 15;
|
||||
|
||||
pub struct PhoneticGenerator {
|
||||
table: TransitionTable,
|
||||
dictionary: Dictionary,
|
||||
rng: SmallRng,
|
||||
}
|
||||
|
||||
impl PhoneticGenerator {
|
||||
pub fn new(table: TransitionTable, rng: SmallRng) -> Self {
|
||||
Self { table, rng }
|
||||
pub fn new(table: TransitionTable, dictionary: Dictionary, rng: SmallRng) -> Self {
|
||||
Self {
|
||||
table,
|
||||
dictionary,
|
||||
rng,
|
||||
}
|
||||
}
|
||||
|
||||
fn pick_weighted_from(
|
||||
@@ -46,28 +56,30 @@ impl PhoneticGenerator {
|
||||
Some(filtered.last().unwrap().0)
|
||||
}
|
||||
|
||||
fn generate_word(&mut self, filter: &CharFilter, focused: Option<char>) -> String {
|
||||
let min_len = 3;
|
||||
let max_len = 10;
|
||||
let mut word = String::new();
|
||||
fn generate_phonetic_word(&mut self, filter: &CharFilter, focused: Option<char>) -> String {
|
||||
for _attempt in 0..5 {
|
||||
let word = self.try_generate_word(filter, focused);
|
||||
if word.len() >= MIN_WORD_LEN {
|
||||
return word;
|
||||
}
|
||||
}
|
||||
// Fallback
|
||||
"the".to_string()
|
||||
}
|
||||
|
||||
fn try_generate_word(&mut self, filter: &CharFilter, focused: Option<char>) -> String {
|
||||
let mut word = Vec::new();
|
||||
|
||||
// Start with space prefix
|
||||
let start_char = if let Some(focus) = focused {
|
||||
if self.rng.gen_bool(0.4) {
|
||||
let probs = self.table.get_next_probs(' ', focus).cloned();
|
||||
if let Some(probs) = probs {
|
||||
let filtered: Vec<(char, f64)> = probs
|
||||
.iter()
|
||||
.filter(|(ch, _)| filter.is_allowed(*ch))
|
||||
.copied()
|
||||
.collect();
|
||||
if !filtered.is_empty() {
|
||||
word.push(focus);
|
||||
Self::pick_weighted_from(&mut self.rng, &filtered, filter)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
if self.rng.gen_bool(0.4) && filter.is_allowed(focus) {
|
||||
word.push(focus);
|
||||
// Get next char from transition table
|
||||
let prefix = vec![' ', ' ', focus];
|
||||
if let Some(probs) = self.table.segment(&prefix) {
|
||||
Self::pick_weighted_from(&mut self.rng, probs, filter)
|
||||
} else {
|
||||
Some(focus)
|
||||
None
|
||||
}
|
||||
} else {
|
||||
None
|
||||
@@ -77,25 +89,32 @@ impl PhoneticGenerator {
|
||||
};
|
||||
|
||||
if word.is_empty() {
|
||||
let starters: Vec<(char, f64)> = filter
|
||||
.allowed
|
||||
.iter()
|
||||
.map(|&ch| {
|
||||
(
|
||||
ch,
|
||||
if ch == 'e' || ch == 't' || ch == 'a' {
|
||||
3.0
|
||||
} else {
|
||||
1.0
|
||||
},
|
||||
)
|
||||
})
|
||||
.collect();
|
||||
|
||||
if let Some(ch) = Self::pick_weighted_from(&mut self.rng, &starters, filter) {
|
||||
word.push(ch);
|
||||
} else {
|
||||
return "the".to_string();
|
||||
// Pick a start from transition table
|
||||
let prefix = vec![' ', ' ', ' '];
|
||||
if let Some(probs) = self.table.segment(&prefix) {
|
||||
if let Some(ch) = Self::pick_weighted_from(&mut self.rng, probs, filter) {
|
||||
word.push(ch);
|
||||
}
|
||||
}
|
||||
// Fallback: weighted random start
|
||||
if word.is_empty() {
|
||||
let starters: Vec<(char, f64)> = filter
|
||||
.allowed
|
||||
.iter()
|
||||
.map(|&ch| {
|
||||
let w = match ch {
|
||||
'e' | 't' | 'a' => 3.0,
|
||||
'o' | 'i' | 'n' | 's' => 2.0,
|
||||
_ => 1.0,
|
||||
};
|
||||
(ch, w)
|
||||
})
|
||||
.collect();
|
||||
if let Some(ch) = Self::pick_weighted_from(&mut self.rng, &starters, filter) {
|
||||
word.push(ch);
|
||||
} else {
|
||||
return "the".to_string();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -103,33 +122,60 @@ impl PhoneticGenerator {
|
||||
word.push(ch);
|
||||
}
|
||||
|
||||
while word.len() < max_len {
|
||||
let chars: Vec<char> = word.chars().collect();
|
||||
let len = chars.len();
|
||||
|
||||
let (prev, curr) = if len >= 2 {
|
||||
(chars[len - 2], chars[len - 1])
|
||||
while word.len() < MAX_WORD_LEN {
|
||||
// Build prefix from recent chars, padded with spaces
|
||||
let prefix_len = self.table.order - 1;
|
||||
let mut prefix = Vec::new();
|
||||
let start = if word.len() >= prefix_len {
|
||||
word.len() - prefix_len
|
||||
} else {
|
||||
(' ', chars[len - 1])
|
||||
0
|
||||
};
|
||||
|
||||
let space_prob = 1.3f64.powi(word.len() as i32 - min_len as i32);
|
||||
if word.len() >= min_len
|
||||
&& self
|
||||
.rng
|
||||
.gen_bool((space_prob / (space_prob + 5.0)).min(0.8))
|
||||
{
|
||||
break;
|
||||
for _ in 0..(prefix_len.saturating_sub(word.len())) {
|
||||
prefix.push(' ');
|
||||
}
|
||||
for i in start..word.len() {
|
||||
prefix.push(word[i]);
|
||||
}
|
||||
|
||||
let probs = self.table.get_next_probs(prev, curr).cloned();
|
||||
if let Some(probs) = probs {
|
||||
if let Some(next) = Self::pick_weighted_from(&mut self.rng, &probs, filter) {
|
||||
// Check for word ending (space probability increases with length)
|
||||
if word.len() >= MIN_WORD_LEN {
|
||||
if let Some(probs) = self.table.segment(&prefix) {
|
||||
let space_weight: f64 = probs
|
||||
.iter()
|
||||
.filter(|(ch, _)| *ch == ' ')
|
||||
.map(|(_, w)| w)
|
||||
.sum();
|
||||
if space_weight > 0.0 {
|
||||
let boost = 1.3f64.powi(word.len() as i32 - MIN_WORD_LEN as i32);
|
||||
let total: f64 = probs.iter().map(|(_, w)| w).sum();
|
||||
let space_prob = (space_weight * boost) / (total + space_weight * (boost - 1.0));
|
||||
if self.rng.gen_bool(space_prob.min(0.85)) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
// Even without space in table, use length-based ending
|
||||
let end_prob = 1.3f64.powi(word.len() as i32 - MIN_WORD_LEN as i32);
|
||||
if self.rng.gen_bool((end_prob / (end_prob + 5.0)).min(0.8)) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Get next character from transition table
|
||||
if let Some(probs) = self.table.segment(&prefix) {
|
||||
let non_space: Vec<(char, f64)> = probs
|
||||
.iter()
|
||||
.filter(|(ch, _)| *ch != ' ')
|
||||
.copied()
|
||||
.collect();
|
||||
if let Some(next) = Self::pick_weighted_from(&mut self.rng, &non_space, filter) {
|
||||
word.push(next);
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
// Fallback to vowel
|
||||
let vowels: Vec<(char, f64)> = ['a', 'e', 'i', 'o', 'u']
|
||||
.iter()
|
||||
.filter(|&&v| filter.is_allowed(v))
|
||||
@@ -143,11 +189,7 @@ impl PhoneticGenerator {
|
||||
}
|
||||
}
|
||||
|
||||
if word.is_empty() {
|
||||
"the".to_string()
|
||||
} else {
|
||||
word
|
||||
}
|
||||
word.iter().collect()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -158,10 +200,42 @@ impl TextGenerator for PhoneticGenerator {
|
||||
focused: Option<char>,
|
||||
word_count: usize,
|
||||
) -> String {
|
||||
// keybr's approach: prefer real words when enough match the filter
|
||||
// Collect matching words into owned Vec to avoid borrow conflict
|
||||
let matching_words: Vec<String> = self
|
||||
.dictionary
|
||||
.find_matching(filter, focused)
|
||||
.iter()
|
||||
.map(|s| s.to_string())
|
||||
.collect();
|
||||
let use_real_words = matching_words.len() >= MIN_REAL_WORDS;
|
||||
|
||||
let mut words: Vec<String> = Vec::new();
|
||||
let mut last_word = String::new();
|
||||
|
||||
for _ in 0..word_count {
|
||||
words.push(self.generate_word(filter, focused));
|
||||
if use_real_words {
|
||||
// Pick a real word (avoid consecutive duplicates)
|
||||
let mut picked = None;
|
||||
for _ in 0..3 {
|
||||
let idx = self.rng.gen_range(0..matching_words.len());
|
||||
let word = matching_words[idx].clone();
|
||||
if word != last_word {
|
||||
picked = Some(word);
|
||||
break;
|
||||
}
|
||||
}
|
||||
let word = match picked {
|
||||
Some(w) => w,
|
||||
None => self.generate_phonetic_word(filter, focused),
|
||||
};
|
||||
last_word.clone_from(&word);
|
||||
words.push(word);
|
||||
} else {
|
||||
// Fall back to phonetic pseudo-words
|
||||
let word = self.generate_phonetic_word(filter, focused);
|
||||
words.push(word);
|
||||
}
|
||||
}
|
||||
|
||||
words.join(" ")
|
||||
|
||||
@@ -4,29 +4,108 @@ use serde::{Deserialize, Serialize};
|
||||
|
||||
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||
pub struct TransitionTable {
|
||||
pub transitions: HashMap<(char, char), Vec<(char, f64)>>,
|
||||
pub order: usize,
|
||||
transitions: HashMap<Vec<char>, Vec<(char, f64)>>,
|
||||
}
|
||||
|
||||
impl TransitionTable {
|
||||
pub fn new() -> Self {
|
||||
pub fn new(order: usize) -> Self {
|
||||
Self {
|
||||
order,
|
||||
transitions: HashMap::new(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn add(&mut self, prev: char, curr: char, next: char, weight: f64) {
|
||||
pub fn add(&mut self, prefix: &[char], next: char, weight: f64) {
|
||||
self.transitions
|
||||
.entry((prev, curr))
|
||||
.entry(prefix.to_vec())
|
||||
.or_default()
|
||||
.push((next, weight));
|
||||
}
|
||||
|
||||
pub fn get_next_probs(&self, prev: char, curr: char) -> Option<&Vec<(char, f64)>> {
|
||||
self.transitions.get(&(prev, curr))
|
||||
pub fn segment(&self, prefix: &[char]) -> Option<&Vec<(char, f64)>> {
|
||||
// Try exact prefix match first, then fall back to shorter prefixes
|
||||
let key_len = self.order - 1;
|
||||
let prefix = if prefix.len() >= key_len {
|
||||
&prefix[prefix.len() - key_len..]
|
||||
} else {
|
||||
prefix
|
||||
};
|
||||
|
||||
// Try progressively shorter prefixes for backoff
|
||||
for start in 0..prefix.len() {
|
||||
let key = prefix[start..].to_vec();
|
||||
if let Some(entries) = self.transitions.get(&key) {
|
||||
return Some(entries);
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
/// Build an order-4 transition table from a word frequency list.
|
||||
/// Words earlier in the list are higher frequency and get more weight.
|
||||
pub fn build_from_words(words: &[String]) -> Self {
|
||||
let mut table = Self::new(4);
|
||||
let prefix_len = 3; // order - 1
|
||||
|
||||
for (rank, word) in words.iter().enumerate() {
|
||||
if word.len() < 3 {
|
||||
continue;
|
||||
}
|
||||
if !word.chars().all(|c| c.is_ascii_lowercase()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Weight decreases with rank (frequency-based)
|
||||
let weight = 1.0 / (1.0 + (rank as f64 / 500.0));
|
||||
|
||||
// Add word start transitions (space prefix -> first chars)
|
||||
let chars: Vec<char> = word.chars().collect();
|
||||
|
||||
// Start of word: ' ' prefix
|
||||
for i in 0..chars.len() {
|
||||
let mut prefix = Vec::new();
|
||||
// Build prefix from space + preceding chars
|
||||
let start = if i >= prefix_len { i - prefix_len } else { 0 };
|
||||
if i < prefix_len {
|
||||
// Pad with spaces
|
||||
for _ in 0..(prefix_len - i) {
|
||||
prefix.push(' ');
|
||||
}
|
||||
}
|
||||
for j in start..i {
|
||||
prefix.push(chars[j]);
|
||||
}
|
||||
|
||||
let next = chars[i];
|
||||
table.add(&prefix, next, weight);
|
||||
}
|
||||
|
||||
// End of word: last chars -> space
|
||||
let end_start = if chars.len() >= prefix_len {
|
||||
chars.len() - prefix_len
|
||||
} else {
|
||||
0
|
||||
};
|
||||
let mut end_prefix: Vec<char> = Vec::new();
|
||||
if chars.len() < prefix_len {
|
||||
for _ in 0..(prefix_len - chars.len()) {
|
||||
end_prefix.push(' ');
|
||||
}
|
||||
}
|
||||
for j in end_start..chars.len() {
|
||||
end_prefix.push(chars[j]);
|
||||
}
|
||||
table.add(&end_prefix, ' ', weight);
|
||||
}
|
||||
|
||||
table
|
||||
}
|
||||
|
||||
/// Legacy order-2 table for fallback
|
||||
#[allow(dead_code)]
|
||||
pub fn build_english() -> Self {
|
||||
let mut table = Self::new();
|
||||
let mut table = Self::new(4);
|
||||
|
||||
let common_patterns: &[(&str, f64)] = &[
|
||||
("the", 10.0), ("and", 8.0), ("ing", 7.0), ("tion", 6.0), ("ent", 5.0),
|
||||
@@ -40,25 +119,24 @@ impl TransitionTable {
|
||||
("ght", 2.0), ("whi", 2.0), ("who", 2.0), ("hen", 2.0), ("ter", 2.0),
|
||||
("man", 2.0), ("men", 2.0), ("ner", 2.0), ("per", 2.0), ("pre", 2.0),
|
||||
("ran", 2.0), ("lin", 2.0), ("kin", 2.0), ("din", 2.0), ("sin", 2.0),
|
||||
("out", 2.0), ("ind", 2.0), ("ith", 2.0), ("ber", 2.0), ("der", 2.0),
|
||||
("out", 2.0), ("ind", 2.0), ("ber", 2.0), ("der", 2.0),
|
||||
("end", 2.0), ("hin", 2.0), ("old", 2.0), ("ear", 2.0), ("ain", 2.0),
|
||||
("ant", 2.0), ("urn", 2.0), ("ell", 2.0), ("ill", 2.0), ("ade", 2.0),
|
||||
("igh", 2.0), ("ong", 2.0), ("ung", 2.0), ("ast", 2.0), ("ist", 2.0),
|
||||
("ong", 2.0), ("ung", 2.0), ("ast", 2.0), ("ist", 2.0),
|
||||
("ust", 2.0), ("ost", 2.0), ("ard", 2.0), ("ord", 2.0), ("art", 2.0),
|
||||
("ort", 2.0), ("ect", 2.0), ("act", 2.0), ("ack", 2.0), ("ick", 2.0),
|
||||
("ock", 2.0), ("uck", 2.0), ("ash", 2.0), ("ish", 2.0), ("ush", 2.0),
|
||||
("anc", 1.5), ("enc", 1.5), ("inc", 1.5), ("onc", 1.5), ("unc", 1.5),
|
||||
("unt", 1.5), ("int", 1.5), ("ont", 1.5), ("ent", 1.5), ("ment", 1.5),
|
||||
("ness", 1.5), ("less", 1.5), ("able", 1.5), ("ible", 1.5), ("ting", 1.5),
|
||||
("ring", 1.5), ("sing", 1.5), ("king", 1.5), ("ning", 1.5), ("ling", 1.5),
|
||||
("wing", 1.5), ("ding", 1.5), ("ping", 1.5), ("ging", 1.5), ("ving", 1.5),
|
||||
("bing", 1.5), ("ming", 1.5), ("fing", 1.0), ("hing", 1.0), ("cing", 1.0),
|
||||
];
|
||||
|
||||
for &(pattern, weight) in common_patterns {
|
||||
let chars: Vec<char> = pattern.chars().collect();
|
||||
for window in chars.windows(3) {
|
||||
table.add(window[0], window[1], window[2], weight);
|
||||
let prefix = vec![window[0], window[1]];
|
||||
table.add(&prefix, window[2], weight);
|
||||
}
|
||||
// Also add shorter prefix entries for the start of patterns
|
||||
if chars.len() >= 2 {
|
||||
table.add(&[' ', chars[0]], chars[1], weight * 0.5);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -70,20 +148,14 @@ impl TransitionTable {
|
||||
|
||||
for &c in &consonants {
|
||||
for &v in &vowels {
|
||||
table.add(' ', c, v, 1.0);
|
||||
table.add(v, c, 'e', 0.5);
|
||||
for &v2 in &vowels {
|
||||
table.add(c, v, v2.to_ascii_lowercase(), 0.3);
|
||||
}
|
||||
for &c2 in &consonants {
|
||||
table.add(v, c, c2, 0.2);
|
||||
}
|
||||
table.add(&[' ', c], v, 1.0);
|
||||
table.add(&[v, c], 'e', 0.5);
|
||||
}
|
||||
}
|
||||
|
||||
for &v in &vowels {
|
||||
for &c in &consonants {
|
||||
table.add(' ', v, c, 0.5);
|
||||
table.add(&[' ', v], c, 0.5);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -93,6 +165,6 @@ impl TransitionTable {
|
||||
|
||||
impl Default for TransitionTable {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
Self::new(4)
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user