From 151ff191cc6821f016519f1a6b75d9d24b0cf5c6 Mon Sep 17 00:00:00 2001 From: Tyler Hallada Date: Fri, 23 Jul 2021 23:14:08 -0400 Subject: [PATCH] Parse strings in Windows-1252 encoding According to UESP, this is what strings in plugin files are encoded in: https://en.uesp.net/wiki/Skyrim_Mod:File_Format_Conventions Decoding from this encoding fixes parsing some files with zstrings that contain special characters like umlauts. --- Cargo.lock | 10 ++++++++++ Cargo.toml | 1 + src/parser.rs | 15 ++++++++------- 3 files changed, 19 insertions(+), 7 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 874974e..68590a8 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -88,6 +88,15 @@ dependencies = [ "cfg-if", ] +[[package]] +name = "encoding_rs" +version = "0.8.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "80df024fbc5ac80f87dfef0d9f5209a252f2a497f7f42944cff24d8253cac065" +dependencies = [ + "cfg-if", +] + [[package]] name = "flate2" version = "1.0.20" @@ -237,6 +246,7 @@ dependencies = [ "anyhow", "argh", "bitflags", + "encoding_rs", "flate2", "nom", "serde", diff --git a/Cargo.toml b/Cargo.toml index 96fe616..33b8ee8 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -14,6 +14,7 @@ license = "MIT" anyhow = "1.0" argh = { version = "0.1", optional = true } bitflags = "1.2" +encoding_rs = "0.8" flate2 = "1.0" nom = "6" serde = { version = "1.0", features = ["derive"] } diff --git a/src/parser.rs b/src/parser.rs index 17c62b4..ae199fb 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -1,7 +1,9 @@ +use std::borrow::Cow; use std::io::Read; use std::{convert::TryInto, str}; use anyhow::{anyhow, Result}; +use encoding_rs::WINDOWS_1252; use flate2::read::ZlibDecoder; use nom::{ branch::alt, @@ -32,9 +34,9 @@ pub struct PluginHeader<'a> { pub version: f32, pub num_records_and_groups: i32, pub next_object_id: u32, - pub author: Option<&'a str>, - pub description: Option<&'a str>, - pub masters: Vec<&'a str>, + pub author: Option>, + pub description: Option>, + pub masters: Vec>, } /// Parsed [CELL records](https://en.uesp.net/wiki/Skyrim_Mod:Mod_File_Format/CELL) @@ -526,10 +528,9 @@ fn parse_4char(input: &[u8]) -> IResult<&[u8], &str> { map_res(take(4usize), |bytes: &[u8]| str::from_utf8(bytes))(input) } -fn parse_zstring(input: &[u8]) -> IResult<&[u8], &str> { - let (input, zstring) = map_res(take_while(|byte| byte != 0), |bytes: &[u8]| { - str::from_utf8(bytes) - })(input)?; +fn parse_zstring(input: &[u8]) -> IResult<&[u8], Cow> { + let (input, bytes) = take_while(|byte| byte != 0)(input)?; + let (zstring, _, _) = WINDOWS_1252.decode(bytes); let (input, _) = take(1usize)(input)?; Ok((input, zstring)) }