Dump plugins by hash instead of id

The same exact plugins are often included in many different files/mods. Grouping by hash in the sql query will show all files and mods a plugin appears in. Previously, the last file that was processed would overwrite the plugin file and only that single file/mod would appear in the file.

Also adds an index on plugins.hash since I'm now querying directly on it.
This commit is contained in:
Tyler Hallada 2022-03-12 13:05:21 -05:00
parent c9dcd3b7c5
commit 5d55e78283
3 changed files with 24 additions and 34 deletions

View File

@ -0,0 +1 @@
CREATE INDEX ON plugins (hash);

View File

@ -10,7 +10,7 @@ fn format_radix(mut x: u64, radix: u32) -> String {
let mut result = vec![];
loop {
let m = x % radix as u64;
x = x / radix as u64;
x /= radix as u64;
// will panic if you use a bad radix (< 2 or > 36).
result.push(std::char::from_digit(m as u32, radix).unwrap());
@ -23,10 +23,10 @@ fn format_radix(mut x: u64, radix: u32) -> String {
pub async fn dump_plugin_data(pool: &sqlx::Pool<sqlx::Postgres>, dir: &str) -> Result<()> {
let page_size = 20;
let mut last_id = None;
let mut last_hash = None;
loop {
let plugins =
plugin::batched_get_with_data(&pool, page_size, last_id, "Skyrim.esm", 1).await?;
plugin::batched_get_by_hash_with_mods(pool, page_size, last_hash, "Skyrim.esm", 1).await?;
if plugins.is_empty() {
break;
}
@ -36,8 +36,8 @@ pub async fn dump_plugin_data(pool: &sqlx::Pool<sqlx::Postgres>, dir: &str) -> R
let path = path.join(format!("{}.json", format_radix(plugin.hash as u64, 36)));
let mut file = File::create(path)?;
write!(file, "{}", serde_json::to_string(&plugin)?)?;
last_id = Some(plugin.id);
last_hash = Some(plugin.hash);
}
}
return Ok(());
Ok(())
}

View File

@ -38,23 +38,11 @@ pub struct UnsavedPlugin<'a> {
}
#[derive(Debug, Serialize, Deserialize, FromRow)]
pub struct PluginWithData {
pub id: i32,
pub name: String,
pub struct PluginsByHashWithMods {
pub hash: i64,
pub file_id: i32,
pub mod_id: i32,
pub version: f64,
pub size: i64,
pub author: Option<String>,
pub description: Option<String>,
pub masters: Vec<String>,
pub file_name: String,
pub file_path: String,
pub updated_at: NaiveDateTime,
pub created_at: NaiveDateTime,
pub file: Option<serde_json::Value>,
pub r#mod: Option<serde_json::Value>,
pub plugins: Option<serde_json::Value>,
pub files: Option<serde_json::Value>,
pub mods: Option<serde_json::Value>,
pub cells: Option<serde_json::Value>,
}
@ -90,36 +78,37 @@ pub async fn insert<'a>(
}
#[instrument(level = "debug", skip(pool))]
pub async fn batched_get_with_data(
pub async fn batched_get_by_hash_with_mods(
pool: &sqlx::Pool<sqlx::Postgres>,
page_size: i64,
last_id: Option<i32>,
last_hash: Option<i64>,
master: &str,
world_id: i32,
) -> Result<Vec<PluginWithData>> {
let last_id = last_id.unwrap_or(0);
) -> Result<Vec<PluginsByHashWithMods>> {
let last_hash = last_hash.unwrap_or(-9223372036854775808); // psql bigint min
sqlx::query_as!(
PluginWithData,
PluginsByHashWithMods,
"SELECT
plugins.*,
json_agg(DISTINCT files.*) as file,
json_agg(DISTINCT mods.*) as mod,
plugins.hash,
json_agg(DISTINCT plugins.*) as plugins,
json_agg(DISTINCT files.*) as files,
json_agg(DISTINCT mods.*) as mods,
COALESCE(json_agg(DISTINCT jsonb_build_object('x', cells.x, 'y', cells.y)) FILTER (WHERE cells.x IS NOT NULL AND cells.y IS NOT NULL AND cells.master = $3 AND cells.world_id = $4), '[]') AS cells
FROM plugins
LEFT OUTER JOIN files ON files.id = plugins.file_id
LEFT OUTER JOIN mods ON mods.id = files.mod_id
LEFT OUTER JOIN plugin_cells ON plugin_cells.plugin_id = plugins.id
LEFT OUTER JOIN cells ON cells.id = plugin_cells.cell_id
WHERE plugins.id > $2
GROUP BY plugins.id
ORDER BY plugins.id ASC
WHERE plugins.hash > $2
GROUP BY plugins.hash
ORDER BY plugins.hash ASC
LIMIT $1",
page_size,
last_id,
last_hash,
master,
world_id
)
.fetch_all(pool)
.await
.context("Failed to batch get with data")
.context("Failed to batch get by hash with mods")
}