bookdata/cli/cluster/author_gender/
mod.rsuse std::path::{Path, PathBuf};
use parquet_derive::ParquetRecordWriter;
use serde::{Deserialize, Serialize};
use crate::arrow::*;
use crate::ids::codes::*;
use crate::prelude::*;
mod authors;
mod clusters;
#[derive(Args, Debug)]
#[command(name = "extract-author-genders")]
pub struct AuthorGender {
#[arg(short = 'o', long = "output")]
output: PathBuf,
#[arg(short = 'A', long = "cluster-authors")]
author_file: PathBuf,
}
#[derive(Serialize, Deserialize, Clone, ParquetRecordWriter)]
struct ClusterGenderInfo {
cluster: i32,
gender: String,
}
fn save_genders(clusters: Vec<i32>, genders: clusters::ClusterTable, outf: &Path) -> Result<()> {
info!("writing cluster genders to {}", outf.display());
let mut out = TableWriter::open(outf)?;
for cluster in clusters {
let mut gender = "no-book-author".to_owned();
if NS_ISBN.from_code(cluster).is_some() {
gender = "no-book".to_owned();
}
if let Some(stats) = genders.get(&cluster) {
if stats.n_book_authors == 0 {
assert!(stats.genders.is_empty());
gender = "no-book-author".to_owned() } else if stats.n_author_recs == 0 {
assert!(stats.genders.is_empty());
gender = "no-author-rec".to_owned()
} else if stats.genders.is_empty() {
gender = "no-gender".to_owned()
} else {
gender = stats.genders.to_gender().to_string()
};
}
out.write_object(ClusterGenderInfo { cluster, gender })?;
}
out.finish()?;
Ok(())
}
impl Command for AuthorGender {
fn exec(&self) -> Result<()> {
let clusters = clusters::all_clusters("book-links/cluster-stats.parquet")?;
let name_genders = authors::viaf_author_table()?;
let cluster_genders = clusters::read_resolve(&self.author_file, &name_genders)?;
save_genders(clusters, cluster_genders, self.output.as_ref())?;
Ok(())
}
}