bookdata/cli/cluster/author_gender/
mod.rs1use std::path::{Path, PathBuf};
9
10use parquet_derive::ParquetRecordWriter;
11use serde::{Deserialize, Serialize};
12
13use crate::arrow::*;
14use crate::ids::codes::*;
15use crate::prelude::*;
16
17mod authors;
18mod clusters;
19
20#[derive(Args, Debug)]
27#[command(name = "extract-author-genders")]
28pub struct AuthorGender {
30 #[arg(short = 'o', long = "output")]
32 output: PathBuf,
33
34 #[arg(short = 'A', long = "cluster-authors")]
36 author_file: PathBuf,
37}
38
39#[derive(Serialize, Deserialize, Clone, ParquetRecordWriter)]
41struct ClusterGenderInfo {
42 cluster: i32,
43 gender: String,
44}
45
46fn save_genders(clusters: Vec<i32>, genders: clusters::ClusterTable, outf: &Path) -> Result<()> {
47 info!("writing cluster genders to {}", outf.display());
48 let mut out = TableWriter::open(outf)?;
49
50 for cluster in clusters {
51 let mut gender = "no-book-author".to_owned();
52 if NS_ISBN.from_code(cluster).is_some() {
53 gender = "no-book".to_owned();
54 }
55 if let Some(stats) = genders.get(&cluster) {
56 if stats.n_book_authors == 0 {
57 assert!(stats.genders.is_empty());
58 gender = "no-book-author".to_owned() } else if stats.n_author_recs == 0 {
60 assert!(stats.genders.is_empty());
61 gender = "no-author-rec".to_owned()
62 } else if stats.genders.is_empty() {
63 gender = "no-gender".to_owned()
64 } else {
65 gender = stats.genders.to_gender().to_string()
66 };
67 }
68 out.write_object(ClusterGenderInfo { cluster, gender })?;
69 }
70
71 out.finish()?;
72
73 Ok(())
74}
75
76impl Command for AuthorGender {
77 fn exec(&self) -> Result<()> {
78 let clusters = clusters::all_clusters("book-links/cluster-stats.parquet")?;
79 let name_genders = authors::viaf_author_table()?;
80 let cluster_genders = clusters::read_resolve(&self.author_file, &name_genders)?;
81 save_genders(clusters, cluster_genders, self.output.as_ref())?;
82
83 Ok(())
84 }
85}