bookdata/cli/cluster/author_gender/
mod.rs

1//! Summarize author gender information for clusters.
2//!
3//! This script reads the cluster author information and author gender
4//! information, in order to aggregate author genders for each cluster.
5//!
6//! We use a lot of left joins so that we can compute statistics across
7//! the integration pipeline.
8use std::path::{Path, PathBuf};
9
10use parquet_derive::ParquetRecordWriter;
11use serde::{Deserialize, Serialize};
12
13use crate::arrow::*;
14use crate::ids::codes::*;
15use crate::prelude::*;
16
17mod authors;
18mod clusters;
19
20// #[derive(Display, FromStr, Debug)]
21// #[display(style="lowercase")]
22// enum GenderSource {
23//   VIAF,
24// }
25
26#[derive(Args, Debug)]
27#[command(name = "extract-author-genders")]
28/// Extract cluster author gender data from extracted book data.
29pub struct AuthorGender {
30    /// Specify output file
31    #[arg(short = 'o', long = "output")]
32    output: PathBuf,
33
34    /// Specify the cluster-author file.
35    #[arg(short = 'A', long = "cluster-authors")]
36    author_file: PathBuf,
37}
38
39/// Record format for saving gender information.
40#[derive(Serialize, Deserialize, Clone, ParquetRecordWriter)]
41struct ClusterGenderInfo {
42    cluster: i32,
43    gender: String,
44}
45
46fn save_genders(clusters: Vec<i32>, genders: clusters::ClusterTable, outf: &Path) -> Result<()> {
47    info!("writing cluster genders to {}", outf.display());
48    let mut out = TableWriter::open(outf)?;
49
50    for cluster in clusters {
51        let mut gender = "no-book-author".to_owned();
52        if NS_ISBN.from_code(cluster).is_some() {
53            gender = "no-book".to_owned();
54        }
55        if let Some(stats) = genders.get(&cluster) {
56            if stats.n_book_authors == 0 {
57                assert!(stats.genders.is_empty());
58                gender = "no-book-author".to_owned() // shouldn't happen but 🤷‍♀️
59            } else if stats.n_author_recs == 0 {
60                assert!(stats.genders.is_empty());
61                gender = "no-author-rec".to_owned()
62            } else if stats.genders.is_empty() {
63                gender = "no-gender".to_owned()
64            } else {
65                gender = stats.genders.to_gender().to_string()
66            };
67        }
68        out.write_object(ClusterGenderInfo { cluster, gender })?;
69    }
70
71    out.finish()?;
72
73    Ok(())
74}
75
76impl Command for AuthorGender {
77    fn exec(&self) -> Result<()> {
78        let clusters = clusters::all_clusters("book-links/cluster-stats.parquet")?;
79        let name_genders = authors::viaf_author_table()?;
80        let cluster_genders = clusters::read_resolve(&self.author_file, &name_genders)?;
81        save_genders(clusters, cluster_genders, self.output.as_ref())?;
82
83        Ok(())
84    }
85}