bookdata/cli/goodreads/
work_gender.rs

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
use crate::{ids::codes::NS_GR_WORK, prelude::*};
use polars::prelude::*;

pub fn link_work_genders() -> Result<()> {
    require_working_dir("goodreads")?;

    let gender = LazyFrame::scan_parquet("../book-links/cluster-genders.parquet", default())?;
    let books = LazyFrame::scan_parquet("gr-book-link.parquet", default())?;

    let merged = gender.join(
        books,
        &[col("cluster")],
        &[col("cluster")],
        JoinType::Inner.into(),
    );
    let dedup = merged.unique(None, UniqueKeepStrategy::First);
    let dedup = dedup.select([
        col("*"),
        coalesce(&[
            col("work_id") + lit(NS_GR_WORK.base()),
            col("book_id") + lit(NS_GR_WORK.base()),
        ])
        .alias("gr_item"),
    ]);

    info!("computing results");
    let results = dedup.collect()?;

    info!("saving {} work-gender records", results.height());
    save_df_parquet(results, "gr-work-gender.parquet")?;

    Ok(())
}