bookdata/goodreads/
ids.rs

1//! GoodReads book identifier and linking support.
2use std::{collections::HashMap, fs::File};
3
4use anyhow::Result;
5use log::*;
6use polars::prelude::*;
7use serde::{Deserialize, Serialize};
8
9use crate::{
10    ids::codes::{NS_GR_BOOK, NS_GR_WORK},
11    prelude::BDPath,
12};
13
14pub type BookLinkMap = HashMap<i32, BookLinkRecord>;
15
16const GR_LINK_FILE: BDPath<'static> = BDPath::new("goodreads/gr-book-link.parquet");
17
18/// Book-link record.
19#[derive(Debug, Serialize, Deserialize)]
20pub struct BookLinkRecord {
21    pub book_id: i32,
22    pub work_id: Option<i32>,
23    pub cluster: i32,
24}
25
26impl BookLinkRecord {
27    /// Get the GoodReads item ID for the book (work id, with fallback to book, in numberspace).
28    pub fn item_id(&self) -> i32 {
29        if let Some(w) = &self.work_id {
30            NS_GR_WORK.base() + w
31        } else {
32            NS_GR_BOOK.base() + self.book_id
33        }
34    }
35}
36
37/// Read a map of book IDs to linking identifiers.
38pub fn load_id_links() -> Result<BookLinkMap> {
39    let path = GR_LINK_FILE.resolve()?;
40    let file = File::open(path)?;
41    let pqf = ParquetReader::new(file);
42    let df = pqf.finish()?;
43
44    let mut map = HashMap::with_capacity(df.height());
45
46    let c_book = df.column("book_id")?.i32()?;
47    let c_work = df.column("work_id")?.i32()?;
48    let c_cluster = df.column("cluster")?.i32()?;
49
50    for i in 0..df.height() {
51        let rec: BookLinkRecord = BookLinkRecord {
52            book_id: c_book.get(i).unwrap(),
53            work_id: c_work.get(i),
54            cluster: c_cluster.get(i).unwrap(),
55        };
56        map.insert(rec.book_id, rec);
57    }
58
59    info!("read {} book links from {}", map.len(), GR_LINK_FILE);
60    Ok(map)
61}