bookdata/graph/
sources.rs

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
use std::fmt::Debug;

use anyhow::Result;

use polars::prelude::*;

use crate::ids::codes::*;
use crate::util::default;

pub trait EdgeRead: Debug {
    fn read_edges(&self) -> Result<LazyFrame>;
}

pub trait NodeRead: Debug {
    fn read_node_ids(&self) -> Result<LazyFrame>;
}

#[derive(Debug)]
pub struct ISBN;
#[derive(Debug)]
pub struct LOC;
#[derive(Debug)]
pub struct OLEditions;
#[derive(Debug)]
pub struct OLWorks;
#[derive(Debug)]
pub struct GRBooks;
#[derive(Debug)]
pub struct GRWorks;

/// Get an ID column and apply the appropriate namespace adjustment.
fn id_col(name: &str, ns: NS<'_>) -> Expr {
    col(name) + lit(ns.base())
}

impl NodeRead for ISBN {
    fn read_node_ids(&self) -> Result<LazyFrame> {
        let df = LazyFrame::scan_parquet("book-links/all-isbns.parquet", default())?;
        let df = df.select([
            id_col("isbn_id", NS_ISBN).alias("code"),
            col("isbn").alias("label"),
        ]);
        Ok(df)
    }
}

impl NodeRead for LOC {
    fn read_node_ids(&self) -> Result<LazyFrame> {
        let df = LazyFrame::scan_parquet("loc-mds/book-ids.parquet", default())?;
        let df = df.select([id_col("rec_id", NS_LOC_REC).alias("code")]);
        Ok(df)
    }
}

impl EdgeRead for LOC {
    fn read_edges(&self) -> Result<LazyFrame> {
        let df = LazyFrame::scan_parquet("loc-mds/book-isbn-ids.parquet", default())?;
        let df = df.select([
            id_col("isbn_id", NS_ISBN).alias("src"),
            id_col("rec_id", NS_LOC_REC).alias("dst"),
        ]);
        Ok(df)
    }
}

impl NodeRead for OLEditions {
    fn read_node_ids(&self) -> Result<LazyFrame> {
        let df = LazyFrame::scan_parquet("openlibrary/editions.parquet", default())?;
        let df = df.select([id_col("id", NS_EDITION).alias("code")]);
        Ok(df)
    }
}

impl EdgeRead for OLEditions {
    fn read_edges(&self) -> Result<LazyFrame> {
        let df = LazyFrame::scan_parquet("openlibrary/edition-isbn-ids.parquet", default())?;
        let df = df.select([
            id_col("isbn_id", NS_ISBN).alias("src"),
            id_col("edition", NS_EDITION).alias("dst"),
        ]);
        Ok(df)
    }
}

impl NodeRead for OLWorks {
    fn read_node_ids(&self) -> Result<LazyFrame> {
        let wdf = LazyFrame::scan_parquet("openlibrary/works.parquet", default())?.select([
            id_col("id", NS_WORK).alias("code"),
            col("key").alias("label"),
        ]);
        let ewdf = LazyFrame::scan_parquet("openlibrary/edition-works.parquet", default())?
            .select([id_col("work", NS_WORK).alias("code")])
            .unique(None, UniqueKeepStrategy::Any);
        let df = wdf.join(
            ewdf,
            [col("code")],
            [col("code")],
            JoinArgs::new(JoinType::Outer { coalesce: true }),
        );
        Ok(df)
    }
}

impl EdgeRead for OLWorks {
    fn read_edges(&self) -> Result<LazyFrame> {
        let df = LazyFrame::scan_parquet("openlibrary/edition-works.parquet", default())?;
        let df = df.select([
            id_col("edition", NS_EDITION).alias("src"),
            id_col("work", NS_WORK).alias("dst"),
        ]);
        Ok(df)
    }
}

impl NodeRead for GRBooks {
    fn read_node_ids(&self) -> Result<LazyFrame> {
        let df = LazyFrame::scan_parquet("goodreads/gr-book-ids.parquet", default())?;
        let df = df.select([id_col("book_id", NS_GR_BOOK).alias("code")]);
        Ok(df)
    }
}

impl EdgeRead for GRBooks {
    fn read_edges(&self) -> Result<LazyFrame> {
        let df = LazyFrame::scan_parquet("goodreads/book-isbn-ids.parquet", default())?;
        let df = df.select([
            id_col("isbn_id", NS_ISBN).alias("src"),
            id_col("book_id", NS_GR_BOOK).alias("dst"),
        ]);
        Ok(df)
    }
}

impl NodeRead for GRWorks {
    fn read_node_ids(&self) -> Result<LazyFrame> {
        let df = LazyFrame::scan_parquet("goodreads/gr-book-ids.parquet", default())?;
        let df = df.filter(col("work_id").is_not_null());
        let df = df.select([id_col("work_id", NS_GR_WORK).alias("code")]);
        Ok(df)
    }
}

impl EdgeRead for GRWorks {
    fn read_edges(&self) -> Result<LazyFrame> {
        let df = LazyFrame::scan_parquet("goodreads/gr-book-ids.parquet", default())?;
        let df = df.filter(col("work_id").is_not_null());
        let df = df.select([
            id_col("book_id", NS_GR_BOOK).alias("src"),
            id_col("work_id", NS_GR_WORK).alias("dst"),
        ]);
        Ok(df)
    }
}