bookdata/graph/
load.rs

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
use std::collections::HashMap;

use anyhow::{anyhow, Result};
use log::*;

use crate::layout::Config;

use super::sources::*;
use super::{BookID, IdGraph, IdNode};
use polars::prelude::*;

type NodeMap = HashMap<i32, IdNode>;

struct GraphBuilder {
    graph: IdGraph,
    nodes: NodeMap,
}

impl GraphBuilder {
    fn add_vertices<R: NodeRead>(&mut self, src: R) -> Result<()> {
        info!("scanning vertices from {:?}", src);
        let node_df = src.read_node_ids()?;
        debug!("node schema: {:?}", node_df.schema());
        let mut node_df = node_df.collect()?;
        let ninit = self.nodes.len();

        // pull out the column to reduce memory
        let code_s = node_df.drop_in_place("code")?;
        let code_s = code_s.cast(&DataType::Int32)?;
        let codes = code_s.i32()?;
        let labels = node_df.column("label").ok().map(|c| c.str()).transpose()?;
        for i in 0..codes.len() {
            let code = codes.get(i).unwrap();
            let label = labels.map(|c| c.get(i)).flatten();
            let label = label.map(|s| s.to_string());
            let entry = self.nodes.entry(code);
            entry.or_insert_with(|| {
                self.graph.add_node(BookID {
                    code,
                    label,
                    cluster: 0,
                })
            });
        }

        info!(
            "loaded {} new vertices from {:?}",
            self.nodes.len() - ninit,
            src
        );

        Ok(())
    }

    fn add_edges<R: EdgeRead>(&mut self, src: R) -> Result<()> {
        info!("scanning edges from {:?}", src);
        let edge_df = src.read_edges()?;
        debug!("edge schema: {:?}", edge_df.schema());
        let edge_df = edge_df.collect()?;
        let src_s = edge_df.column("src")?.cast(&DataType::Int32)?;
        let srcs = src_s.i32()?;
        let dst_s = edge_df.column("dst")?.cast(&DataType::Int32)?;
        let dsts = dst_s.i32()?;

        let iter = srcs.into_iter().zip(dsts.into_iter());
        let mut n = 0;

        for pair in iter {
            if let (Some(sn), Some(dn)) = pair {
                let sid = self
                    .nodes
                    .get(&sn)
                    .ok_or_else(|| anyhow!("unknown source node {}", sn))?;
                let did = self
                    .nodes
                    .get(&dn)
                    .ok_or_else(|| anyhow!("unknown destination node {}", sn))?;
                self.graph.add_edge(*sid, *did, ());
                n += 1;
            }
        }

        info!("added {} edges from {:?}", n, src);

        Ok(())
    }
}

pub fn construct_graph(cfg: &Config) -> Result<IdGraph> {
    let graph = IdGraph::new_undirected();
    let nodes = NodeMap::new();
    let mut gb = GraphBuilder { graph, nodes };

    info!("loading nodes");
    gb.add_vertices(ISBN)?;
    gb.add_vertices(LOC)?;
    gb.add_vertices(OLEditions)?;
    gb.add_vertices(OLWorks)?;
    if cfg.goodreads.enabled {
        gb.add_vertices(GRBooks)?;
        gb.add_vertices(GRWorks)?;
    }

    info!("loading edges");
    gb.add_edges(LOC)?;
    gb.add_edges(OLEditions)?;
    gb.add_edges(OLWorks)?;
    if cfg.goodreads.enabled {
        gb.add_edges(GRBooks)?;
        gb.add_edges(GRWorks)?;
    }

    let graph = gb.graph;
    info!(
        "graph has {} nodes, {} edges",
        graph.node_count(),
        graph.edge_count()
    );
    Ok(graph)
}