bookdata/graph/
model.rs

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
use std::fs::File;

use anyhow::{anyhow, Result};
use log::*;
use parquet_derive::{ParquetRecordReader, ParquetRecordWriter};
use serde::Serialize;

use super::{BookID, IdGraph, IdNode};
use crate::arrow::TableWriter;
use crate::ids::codes::{ns_of_book_code, NS_ISBN};
use crate::io::object::ObjectWriter;
use crate::util::logging::item_progress;

const ISBN_CLUSTER_PATH: &str = "book-links/isbn-clusters.parquet";
const GRAPH_NODE_PATH: &str = "book-links/cluster-graph-nodes.parquet";
const GRAPH_EDGE_PATH: &str = "book-links/cluster-graph-edges.parquet";
const CLUSTER_STATS_PATH: &str = "book-links/cluster-stats.parquet";
const CLUSTER_METRICS_PATH: &str = "book-links/cluster-metrics.json";

#[derive(ParquetRecordWriter, ParquetRecordReader, Debug)]
pub struct ISBNClusterRec {
    pub isbn: String,
    pub isbn_id: i32,
    pub cluster: i32,
}

#[derive(ParquetRecordWriter, Debug)]
pub struct ClusterCode {
    pub book_code: i32,
    pub cluster: i32,
    pub node_type: String,
    pub label: Option<String>,
}

#[derive(ParquetRecordWriter, Debug)]
pub struct GraphEdge {
    pub src: i32,
    pub dst: i32,
}

#[derive(ParquetRecordWriter, Debug, Default)]
pub struct ClusterStat {
    pub cluster: i32,
    pub n_nodes: u32,
    pub n_isbns: u32,
    pub n_loc_recs: u32,
    pub n_ol_editions: u32,
    pub n_ol_works: u32,
    pub n_gr_books: u32,
    pub n_gr_works: u32,
}

#[derive(Serialize, Debug)]
struct ClusteringStatistics {
    clusters: usize,
    largest: usize,
    max_isbns: usize,
}

impl ClusterStat {
    /// Create a cluster statistics object from a list of books codes.
    pub fn create(cluster: i32, nodes: &Vec<&BookID>) -> ClusterStat {
        let mut cs = ClusterStat::default();
        cs.cluster = cluster;
        cs.n_nodes = nodes.len() as u32;
        for node in nodes {
            if let Some(ns) = ns_of_book_code(node.code) {
                match ns.name {
                    "ISBN" => cs.n_isbns += 1,
                    "LOC" => cs.n_loc_recs += 1,
                    "OL-W" => cs.n_ol_works += 1,
                    "OL-E" => cs.n_ol_editions += 1,
                    "GR-W" => cs.n_gr_works += 1,
                    "GR-B" => cs.n_gr_books += 1,
                    _ => (),
                }
            }
        }

        cs
    }
}

pub fn save_graph_cluster_data(graph: &IdGraph, clusters: Vec<Vec<IdNode>>) -> Result<()> {
    let mut ic_w = TableWriter::open(ISBN_CLUSTER_PATH)?;

    let mut n_w = TableWriter::open(GRAPH_NODE_PATH)?;
    let mut cs_w = TableWriter::open(CLUSTER_STATS_PATH)?;

    let mut m_size = 0;
    let mut m_id = 0;
    let mut m_isbns = 0;

    info!("writing graph nodes");
    let pb = item_progress(clusters.len(), "clusters");
    for ci in 0..clusters.len() {
        let verts = &clusters[ci];
        let vids: Vec<_> = verts
            .iter()
            .map(|v| graph.node_weight(*v).unwrap())
            .collect();
        let cluster = vids.iter().map(|b| b.code).min().unwrap();
        if vids.len() > m_size {
            m_size = vids.len();
            m_id = cluster;
        }
        cs_w.write_object(ClusterStat::create(cluster, &vids))?;
        let mut n_isbns = 0;
        for v in &vids {
            n_w.write_object(ClusterCode {
                cluster,
                book_code: v.code,
                node_type: ns_of_book_code(v.code).unwrap().name.to_string(),
                label: v.label.clone(),
            })?;
            if let Some(id) = NS_ISBN.from_code(v.code) {
                ic_w.write_object(ISBNClusterRec {
                    cluster,
                    isbn_id: id,
                    isbn: v
                        .label
                        .clone()
                        .ok_or_else(|| anyhow!("graph node missing ISBN label"))?,
                })?;
                n_isbns += 1;
            }
        }
        if n_isbns > m_isbns {
            m_isbns = n_isbns;
        }
        pb.inc(1);
    }

    ic_w.finish()?;
    n_w.finish()?;
    cs_w.finish()?;
    pb.finish_and_clear();

    info!("largest cluster {} has {} nodes", m_id, m_size);

    info!("writing graph edges");
    let mut e_w = TableWriter::open(GRAPH_EDGE_PATH)?;
    for e in graph.edge_indices() {
        let (s, d) = graph.edge_endpoints(e).unwrap();
        let src = graph.node_weight(s).unwrap().code;
        let dst = graph.node_weight(d).unwrap().code;
        e_w.write_object(GraphEdge { src, dst })?;
    }
    e_w.finish()?;

    info!("saving statistics");
    let stats = ClusteringStatistics {
        clusters: clusters.len(),
        largest: m_size,
        max_isbns: m_isbns,
    };
    let statf = File::create(CLUSTER_METRICS_PATH)?;
    serde_json::to_writer(statf, &stats)?;

    Ok(())
}