bookdata/openlib/
edition.rs

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
//! OpenLibrary edition schemas.
use friendly::scalar;
use parquet_derive::ParquetRecordWriter;

use crate::arrow::*;
use crate::cleaning::isbns::clean_asin_chars;
use crate::cleaning::isbns::clean_isbn_chars;
use crate::prelude::*;

use super::key::parse_ol_key;
use super::key::KS_WORK;
pub use super::source::OLEditionRecord;
use super::source::Row;
use super::subject::SubjectEntry;

/// An edition row in the extracted Parquet.
#[derive(ParquetRecordWriter)]
pub struct EditionRec {
    pub id: u32,
    pub key: String,
    pub title: Option<String>,
}

/// Link between edition and work.
#[derive(ParquetRecordWriter)]
pub struct LinkRec {
    pub edition: u32,
    pub work: u32,
}

/// Edition ISBN record.
#[derive(ParquetRecordWriter)]
pub struct ISBNrec {
    pub edition: u32,
    pub isbn: String,
}

/// Edition author record.
#[derive(ParquetRecordWriter)]
pub struct EditionAuthorRec {
    pub edition: u32,
    pub pos: i16,
    pub author: u32,
}

/// Edition-subject record in extracted Parquet.
#[derive(ParquetRecordWriter)]
pub struct EditionSubjectRec {
    pub id: u32,
    pub subj_type: u8,
    pub subject: String,
}

impl From<SubjectEntry> for EditionSubjectRec {
    fn from(value: SubjectEntry) -> Self {
        EditionSubjectRec {
            id: value.entity,
            subj_type: value.subj_type.into(),
            subject: value.subject,
        }
    }
}

/// Process edition records into Parquet.
///
/// This must be run **after** the author and work processors.
pub struct EditionProcessor {
    last_id: u32,
    rec_writer: TableWriter<EditionRec>,
    link_writer: TableWriter<LinkRec>,
    isbn_writer: TableWriter<ISBNrec>,
    author_writer: TableWriter<EditionAuthorRec>,
    subject_writer: TableWriter<EditionSubjectRec>,
}

impl EditionProcessor {
    pub fn new() -> Result<EditionProcessor> {
        Ok(EditionProcessor {
            last_id: 0,
            rec_writer: TableWriter::open("editions.parquet")?,
            link_writer: TableWriter::open("edition-works.parquet")?,
            isbn_writer: TableWriter::open("edition-isbns.parquet")?,
            author_writer: TableWriter::open("edition-authors.parquet")?,
            subject_writer: TableWriter::open("edition-subjects.parquet")?,
        })
    }

    fn save_isbns(
        &mut self,
        edition: u32,
        isbns: Vec<String>,
        clean: fn(&str) -> String,
    ) -> Result<()> {
        for isbn in isbns {
            let isbn = clean(&isbn);
            // filter but with a reasonable threshold of error
            if isbn.len() >= 8 {
                self.isbn_writer.write_object(ISBNrec { edition, isbn })?;
            }
        }

        Ok(())
    }
}

impl ObjectWriter<Row<OLEditionRecord>> for EditionProcessor {
    fn write_object(&mut self, row: Row<OLEditionRecord>) -> Result<()> {
        self.last_id += 1;
        let id = self.last_id;

        self.rec_writer.write_object(EditionRec {
            id,
            key: row.key.clone(),
            title: row.record.title.clone(),
        })?;

        self.save_isbns(id, row.record.isbn_10, clean_isbn_chars)?;
        self.save_isbns(id, row.record.isbn_13, clean_isbn_chars)?;
        self.save_isbns(id, row.record.asin, clean_asin_chars)?;

        for work in row.record.works {
            let key = work.key;
            let work = parse_ol_key(&key, KS_WORK)?;
            self.link_writer
                .write_object(LinkRec { edition: id, work })?;
        }

        for pos in 0..row.record.authors.len() {
            let author = row.record.authors[pos].id()?;
            if let Some(aid) = author {
                let pos = pos as i16;
                self.author_writer.write_object(EditionAuthorRec {
                    edition: id,
                    pos,
                    author: aid,
                })?;
            }
        }

        for sr in row.record.subjects.subject_records(id) {
            self.subject_writer.write_object(sr.into())?;
        }

        Ok(())
    }

    fn finish(self) -> Result<usize> {
        let n = self.rec_writer.finish()?;
        info!("wrote {} edition records", scalar(n));
        let n = self.author_writer.finish()?;
        info!("wrote {} edition-author records", scalar(n));
        let n = self.link_writer.finish()?;
        info!("wrote {} edition-work records", scalar(n));
        let n = self.isbn_writer.finish()?;
        info!("wrote {} edition-isbn records", scalar(n));
        let n = self.subject_writer.finish()?;
        info!("wrote {} edition-subject records", scalar(n));
        Ok(self.last_id as usize)
    }
}

impl EditionProcessor {}