bookdata/openlib/
edition.rs

1//! OpenLibrary edition schemas.
2use friendly::scalar;
3use parquet_derive::ParquetRecordWriter;
4
5use crate::arrow::*;
6use crate::cleaning::isbns::clean_asin_chars;
7use crate::cleaning::isbns::clean_isbn_chars;
8use crate::prelude::*;
9
10use super::key::parse_ol_key;
11use super::key::KS_WORK;
12pub use super::source::OLEditionRecord;
13use super::source::Row;
14use super::subject::SubjectEntry;
15
16/// An edition row in the extracted Parquet.
17#[derive(ParquetRecordWriter)]
18pub struct EditionRec {
19    pub id: u32,
20    pub key: String,
21    pub title: Option<String>,
22}
23
24/// Link between edition and work.
25#[derive(ParquetRecordWriter)]
26pub struct LinkRec {
27    pub edition: u32,
28    pub work: u32,
29}
30
31/// Edition ISBN record.
32#[derive(ParquetRecordWriter)]
33pub struct ISBNrec {
34    pub edition: u32,
35    pub isbn: String,
36}
37
38/// Edition author record.
39#[derive(ParquetRecordWriter)]
40pub struct EditionAuthorRec {
41    pub edition: u32,
42    pub pos: i16,
43    pub author: u32,
44}
45
46/// Edition-subject record in extracted Parquet.
47#[derive(ParquetRecordWriter)]
48pub struct EditionSubjectRec {
49    pub id: u32,
50    pub subj_type: u8,
51    pub subject: String,
52}
53
54impl From<SubjectEntry> for EditionSubjectRec {
55    fn from(value: SubjectEntry) -> Self {
56        EditionSubjectRec {
57            id: value.entity,
58            subj_type: value.subj_type.into(),
59            subject: value.subject,
60        }
61    }
62}
63
64/// Process edition records into Parquet.
65///
66/// This must be run **after** the author and work processors.
67pub struct EditionProcessor {
68    last_id: u32,
69    rec_writer: TableWriter<EditionRec>,
70    link_writer: TableWriter<LinkRec>,
71    isbn_writer: TableWriter<ISBNrec>,
72    author_writer: TableWriter<EditionAuthorRec>,
73    subject_writer: TableWriter<EditionSubjectRec>,
74}
75
76impl EditionProcessor {
77    pub fn new() -> Result<EditionProcessor> {
78        Ok(EditionProcessor {
79            last_id: 0,
80            rec_writer: TableWriter::open("editions.parquet")?,
81            link_writer: TableWriter::open("edition-works.parquet")?,
82            isbn_writer: TableWriter::open("edition-isbns.parquet")?,
83            author_writer: TableWriter::open("edition-authors.parquet")?,
84            subject_writer: TableWriter::open("edition-subjects.parquet")?,
85        })
86    }
87
88    fn save_isbns(
89        &mut self,
90        edition: u32,
91        isbns: Vec<String>,
92        clean: fn(&str) -> String,
93    ) -> Result<()> {
94        for isbn in isbns {
95            let isbn = clean(&isbn);
96            // filter but with a reasonable threshold of error
97            if isbn.len() >= 8 {
98                self.isbn_writer.write_object(ISBNrec { edition, isbn })?;
99            }
100        }
101
102        Ok(())
103    }
104}
105
106impl ObjectWriter<Row<OLEditionRecord>> for EditionProcessor {
107    fn write_object(&mut self, row: Row<OLEditionRecord>) -> Result<()> {
108        self.last_id += 1;
109        let id = self.last_id;
110
111        self.rec_writer.write_object(EditionRec {
112            id,
113            key: row.key.clone(),
114            title: row.record.title.clone(),
115        })?;
116
117        self.save_isbns(id, row.record.isbn_10, clean_isbn_chars)?;
118        self.save_isbns(id, row.record.isbn_13, clean_isbn_chars)?;
119        self.save_isbns(id, row.record.asin, clean_asin_chars)?;
120
121        for work in row.record.works {
122            let key = work.key;
123            let work = parse_ol_key(&key, KS_WORK)?;
124            self.link_writer
125                .write_object(LinkRec { edition: id, work })?;
126        }
127
128        for pos in 0..row.record.authors.len() {
129            let author = row.record.authors[pos].id()?;
130            if let Some(aid) = author {
131                let pos = pos as i16;
132                self.author_writer.write_object(EditionAuthorRec {
133                    edition: id,
134                    pos,
135                    author: aid,
136                })?;
137            }
138        }
139
140        for sr in row.record.subjects.subject_records(id) {
141            self.subject_writer.write_object(sr.into())?;
142        }
143
144        Ok(())
145    }
146
147    fn finish(self) -> Result<usize> {
148        let n = self.rec_writer.finish()?;
149        info!("wrote {} edition records", scalar(n));
150        let n = self.author_writer.finish()?;
151        info!("wrote {} edition-author records", scalar(n));
152        let n = self.link_writer.finish()?;
153        info!("wrote {} edition-work records", scalar(n));
154        let n = self.isbn_writer.finish()?;
155        info!("wrote {} edition-isbn records", scalar(n));
156        let n = self.subject_writer.finish()?;
157        info!("wrote {} edition-subject records", scalar(n));
158        Ok(self.last_id as usize)
159    }
160}
161
162impl EditionProcessor {}