bookdata/openlib/
source.rs

1//! Parse OpenLibrary JSON.
2use std::str::FromStr;
3
4use log::*;
5use serde::de;
6use serde::Deserialize;
7use thiserror::Error;
8
9use super::key::parse_ol_key;
10use super::key::OLKeyError;
11use super::key::KS_AUTHOR;
12use crate::tsv::split_first;
13
14/// Struct representing a row of the OpenLibrary dump file.
15///
16/// The row extracts the key and record, deserializing the record from JSON to the
17/// appropriate type.
18pub struct Row<T> {
19    pub key: String,
20    pub record: T,
21}
22
23/// Error type for parsing an OpenLibrary JSON row.
24#[derive(Error, Debug)]
25pub enum RowError {
26    #[error("line has insufficient fields, failed splitting {0}")]
27    FieldError(i32),
28    #[error("JSON parsing error: {0}")]
29    ParseError(#[from] serde_json::Error),
30}
31
32impl<T: de::DeserializeOwned> FromStr for Row<T> {
33    type Err = RowError;
34
35    fn from_str(s: &str) -> Result<Row<T>, RowError> {
36        // split row into columns
37        let (_, rest) = split_first(s).ok_or(RowError::FieldError(1))?;
38        let (key, rest) = split_first(rest).ok_or(RowError::FieldError(2))?;
39        let (_, rest) = split_first(rest).ok_or(RowError::FieldError(3))?;
40        let (_, data) = split_first(rest).ok_or(RowError::FieldError(4))?;
41        let record = serde_json::from_str(data).map_err(|e| {
42            error!("invalid JSON in record {}: {:?}", key, e);
43            let jsv: serde_json::Value = serde_json::from_str(data).expect("invalid JSON");
44            let jsp = serde_json::to_string_pretty(&jsv).expect("uhh");
45            info!("offending JSON: {}", jsp);
46            e
47        })?;
48        Ok(Row {
49            key: key.to_owned(),
50            record,
51        })
52    }
53}
54
55/// Struct representing an author link in OL.
56///
57/// There are several different formats in which we can find author references.
58/// This enum encapsulates them, and Serde automatically deserializes it into the
59/// appropraite variant.  We then use the [Author::key] function to extract the key itself,
60/// no matter the variant.
61#[derive(Deserialize, Debug)]
62#[serde(untagged)]
63pub enum Author {
64    Object { key: String },
65    Nested { author: Keyed },
66    Key(String),
67    Empty {},
68}
69
70/// Keyed object reference
71#[derive(Deserialize, Debug)]
72pub struct Keyed {
73    pub key: String,
74}
75
76impl Author {
77    /// Get the key out of an author reference.
78    pub fn key<'a>(&'a self) -> Option<&'a str> {
79        match self {
80            Author::Object { key } => Some(key.as_ref()),
81            Author::Nested { author } => Some(author.key.as_ref()),
82            Author::Key(ref ks) => Some(ks.as_ref()),
83            Author::Empty {} => None,
84        }
85    }
86
87    /// Get the numeric ID for this author.
88    pub fn id(&self) -> Result<Option<u32>, OLKeyError> {
89        self.key().map(|k| parse_ol_key(k, KS_AUTHOR)).transpose()
90    }
91}
92
93/// An author record parsed from OpenLibrary JSON.
94#[derive(Deserialize)]
95pub struct OLAuthorSource {
96    #[serde(default)]
97    pub name: Option<String>,
98    #[serde(default)]
99    pub personal_name: Option<String>,
100    #[serde(default)]
101    pub alternate_names: Vec<String>,
102}
103
104/// An edition record parsed from OpenLibrary JSON.
105#[derive(Deserialize)]
106pub struct OLEditionRecord {
107    #[serde(default)]
108    pub isbn_10: Vec<String>,
109    #[serde(default)]
110    pub isbn_13: Vec<String>,
111    #[serde(default)]
112    pub asin: Vec<String>,
113
114    #[serde(default)]
115    pub title: Option<String>,
116
117    #[serde(default)]
118    pub works: Vec<Keyed>,
119    #[serde(default)]
120    pub authors: Vec<Author>,
121
122    #[serde(flatten)]
123    pub subjects: OLSubjects,
124}
125
126/// An author record parsed from OpenLibrary JSON.
127#[derive(Deserialize)]
128pub struct OLWorkRecord {
129    #[serde(default)]
130    pub authors: Vec<Author>,
131    #[serde(default)]
132    pub title: Option<String>,
133    #[serde(flatten)]
134    pub subjects: OLSubjects,
135}
136
137/// Text entries
138#[derive(Deserialize, Clone)]
139#[serde(untagged)]
140pub enum Text {
141    String(String),
142    Object { value: String },
143}
144
145impl Into<String> for Text {
146    fn into(self) -> String {
147        match self {
148            Text::String(s) => s,
149            Text::Object { value } => value,
150        }
151    }
152}
153
154/// Information about a work or edition's subjects.
155#[derive(Deserialize)]
156pub struct OLSubjects {
157    #[serde(default)]
158    pub subjects: Vec<Text>,
159    #[serde(default)]
160    pub subject_people: Vec<Text>,
161    #[serde(default)]
162    pub subject_places: Vec<Text>,
163    #[serde(default)]
164    pub subject_times: Vec<Text>,
165}