bookdata/goodreads/
review.rs

1//! GoodReads review data model.
2use parquet_derive::ParquetRecordWriter;
3pub use serde::Deserialize;
4
5use crate::arrow::*;
6use crate::ids::index::IdIndex;
7use crate::parsing::dates::*;
8use crate::parsing::*;
9use crate::prelude::*;
10
11use super::ids::load_id_links;
12use super::ids::BookLinkMap;
13use super::users::load_user_index;
14
15const OUT_FILE: &'static str = "gr-reviews.parquet";
16
17/// Review records we read from JSON.
18#[derive(Deserialize)]
19pub struct RawReview {
20    pub user_id: String,
21    pub book_id: String,
22    pub review_id: String,
23    pub rating: f32,
24    pub review_text: String,
25    pub n_votes: i32,
26    pub date_added: String,
27    pub date_updated: String,
28    pub read_at: String,
29    pub started_at: String,
30}
31
32/// Review records to write to the Parquet table.
33#[derive(ParquetRecordWriter)]
34pub struct ReviewRecord {
35    /// Internal auto-genereated record identifier.
36    pub rec_id: u32,
37    /// Review identifier (derived from input).
38    pub review_id: i64,
39    /// User identifier.
40    pub user_id: i32,
41    /// GoodReads book identifier.
42    pub book_id: i32,
43    /// GoodReads work identifier.
44    pub work_id: Option<i32>,
45    /// Cluster identifier (from [integration clustering][clust]).
46    ///
47    /// [clust]: https://bookdata.piret.info/data/cluster.html
48    pub cluster: i32,
49    /// GoodReads “item” identifier
50    pub item_id: i32,
51    /// Rating associated with this review (if provided).
52    pub rating: Option<f32>,
53    /// Review text.
54    pub review: String,
55    /// Number of votes this review has received.
56    pub n_votes: i32,
57    /// Date review was added.
58    pub added: f32,
59    /// Date review was updated.
60    pub updated: f32,
61}
62
63// Object writer to transform and write GoodReads reviews
64pub struct ReviewWriter {
65    writer: TableWriter<ReviewRecord>,
66    users: IdIndex<String>,
67    books: BookLinkMap,
68    n_recs: u32,
69}
70
71impl ReviewWriter {
72    // Open a new output
73    pub fn open() -> Result<ReviewWriter> {
74        let writer = TableWriter::open(OUT_FILE)?;
75        let users = load_user_index()?.freeze();
76        let books = load_id_links()?;
77        Ok(ReviewWriter {
78            writer,
79            users,
80            books,
81            n_recs: 0,
82        })
83    }
84}
85
86impl DataSink for ReviewWriter {
87    fn output_files(&self) -> Vec<PathBuf> {
88        path_list(&[OUT_FILE])
89    }
90}
91
92impl ObjectWriter<RawReview> for ReviewWriter {
93    // Write a single interaction to the output
94    fn write_object(&mut self, row: RawReview) -> Result<()> {
95        self.n_recs += 1;
96        let rec_id = self.n_recs;
97        let user_id = self.users.intern_owned(row.user_id)?;
98        let book_id: i32 = row.book_id.parse()?;
99        let (rev_hi, rev_lo) = decode_hex_i64_pair(&row.review_id)?;
100        // review ids were checked for dupluicates in interaction scan, don't repeat here
101        let review_id = rev_hi ^ rev_lo;
102        let link = self
103            .books
104            .get(&book_id)
105            .ok_or_else(|| anyhow!("unknown book ID"))?;
106
107        self.writer.write_object(ReviewRecord {
108            rec_id,
109            review_id,
110            user_id,
111            book_id,
112            work_id: link.work_id,
113            item_id: link.item_id(),
114            cluster: link.cluster,
115            review: row.review_text,
116            rating: if row.rating > 0.0 {
117                Some(row.rating)
118            } else {
119                None
120            },
121            n_votes: row.n_votes,
122            added: parse_gr_date(&row.date_added).map(check_ts("added", 2000))?,
123            updated: parse_gr_date(&row.date_updated).map(check_ts("updated", 2000))?,
124        })?;
125
126        Ok(())
127    }
128
129    // Clean up and finalize output
130    fn finish(self) -> Result<usize> {
131        info!(
132            "wrote {} records for {} users, closing output",
133            self.n_recs,
134            self.users.len()
135        );
136        self.writer.finish()
137    }
138}