bookdata/goodreads/
work.rsuse parquet_derive::ParquetRecordWriter;
use serde::Deserialize;
use crate::arrow::*;
use crate::parsing::*;
use crate::prelude::*;
const OUT_FILE: &'static str = "gr-work-info.parquet";
#[derive(Deserialize)]
pub struct RawWork {
pub work_id: String,
#[serde(default)]
pub original_title: String,
#[serde(default)]
pub original_publication_year: String,
#[serde(default)]
pub original_publication_month: String,
#[serde(default)]
pub original_publication_day: String,
}
#[derive(ParquetRecordWriter)]
pub struct WorkRecord {
pub work_id: i32,
pub title: Option<String>,
pub pub_year: Option<i16>,
pub pub_month: Option<u8>,
}
pub struct WorkWriter {
writer: TableWriter<WorkRecord>,
n_recs: usize,
}
impl WorkWriter {
pub fn open() -> Result<WorkWriter> {
let writer = TableWriter::open(OUT_FILE)?;
Ok(WorkWriter { writer, n_recs: 0 })
}
}
impl DataSink for WorkWriter {
fn output_files(&self) -> Vec<PathBuf> {
path_list(&[OUT_FILE])
}
}
impl ObjectWriter<RawWork> for WorkWriter {
fn write_object(&mut self, row: RawWork) -> Result<()> {
let work_id: i32 = row.work_id.parse()?;
let pub_year = parse_opt(&row.original_publication_year)?;
let pub_month = parse_opt(&row.original_publication_month)?;
self.writer.write_object(WorkRecord {
work_id,
title: trim_owned(&row.original_title),
pub_year,
pub_month,
})?;
self.n_recs += 1;
Ok(())
}
fn finish(self) -> Result<usize> {
self.writer.finish()?;
Ok(self.n_recs)
}
}