1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
//! OpenLibrary author schemas.
use crate::arrow::*;
use crate::cleaning::names::clean_name;
use crate::cleaning::strings::norm_unicode;
use crate::prelude::*;

pub use super::source::OLAuthorSource;
use super::source::Row;

/// An author record in the extracted Parquet.
#[derive(TableRow)]
pub struct AuthorRec {
    pub id: i32,
    pub key: String,
    pub name: Option<String>,
}

/// An author-name record in the extracted Parquet.
#[derive(TableRow)]
pub struct AuthorNameRec {
    pub id: i32,
    pub source: u8,
    pub name: String,
}

/// Get a list of author name records for an author.
pub fn author_name_records(src: &OLAuthorSource, id: i32) -> Vec<AuthorNameRec> {
    let mut names = Vec::new();

    if let Some(n) = &src.name {
        names.push(AuthorNameRec {
            id,
            source: b'n',
            name: clean_name(&n),
        });
    }

    if let Some(n) = &src.personal_name {
        names.push(AuthorNameRec {
            id,
            source: b'p',
            name: clean_name(&n),
        });
    }

    for n in &src.alternate_names {
        names.push(AuthorNameRec {
            id,
            source: b'a',
            name: clean_name(&n),
        });
    }

    names
}

/// Process author records into Parquet.
pub struct AuthorProcessor {
    last_id: i32,
    rec_writer: TableWriter<AuthorRec>,
    name_writer: TableWriter<AuthorNameRec>,
}

impl AuthorProcessor {
    pub fn new() -> Result<AuthorProcessor> {
        Ok(AuthorProcessor {
            last_id: 0,
            rec_writer: TableWriter::open("authors.parquet")?,
            name_writer: TableWriter::open("author-names.parquet")?,
        })
    }
}

impl ObjectWriter<Row<OLAuthorSource>> for AuthorProcessor {
    fn write_object(&mut self, row: Row<OLAuthorSource>) -> Result<()> {
        self.last_id += 1;
        let id = self.last_id;

        self.rec_writer.write_object(AuthorRec {
            id,
            key: row.key,
            name: row
                .record
                .name
                .as_ref()
                .map(|s| norm_unicode(s).into_owned()),
        })?;

        for name in author_name_records(&row.record, id) {
            self.name_writer.write_object(name)?;
        }

        Ok(())
    }

    fn finish(self) -> Result<usize> {
        self.rec_writer.finish()?;
        self.name_writer.finish()?;
        Ok(self.last_id as usize)
    }
}