bookdata/cleaning/names/
mod.rs

1//! Extract and normalize author names.
2//!
3//! Names in the book data — both in author records and in their references in
4//! book records — come in a variety of formats.  This module is responsible for
5//! expanding and normalizing those name formats to improve data linkability.
6//! Some records also include a year or date range for the author's lifetime. We
7//! normalize names as follows:
8//!
9//! - If the name is “Last, First”, we emit both “Last, First” and “First Last”
10//!   variants.
11//! - If the name has a year, we emit each variant both with and without the
12//!   year.
13//! - Leading and trailing junk is cleaned
14//!
15//! This maximizes our ability to match records across sources recording names
16//! in different formats.
17//!
18//! [`name_variants`] is the primary entry point for using this module.  The
19//! [`clean_name`] function provides cleanup utilities without parsing, for
20//! emitting names from book records.
21
22use anyhow::Result;
23
24use super::strings::norm_unicode;
25
26mod parse;
27mod types;
28
29#[cfg(test)]
30mod test_cleaning;
31#[cfg(test)]
32mod test_variants;
33
34pub use types::NameError;
35use types::NameFmt;
36
37pub use parse::parse_name_entry;
38
39/// Pre-clean a string without copying.
40///
41/// Many strings don't need advanced cleaning. This method tries to pre-clean
42/// a string.  If the string cannot be pre-cleaned, it returns None.
43fn preclean<'a>(name: &'a str) -> Option<&'a str> {
44    let name = name.trim();
45
46    let mut ws_count = 0;
47    for c in name.bytes() {
48        if c == b'.' {
49            return None;
50        } else if c.is_ascii_whitespace() {
51            ws_count += 1;
52            if ws_count > 1 {
53                return None;
54            }
55        } else {
56            if c == b',' && ws_count > 0 {
57                return None; // need cleaning of space and ,
58            }
59            ws_count = 0;
60        }
61    }
62
63    Some(name)
64}
65
66/// Clean up a name from unnecessary special characters.
67pub fn clean_name<'a>(name: &'a str) -> String {
68    let name = norm_unicode(name);
69
70    // fast path for pretty clean strings
71    // directly copying a string is faster than our character-by-character copying,
72    // probably due to simd, so it's worth scanning for a fast path.
73    if let Some(pc) = preclean(&name) {
74        return pc.to_string();
75    }
76
77    // we use a manually-coded state machine instead of REs for performance
78    let mut res = Vec::with_capacity(name.len());
79    let mut in_seq = false;
80    for c in name.bytes() {
81        if in_seq {
82            if c.is_ascii_whitespace() || c == b'.' {
83                // no-op
84            } else if c == b',' || res.is_empty() {
85                // emit the comma and proceed
86                res.push(c);
87                in_seq = false;
88            } else {
89                // collapse whitespace sequence and proceed
90                res.push(b' ');
91                res.push(c);
92                in_seq = false;
93            }
94        } else {
95            if c.is_ascii_whitespace() || c == b'.' {
96                in_seq = true;
97            } else {
98                res.push(c);
99            }
100        }
101    }
102    unsafe {
103        // since we have copied bytes, except for ASCII manipulations, this is safe
104        String::from_utf8_unchecked(res)
105    }
106}
107
108/// Extract all variants from a name.
109///
110/// See the [module documentation][self] for details on this parsing process.
111pub fn name_variants(name: &str) -> Result<Vec<String>, NameError> {
112    let parse = parse_name_entry(name)?;
113    let mut variants = Vec::new();
114    match parse.name.simplify() {
115        NameFmt::Empty => (),
116        NameFmt::Single(n) => variants.push(n),
117        NameFmt::TwoPart(last, first) => {
118            variants.push(format!("{} {}", first, last));
119            variants.push(format!("{}, {}", last, first));
120        }
121    };
122
123    // create a version with the year
124    if let Some(y) = parse.year {
125        for i in 0..variants.len() {
126            variants.push(format!("{}, {}", variants[i], y));
127        }
128    }
129
130    let mut variants: Vec<String> = variants.iter().map(|s| clean_name(s)).collect();
131    variants.sort();
132    variants.dedup();
133
134    Ok(variants)
135}