bookdata/cleaning/names/mod.rs
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135
//! Extract and normalize author names.
//!
//! Names in the book data — both in author records and in their references in
//! book records — come in a variety of formats. This module is responsible for
//! expanding and normalizing those name formats to improve data linkability.
//! Some records also include a year or date range for the author's lifetime. We
//! normalize names as follows:
//!
//! - If the name is “Last, First”, we emit both “Last, First” and “First Last”
//! variants.
//! - If the name has a year, we emit each variant both with and without the
//! year.
//! - Leading and trailing junk is cleaned
//!
//! This maximizes our ability to match records across sources recording names
//! in different formats.
//!
//! [`name_variants`] is the primary entry point for using this module. The
//! [`clean_name`] function provides cleanup utilities without parsing, for
//! emitting names from book records.
use anyhow::Result;
use super::strings::norm_unicode;
mod parse;
mod types;
#[cfg(test)]
mod test_cleaning;
#[cfg(test)]
mod test_variants;
pub use types::NameError;
use types::NameFmt;
pub use parse::parse_name_entry;
/// Pre-clean a string without copying.
///
/// Many strings don't need advanced cleaning. This method tries to pre-clean
/// a string. If the string cannot be pre-cleaned, it returns None.
fn preclean<'a>(name: &'a str) -> Option<&'a str> {
let name = name.trim();
let mut ws_count = 0;
for c in name.bytes() {
if c == b'.' {
return None;
} else if c.is_ascii_whitespace() {
ws_count += 1;
if ws_count > 1 {
return None;
}
} else {
if c == b',' && ws_count > 0 {
return None; // need cleaning of space and ,
}
ws_count = 0;
}
}
Some(name)
}
/// Clean up a name from unnecessary special characters.
pub fn clean_name<'a>(name: &'a str) -> String {
let name = norm_unicode(name);
// fast path for pretty clean strings
// directly copying a string is faster than our character-by-character copying,
// probably due to simd, so it's worth scanning for a fast path.
if let Some(pc) = preclean(&name) {
return pc.to_string();
}
// we use a manually-coded state machine instead of REs for performance
let mut res = Vec::with_capacity(name.len());
let mut in_seq = false;
for c in name.bytes() {
if in_seq {
if c.is_ascii_whitespace() || c == b'.' {
// no-op
} else if c == b',' || res.is_empty() {
// emit the comma and proceed
res.push(c);
in_seq = false;
} else {
// collapse whitespace sequence and proceed
res.push(b' ');
res.push(c);
in_seq = false;
}
} else {
if c.is_ascii_whitespace() || c == b'.' {
in_seq = true;
} else {
res.push(c);
}
}
}
unsafe {
// since we have copied bytes, except for ASCII manipulations, this is safe
String::from_utf8_unchecked(res)
}
}
/// Extract all variants from a name.
///
/// See the [module documentation][self] for details on this parsing process.
pub fn name_variants(name: &str) -> Result<Vec<String>, NameError> {
let parse = parse_name_entry(name)?;
let mut variants = Vec::new();
match parse.name.simplify() {
NameFmt::Empty => (),
NameFmt::Single(n) => variants.push(n),
NameFmt::TwoPart(last, first) => {
variants.push(format!("{} {}", first, last));
variants.push(format!("{}, {}", last, first));
}
};
// create a version with the year
if let Some(y) = parse.year {
for i in 0..variants.len() {
variants.push(format!("{}, {}", variants[i], y));
}
}
let mut variants: Vec<String> = variants.iter().map(|s| clean_name(s)).collect();
variants.sort();
variants.dedup();
Ok(variants)
}