diff --git a/Cargo.toml b/Cargo.toml index 07da6ea6..8b41e5ab 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -8,7 +8,7 @@ members = [ resolver = "2" [workspace.package] -version = "0.2.0" +version = "0.2.1" edition = "2021" description = "A library for processing addresses of Japan" repository = "https://github.com/YuukiToriyama/japanese-address-parser" diff --git a/core/src/adapter/orthographical_variant_adapter.rs b/core/src/adapter/orthographical_variant_adapter.rs index 41ef50e6..dacae8bf 100644 --- a/core/src/adapter/orthographical_variant_adapter.rs +++ b/core/src/adapter/orthographical_variant_adapter.rs @@ -37,58 +37,61 @@ pub enum OrthographicalVariant { 櫟, 冨, 諫, + 驒, } impl OrthographicalVariant { fn value(&self) -> &[char] { + use OrthographicalVariant::*; match self { - OrthographicalVariant::の => &['の', 'ノ', '之'], - OrthographicalVariant::ツ => &['ツ', 'ッ'], - OrthographicalVariant::ケ => &['ケ', 'ヶ', 'が', 'ガ'], - OrthographicalVariant::薮 => &['薮', '藪', '籔'], - OrthographicalVariant::崎 => &['崎', '﨑'], - OrthographicalVariant::檜 => &['桧', '檜'], - OrthographicalVariant::龍 => &['龍', '竜'], - OrthographicalVariant::竈 => &['竈', '竃', '釜'], - OrthographicalVariant::嶋 => &['嶋', '島'], - OrthographicalVariant::舘 => &['舘', '館'], - OrthographicalVariant::鰺 => &['鰺', '鯵'], - OrthographicalVariant::脊 => &['脊', '背'], - OrthographicalVariant::渕 => &['渕', '淵'], - OrthographicalVariant::己 => &['己', '巳'], - OrthographicalVariant::槇 => &['槇', '槙'], - OrthographicalVariant::治 => &['治', '冶'], - OrthographicalVariant::佛 => &['佛', '仏'], - OrthographicalVariant::澤 => &['澤', '沢'], - OrthographicalVariant::塚 => &['塚', '塚'], - OrthographicalVariant::恵 => &['恵', '惠'], - OrthographicalVariant::穂 => &['穂', '穗'], - OrthographicalVariant::梼 => &['梼', '檮'], - OrthographicalVariant::蛍 => &['蛍', '螢'], - OrthographicalVariant::與 => &['與', '与'], - OrthographicalVariant::瀧 => &['瀧', '滝'], - OrthographicalVariant::籠 => &['籠', '篭'], - OrthographicalVariant::濱 => &['濱', '浜'], - OrthographicalVariant::祗 => &['祗', '祇'], - OrthographicalVariant::曾 => &['曾', '曽'], - OrthographicalVariant::國 => &['國', '国'], - OrthographicalVariant::鉋 => &['鉋', '飽'], - OrthographicalVariant::鷆 => &['鷆', '鷏'], - OrthographicalVariant::斑 => &['斑', '班'], - OrthographicalVariant::櫻 => &['櫻', '桜'], - OrthographicalVariant::櫟 => &['櫟', '擽'], - OrthographicalVariant::冨 => &['冨', '富'], - OrthographicalVariant::諫 => &['諫', '諌'], + の => &['の', 'ノ', '之'], + ツ => &['ツ', 'ッ'], + ケ => &['ケ', 'ヶ', 'が', 'ガ'], + 薮 => &['薮', '藪', '籔'], + 崎 => &['崎', '﨑'], + 檜 => &['桧', '檜'], + 龍 => &['龍', '竜'], + 竈 => &['竈', '竃', '釜'], + 嶋 => &['嶋', '島'], + 舘 => &['舘', '館'], + 鰺 => &['鰺', '鯵'], + 脊 => &['脊', '背'], + 渕 => &['渕', '淵'], + 己 => &['己', '巳'], + 槇 => &['槇', '槙'], + 治 => &['治', '冶'], + 佛 => &['佛', '仏'], + 澤 => &['澤', '沢'], + 塚 => &['塚', '塚'], + 恵 => &['恵', '惠'], + 穂 => &['穂', '穗'], + 梼 => &['梼', '檮'], + 蛍 => &['蛍', '螢'], + 與 => &['與', '与'], + 瀧 => &['瀧', '滝'], + 籠 => &['籠', '篭'], + 濱 => &['濱', '浜'], + 祗 => &['祗', '祇'], + 曾 => &['曾', '曽'], + 國 => &['國', '国'], + 鉋 => &['鉋', '飽'], + 鷆 => &['鷆', '鷏'], + 斑 => &['斑', '班'], + 櫻 => &['櫻', '桜'], + 櫟 => &['櫟', '擽'], + 冨 => &['冨', '富'], + 諫 => &['諫', '諌'], + 驒 => &['驒', '騨'], } } fn permutations(&self) -> Vec<(char, char)> { let characters = self.value(); - let mut permutations: Vec<(char, char)> = vec![]; - for n in 0..characters.len() { - for m in 0..characters.len() { - if n != m { - permutations.push((characters[n], characters[m])); + let mut permutations = Vec::with_capacity(characters.len() * (characters.len() - 1)); + for &a in characters { + for &b in characters { + if a != b { + permutations.push((a, b)); } } } @@ -102,21 +105,32 @@ pub struct OrthographicalVariantAdapter { impl OrthographicalVariantAdapter { pub fn apply(self, input: &str, region_name: &str) -> Option<(String, String)> { + let variants = self.filter_variants(input); + if variants.is_empty() { + return None; + } + self.match_with_variants(input, region_name, variants) + } + + fn filter_variants(&self, input: &str) -> Vec<&OrthographicalVariant> { // 必要なパターンのみを選別する - let variant_list: Vec<&OrthographicalVariant> = self - .variant_list + self.variant_list .iter() .filter(|v| v.value().iter().any(|&c| input.contains(c))) - .collect(); - if variant_list.is_empty() { - return None; - } + .collect() + } + fn match_with_variants( + &self, + input: &str, + target: &str, + variants: Vec<&OrthographicalVariant>, + ) -> Option<(String, String)> { // マッチ候補を容れておくためのVector - let mut candidates: Vec = vec![region_name.to_string()]; + let mut candidates = vec![target.to_string()]; // パターンを一つづつ検証していく - for variant in variant_list { - let mut semi_candidates: Vec = vec![]; + for variant in variants { + let mut semi_candidates = vec![]; // variantから順列を作成 // ["ケ", "ヶ", "が"] -> (ケ, ヶ), (ケ, が), (ヶ, ケ), (ヶ, が), (が, ケ), (が, ヶ) for (a, b) in variant.permutations() { @@ -125,7 +139,7 @@ impl OrthographicalVariantAdapter { if input.starts_with(&modified_candidate) { // マッチすれば早期リターン return Some(( - region_name.to_string(), + target.to_string(), input .chars() .skip(modified_candidate.chars().count()) @@ -138,7 +152,7 @@ impl OrthographicalVariantAdapter { } } candidates = semi_candidates; - candidates.push(region_name.to_string()); + candidates.push(target.to_string()); } None } diff --git a/core/src/formatter/chome_with_arabic_numerals.rs b/core/src/formatter/chome_with_arabic_numerals.rs index b8610ea9..f27ebb20 100644 --- a/core/src/formatter/chome_with_arabic_numerals.rs +++ b/core/src/formatter/chome_with_arabic_numerals.rs @@ -1,21 +1,24 @@ use crate::util::converter::JapaneseNumber; pub(crate) fn format_chome_with_arabic_numerals(target: &str) -> Option { - let chome = if cfg!(target_arch = "wasm32") { + let chome = extract_chome(target)?; + let chome_int = chome.parse::().ok()?; + Some(target.replacen(&chome, chome_int.to_japanese_form()?.as_str(), 1)) +} + +fn extract_chome(target: &str) -> Option { + if cfg!(target_arch = "wasm32") { js_sys::RegExp::new(r"\D+(\d+)丁目", "") .exec(target)? .get(1) - .as_string()? + .as_string() } else { regex::Regex::new(r"\D+(?\d+)丁目") .unwrap() .captures(target)? - .name("chome")? - .as_str() - .to_string() - }; - let chome_int = chome.parse::().ok()?; - Some(target.replacen(&chome, chome_int.to_japanese_form()?.as_str(), 1)) + .name("chome") + .map(|m| m.as_str().to_string()) + } } #[cfg(all(test, not(target_arch = "wasm32")))] diff --git a/core/src/formatter/fullwidth_character.rs b/core/src/formatter/fullwidth_character.rs index 4a116d52..edef0e83 100644 --- a/core/src/formatter/fullwidth_character.rs +++ b/core/src/formatter/fullwidth_character.rs @@ -1,5 +1,5 @@ /// 文字列中の全角数字を半角数字に修正します -pub(crate) fn format_fullwidth_number(target: &str) -> String { +pub(crate) fn format_fullwidth_numerals(target: &str) -> String { target .chars() .map(|c| match c { @@ -20,11 +20,14 @@ pub(crate) fn format_fullwidth_number(target: &str) -> String { #[cfg(test)] mod tests { - use crate::formatter::fullwidth_character::format_fullwidth_number; + use crate::formatter::fullwidth_character::format_fullwidth_numerals; #[test] fn 全角文字を含む() { - assert_eq!(format_fullwidth_number("京橋1丁目"), "京橋1丁目"); - assert_eq!(format_fullwidth_number("京橋3丁目1の1"), "京橋3丁目1の1"); + assert_eq!(format_fullwidth_numerals("京橋1丁目"), "京橋1丁目"); + assert_eq!( + format_fullwidth_numerals("京橋3丁目1の1"), + "京橋3丁目1の1" + ); } } diff --git a/core/src/tokenizer/read_city.rs b/core/src/tokenizer/read_city.rs index 048e1a7b..8c6872d0 100644 --- a/core/src/tokenizer/read_city.rs +++ b/core/src/tokenizer/read_city.rs @@ -18,52 +18,34 @@ impl Tokenizer { found.to_string(), Tokenizer { tokens: append_token(&self.tokens, Token::City(found.to_string())), - rest: self - .rest - .chars() - .skip(found.chars().count()) - .collect::(), + rest: self.rest.chars().skip(found.chars().count()).collect(), _state: PhantomData::, }, )); } // ここまでで市区町村名が読み取れない場合は、表記ゆれを含む可能性を検討する - let mut variant_list = vec![OrthographicalVariant::ケ]; - match self.get_prefecture_name() { - Some("青森県") => { - variant_list.push(OrthographicalVariant::舘); - variant_list.push(OrthographicalVariant::鰺); - } - Some("宮城県") => { - variant_list.push(OrthographicalVariant::竈); - } - Some("茨城県") => { - variant_list.push(OrthographicalVariant::龍); - variant_list.push(OrthographicalVariant::嶋); - } - Some("東京都") => { - variant_list.push(OrthographicalVariant::檜); - } - Some("兵庫県") => { - variant_list.push(OrthographicalVariant::塚); - } - Some("高知県") => { - variant_list.push(OrthographicalVariant::梼); - } - Some("福岡県") => { - variant_list.push(OrthographicalVariant::恵); - } - Some("長崎県") => { - variant_list.push(OrthographicalVariant::諫); - } - _ => {} + use OrthographicalVariant::*; + let mut variant_list = vec![ケ]; + if let Some(pref_name) = self.get_prefecture_name() { + variant_list.extend(match pref_name { + "青森県" => vec![舘, 鰺], + "宮城県" => vec![竈], + "茨城県" => vec![龍, 嶋], + "東京都" => vec![檜], + "岐阜県" => vec![驒], + "兵庫県" => vec![塚], + "高知県" => vec![梼], + "福岡県" => vec![恵], + "長崎県" => vec![諫], + _ => vec![], + }); } for candidate in candidates { let adapter = OrthographicalVariantAdapter { variant_list: variant_list.clone(), }; - if let Some((city_name, rest)) = adapter.apply(self.rest.as_str(), candidate) { + if let Some((city_name, rest)) = adapter.apply(&self.rest, candidate) { return Ok(( city_name.clone(), Tokenizer { diff --git a/core/src/tokenizer/read_town.rs b/core/src/tokenizer/read_town.rs index cab113fa..2bac9611 100644 --- a/core/src/tokenizer/read_town.rs +++ b/core/src/tokenizer/read_town.rs @@ -3,7 +3,7 @@ use crate::adapter::orthographical_variant_adapter::{ }; use crate::domain::common::token::{append_token, Token}; use crate::formatter::chome_with_arabic_numerals::format_chome_with_arabic_numerals; -use crate::formatter::fullwidth_character::format_fullwidth_number; +use crate::formatter::fullwidth_character::format_fullwidth_numerals; use crate::formatter::house_number::format_house_number; use crate::formatter::informal_town_name_notation::format_informal_town_name_notation; use crate::tokenizer::{CityNameFound, End, Tokenizer, TownNameFound}; @@ -14,28 +14,24 @@ impl Tokenizer { &self, candidates: Vec, ) -> Result<(String, Tokenizer), Tokenizer> { - let mut rest = format_fullwidth_number(&self.rest); + let mut rest = format_fullwidth_numerals(&self.rest); if rest.contains("丁目") { rest = format_chome_with_arabic_numerals(&rest).unwrap_or(rest); } - let (town_name, rest) = match find_town(&rest, &candidates) { - Some(found) => found, - None => { + let (town_name, rest) = find_town(&rest, &candidates) + .or_else(|| { // 「〇〇町L丁目M番N」ではなく「〇〇町L-M-N」と表記されているような場合 - rest = format_informal_town_name_notation(&rest).unwrap_or(rest); - match find_town(&rest, &candidates) { - Some(found) => found, - None => { - // ここまでで町名の検出に成功しない場合は、「大字」の省略の可能性を検討する - rest = format!("大字{}", rest); - match find_town(&rest, &candidates) { - Some(found) => found, - None => return Err(self.finish()), - } - } + if let Some(it) = format_informal_town_name_notation(&rest) { + rest = it } - } - }; + find_town(&rest, &candidates) + }) + .or_else(|| { + // ここまでで町名の検出に成功しない場合は、「大字」の省略の可能性を検討する + rest = format!("大字{}", rest); + find_town(&rest, &candidates) + }) + .ok_or_else(|| self.finish())?; Ok(( town_name.clone(), Tokenizer { @@ -57,47 +53,14 @@ fn find_town(input: &str, candidates: &Vec) -> Option<(String, String)> if input.starts_with(candidate) { return Some(( candidate.to_string(), - input - .chars() - .skip(candidate.chars().count()) - .collect::(), + input.chars().skip(candidate.chars().count()).collect(), )); } + use OrthographicalVariant::*; let adapter = OrthographicalVariantAdapter { variant_list: vec![ - OrthographicalVariant::の, - OrthographicalVariant::ツ, - OrthographicalVariant::ケ, - OrthographicalVariant::薮, - OrthographicalVariant::崎, - OrthographicalVariant::檜, - OrthographicalVariant::竈, - OrthographicalVariant::舘, - OrthographicalVariant::鰺, - OrthographicalVariant::脊, - OrthographicalVariant::渕, - OrthographicalVariant::己, - OrthographicalVariant::槇, - OrthographicalVariant::治, - OrthographicalVariant::佛, - OrthographicalVariant::澤, - OrthographicalVariant::恵, - OrthographicalVariant::穂, - OrthographicalVariant::梼, - OrthographicalVariant::蛍, - OrthographicalVariant::與, - OrthographicalVariant::瀧, - OrthographicalVariant::籠, - OrthographicalVariant::濱, - OrthographicalVariant::祗, - OrthographicalVariant::曾, - OrthographicalVariant::國, - OrthographicalVariant::鉋, - OrthographicalVariant::鷆, - OrthographicalVariant::斑, - OrthographicalVariant::櫻, - OrthographicalVariant::櫟, - OrthographicalVariant::冨, + の, ツ, ケ, 薮, 崎, 檜, 竈, 舘, 鰺, 脊, 渕, 己, 槇, 治, 佛, 澤, 恵, 穂, 梼, 蛍, 與, + 瀧, 籠, 濱, 祗, 曾, 國, 鉋, 鷆, 斑, 櫻, 櫟, 冨, ], }; if let Some(result) = adapter.apply(input, candidate) { diff --git a/core/src/util/sequence_matcher.rs b/core/src/util/sequence_matcher.rs index 8eaeffb1..c84df353 100644 --- a/core/src/util/sequence_matcher.rs +++ b/core/src/util/sequence_matcher.rs @@ -40,7 +40,7 @@ impl SequenceMatcher { } // 類似度で並び替える candidates.sort_by(|a, b| b.similarity.partial_cmp(&a.similarity).unwrap()); - let highest_similarity = candidates.first().unwrap().similarity; + let highest_similarity = candidates[0].similarity; // 類似度が一位のものだけを抽出する let highest_matches: Vec = candidates .iter() @@ -48,7 +48,7 @@ impl SequenceMatcher { .map(|candidate| candidate.text.clone()) .collect(); match &highest_matches.len() { - 1 => Ok(highest_matches.first().unwrap().clone()), + 1 => Ok(highest_matches[0].clone()), _ => Err(Error::MoreThanOneCandidateExist(highest_matches)), } } @@ -58,8 +58,8 @@ impl SequenceMatcher { possibilities: &[String], threshold: Option, ) -> Vec { - let mut highest_similarity: f64 = 0.0; - let mut highest_matches: Vec = vec![]; + let mut highest_similarity = 0.0; + let mut highest_matches = Vec::with_capacity(possibilities.len()); let length_of_longest_possibility = Self::get_length_of_longest_one(possibilities).unwrap(); let input = Self::cut_text(input, length_of_longest_possibility); for possibility in possibilities { @@ -68,7 +68,7 @@ impl SequenceMatcher { if similarity > highest_similarity { highest_matches.clear(); } - if threshold.is_none() || similarity > threshold.unwrap() { + if similarity > threshold.unwrap_or(0.0) { highest_matches.push(Candidate { similarity, text: possibility.clone(), @@ -85,11 +85,7 @@ impl SequenceMatcher { } fn cut_text(input: &str, length: usize) -> String { - if input.chars().count() > length { - input.chars().take(length).collect::() - } else { - input.to_string() - } + input.chars().take(length).collect() } fn evaluate_match_ratio(left: &str, right: &str) -> f64 { diff --git "a/tests/test_data/\345\270\202\345\214\272\347\224\272\346\235\221\345\220\215\343\203\254\343\203\231\343\203\253\343\201\247\343\201\256\350\241\250\350\250\230\343\202\206\343\202\214.csv" "b/tests/test_data/\345\270\202\345\214\272\347\224\272\346\235\221\345\220\215\343\203\254\343\203\231\343\203\253\343\201\247\343\201\256\350\241\250\350\250\230\343\202\206\343\202\214.csv" index 2c3c81d4..747775c2 100644 --- "a/tests/test_data/\345\270\202\345\214\272\347\224\272\346\235\221\345\220\215\343\203\254\343\203\231\343\203\253\343\201\247\343\201\256\350\241\250\350\250\230\343\202\206\343\202\214.csv" +++ "b/tests/test_data/\345\270\202\345\214\272\347\224\272\346\235\221\345\220\215\343\203\254\343\203\231\343\203\253\343\201\247\343\201\256\350\241\250\350\250\230\343\202\206\343\202\214.csv" @@ -8,6 +8,9 @@ address,prefecture,city,town,rest # 茨城県 茨城県鹿嶋市大字平井1187-1,茨城県,鹿嶋市,大字平井,1187-1 茨城県鹿島市大字平井1187-1,茨城県,鹿嶋市,大字平井,1187-1 +# 岐阜県 +岐阜県飛騨市宮川町塩屋104,岐阜県,飛騨市,宮川町塩屋,104 +岐阜県飛驒市宮川町塩屋104,岐阜県,飛騨市,宮川町塩屋,104 # 兵庫県 兵庫県宝塚市売布東の町8-19,兵庫県,宝塚市,売布東の町,8-19 兵庫県宝塚市売布東の町8-19,兵庫県,宝塚市,売布東の町,8-19