Skip to content

Commit

Permalink
Merge pull request #569 from YuukiToriyama/release/v0.2.1
Browse files Browse the repository at this point in the history
release/v0.2.1をmainブランチにマージ
  • Loading branch information
YuukiToriyama authored Jan 5, 2025
2 parents 723009e + 86ee44a commit 1a37b5e
Show file tree
Hide file tree
Showing 8 changed files with 130 additions and 166 deletions.
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ members = [
resolver = "2"

[workspace.package]
version = "0.2.0"
version = "0.2.1"
edition = "2021"
description = "A library for processing addresses of Japan"
repository = "https://github.com/YuukiToriyama/japanese-address-parser"
Expand Down
120 changes: 67 additions & 53 deletions core/src/adapter/orthographical_variant_adapter.rs
Original file line number Diff line number Diff line change
Expand Up @@ -37,58 +37,61 @@ pub enum OrthographicalVariant {
,
,
,
,
}

impl OrthographicalVariant {
fn value(&self) -> &[char] {
use OrthographicalVariant::*;
match self {
OrthographicalVariant::の => &['の', 'ノ', '之'],
OrthographicalVariant::ツ => &['ツ', 'ッ'],
OrthographicalVariant::ケ => &['ケ', 'ヶ', 'が', 'ガ'],
OrthographicalVariant::薮 => &['薮', '藪', '籔'],
OrthographicalVariant::崎 => &['崎', '﨑'],
OrthographicalVariant::檜 => &['桧', '檜'],
OrthographicalVariant::龍 => &['龍', '竜'],
OrthographicalVariant::竈 => &['竈', '竃', '釜'],
OrthographicalVariant::嶋 => &['嶋', '島'],
OrthographicalVariant::舘 => &['舘', '館'],
OrthographicalVariant::鰺 => &['鰺', '鯵'],
OrthographicalVariant::脊 => &['脊', '背'],
OrthographicalVariant::渕 => &['渕', '淵'],
OrthographicalVariant::己 => &['己', '巳'],
OrthographicalVariant::槇 => &['槇', '槙'],
OrthographicalVariant::治 => &['治', '冶'],
OrthographicalVariant::佛 => &['佛', '仏'],
OrthographicalVariant::澤 => &['澤', '沢'],
OrthographicalVariant::塚 => &['塚', '塚'],
OrthographicalVariant::恵 => &['恵', '惠'],
OrthographicalVariant::穂 => &['穂', '穗'],
OrthographicalVariant::梼 => &['梼', '檮'],
OrthographicalVariant::蛍 => &['蛍', '螢'],
OrthographicalVariant::與 => &['與', '与'],
OrthographicalVariant::瀧 => &['瀧', '滝'],
OrthographicalVariant::籠 => &['籠', '篭'],
OrthographicalVariant::濱 => &['濱', '浜'],
OrthographicalVariant::祗 => &['祗', '祇'],
OrthographicalVariant::曾 => &['曾', '曽'],
OrthographicalVariant::國 => &['國', '国'],
OrthographicalVariant::鉋 => &['鉋', '飽'],
OrthographicalVariant::鷆 => &['鷆', '鷏'],
OrthographicalVariant::斑 => &['斑', '班'],
OrthographicalVariant::櫻 => &['櫻', '桜'],
OrthographicalVariant::櫟 => &['櫟', '擽'],
OrthographicalVariant::冨 => &['冨', '富'],
OrthographicalVariant::諫 => &['諫', '諌'],
の => &['の', 'ノ', '之'],
ツ => &['ツ', 'ッ'],
ケ => &['ケ', 'ヶ', 'が', 'ガ'],
薮 => &['薮', '藪', '籔'],
崎 => &['崎', '﨑'],
檜 => &['桧', '檜'],
龍 => &['龍', '竜'],
竈 => &['竈', '竃', '釜'],
嶋 => &['嶋', '島'],
舘 => &['舘', '館'],
鰺 => &['鰺', '鯵'],
脊 => &['脊', '背'],
渕 => &['渕', '淵'],
己 => &['己', '巳'],
槇 => &['槇', '槙'],
治 => &['治', '冶'],
佛 => &['佛', '仏'],
澤 => &['澤', '沢'],
塚 => &['塚', '塚'],
恵 => &['恵', '惠'],
穂 => &['穂', '穗'],
梼 => &['梼', '檮'],
蛍 => &['蛍', '螢'],
與 => &['與', '与'],
瀧 => &['瀧', '滝'],
籠 => &['籠', '篭'],
濱 => &['濱', '浜'],
祗 => &['祗', '祇'],
曾 => &['曾', '曽'],
國 => &['國', '国'],
鉋 => &['鉋', '飽'],
鷆 => &['鷆', '鷏'],
斑 => &['斑', '班'],
櫻 => &['櫻', '桜'],
櫟 => &['櫟', '擽'],
冨 => &['冨', '富'],
諫 => &['諫', '諌'],
驒 => &['驒', '騨'],
}
}

fn permutations(&self) -> Vec<(char, char)> {
let characters = self.value();
let mut permutations: Vec<(char, char)> = vec![];
for n in 0..characters.len() {
for m in 0..characters.len() {
if n != m {
permutations.push((characters[n], characters[m]));
let mut permutations = Vec::with_capacity(characters.len() * (characters.len() - 1));
for &a in characters {
for &b in characters {
if a != b {
permutations.push((a, b));
}
}
}
Expand All @@ -102,21 +105,32 @@ pub struct OrthographicalVariantAdapter {

impl OrthographicalVariantAdapter {
pub fn apply(self, input: &str, region_name: &str) -> Option<(String, String)> {
let variants = self.filter_variants(input);
if variants.is_empty() {
return None;
}
self.match_with_variants(input, region_name, variants)
}

fn filter_variants(&self, input: &str) -> Vec<&OrthographicalVariant> {
// 必要なパターンのみを選別する
let variant_list: Vec<&OrthographicalVariant> = self
.variant_list
self.variant_list
.iter()
.filter(|v| v.value().iter().any(|&c| input.contains(c)))
.collect();
if variant_list.is_empty() {
return None;
}
.collect()
}

fn match_with_variants(
&self,
input: &str,
target: &str,
variants: Vec<&OrthographicalVariant>,
) -> Option<(String, String)> {
// マッチ候補を容れておくためのVector
let mut candidates: Vec<String> = vec![region_name.to_string()];
let mut candidates = vec![target.to_string()];
// パターンを一つづつ検証していく
for variant in variant_list {
let mut semi_candidates: Vec<String> = vec![];
for variant in variants {
let mut semi_candidates = vec![];
// variantから順列を作成
// ["ケ", "ヶ", "が"] -> (ケ, ヶ), (ケ, が), (ヶ, ケ), (ヶ, が), (が, ケ), (が, ヶ)
for (a, b) in variant.permutations() {
Expand All @@ -125,7 +139,7 @@ impl OrthographicalVariantAdapter {
if input.starts_with(&modified_candidate) {
// マッチすれば早期リターン
return Some((
region_name.to_string(),
target.to_string(),
input
.chars()
.skip(modified_candidate.chars().count())
Expand All @@ -138,7 +152,7 @@ impl OrthographicalVariantAdapter {
}
}
candidates = semi_candidates;
candidates.push(region_name.to_string());
candidates.push(target.to_string());
}
None
}
Expand Down
19 changes: 11 additions & 8 deletions core/src/formatter/chome_with_arabic_numerals.rs
Original file line number Diff line number Diff line change
@@ -1,21 +1,24 @@
use crate::util::converter::JapaneseNumber;

pub(crate) fn format_chome_with_arabic_numerals(target: &str) -> Option<String> {
let chome = if cfg!(target_arch = "wasm32") {
let chome = extract_chome(target)?;
let chome_int = chome.parse::<i8>().ok()?;
Some(target.replacen(&chome, chome_int.to_japanese_form()?.as_str(), 1))
}

fn extract_chome(target: &str) -> Option<String> {
if cfg!(target_arch = "wasm32") {
js_sys::RegExp::new(r"\D+(\d+)丁目", "")
.exec(target)?
.get(1)
.as_string()?
.as_string()
} else {
regex::Regex::new(r"\D+(?<chome>\d+)丁目")
.unwrap()
.captures(target)?
.name("chome")?
.as_str()
.to_string()
};
let chome_int = chome.parse::<i8>().ok()?;
Some(target.replacen(&chome, chome_int.to_japanese_form()?.as_str(), 1))
.name("chome")
.map(|m| m.as_str().to_string())
}
}

#[cfg(all(test, not(target_arch = "wasm32")))]
Expand Down
11 changes: 7 additions & 4 deletions core/src/formatter/fullwidth_character.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/// 文字列中の全角数字を半角数字に修正します
pub(crate) fn format_fullwidth_number(target: &str) -> String {
pub(crate) fn format_fullwidth_numerals(target: &str) -> String {
target
.chars()
.map(|c| match c {
Expand All @@ -20,11 +20,14 @@ pub(crate) fn format_fullwidth_number(target: &str) -> String {

#[cfg(test)]
mod tests {
use crate::formatter::fullwidth_character::format_fullwidth_number;
use crate::formatter::fullwidth_character::format_fullwidth_numerals;

#[test]
fn 全角文字を含む() {
assert_eq!(format_fullwidth_number("京橋1丁目"), "京橋1丁目");
assert_eq!(format_fullwidth_number("京橋3丁目1の1"), "京橋3丁目1の1");
assert_eq!(format_fullwidth_numerals("京橋1丁目"), "京橋1丁目");
assert_eq!(
format_fullwidth_numerals("京橋3丁目1の1"),
"京橋3丁目1の1"
);
}
}
52 changes: 17 additions & 35 deletions core/src/tokenizer/read_city.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,52 +18,34 @@ impl Tokenizer<PrefectureNameFound> {
found.to_string(),
Tokenizer {
tokens: append_token(&self.tokens, Token::City(found.to_string())),
rest: self
.rest
.chars()
.skip(found.chars().count())
.collect::<String>(),
rest: self.rest.chars().skip(found.chars().count()).collect(),
_state: PhantomData::<CityNameFound>,
},
));
}

// ここまでで市区町村名が読み取れない場合は、表記ゆれを含む可能性を検討する
let mut variant_list = vec![OrthographicalVariant::];
match self.get_prefecture_name() {
Some("青森県") => {
variant_list.push(OrthographicalVariant::);
variant_list.push(OrthographicalVariant::);
}
Some("宮城県") => {
variant_list.push(OrthographicalVariant::);
}
Some("茨城県") => {
variant_list.push(OrthographicalVariant::);
variant_list.push(OrthographicalVariant::);
}
Some("東京都") => {
variant_list.push(OrthographicalVariant::);
}
Some("兵庫県") => {
variant_list.push(OrthographicalVariant::);
}
Some("高知県") => {
variant_list.push(OrthographicalVariant::);
}
Some("福岡県") => {
variant_list.push(OrthographicalVariant::);
}
Some("長崎県") => {
variant_list.push(OrthographicalVariant::);
}
_ => {}
use OrthographicalVariant::*;
let mut variant_list = vec![];
if let Some(pref_name) = self.get_prefecture_name() {
variant_list.extend(match pref_name {
"青森県" => vec![,],
"宮城県" => vec![],
"茨城県" => vec![,],
"東京都" => vec![],
"岐阜県" => vec![],
"兵庫県" => vec![],
"高知県" => vec![],
"福岡県" => vec![],
"長崎県" => vec![],
_ => vec![],
});
}
for candidate in candidates {
let adapter = OrthographicalVariantAdapter {
variant_list: variant_list.clone(),
};
if let Some((city_name, rest)) = adapter.apply(self.rest.as_str(), candidate) {
if let Some((city_name, rest)) = adapter.apply(&self.rest, candidate) {
return Ok((
city_name.clone(),
Tokenizer {
Expand Down
Loading

0 comments on commit 1a37b5e

Please sign in to comment.