From af730a644378459840db8eae2fa7e08007a03d34 Mon Sep 17 00:00:00 2001 From: Yuuki Toriyama Date: Tue, 9 Jul 2024 23:17:32 +0900 Subject: [PATCH 01/12] =?UTF-8?q?update:=20#339:=20=E3=80=8C=E8=9B=8D?= =?UTF-8?q?=E6=B1=A0=E3=80=8D=E3=81=A8=E3=80=8C=E8=9E=A2=E6=B1=A0=E3=80=8D?= =?UTF-8?q?=E3=81=AE=E8=A1=A8=E8=A8=98=E3=82=86=E3=82=8C=E3=81=AE=E3=82=B1?= =?UTF-8?q?=E3=83=BC=E3=82=B9=E3=82=92`=E7=95=B0=E5=AD=97=E4=BD=93?= =?UTF-8?q?=E6=97=A7=E5=AD=97=E4=BD=93=E3=81=B8=E3=81=AE=E5=AF=BE=E5=BF=9C?= =?UTF-8?q?.csv`=E3=81=AB=E8=BF=BD=E5=8A=A0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ...75\223\343\201\270\343\201\256\345\257\276\345\277\234.csv" | 3 +++ 1 file changed, 3 insertions(+) diff --git "a/tests/test_data/\347\225\260\345\255\227\344\275\223\346\227\247\345\255\227\344\275\223\343\201\270\343\201\256\345\257\276\345\277\234.csv" "b/tests/test_data/\347\225\260\345\255\227\344\275\223\346\227\247\345\255\227\344\275\223\343\201\270\343\201\256\345\257\276\345\277\234.csv" index 3b3dbb97..1b80d292 100644 --- "a/tests/test_data/\347\225\260\345\255\227\344\275\223\346\227\247\345\255\227\344\275\223\343\201\270\343\201\256\345\257\276\345\277\234.csv" +++ "b/tests/test_data/\347\225\260\345\255\227\344\275\223\346\227\247\345\255\227\344\275\223\343\201\270\343\201\256\345\257\276\345\277\234.csv" @@ -60,3 +60,6 @@ address,prefecture,city,town,rest # 「道穂」と「道穗」の表記ゆれへの対応 奈良県葛城市南道穗171-3,奈良県,葛城市,南道穗,171-3 奈良県葛城市南道穂171-3,奈良県,葛城市,南道穗,171-3 +# 「螢池」と「蛍池」の表記ゆれへの対応 +大阪府豊中市螢池東町一丁目5番1号,大阪府,豊中市,螢池東町一丁目,5番1号 +大阪府豊中市蛍池東町一丁目5番1号,大阪府,豊中市,螢池東町一丁目,5番1号 From 12362cc056a749c67b2ae2a98108bdca613a5805 Mon Sep 17 00:00:00 2001 From: Yuuki Toriyama Date: Tue, 9 Jul 2024 23:22:57 +0900 Subject: [PATCH 02/12] =?UTF-8?q?update:=20#339:=20=E3=80=8C=E8=9B=8D?= =?UTF-8?q?=E6=B1=A0=E3=80=8D=E3=81=A8=E3=80=8C=E8=9E=A2=E6=B1=A0=E3=80=8D?= =?UTF-8?q?=E3=81=AE=E8=A1=A8=E8=A8=98=E3=82=86=E3=82=8C=E3=81=AB=E5=AF=BE?= =?UTF-8?q?=E5=BF=9C=E3=81=97=E3=81=9F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- core/src/parser/adapter/orthographical_variant_adapter.rs | 2 ++ core/src/parser/read_town.rs | 1 + 2 files changed, 3 insertions(+) diff --git a/core/src/parser/adapter/orthographical_variant_adapter.rs b/core/src/parser/adapter/orthographical_variant_adapter.rs index 0d69f484..ceed8b3f 100644 --- a/core/src/parser/adapter/orthographical_variant_adapter.rs +++ b/core/src/parser/adapter/orthographical_variant_adapter.rs @@ -28,6 +28,7 @@ pub trait OrthographicalVariants { const 穂: Variant; const 梼: Variant; const 葛: Variant; + const 蛍: Variant; } impl OrthographicalVariants for Variant { @@ -53,6 +54,7 @@ impl OrthographicalVariants for Variant { const 穂: Variant = &["穂", "穗"]; const 梼: Variant = &["梼", "檮"]; const 葛: Variant = &["葛󠄀", "葛"]; + const 蛍: Variant = &["蛍", "螢"]; } pub struct OrthographicalVariantAdapter { diff --git a/core/src/parser/read_town.rs b/core/src/parser/read_town.rs index 3496447d..425d7dd7 100644 --- a/core/src/parser/read_town.rs +++ b/core/src/parser/read_town.rs @@ -57,6 +57,7 @@ fn find_town(input: &String, city: &City) -> Option<(String, String)> { Variant::恵, Variant::穂, Variant::梼, + Variant::蛍, ], }; if let Some(result) = adapter.apply(input, &town.name) { From bb5321d161e485d51ba48bb0e569fd0ebe64a0f0 Mon Sep 17 00:00:00 2001 From: Yuuki Toriyama Date: Tue, 9 Jul 2024 23:27:14 +0900 Subject: [PATCH 03/12] update-version: 0.1.4 -> 0.1.5 --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index d4b4f463..f74ebbde 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -8,7 +8,7 @@ members = [ resolver = "2" [workspace.package] -version = "0.1.4" +version = "0.1.5" edition = "2021" description = "A Rust Library to parse japanese addresses." repository = "https://github.com/YuukiToriyama/japanese-address-parser" From a56f462357006bb8c169616a20d37e5d16d0627f Mon Sep 17 00:00:00 2001 From: Yuuki Toriyama Date: Fri, 12 Jul 2024 21:34:43 +0900 Subject: [PATCH 04/12] =?UTF-8?q?add:=20core=E3=83=A2=E3=82=B8=E3=83=A5?= =?UTF-8?q?=E3=83=BC=E3=83=AB=E3=81=AE=E3=83=AA=E3=83=95=E3=82=A1=E3=82=AF?= =?UTF-8?q?=E3=82=BF:=20`Tokenizer`=E3=82=92=E5=AE=9A=E7=BE=A9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- core/src/lib.rs | 1 + core/src/tokenizer.rs | 22 ++++++++++++++++++++++ 2 files changed, 23 insertions(+) create mode 100644 core/src/tokenizer.rs diff --git a/core/src/lib.rs b/core/src/lib.rs index 65733fb8..0f20c837 100644 --- a/core/src/lib.rs +++ b/core/src/lib.rs @@ -7,4 +7,5 @@ pub mod api; pub mod entity; mod err; pub mod parser; +mod tokenizer; mod util; diff --git a/core/src/tokenizer.rs b/core/src/tokenizer.rs new file mode 100644 index 00000000..65a13e9b --- /dev/null +++ b/core/src/tokenizer.rs @@ -0,0 +1,22 @@ +use std::marker::PhantomData; + +#[derive(Debug)] +pub(crate) struct Init; +#[derive(Debug)] +pub(crate) struct PrefectureNameFound; +#[derive(Debug)] +pub(crate) struct CityNameFound; +#[derive(Debug)] +pub(crate) struct TownNameFound; +#[derive(Debug)] +pub(crate) struct End; + +#[derive(Debug)] +pub struct Tokenizer { + input: String, + pub(crate) prefecture_name: Option, + pub(crate) city_name: Option, + pub(crate) town_name: Option, + pub(crate) rest: String, + _state: PhantomData, +} From 90a5735ae8f7a9484550afa33c37a90ee331b662 Mon Sep 17 00:00:00 2001 From: Yuuki Toriyama Date: Fri, 12 Jul 2024 21:36:31 +0900 Subject: [PATCH 05/12] =?UTF-8?q?add:=20core=E3=83=A2=E3=82=B8=E3=83=A5?= =?UTF-8?q?=E3=83=BC=E3=83=AB=E3=81=AE=E3=83=AA=E3=83=95=E3=82=A1=E3=82=AF?= =?UTF-8?q?=E3=82=BF:=20`Tokenizer`=E3=82=92=E5=AE=9F=E8=A3=85?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `core/src/parser/read_prefecture.rs`に対応する --- core/src/tokenizer.rs | 2 + core/src/tokenizer/read_prefecture.rs | 134 ++++++++++++++++++++++++++ 2 files changed, 136 insertions(+) create mode 100644 core/src/tokenizer/read_prefecture.rs diff --git a/core/src/tokenizer.rs b/core/src/tokenizer.rs index 65a13e9b..94b55b95 100644 --- a/core/src/tokenizer.rs +++ b/core/src/tokenizer.rs @@ -1,3 +1,5 @@ +pub(crate) mod read_prefecture; + use std::marker::PhantomData; #[derive(Debug)] diff --git a/core/src/tokenizer/read_prefecture.rs b/core/src/tokenizer/read_prefecture.rs new file mode 100644 index 00000000..dc87548f --- /dev/null +++ b/core/src/tokenizer/read_prefecture.rs @@ -0,0 +1,134 @@ +use std::marker::PhantomData; + +use crate::tokenizer::{End, Init, PrefectureNameFound, Tokenizer}; + +const PREFECTURE_NAME_LIST: [&str; 47] = [ + "北海道", + "青森県", + "岩手県", + "宮城県", + "秋田県", + "山形県", + "福島県", + "茨城県", + "栃木県", + "群馬県", + "埼玉県", + "千葉県", + "東京都", + "神奈川県", + "新潟県", + "富山県", + "石川県", + "福井県", + "山梨県", + "長野県", + "岐阜県", + "静岡県", + "愛知県", + "三重県", + "滋賀県", + "京都府", + "大阪府", + "兵庫県", + "奈良県", + "和歌山県", + "鳥取県", + "島根県", + "岡山県", + "広島県", + "山口県", + "徳島県", + "香川県", + "愛媛県", + "高知県", + "福岡県", + "佐賀県", + "長崎県", + "熊本県", + "大分県", + "宮崎県", + "鹿児島県", + "沖縄県", +]; + +impl Tokenizer { + pub(crate) fn new(input: &str) -> Self { + Self { + input: input.to_string(), + prefecture_name: None, + city_name: None, + town_name: None, + rest: "".to_string(), + _state: PhantomData, + } + } + + pub(crate) fn read_prefecture(self) -> Result, Tokenizer> { + for prefecture_name in PREFECTURE_NAME_LIST { + if self.input.starts_with(prefecture_name) { + return Ok(Tokenizer { + input: self.input.clone(), + prefecture_name: Some(prefecture_name.to_string()), + city_name: None, + town_name: None, + rest: self + .input + .chars() + .skip(prefecture_name.chars().count()) + .collect::(), + _state: PhantomData::, + }); + } + } + Err(Tokenizer { + input: self.input.clone(), + prefecture_name: None, + city_name: None, + town_name: None, + rest: self.input.clone(), + _state: PhantomData::, + }) + } +} + +#[cfg(test)] +mod tests { + use crate::tokenizer::Tokenizer; + + #[test] + fn new() { + let tokenizer = Tokenizer::new("東京都港区芝公園4丁目2-8"); + assert_eq!(tokenizer.input, "東京都港区芝公園4丁目2-8"); + assert_eq!(tokenizer.prefecture_name, None); + assert_eq!(tokenizer.city_name, None); + assert_eq!(tokenizer.town_name, None); + assert_eq!(tokenizer.rest, ""); + } + + #[test] + fn read_prefecture_成功() { + let tokenizer = Tokenizer::new("東京都港区芝公園4丁目2-8"); + let result = tokenizer.read_prefecture(); + assert!(result.is_ok()); + let tokenizer = result.unwrap(); + assert_eq!(tokenizer.input, "東京都港区芝公園4丁目2-8"); + assert_eq!(tokenizer.prefecture_name, Some("東京都".to_string())); + assert_eq!(tokenizer.city_name, None); + assert_eq!(tokenizer.town_name, None); + assert_eq!(tokenizer.rest, "港区芝公園4丁目2-8"); + } + + #[test] + fn read_prefecture_失敗() { + let tokenizer = Tokenizer::new("東今日都港区芝公園4丁目2-8"); + let result = tokenizer.read_prefecture(); + assert!(result.is_err()); + let tokenizer = result.unwrap_err(); + assert_eq!(tokenizer.input, "東今日都港区芝公園4丁目2-8"); + assert_eq!(tokenizer.prefecture_name, None); + assert_eq!(tokenizer.city_name, None); + assert_eq!(tokenizer.town_name, None); + assert_eq!(tokenizer.rest, "東今日都港区芝公園4丁目2-8".to_string()); + } +} From 883cd8ff84a67abc59b57e6c7e327149646add90 Mon Sep 17 00:00:00 2001 From: Yuuki Toriyama Date: Fri, 12 Jul 2024 22:33:27 +0900 Subject: [PATCH 06/12] =?UTF-8?q?add:=20core=E3=83=A2=E3=82=B8=E3=83=A5?= =?UTF-8?q?=E3=83=BC=E3=83=AB=E3=81=AE=E3=83=AA=E3=83=95=E3=82=A1=E3=82=AF?= =?UTF-8?q?=E3=82=BF:=20`Tokenizer`=E3=82=92=E5=AE=9F?= =?UTF-8?q?=E8=A3=85?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `core/src/parser/read_city.rs`に対応する --- core/src/parser.rs | 2 +- core/src/tokenizer.rs | 1 + core/src/tokenizer/read_city.rs | 197 ++++++++++++++++++++++++++++++++ 3 files changed, 199 insertions(+), 1 deletion(-) create mode 100644 core/src/tokenizer/read_city.rs diff --git a/core/src/parser.rs b/core/src/parser.rs index aa28917a..61c9db5c 100644 --- a/core/src/parser.rs +++ b/core/src/parser.rs @@ -9,7 +9,7 @@ use crate::parser::read_city::read_city; use crate::parser::read_prefecture::read_prefecture; use crate::parser::read_town::read_town; -mod adapter; +pub(crate) mod adapter; mod filter; mod read_city; mod read_house_number; diff --git a/core/src/tokenizer.rs b/core/src/tokenizer.rs index 94b55b95..2220ebfb 100644 --- a/core/src/tokenizer.rs +++ b/core/src/tokenizer.rs @@ -1,3 +1,4 @@ +pub(crate) mod read_city; pub(crate) mod read_prefecture; use std::marker::PhantomData; diff --git a/core/src/tokenizer/read_city.rs b/core/src/tokenizer/read_city.rs new file mode 100644 index 00000000..03f79d94 --- /dev/null +++ b/core/src/tokenizer/read_city.rs @@ -0,0 +1,197 @@ +use std::marker::PhantomData; + +use crate::parser::adapter::orthographical_variant_adapter::{ + OrthographicalVariantAdapter, OrthographicalVariants, Variant, +}; +use crate::parser::adapter::vague_expression_adapter::VagueExpressionAdapter; +use crate::tokenizer::{CityNameFound, End, PrefectureNameFound, Tokenizer}; + +impl Tokenizer { + pub(crate) fn read_city( + self, + candidates: Vec, + ) -> Result, Tokenizer> { + for candidate in &candidates { + if self.rest.starts_with(candidate) { + return Ok(Tokenizer { + input: self.input.clone(), + prefecture_name: self.prefecture_name.clone(), + city_name: Some(candidate.clone()), + town_name: None, + rest: self + .rest + .chars() + .skip(candidate.chars().count()) + .collect::(), + _state: PhantomData::, + }); + } + let mut variant_list = vec![Variant::ケ]; + match self.prefecture_name.clone().unwrap().as_str() { + "青森県" => { + variant_list.push(Variant::舘); + } + "宮城県" => { + variant_list.push(Variant::竈); + } + "茨城県" => { + variant_list.push(Variant::龍); + variant_list.push(Variant::嶋); + } + "東京都" => { + variant_list.push(Variant::檜); + variant_list.push(Variant::葛); + } + "兵庫県" => { + variant_list.push(Variant::塚); + } + "奈良県" => { + variant_list.push(Variant::葛); + } + "高知県" => { + variant_list.push(Variant::梼); + } + "福岡県" => { + variant_list.push(Variant::恵); + } + _ => {} + } + let adapter = OrthographicalVariantAdapter { variant_list }; + if let Some(result) = adapter.apply(self.rest.as_str(), candidate) { + return Ok(Tokenizer { + input: self.input.clone(), + prefecture_name: self.prefecture_name.clone(), + city_name: Some(result.1), + town_name: None, + rest: result.0, + _state: PhantomData::, + }); + } + } + + // ここまでで市町村名の特定ができない場合はVagueExpressionAdapterを使用して市町村名を推測する + let vague_expression_adapter = VagueExpressionAdapter {}; + if let Some(result) = vague_expression_adapter.apply(self.rest.as_str(), &candidates) { + return Ok(Tokenizer { + input: self.input.clone(), + prefecture_name: self.prefecture_name.clone(), + city_name: Some(result.1), + town_name: None, + rest: result.0, + _state: PhantomData::, + }); + } + + Err(Tokenizer { + input: self.input.clone(), + prefecture_name: self.prefecture_name, + city_name: None, + town_name: None, + rest: self.rest, + _state: PhantomData::, + }) + } +} + +#[cfg(test)] +mod tests { + use crate::tokenizer::{PrefectureNameFound, Tokenizer}; + use std::marker::PhantomData; + + #[test] + fn read_city_成功() { + let tokenizer = Tokenizer { + input: "神奈川県横浜市保土ケ谷区川辺町2番地9".to_string(), + prefecture_name: Some("神奈川県".to_string()), + city_name: None, + town_name: None, + rest: "横浜市保土ケ谷区川辺町2番地9".to_string(), + _state: PhantomData::, + }; + let result = tokenizer.read_city(vec![ + "横浜市保土ケ谷区".to_string(), + "横浜市鶴見区".to_string(), + "横浜市西区".to_string(), + ]); + assert!(result.is_ok()); + let tokenizer = result.unwrap(); + assert_eq!(tokenizer.input, "神奈川県横浜市保土ケ谷区川辺町2番地9"); + assert_eq!(tokenizer.prefecture_name, Some("神奈川県".to_string())); + assert_eq!(tokenizer.city_name, Some("横浜市保土ケ谷区".to_string())); + assert_eq!(tokenizer.town_name, None); + assert_eq!(tokenizer.rest, "川辺町2番地9"); + } + + #[test] + fn read_city_orthographical_variant_adapterで成功() { + let tokenizer = Tokenizer { + input: "神奈川県横浜市保土ヶ谷区川辺町2番地9".to_string(), // 「ヶ」と「ケ」の表記ゆれ + prefecture_name: Some("神奈川県".to_string()), + city_name: None, + town_name: None, + rest: "横浜市保土ヶ谷区川辺町2番地9".to_string(), + _state: PhantomData::, + }; + let result = tokenizer.read_city(vec![ + "横浜市保土ケ谷区".to_string(), + "横浜市鶴見区".to_string(), + "横浜市西区".to_string(), + ]); + assert!(result.is_ok()); + let tokenizer = result.unwrap(); + assert_eq!(tokenizer.input, "神奈川県横浜市保土ヶ谷区川辺町2番地9"); + assert_eq!(tokenizer.prefecture_name, Some("神奈川県".to_string())); + assert_eq!(tokenizer.city_name, Some("横浜市保土ケ谷区".to_string())); + assert_eq!(tokenizer.town_name, None); + assert_eq!(tokenizer.rest, "川辺町2番地9"); + } + + #[test] + fn read_city_vague_expression_adapterで成功() { + let tokenizer = Tokenizer { + input: "埼玉県東秩父村大字御堂634番地".to_string(), // 「秩父郡」が省略されている + prefecture_name: Some("埼玉県".to_string()), + city_name: None, + town_name: None, + rest: "東秩父村大字御堂634番地".to_string(), + _state: PhantomData::, + }; + let result = tokenizer.read_city(vec![ + "秩父郡皆野町".to_string(), + "秩父郡長瀞町".to_string(), + "秩父郡小鹿野町".to_string(), + "秩父郡東秩父村".to_string(), + ]); + assert!(result.is_ok()); + let tokenizer = result.unwrap(); + assert_eq!(tokenizer.input, "埼玉県東秩父村大字御堂634番地"); + assert_eq!(tokenizer.prefecture_name, Some("埼玉県".to_string())); + assert_eq!(tokenizer.city_name, Some("秩父郡東秩父村".to_string())); + assert_eq!(tokenizer.town_name, None); + assert_eq!(tokenizer.rest, "大字御堂634番地"); + } + + #[test] + fn read_city_失敗() { + let tokenizer = Tokenizer { + input: "神奈川県京都市上京区川辺町2番地9".to_string(), + prefecture_name: Some("神奈川県".to_string()), + city_name: None, + town_name: None, + rest: "京都市上京区川辺町2番地9".to_string(), + _state: PhantomData::, + }; + let result = tokenizer.read_city(vec![ + "横浜市保土ケ谷区".to_string(), + "横浜市鶴見区".to_string(), + "横浜市西区".to_string(), + ]); + assert!(result.is_err()); + let tokenizer = result.unwrap_err(); + assert_eq!(tokenizer.input, "神奈川県京都市上京区川辺町2番地9"); + assert_eq!(tokenizer.prefecture_name, Some("神奈川県".to_string())); + assert_eq!(tokenizer.city_name, None); + assert_eq!(tokenizer.town_name, None); + assert_eq!(tokenizer.rest, "京都市上京区川辺町2番地9"); + } +} From c71abbd04c68e0596430cf8b61a7feb9df5f227c Mon Sep 17 00:00:00 2001 From: Yuuki Toriyama Date: Fri, 12 Jul 2024 23:30:14 +0900 Subject: [PATCH 07/12] =?UTF-8?q?add:=20core=E3=83=A2=E3=82=B8=E3=83=A5?= =?UTF-8?q?=E3=83=BC=E3=83=AB=E3=81=AE=E3=83=AA=E3=83=95=E3=82=A1=E3=82=AF?= =?UTF-8?q?=E3=82=BF:=20`Tokenizer`=E3=82=92=E5=AE=9F?= =?UTF-8?q?=E8=A3=85?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `core/src/parser/read_town.rs`に対応 --- core/src/parser.rs | 2 +- core/src/tokenizer.rs | 1 + core/src/tokenizer/read_town.rs | 233 ++++++++++++++++++++++++++++++++ 3 files changed, 235 insertions(+), 1 deletion(-) create mode 100644 core/src/tokenizer/read_town.rs diff --git a/core/src/parser.rs b/core/src/parser.rs index 61c9db5c..9e0c97cf 100644 --- a/core/src/parser.rs +++ b/core/src/parser.rs @@ -10,7 +10,7 @@ use crate::parser::read_prefecture::read_prefecture; use crate::parser::read_town::read_town; pub(crate) mod adapter; -mod filter; +pub(crate) mod filter; mod read_city; mod read_house_number; mod read_prefecture; diff --git a/core/src/tokenizer.rs b/core/src/tokenizer.rs index 2220ebfb..0a4c9af7 100644 --- a/core/src/tokenizer.rs +++ b/core/src/tokenizer.rs @@ -1,5 +1,6 @@ pub(crate) mod read_city; pub(crate) mod read_prefecture; +pub(crate) mod read_town; use std::marker::PhantomData; diff --git a/core/src/tokenizer/read_town.rs b/core/src/tokenizer/read_town.rs new file mode 100644 index 00000000..0b79febc --- /dev/null +++ b/core/src/tokenizer/read_town.rs @@ -0,0 +1,233 @@ +use std::marker::PhantomData; + +use crate::parser::adapter::orthographical_variant_adapter::{ + OrthographicalVariantAdapter, OrthographicalVariants, Variant, +}; +use crate::parser::filter::fullwidth_character::FullwidthCharacterFilter; +use crate::parser::filter::invalid_town_name_format::InvalidTownNameFormatFilter; +use crate::parser::filter::non_kanji_block_number::NonKanjiBlockNumberFilter; +use crate::parser::filter::Filter; +use crate::tokenizer::{CityNameFound, End, Tokenizer, TownNameFound}; + +impl Tokenizer { + pub(crate) fn read_town( + self, + candidates: Vec, + ) -> Result, Tokenizer> { + let mut rest = FullwidthCharacterFilter {}.apply(self.rest.clone()); + if rest.contains("丁目") { + rest = NonKanjiBlockNumberFilter {}.apply(rest); + } + if let Some(result) = find_town(&rest, &candidates) { + return Ok(Tokenizer { + input: self.input, + prefecture_name: self.prefecture_name, + city_name: self.city_name, + town_name: Some(result.1), + rest: result.0, + _state: PhantomData::, + }); + } + // 「〇〇町L丁目M番N」ではなく「〇〇町L-M-N」と表記されているような場合 + rest = InvalidTownNameFormatFilter {}.apply(rest); + if let Some(result) = find_town(&rest, &candidates) { + return Ok(Tokenizer { + input: self.input, + prefecture_name: self.prefecture_name, + city_name: self.city_name, + town_name: Some(result.1), + rest: result.0, + _state: PhantomData::, + }); + } + // ここまでで町名の検出に成功しない場合は、「大字」の省略の可能性を検討する + if let Some(result) = find_town(&format!("大字{}", rest), &candidates) { + return Ok(Tokenizer { + input: self.input, + prefecture_name: self.prefecture_name, + city_name: self.city_name, + town_name: Some(result.1), + rest: result.0, + _state: PhantomData::, + }); + } + Err(Tokenizer { + input: self.input, + prefecture_name: self.prefecture_name, + city_name: self.city_name, + town_name: None, + rest: self.rest, + _state: PhantomData::, + }) + } +} + +fn find_town(input: &String, candidates: &Vec) -> Option<(String, String)> { + for candidate in candidates { + if input.starts_with(candidate) { + return Some(( + input + .chars() + .skip(candidate.chars().count()) + .collect::(), + candidate.to_string(), + )); + } + let adapter = OrthographicalVariantAdapter { + variant_list: vec![ + Variant::の, + Variant::ツ, + Variant::ケ, + Variant::薮, + Variant::崎, + Variant::檜, + Variant::舘, + Variant::脊, + Variant::渕, + Variant::己, + Variant::槇, + Variant::治, + Variant::佛, + Variant::澤, + Variant::恵, + Variant::穂, + Variant::梼, + ], + }; + if let Some(result) = adapter.apply(input, &candidate) { + return Some(result); + }; + } + None +} + +#[cfg(test)] +mod tests { + use crate::tokenizer::{CityNameFound, Tokenizer}; + use std::marker::PhantomData; + + #[test] + fn read_town_成功() { + let tokenizer = Tokenizer { + input: "静岡県静岡市清水区旭町6番8号".to_string(), + prefecture_name: Some("静岡県".to_string()), + city_name: Some("静岡市清水区".to_string()), + town_name: None, + rest: "旭町6番8号".to_string(), + _state: PhantomData::, + }; + let result = tokenizer.read_town(vec![ + "下野緑町".to_string(), + "承元寺町".to_string(), + "旭町".to_string(), + "新丹谷".to_string(), + "三保松原町".to_string(), + ]); + assert!(result.is_ok()); + let tokenizer = result.unwrap(); + assert_eq!(tokenizer.input, "静岡県静岡市清水区旭町6番8号"); + assert_eq!(tokenizer.prefecture_name.unwrap(), "静岡県"); + assert_eq!(tokenizer.city_name.unwrap(), "静岡市清水区"); + assert_eq!(tokenizer.town_name.unwrap(), "旭町"); + assert_eq!(tokenizer.rest, "6番8号"); + } + + #[test] + fn read_town_orthographical_variant_adapterで成功() { + let tokenizer = Tokenizer { + input: "東京都千代田区一ッ橋二丁目1番".to_string(), // 「ッ」と「ツ」の表記ゆれ + prefecture_name: Some("東京都".to_string()), + city_name: Some("千代田区".to_string()), + town_name: None, + rest: "一ッ橋二丁目1番".to_string(), + _state: PhantomData::, + }; + let result = tokenizer.read_town(vec![ + "神田錦町一丁目".to_string(), + "神田錦町二丁目".to_string(), + "神田錦町三丁目".to_string(), + "一ツ橋一丁目".to_string(), + "一ツ橋二丁目".to_string(), + ]); + assert!(result.is_ok()); + let tokenizer = result.unwrap(); + assert_eq!(tokenizer.input, "東京都千代田区一ッ橋二丁目1番"); + assert_eq!(tokenizer.prefecture_name.unwrap(), "東京都"); + assert_eq!(tokenizer.city_name.unwrap(), "千代田区"); + assert_eq!(tokenizer.town_name.unwrap(), "一ツ橋二丁目"); + assert_eq!(tokenizer.rest, "1番"); + } + + #[test] + fn read_town_invalid_town_name_format_filterで成功() { + let tokenizer = Tokenizer { + input: "京都府京都市東山区本町22丁目489番".to_string(), + prefecture_name: Some("京都府".to_string()), + city_name: Some("京都市東山区".to_string()), + town_name: None, + rest: "本町22丁目489番".to_string(), + _state: PhantomData::, + }; + let result = tokenizer.read_town(vec![ + "本町十九丁目".to_string(), + "本町二十丁目".to_string(), + "本町二十一丁目".to_string(), + "本町二十二丁目".to_string(), + "本町新五丁目".to_string(), + "本町新六丁目".to_string(), + ]); + assert!(result.is_ok()); + let tokenizer = result.unwrap(); + assert_eq!(tokenizer.input, "京都府京都市東山区本町22丁目489番"); + assert_eq!(tokenizer.prefecture_name.unwrap(), "京都府"); + assert_eq!(tokenizer.city_name.unwrap(), "京都市東山区"); + assert_eq!(tokenizer.town_name.unwrap(), "本町二十二丁目"); + assert_eq!(tokenizer.rest, "489番"); + } + + #[test] + fn read_town_大字が省略されている場合_成功() { + let tokenizer = Tokenizer { + input: "東京都西多摩郡日の出町平井2780番地".to_string(), // 「大字」が省略されている + prefecture_name: Some("東京都".to_string()), + city_name: Some("西多摩郡日の出町".to_string()), + town_name: None, + rest: "平井2780番地".to_string(), + _state: PhantomData::, + }; + let result = tokenizer.read_town(vec!["大字大久野".to_string(), "大字平井".to_string()]); + assert!(result.is_ok()); + let tokenizer = result.unwrap(); + assert_eq!(tokenizer.input, "東京都西多摩郡日の出町平井2780番地"); + assert_eq!(tokenizer.prefecture_name.unwrap(), "東京都"); + assert_eq!(tokenizer.city_name.unwrap(), "西多摩郡日の出町"); + assert_eq!(tokenizer.town_name.unwrap(), "大字平井"); + assert_eq!(tokenizer.rest, "2780番地"); + } + + #[test] + fn read_town_失敗() { + let tokenizer = Tokenizer { + input: "静岡県静岡市清水区".to_string(), + prefecture_name: Some("静岡県".to_string()), + city_name: Some("静岡市清水区".to_string()), + town_name: None, + rest: "".to_string(), + _state: PhantomData::, + }; + let result = tokenizer.read_town(vec![ + "下野緑町".to_string(), + "承元寺町".to_string(), + "旭町".to_string(), + "新丹谷".to_string(), + "三保松原町".to_string(), + ]); + assert!(result.is_err()); + let tokenizer = result.unwrap_err(); + assert_eq!(tokenizer.input, "静岡県静岡市清水区"); + assert_eq!(tokenizer.prefecture_name.unwrap(), "静岡県"); + assert_eq!(tokenizer.city_name.unwrap(), "静岡市清水区"); + assert_eq!(tokenizer.town_name, None); + assert_eq!(tokenizer.rest, ""); + } +} From 6ff6b00c082db97983329fb771aac4480adba24f Mon Sep 17 00:00:00 2001 From: Yuuki Toriyama Date: Sat, 13 Jul 2024 00:37:08 +0900 Subject: [PATCH 08/12] =?UTF-8?q?update:=20core=E3=83=A2=E3=82=B8=E3=83=A5?= =?UTF-8?q?=E3=83=BC=E3=83=AB=E3=81=AE=E3=83=AA=E3=83=95=E3=82=A1=E3=82=AF?= =?UTF-8?q?=E3=82=BF:=20`core/src/parser.rs`=E3=81=AE=E5=AE=9F=E8=A3=85?= =?UTF-8?q?=E3=82=92`Tokenizer`=E3=83=99=E3=83=BC=E3=82=B9=E3=81=AB?= =?UTF-8?q?=E4=BF=AE=E6=AD=A3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- core/src/parser.rs | 122 +++++++++++++++++++++++++++------------------ 1 file changed, 74 insertions(+), 48 deletions(-) diff --git a/core/src/parser.rs b/core/src/parser.rs index 9e0c97cf..63e36246 100644 --- a/core/src/parser.rs +++ b/core/src/parser.rs @@ -5,9 +5,7 @@ use crate::api::AsyncApi; use crate::api::BlockingApi; use crate::entity::{Address, ParseResult}; use crate::err::{Error, ParseErrorKind}; -use crate::parser::read_city::read_city; -use crate::parser::read_prefecture::read_prefecture; -use crate::parser::read_town::read_town; +use crate::tokenizer::Tokenizer; pub(crate) mod adapter; pub(crate) mod filter; @@ -16,6 +14,17 @@ mod read_house_number; mod read_prefecture; mod read_town; +impl From> for Address { + fn from(value: Tokenizer) -> Self { + Self { + prefecture: value.prefecture_name.unwrap_or("".to_string()), + city: value.city_name.unwrap_or("".to_string()), + town: value.town_name.unwrap_or("".to_string()), + rest: value.rest, + } + } +} + /// An asynchronous `Parser` to process addresses. /// /// # Example @@ -69,56 +78,69 @@ impl Parser { /// /// publicにしていますが、直接の使用は推奨されません。[Parser]の利用を検討してください。 pub async fn parse(api: Arc, input: &str) -> ParseResult { + let tokenizer = Tokenizer::new(input); // 都道府県を特定 - let (rest, prefecture_name) = if let Some(result) = read_prefecture(input) { - result - } else { - return ParseResult { - address: Address::new("", "", "", input), - error: Some(Error::new_parse_error(ParseErrorKind::Prefecture)), - }; + let tokenizer = match tokenizer.read_prefecture() { + Ok(tokenizer) => tokenizer, + Err(tokenizer) => { + return ParseResult { + address: Address::from(tokenizer), + error: Some(Error::new_parse_error(ParseErrorKind::Prefecture)), + } + } }; // その都道府県の市町村名リストを取得 - let prefecture = match api.get_prefecture_master(prefecture_name).await { + let prefecture = match api + .get_prefecture_master(tokenizer.prefecture_name.as_ref().unwrap()) + .await + { Err(error) => { return ParseResult { - address: Address::new(prefecture_name, "", "", rest), + address: Address::from(tokenizer), error: Some(error), }; } Ok(result) => result, }; // 市町村名を特定 - let (rest, city_name) = if let Some(result) = read_city(rest, prefecture) { - result - } else { - return ParseResult { - address: Address::new(prefecture_name, "", "", rest), - error: Some(Error::new_parse_error(ParseErrorKind::City)), - }; + let tokenizer = match tokenizer.read_city(prefecture.cities) { + Ok(tokenizer) => tokenizer, + Err(tokenizer) => { + return ParseResult { + address: Address::from(tokenizer), + error: Some(Error::new_parse_error(ParseErrorKind::City)), + } + } }; // その市町村の町名リストを取得 - let city = match api.get_city_master(prefecture_name, &city_name).await { + let city = match api + .get_city_master( + tokenizer.prefecture_name.as_ref().unwrap(), + tokenizer.city_name.as_ref().unwrap(), + ) + .await + { Err(error) => { return ParseResult { - address: Address::new(prefecture_name, &city_name, "", &rest), + address: Address::from(tokenizer), error: Some(error), }; } Ok(result) => result, }; // 町名を特定 - let (rest, town_name) = if let Some(result) = read_town(&rest, &city) { - result - } else { - return ParseResult { - address: Address::new(prefecture_name, &city_name, "", &rest), - error: Some(Error::new_parse_error(ParseErrorKind::Town)), - }; + let tokenizer = match tokenizer.read_town(city.towns.iter().map(|x| x.name.clone()).collect()) { + Ok(tokenizer) => tokenizer, + Err(tokenizer) => { + return ParseResult { + address: Address::from(tokenizer), + error: Some(Error::new_parse_error(ParseErrorKind::Town)), + }; + } }; ParseResult { - address: Address::new(prefecture_name, &city_name, &town_name, &rest), + address: Address::from(tokenizer), error: None, } } @@ -226,54 +248,58 @@ mod tests { /// publicにしていますが、直接の使用は推奨されません。[Parser]の利用を検討してください。 #[cfg(feature = "blocking")] pub fn parse_blocking(api: Arc, input: &str) -> ParseResult { - let (rest, prefecture_name) = match read_prefecture(input) { - None => { + let tokenizer = Tokenizer::new(input); + let tokenizer = match tokenizer.read_prefecture() { + Ok(tokenizer) => tokenizer, + Err(tokenizer) => { return ParseResult { - address: Address::new("", "", "", input), + address: Address::from(tokenizer), error: Some(Error::new_parse_error(ParseErrorKind::Prefecture)), - }; + } } - Some(result) => result, }; - let prefecture = match api.get_prefecture_master(prefecture_name) { + let prefecture = match api.get_prefecture_master(tokenizer.prefecture_name.as_ref().unwrap()) { Err(error) => { return ParseResult { - address: Address::new(prefecture_name, "", "", rest), + address: Address::from(tokenizer), error: Some(error), }; } Ok(result) => result, }; - let (rest, city_name) = match read_city(rest, prefecture) { - None => { + let tokenizer = match tokenizer.read_city(prefecture.cities) { + Ok(tokenizer) => tokenizer, + Err(tokenizer) => { return ParseResult { - address: Address::new(prefecture_name, "", "", rest), + address: Address::from(tokenizer), error: Some(Error::new_parse_error(ParseErrorKind::City)), - }; + } } - Some(result) => result, }; - let city = match api.get_city_master(prefecture_name, &city_name) { + let city = match api.get_city_master( + tokenizer.prefecture_name.as_ref().unwrap(), + tokenizer.city_name.as_ref().unwrap(), + ) { Err(error) => { return ParseResult { - address: Address::new(prefecture_name, &city_name, "", &rest), + address: Address::from(tokenizer), error: Some(error), }; } Ok(result) => result, }; - let (rest, town_name) = match read_town(&rest, &city) { - None => { + let tokenizer = match tokenizer.read_town(city.towns.iter().map(|x| x.name.clone()).collect()) { + Ok(tokenizer) => tokenizer, + Err(tokenizer) => { return ParseResult { - address: Address::new(prefecture_name, &city_name, "", &rest), + address: Address::from(tokenizer), error: Some(Error::new_parse_error(ParseErrorKind::Town)), }; } - Some(result) => result, }; ParseResult { - address: Address::new(prefecture_name, &city_name, &town_name, &rest), + address: Address::from(tokenizer), error: None, } } From e07415513b0297b007b9f4e47ea38eb72c610c81 Mon Sep 17 00:00:00 2001 From: Yuuki Toriyama Date: Sat, 13 Jul 2024 00:48:57 +0900 Subject: [PATCH 09/12] =?UTF-8?q?delete:=20core=E3=83=A2=E3=82=B8=E3=83=A5?= =?UTF-8?q?=E3=83=BC=E3=83=AB=E3=81=AE=E3=83=AA=E3=83=95=E3=82=A1=E3=82=AF?= =?UTF-8?q?=E3=82=BF:=20`core/src/parser`=E3=81=AE`read=5Fprefecture.rs`,?= =?UTF-8?q?=20`read=5Fcity.rs`,=20`read=5Ftown.rs`=E3=82=92=E5=89=8A?= =?UTF-8?q?=E9=99=A4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `Tokenizer`への置き換えに伴い、使用箇所がなくなったため --- core/src/parser.rs | 3 - core/src/parser/read_city.rs | 86 ------------- core/src/parser/read_prefecture.rs | 79 ------------ core/src/parser/read_town.rs | 196 ----------------------------- 4 files changed, 364 deletions(-) delete mode 100644 core/src/parser/read_city.rs delete mode 100644 core/src/parser/read_prefecture.rs delete mode 100644 core/src/parser/read_town.rs diff --git a/core/src/parser.rs b/core/src/parser.rs index 63e36246..134ecbaa 100644 --- a/core/src/parser.rs +++ b/core/src/parser.rs @@ -9,10 +9,7 @@ use crate::tokenizer::Tokenizer; pub(crate) mod adapter; pub(crate) mod filter; -mod read_city; mod read_house_number; -mod read_prefecture; -mod read_town; impl From> for Address { fn from(value: Tokenizer) -> Self { diff --git a/core/src/parser/read_city.rs b/core/src/parser/read_city.rs deleted file mode 100644 index 48a5d280..00000000 --- a/core/src/parser/read_city.rs +++ /dev/null @@ -1,86 +0,0 @@ -use crate::entity::Prefecture; -use crate::parser::adapter::orthographical_variant_adapter::{ - OrthographicalVariantAdapter, OrthographicalVariants, Variant, -}; -use crate::parser::adapter::vague_expression_adapter::VagueExpressionAdapter; -use nom::bytes::complete::tag; -use nom::error::VerboseError; -use nom::Parser; - -pub fn read_city(input: &str, prefecture: Prefecture) -> Option<(String, String)> { - for city_name in &prefecture.cities { - if let Ok((rest, city_name)) = tag::<&str, &str, VerboseError<&str>>(city_name).parse(input) - { - return Some((rest.to_string(), city_name.to_string())); - } - let mut variant_list = vec![Variant::ケ]; - match prefecture.name.as_str() { - "青森県" => { - variant_list.push(Variant::舘); - } - "宮城県" => { - variant_list.push(Variant::竈); - } - "茨城県" => { - variant_list.push(Variant::龍); - variant_list.push(Variant::嶋); - } - "東京都" => { - variant_list.push(Variant::檜); - variant_list.push(Variant::葛); - } - "兵庫県" => { - variant_list.push(Variant::塚); - } - "奈良県" => { - variant_list.push(Variant::葛); - } - "高知県" => { - variant_list.push(Variant::梼); - } - "福岡県" => { - variant_list.push(Variant::恵); - } - _ => {} - } - let adapter = OrthographicalVariantAdapter { variant_list }; - if let Some(result) = adapter.apply(input, city_name) { - return Some(result); - } - } - - // ここまでで市町村名の特定ができない場合はVagueExpressionAdapterを使用して市町村名を推測する - let vague_expression_adapter = VagueExpressionAdapter {}; - if let Some(result) = vague_expression_adapter.apply(input, &prefecture.cities) { - return Some(result); - } - - None -} - -#[cfg(all(test, feature = "blocking"))] -mod tests { - use crate::api::BlockingApi; - use crate::parser::read_city::read_city; - use test_case::test_case; - - #[test_case("京都府", "京都市山科区椥辻池尻町14-2", "京都市山科区"; "success_京都市山科区")] - #[test_case("神奈川県", "茅ヶ崎市香川5丁目1", "茅ヶ崎市"; "success_茅ヶ崎市")] - #[test_case("神奈川県", "茅ケ崎市香川5丁目1", "茅ヶ崎市"; "success_茅ケ崎市_表記ゆれ")] - #[test_case("神奈川県", "横浜市保土ケ谷区川辺町2番地9", "横浜市保土ケ谷区"; "success_横浜市保土ケ谷区")] - #[test_case("神奈川県", "横浜市保土ヶ谷区川辺町2番地9", "横浜市保土ケ谷区"; "success_横浜市保土ヶ谷区_表記ゆれ")] - #[test_case("岐阜県", "不破郡関ケ原町大字関ケ原894番地の58", "不破郡関ケ原町"; "success_不破郡関ケ原町")] - #[test_case("岐阜県", "不破郡関が原町大字関ケ原894番地の58", "不破郡関ケ原町"; "success_不破郡関が原町_表記ゆれ")] - #[test_case("茨城県", "龍ヶ崎市佐貫町647", "龍ヶ崎市"; "success_龍ヶ崎市")] - #[test_case("茨城県", "龍ケ崎市佐貫町647", "龍ヶ崎市"; "success_龍ケ崎市_表記ゆれ")] - #[test_case("茨城県", "竜ヶ崎市佐貫町647", "龍ヶ崎市"; "success_竜ヶ崎市_表記ゆれ")] - #[test_case("茨城県", "竜ケ崎市佐貫町647", "龍ヶ崎市"; "success_竜ケ崎市_表記ゆれ")] - #[test_case("群馬県", "みなかみ町後閑318", "利根郡みなかみ町"; "success_利根郡みなかみ町_郡名が省略されている")] - #[test_case("埼玉県", "東秩父村大字御堂634番地", "秩父郡東秩父村"; "success_秩父郡東秩父村_郡名が省略されている")] - fn test_read_city(prefecture_name: &str, input: &str, expected: &str) { - let api: BlockingApi = Default::default(); - let prefecture = api.get_prefecture_master(prefecture_name).unwrap(); - let (_, city_name) = read_city(input, prefecture).unwrap(); - assert_eq!(city_name, expected); - } -} diff --git a/core/src/parser/read_prefecture.rs b/core/src/parser/read_prefecture.rs deleted file mode 100644 index 7908e5d3..00000000 --- a/core/src/parser/read_prefecture.rs +++ /dev/null @@ -1,79 +0,0 @@ -use nom::bytes::complete::tag; -use nom::error::VerboseError; -use nom::Parser; - -const PREFECTURE_NAME_LIST: [&str; 47] = [ - "北海道", - "青森県", - "岩手県", - "宮城県", - "秋田県", - "山形県", - "福島県", - "茨城県", - "栃木県", - "群馬県", - "埼玉県", - "千葉県", - "東京都", - "神奈川県", - "新潟県", - "富山県", - "石川県", - "福井県", - "山梨県", - "長野県", - "岐阜県", - "静岡県", - "愛知県", - "三重県", - "滋賀県", - "京都府", - "大阪府", - "兵庫県", - "奈良県", - "和歌山県", - "鳥取県", - "島根県", - "岡山県", - "広島県", - "山口県", - "徳島県", - "香川県", - "愛媛県", - "高知県", - "福岡県", - "佐賀県", - "長崎県", - "熊本県", - "大分県", - "宮崎県", - "鹿児島県", - "沖縄県", -]; - -pub fn read_prefecture(input: &str) -> Option<(&str, &str)> { - for prefecture_name in PREFECTURE_NAME_LIST { - if let Ok(result) = tag::<&str, &str, VerboseError<&str>>(prefecture_name).parse(input) { - return Some(result); - } - } - None -} - -#[cfg(test)] -mod parser_tests { - use crate::parser::read_prefecture::read_prefecture; - - #[test] - fn read_prefecture_成功_東京都() { - let (rest, prefecture) = read_prefecture("東京都港区芝公園4丁目2-8").unwrap(); - assert_eq!(rest, "港区芝公園4丁目2-8"); - assert_eq!(prefecture, "東京都".to_string()); - } - - #[test] - fn read_prefecture_失敗_都道府県名が誤っている() { - assert_eq!(read_prefecture("東今日都港区芝公園4丁目2-8"), None); - } -} diff --git a/core/src/parser/read_town.rs b/core/src/parser/read_town.rs deleted file mode 100644 index 3496447d..00000000 --- a/core/src/parser/read_town.rs +++ /dev/null @@ -1,196 +0,0 @@ -use nom::bytes::complete::tag; -use nom::error::VerboseError; -use nom::Parser; - -use crate::entity::City; -use crate::parser::adapter::orthographical_variant_adapter::{ - OrthographicalVariantAdapter, OrthographicalVariants, Variant, -}; -use crate::parser::filter::fullwidth_character::FullwidthCharacterFilter; -use crate::parser::filter::invalid_town_name_format::InvalidTownNameFormatFilter; -use crate::parser::filter::non_kanji_block_number::NonKanjiBlockNumberFilter; -use crate::parser::filter::Filter; - -pub fn read_town(input: &str, city: &City) -> Option<(String, String)> { - let mut input: String = FullwidthCharacterFilter {}.apply(input.to_string()); - if input.contains("丁目") { - input = NonKanjiBlockNumberFilter {}.apply(input); - } - if let Some(result) = find_town(&input, city) { - return Some(result); - } - // 「〇〇町L丁目M番N」ではなく「〇〇町L-M-N」と表記されているような場合 - input = InvalidTownNameFormatFilter {}.apply(input); - if let Some(result) = find_town(&input, city) { - return Some(result); - } - // ここまでで町名の検出に成功しない場合は、「大字」の省略の可能性を検討する - if let Some(result) = find_town(&format!("大字{}", input), city) { - return Some(result); - } - None -} - -fn find_town(input: &String, city: &City) -> Option<(String, String)> { - for town in &city.towns { - if let Ok((rest, town_name)) = - tag::<&str, &str, VerboseError<&str>>(town.name.as_str()).parse(input) - { - return Some((rest.to_string(), town_name.to_string())); - } - let adapter = OrthographicalVariantAdapter { - variant_list: vec![ - Variant::の, - Variant::ツ, - Variant::ケ, - Variant::薮, - Variant::崎, - Variant::檜, - Variant::舘, - Variant::脊, - Variant::渕, - Variant::己, - Variant::槇, - Variant::治, - Variant::佛, - Variant::澤, - Variant::恵, - Variant::穂, - Variant::梼, - ], - }; - if let Some(result) = adapter.apply(input, &town.name) { - return Some(result); - }; - } - None -} - -#[cfg(all(test, feature = "blocking"))] -mod tests { - use crate::api::BlockingApi; - use crate::entity::{City, Town}; - use crate::parser::read_town::read_town; - - #[test] - fn read_town_成功_静岡市清水区旭町() { - let city = City { - name: "静岡市清水区".to_string(), - towns: vec![ - Town::new("旭町", "", 35.016292, 138.489362), - Town::new("新丹谷", "", 35.072403, 138.474199), - ], - }; - let (rest, town) = read_town("旭町6-8", &city).unwrap(); - assert_eq!(rest, "6-8"); - assert_eq!(town, "旭町".to_string()); - } - - #[test] - fn read_town_失敗_町名がない場合() { - let city = City { - name: "静岡市清水区".to_string(), - towns: vec![], - }; - assert_eq!(read_town("旭町6-8", &city), None); - } - - #[test] - fn read_town_表記ゆれ_東京都千代田区丸の内() { - let city = generate_city_東京都千代田区(); - let (rest, town) = read_town("丸ノ内一丁目9", &city).unwrap(); - assert_eq!(rest, "9"); - assert_eq!(town, "丸の内一丁目"); - } - - #[test] - fn read_town_表記ゆれ_東京都千代田区一ツ橋() { - let city = generate_city_東京都千代田区(); - let (rest, town) = read_town("一ッ橋二丁目1番", &city).unwrap(); - assert_eq!(rest, "1番"); - assert_eq!(town, "一ツ橋二丁目"); - } - - fn generate_city_東京都千代田区() -> City { - City { - name: "千代田区".to_string(), - towns: vec![ - Town::new("富士見一丁目", "", 35.697871, 139.746978), - Town::new("富士見二丁目", "", 35.698126, 139.743057), - Town::new("丸の内一丁目", "", 35.68156, 139.767201), - Town::new("一ツ橋一丁目", "", 35.691189, 139.757119), - Town::new("一ツ橋二丁目", "", 35.693171, 139.757346), - ], - } - } - - #[test] - fn read_town_表記ゆれ_京都府京都市左京区松ケ崎杉ケ海道町() { - let city = generate_city_京都府京都市左京区(); - let (rest, town) = read_town("松ヶ崎杉ヶ海道町1", &city).unwrap(); - assert_eq!(rest, "1"); - assert_eq!(town, "松ケ崎杉ケ海道町"); - } - - fn generate_city_京都府京都市左京区() -> City { - City { - name: "京都市左京区".to_string(), - towns: vec![ - Town::new("松ケ崎杉ケ海道町", "", 35.047438, 135.779877), - Town::new("松ケ崎西池ノ内町", "", 35.054046, 135.773686), - Town::new("松ケ崎井出ケ鼻町", "", 35.056292, 135.790852), - ], - } - } - - #[test] - fn read_town_異字体_岐阜県岐阜市薮田南二丁目() { - let city = City { - name: "岐阜県岐阜市".to_string(), - towns: vec![ - Town::new("薮田南一丁目", "", 35.394373, 136.723208), - Town::new("薮田南二丁目", "", 35.391964, 136.723151), - Town::new("薮田南三丁目", "", 35.3896, 136.723086), - ], - }; - let (_, town) = read_town("薮田南二丁目", &city).unwrap(); - assert_eq!(town, "薮田南二丁目"); - let (_, town) = read_town("藪田南二丁目", &city).unwrap(); - assert_eq!(town, "薮田南二丁目"); - let (_, town) = read_town("籔田南二丁目", &city).unwrap(); - assert_eq!(town, "薮田南二丁目"); - } - - #[test] - fn read_town_丁目が算用数字の場合_京都府京都市東山区n丁目() { - let client: BlockingApi = Default::default(); - let city = client.get_city_master("京都府", "京都市東山区").unwrap(); - let test_cases = vec![ - ("本町1丁目45番", "本町一丁目"), - ("本町2丁目64番", "本町二丁目"), - ("本町10丁目169番", "本町十丁目"), - ("本町12丁目224番", "本町十二丁目"), - ("本町20丁目435番", "本町二十丁目"), - ("本町22丁目489番", "本町二十二丁目"), - ]; - for (input, town_name) in test_cases { - let (_, town) = read_town(input, &city).unwrap(); - assert_eq!(town, town_name); - } - } - - #[test] - fn read_town_大字の省略_東京都西多摩郡日の出町大字平井() { - let blocking_api: BlockingApi = Default::default(); - let city = blocking_api - .get_city_master("東京都", "西多摩郡日の出町") - .unwrap(); - - let (rest, town) = read_town("大字平井2780番地", &city).unwrap(); - assert_eq!(town, "大字平井"); - assert_eq!(rest, "2780番地"); - let (rest, town) = read_town("平井2780番地", &city).unwrap(); - assert_eq!(town, "大字平井"); - assert_eq!(rest, "2780番地"); - } -} From b12f351a00eae9fbd163547b40c52cb37c36d9e9 Mon Sep 17 00:00:00 2001 From: Yuuki Toriyama Date: Sat, 13 Jul 2024 10:41:02 +0900 Subject: [PATCH 10/12] =?UTF-8?q?fix:=20core=E3=83=A2=E3=82=B8=E3=83=A5?= =?UTF-8?q?=E3=83=BC=E3=83=AB=E3=81=AE=E3=83=AA=E3=83=95=E3=82=A1=E3=82=AF?= =?UTF-8?q?=E3=82=BF:=20cargo=20clippy=E3=81=AE=E8=AD=A6=E5=91=8A=E3=82=92?= =?UTF-8?q?=E4=BF=AE=E6=AD=A3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - `&String`を`&str`に変更 - 必要でない借用を解消 --- core/src/tokenizer/read_town.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/core/src/tokenizer/read_town.rs b/core/src/tokenizer/read_town.rs index 0b79febc..65e69048 100644 --- a/core/src/tokenizer/read_town.rs +++ b/core/src/tokenizer/read_town.rs @@ -62,7 +62,7 @@ impl Tokenizer { } } -fn find_town(input: &String, candidates: &Vec) -> Option<(String, String)> { +fn find_town(input: &str, candidates: &Vec) -> Option<(String, String)> { for candidate in candidates { if input.starts_with(candidate) { return Some(( @@ -94,7 +94,7 @@ fn find_town(input: &String, candidates: &Vec) -> Option<(String, String Variant::梼, ], }; - if let Some(result) = adapter.apply(input, &candidate) { + if let Some(result) = adapter.apply(input, candidate) { return Some(result); }; } From ca31071389aabde17570d6b2a677f3e76d647b7f Mon Sep 17 00:00:00 2001 From: Yuuki Toriyama Date: Sat, 13 Jul 2024 11:09:10 +0900 Subject: [PATCH 11/12] =?UTF-8?q?fix:=20core=E3=83=A2=E3=82=B8=E3=83=A5?= =?UTF-8?q?=E3=83=BC=E3=83=AB=E3=81=AE=E3=83=AA=E3=83=95=E3=82=A1=E3=82=AF?= =?UTF-8?q?=E3=82=BF:=20`Tokenizer#read=5Fhoge`=E3=81=AE=E7=B5=90=E6=9E=9C?= =?UTF-8?q?=E3=82=92match=E5=BC=8F=E3=81=A7=E8=A9=95=E4=BE=A1=E3=81=97?= =?UTF-8?q?=E3=81=A6=E3=81=84=E3=81=9F=E3=81=AE=E3=82=92let=20else?= =?UTF-8?q?=E5=BC=8F=E3=81=A7=E6=9B=B8=E3=81=8D=E7=9B=B4=E3=81=97=E3=81=9F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- core/src/parser.rs | 80 +++++++++++---------------- core/src/tokenizer/read_city.rs | 6 +- core/src/tokenizer/read_prefecture.rs | 8 +-- core/src/tokenizer/read_town.rs | 28 +++++----- 4 files changed, 53 insertions(+), 69 deletions(-) diff --git a/core/src/parser.rs b/core/src/parser.rs index 134ecbaa..2fd29b47 100644 --- a/core/src/parser.rs +++ b/core/src/parser.rs @@ -77,14 +77,11 @@ impl Parser { pub async fn parse(api: Arc, input: &str) -> ParseResult { let tokenizer = Tokenizer::new(input); // 都道府県を特定 - let tokenizer = match tokenizer.read_prefecture() { - Ok(tokenizer) => tokenizer, - Err(tokenizer) => { - return ParseResult { - address: Address::from(tokenizer), - error: Some(Error::new_parse_error(ParseErrorKind::Prefecture)), - } - } + let Ok(tokenizer) = tokenizer.read_prefecture() else { + return ParseResult { + address: Address::from(tokenizer), + error: Some(Error::new_parse_error(ParseErrorKind::Prefecture)), + }; }; // その都道府県の市町村名リストを取得 let prefecture = match api @@ -100,14 +97,11 @@ pub async fn parse(api: Arc, input: &str) -> ParseResult { Ok(result) => result, }; // 市町村名を特定 - let tokenizer = match tokenizer.read_city(prefecture.cities) { - Ok(tokenizer) => tokenizer, - Err(tokenizer) => { - return ParseResult { - address: Address::from(tokenizer), - error: Some(Error::new_parse_error(ParseErrorKind::City)), - } - } + let Ok(tokenizer) = tokenizer.read_city(prefecture.cities) else { + return ParseResult { + address: Address::from(tokenizer), + error: Some(Error::new_parse_error(ParseErrorKind::City)), + }; }; // その市町村の町名リストを取得 let city = match api @@ -126,14 +120,12 @@ pub async fn parse(api: Arc, input: &str) -> ParseResult { Ok(result) => result, }; // 町名を特定 - let tokenizer = match tokenizer.read_town(city.towns.iter().map(|x| x.name.clone()).collect()) { - Ok(tokenizer) => tokenizer, - Err(tokenizer) => { - return ParseResult { - address: Address::from(tokenizer), - error: Some(Error::new_parse_error(ParseErrorKind::Town)), - }; - } + let Ok(tokenizer) = tokenizer.read_town(city.towns.iter().map(|x| x.name.clone()).collect()) + else { + return ParseResult { + address: Address::from(tokenizer), + error: Some(Error::new_parse_error(ParseErrorKind::Town)), + }; }; ParseResult { @@ -246,14 +238,11 @@ mod tests { #[cfg(feature = "blocking")] pub fn parse_blocking(api: Arc, input: &str) -> ParseResult { let tokenizer = Tokenizer::new(input); - let tokenizer = match tokenizer.read_prefecture() { - Ok(tokenizer) => tokenizer, - Err(tokenizer) => { - return ParseResult { - address: Address::from(tokenizer), - error: Some(Error::new_parse_error(ParseErrorKind::Prefecture)), - } - } + let Ok(tokenizer) = tokenizer.read_prefecture() else { + return ParseResult { + address: Address::from(tokenizer), + error: Some(Error::new_parse_error(ParseErrorKind::Prefecture)), + }; }; let prefecture = match api.get_prefecture_master(tokenizer.prefecture_name.as_ref().unwrap()) { Err(error) => { @@ -264,14 +253,11 @@ pub fn parse_blocking(api: Arc, input: &str) -> ParseResult { } Ok(result) => result, }; - let tokenizer = match tokenizer.read_city(prefecture.cities) { - Ok(tokenizer) => tokenizer, - Err(tokenizer) => { - return ParseResult { - address: Address::from(tokenizer), - error: Some(Error::new_parse_error(ParseErrorKind::City)), - } - } + let Ok(tokenizer) = tokenizer.read_city(prefecture.cities) else { + return ParseResult { + address: Address::from(tokenizer), + error: Some(Error::new_parse_error(ParseErrorKind::City)), + }; }; let city = match api.get_city_master( tokenizer.prefecture_name.as_ref().unwrap(), @@ -285,14 +271,12 @@ pub fn parse_blocking(api: Arc, input: &str) -> ParseResult { } Ok(result) => result, }; - let tokenizer = match tokenizer.read_town(city.towns.iter().map(|x| x.name.clone()).collect()) { - Ok(tokenizer) => tokenizer, - Err(tokenizer) => { - return ParseResult { - address: Address::from(tokenizer), - error: Some(Error::new_parse_error(ParseErrorKind::Town)), - }; - } + let Ok(tokenizer) = tokenizer.read_town(city.towns.iter().map(|x| x.name.clone()).collect()) + else { + return ParseResult { + address: Address::from(tokenizer), + error: Some(Error::new_parse_error(ParseErrorKind::Town)), + }; }; ParseResult { diff --git a/core/src/tokenizer/read_city.rs b/core/src/tokenizer/read_city.rs index 03f79d94..67a11114 100644 --- a/core/src/tokenizer/read_city.rs +++ b/core/src/tokenizer/read_city.rs @@ -8,7 +8,7 @@ use crate::tokenizer::{CityNameFound, End, PrefectureNameFound, Tokenizer}; impl Tokenizer { pub(crate) fn read_city( - self, + &self, candidates: Vec, ) -> Result, Tokenizer> { for candidate in &candidates { @@ -84,10 +84,10 @@ impl Tokenizer { Err(Tokenizer { input: self.input.clone(), - prefecture_name: self.prefecture_name, + prefecture_name: self.prefecture_name.clone(), city_name: None, town_name: None, - rest: self.rest, + rest: self.rest.clone(), _state: PhantomData::, }) } diff --git a/core/src/tokenizer/read_prefecture.rs b/core/src/tokenizer/read_prefecture.rs index dc87548f..b793b9a4 100644 --- a/core/src/tokenizer/read_prefecture.rs +++ b/core/src/tokenizer/read_prefecture.rs @@ -59,12 +59,12 @@ impl Tokenizer { prefecture_name: None, city_name: None, town_name: None, - rest: "".to_string(), + rest: input.to_string(), _state: PhantomData, } } - pub(crate) fn read_prefecture(self) -> Result, Tokenizer> { + pub(crate) fn read_prefecture(&self) -> Result, Tokenizer> { for prefecture_name in PREFECTURE_NAME_LIST { if self.input.starts_with(prefecture_name) { return Ok(Tokenizer { @@ -86,7 +86,7 @@ impl Tokenizer { prefecture_name: None, city_name: None, town_name: None, - rest: self.input.clone(), + rest: self.rest.clone(), _state: PhantomData::, }) } @@ -103,7 +103,7 @@ mod tests { assert_eq!(tokenizer.prefecture_name, None); assert_eq!(tokenizer.city_name, None); assert_eq!(tokenizer.town_name, None); - assert_eq!(tokenizer.rest, ""); + assert_eq!(tokenizer.rest, "東京都港区芝公園4丁目2-8"); } #[test] diff --git a/core/src/tokenizer/read_town.rs b/core/src/tokenizer/read_town.rs index 65e69048..8d4e385c 100644 --- a/core/src/tokenizer/read_town.rs +++ b/core/src/tokenizer/read_town.rs @@ -11,7 +11,7 @@ use crate::tokenizer::{CityNameFound, End, Tokenizer, TownNameFound}; impl Tokenizer { pub(crate) fn read_town( - self, + &self, candidates: Vec, ) -> Result, Tokenizer> { let mut rest = FullwidthCharacterFilter {}.apply(self.rest.clone()); @@ -20,9 +20,9 @@ impl Tokenizer { } if let Some(result) = find_town(&rest, &candidates) { return Ok(Tokenizer { - input: self.input, - prefecture_name: self.prefecture_name, - city_name: self.city_name, + input: self.input.clone(), + prefecture_name: self.prefecture_name.clone(), + city_name: self.city_name.clone(), town_name: Some(result.1), rest: result.0, _state: PhantomData::, @@ -32,9 +32,9 @@ impl Tokenizer { rest = InvalidTownNameFormatFilter {}.apply(rest); if let Some(result) = find_town(&rest, &candidates) { return Ok(Tokenizer { - input: self.input, - prefecture_name: self.prefecture_name, - city_name: self.city_name, + input: self.input.clone(), + prefecture_name: self.prefecture_name.clone(), + city_name: self.city_name.clone(), town_name: Some(result.1), rest: result.0, _state: PhantomData::, @@ -43,20 +43,20 @@ impl Tokenizer { // ここまでで町名の検出に成功しない場合は、「大字」の省略の可能性を検討する if let Some(result) = find_town(&format!("大字{}", rest), &candidates) { return Ok(Tokenizer { - input: self.input, - prefecture_name: self.prefecture_name, - city_name: self.city_name, + input: self.input.clone(), + prefecture_name: self.prefecture_name.clone(), + city_name: self.city_name.clone(), town_name: Some(result.1), rest: result.0, _state: PhantomData::, }); } Err(Tokenizer { - input: self.input, - prefecture_name: self.prefecture_name, - city_name: self.city_name, + input: self.input.clone(), + prefecture_name: self.prefecture_name.clone(), + city_name: self.city_name.clone(), town_name: None, - rest: self.rest, + rest: self.rest.clone(), _state: PhantomData::, }) } From 7290dbd3c94a84b84560b17a3e59a9eb1e8048bc Mon Sep 17 00:00:00 2001 From: Yuuki Toriyama Date: Sat, 13 Jul 2024 11:28:22 +0900 Subject: [PATCH 12/12] =?UTF-8?q?remove:=20core=E3=83=A2=E3=82=B8=E3=83=A5?= =?UTF-8?q?=E3=83=BC=E3=83=AB=E3=81=AE=E3=83=AA=E3=83=95=E3=82=A1=E3=82=AF?= =?UTF-8?q?=E3=82=BF:=20`test-case`=E3=82=AF=E3=83=AC=E3=83=BC=E3=83=88?= =?UTF-8?q?=E3=81=B8=E3=81=AE=E4=BE=9D=E5=AD=98=E3=82=92=E5=89=8A=E9=99=A4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit #342 での除却漏れ --- core/Cargo.toml | 1 - 1 file changed, 1 deletion(-) diff --git a/core/Cargo.toml b/core/Cargo.toml index cd8eca9f..59ae7dec 100644 --- a/core/Cargo.toml +++ b/core/Cargo.toml @@ -28,6 +28,5 @@ reqwest = { version = "0.12.3", default-features = false, features = ["json", "r serde = { version = "1.0.192", features = ["derive"] } [dev-dependencies] -test-case = "3.3.1" tokio = { version = "1.38.0", features = ["rt", "macros"] } wasm-bindgen-test = { workspace = true }