Skip to content

Commit

Permalink
Merge pull request #404 from YuukiToriyama/release/v0.1.13
Browse files Browse the repository at this point in the history
release/v0.1.13をmainブランチにマージ
  • Loading branch information
YuukiToriyama authored Sep 5, 2024
2 parents 4c7ae80 + ccde5a6 commit 403cd4b
Show file tree
Hide file tree
Showing 9 changed files with 90 additions and 16 deletions.
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ members = [
resolver = "2"

[workspace.package]
version = "0.1.12"
version = "0.1.13"
edition = "2021"
description = "A Rust Library to parse japanese addresses."
repository = "https://github.com/YuukiToriyama/japanese-address-parser"
Expand Down
2 changes: 0 additions & 2 deletions core/src/parser/adapter/orthographical_variant_adapter.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@ pub trait OrthographicalVariants {
const: Variant;
const: Variant;
const: Variant;
const: Variant;
const: Variant;
const: Variant;
const: Variant;
Expand Down Expand Up @@ -53,7 +52,6 @@ impl OrthographicalVariants for Variant {
const: Variant = &["恵", "惠"];
const: Variant = &["穂", "穗"];
const: Variant = &["梼", "檮"];
const: Variant = &["葛󠄀", "葛"];
const: Variant = &["蛍", "螢"];
const: Variant = &["與", "与"];
const: Variant = &["瀧", "滝"];
Expand Down
4 changes: 0 additions & 4 deletions core/src/tokenizer/read_city.rs
Original file line number Diff line number Diff line change
Expand Up @@ -39,14 +39,10 @@ impl Tokenizer<PrefectureNameFound> {
}
"東京都" => {
variant_list.push(Variant::);
variant_list.push(Variant::);
}
"兵庫県" => {
variant_list.push(Variant::);
}
"奈良県" => {
variant_list.push(Variant::);
}
"高知県" => {
variant_list.push(Variant::);
}
Expand Down
17 changes: 14 additions & 3 deletions core/src/tokenizer/read_prefecture.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
use std::marker::PhantomData;

use crate::tokenizer::{End, Init, PrefectureNameFound, Tokenizer};
use crate::util::extension::StrExt;

const PREFECTURE_NAME_LIST: [&str; 47] = [
"北海道",
Expand Down Expand Up @@ -59,21 +60,21 @@ impl Tokenizer<Init> {
prefecture_name: None,
city_name: None,
town_name: None,
rest: input.to_string(),
rest: input.strip_variation_selectors(),
_state: PhantomData,
}
}

pub(crate) fn read_prefecture(&self) -> Result<Tokenizer<PrefectureNameFound>, Tokenizer<End>> {
for prefecture_name in PREFECTURE_NAME_LIST {
if self.input.starts_with(prefecture_name) {
if self.rest.starts_with(prefecture_name) {
return Ok(Tokenizer {
input: self.input.clone(),
prefecture_name: Some(prefecture_name.to_string()),
city_name: None,
town_name: None,
rest: self
.input
.rest
.chars()
.skip(prefecture_name.chars().count())
.collect::<String>(),
Expand Down Expand Up @@ -106,6 +107,16 @@ mod tests {
assert_eq!(tokenizer.rest, "東京都港区芝公園4丁目2-8");
}

#[test]
fn new_異字体セレクタ除去() {
let tokenizer = Tokenizer::new("東京都葛\u{E0100}飾区立石5-13-1");
assert_eq!(tokenizer.input, "東京都葛\u{E0100}飾区立石5-13-1");
assert_eq!(tokenizer.prefecture_name, None);
assert_eq!(tokenizer.city_name, None);
assert_eq!(tokenizer.town_name, None);
assert_eq!(tokenizer.rest, "東京都葛飾区立石5-13-1")
}

#[test]
fn read_prefecture_成功() {
let tokenizer = Tokenizer::new("東京都港区芝公園4丁目2-8");
Expand Down
1 change: 1 addition & 0 deletions core/src/util.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
pub mod converter;
pub(crate) mod extension;
pub mod sequence_matcher;
mod trimmer;
62 changes: 62 additions & 0 deletions core/src/util/extension.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
pub(crate) trait CharExt {
fn is_variation_selector(&self) -> bool;
}

impl CharExt for char {
/// 異字体セレクタかどうかを判別します
fn is_variation_selector(&self) -> bool {
matches!(self, '\u{FE00}'..='\u{FE0F}' | '\u{E0100}'..='\u{E01EF}')
}
}

pub(crate) trait StrExt {
fn strip_variation_selectors(&self) -> String;
}

impl StrExt for str {
/// 文字列から異字体セレクタを取り除きます
fn strip_variation_selectors(&self) -> String {
self.chars()
.filter(|c| !c.is_variation_selector())
.collect()
}
}

#[cfg(test)]
mod tests {
use crate::util::extension::{CharExt, StrExt};

#[test]
fn is_variation_selector() {
assert_eq!('あ'.is_variation_selector(), false);
assert_eq!('亜'.is_variation_selector(), false);

assert_eq!('\u{FDFF}'.is_variation_selector(), false);
assert_eq!('\u{FE00}'.is_variation_selector(), true);

assert_eq!('\u{FE0F}'.is_variation_selector(), true);
assert_eq!('\u{FE10}'.is_variation_selector(), false);

assert_eq!('\u{E00FF}'.is_variation_selector(), false);
assert_eq!('\u{E0100}'.is_variation_selector(), true);

assert_eq!('\u{E01EF}'.is_variation_selector(), true);
assert_eq!('\u{E01F0}'.is_variation_selector(), false);
}

#[test]
fn strip_variation_selectors_逢坂() {
let normal = "\u{9022}\u{5742}"; // 逢坂
let variant = "\u{9022}\u{E0101}\u{5742}"; // 逢󠄁坂
assert_ne!(normal, variant);
assert_eq!(normal, variant.strip_variation_selectors());
}

#[test]
fn strip_variation_selectors_茨城() {
let normal = "\u{8328}\u{57CE}";
let variant = "\u{8328}\u{E0100}\u{57CE}";
assert_ne!(normal, variant);
assert_eq!(normal, variant.strip_variation_selectors());
}
}
5 changes: 5 additions & 0 deletions tests/integration_tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -43,3 +43,8 @@ async fn 郡が省略されている場合への対応テスト() {
async fn 郡名と町名が一致している場合() {
run_data_driven_tests("./test_data/郡名と町名が一致している場合.csv").await
}

#[tokio::test]
async fn 異字体セレクタを含む場合への対応() {
run_data_driven_tests("./test_data/異字体セレクタを含む場合への対応.csv").await
}
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,8 @@ address,prefecture,city,town,rest
# 茨城県
茨城県鹿嶋市大字平井1187-1,茨城県,鹿嶋市,大字平井,1187-1
茨城県鹿島市大字平井1187-1,茨城県,鹿嶋市,大字平井,1187-1
# 東京都
東京都葛飾区立石5-13-1,東京都,葛飾区,立石五丁目,13-1
東京都葛󠄀飾区立石5-13-1,東京都,葛飾区,立石五丁目,13-1
# 兵庫県
兵庫県宝塚市売布東の町8-19,兵庫県,宝塚市,売布東の町,8-19
兵庫県宝塚市売布東の町8-19,兵庫県,宝塚市,売布東の町,8-19
兵庫県宝塚市武庫川町1-1,兵庫県,宝塚市,武庫川町,1-1
兵庫県宝塚市武庫川町1-1,兵庫県,宝塚市,武庫川町,1-1
# 奈良県
奈良県葛󠄀城市柿本166番地,奈良県,葛城市,柿本,166番地
奈良県葛城市柿本166番地,奈良県,葛城市,柿本,166番地
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
address,prefecture,city,town,rest
東京都葛飾区立石5-13-1,東京都,葛飾区,立石五丁目,13-1
東京都葛󠄀飾区立石5-13-1,東京都,葛飾区,立石五丁目,13-1
奈良県葛城市柿本166番地,奈良県,葛城市,柿本,166番地
奈良県葛󠄀城市柿本166番地,奈良県,葛城市,柿本,166番地
鹿児島県薩摩川内市上甑町中甑250-1,鹿児島県,薩摩川内市,上甑町中甑,250-1
鹿児島県薩摩川内市上甑󠄀町中甑󠄀250-1,鹿児島県,薩摩川内市,上甑町中甑,250-1

0 comments on commit 403cd4b

Please sign in to comment.