diff --git a/.github/workflows/code-quality-check.yaml b/.github/workflows/code-quality-check.yaml index 36fea575..679c076d 100644 --- a/.github/workflows/code-quality-check.yaml +++ b/.github/workflows/code-quality-check.yaml @@ -25,10 +25,3 @@ jobs: reporter: 'github-pr-review' filter_mode: 'nofilter' github_token: ${{ secrets.GITHUB_TOKEN }} - - name: Run benchmark - uses: boa-dev/criterion-compare-action@v3 - with: - token: ${{ secrets.GITHUB_TOKEN }} - branchName: ${{ github.base_ref }} - cwd: 'core' - benchName: 'core_benchmark' diff --git a/.github/workflows/run-test.yaml b/.github/workflows/run-test.yaml index 63827a74..f8f698ab 100644 --- a/.github/workflows/run-test.yaml +++ b/.github/workflows/run-test.yaml @@ -49,3 +49,16 @@ jobs: - name: Build check for wasm crate working-directory: wasm run: wasm-pack build --target web --scope toriyama + + msrv: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Install minimum supported version + uses: dtolnay/rust-toolchain@master + with: + toolchain: 1.75.0 + - name: Basic build + run: cargo build --verbose + - name: Build docs + run: cargo doc --verbose diff --git a/.github/workflows/upload-npmjs.yaml b/.github/workflows/upload-npmjs.yaml index 3193f235..2511189e 100644 --- a/.github/workflows/upload-npmjs.yaml +++ b/.github/workflows/upload-npmjs.yaml @@ -9,6 +9,8 @@ jobs: publish: runs-on: ubuntu-latest environment: npmjs + permissions: + id-token: write defaults: run: working-directory: wasm @@ -48,6 +50,6 @@ jobs: - name: Upload wasm to npmjs.com run: | cd pkg - npm publish --access public + npm publish --provenance --access public env: NODE_AUTH_TOKEN: ${{ secrets.NPMJS_REGISTRY_TOKEN }} \ No newline at end of file diff --git a/Cargo.toml b/Cargo.toml index 936e1c3f..59c3baa9 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -8,7 +8,7 @@ members = [ resolver = "2" [workspace.package] -version = "0.1.23" +version = "0.1.24" edition = "2021" description = "A Rust Library to parse japanese addresses." repository = "https://github.com/YuukiToriyama/japanese-address-parser" diff --git a/README.md b/README.md index 9a28bdf1..cbd0f7eb 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ [![Docs](https://docs.rs/japanese-address-parser/badge.svg)](https://docs.rs/japanese-address-parser) [![Crates.io (latest)](https://img.shields.io/crates/v/japanese-address-parser)](https://crates.io/crates/japanese-address-parser) -![Rust Version](https://img.shields.io/badge/rust%20version-%3E%3D1.73.0-orange) +![Rust Version](https://img.shields.io/badge/rust%20version-%3E%3D1.75.0-orange) [![Unit test & Integration test](https://github.com/YuukiToriyama/japanese-address-parser/actions/workflows/run-test.yaml/badge.svg?branch=main)](https://github.com/YuukiToriyama/japanese-address-parser/actions/workflows/run-test.yaml) A Rust library for parsing Japanese addresses. @@ -48,20 +48,7 @@ fn main() { This crate is designed to be buildable for `wasm32-unknown-unknown` with `wasm-pack`. Pre-compiled wasm module is available on npmjs.com -```bash -npm install @toriyama/japanese-address-parser -``` - -```javascript -import init, {Parser} from "@toriyama/japanese-address-parser" - -init().then(() => { - const parser = new Parser() - parser.parse("東京都千代田区丸の内1-1-1").then(parseResult => { - console.log(JSON.stringify(parseResult, null, "\t")) - }) -}) -``` +You can run this crate on your browser. For more details, see [wasm module's README](wasm/README.md). ## Python support(experimental) diff --git a/core/Cargo.toml b/core/Cargo.toml index 5d3466bb..0cf422a0 100644 --- a/core/Cargo.toml +++ b/core/Cargo.toml @@ -9,11 +9,10 @@ license.workspace = true readme = "../README.md" keywords.workspace = true categories.workspace = true -rust-version = "1.73.0" +rust-version = "1.75.0" [lib] crate-type = ["rlib", "cdylib"] -bench = false [features] default = ["city-name-correction"] @@ -23,12 +22,8 @@ format-house-number = [] eliminate-whitespaces = [] experimental = [] -[[bench]] -name = "core_benchmark" -harness = false - [dependencies] -itertools = "0.13.0" +itertools = "0.13.0" # 互換性のために残してあるが、`core::parser::adapter`を削除する際に忘れずに削除する log.workspace = true rapidfuzz = "0.5.0" regex = { version = "1.10.6", default-features = false, features = ["std", "unicode-perl"] } diff --git a/core/benches/core_benchmark.rs b/core/benches/core_benchmark.rs deleted file mode 100644 index ff5fb974..00000000 --- a/core/benches/core_benchmark.rs +++ /dev/null @@ -1,7 +0,0 @@ -mod orthographical_variant_adapter; - -use crate::orthographical_variant_adapter::bench_orthographical_variant_adapter; -use criterion::{criterion_group, criterion_main}; - -criterion_group!(benches, bench_orthographical_variant_adapter); -criterion_main!(benches); diff --git a/core/benches/orthographical_variant_adapter.rs b/core/benches/orthographical_variant_adapter.rs deleted file mode 100644 index 97f4fa26..00000000 --- a/core/benches/orthographical_variant_adapter.rs +++ /dev/null @@ -1,48 +0,0 @@ -use criterion::measurement::WallTime; -use criterion::{BatchSize, BenchmarkGroup, BenchmarkId, Criterion}; -use japanese_address_parser::parser::adapter::orthographical_variant_adapter::{ - OrthographicalVariantAdapter, OrthographicalVariants, Variant, -}; - -pub fn bench_orthographical_variant_adapter(c: &mut Criterion) { - let mut group = c.benchmark_group("orthographical_variant_adapter"); - add_tests( - &mut group, - TestSuite { - expected: "松ケ崎東池ノ内町", - inputs: vec![ - "松が崎東池ノ内町", - "松ヶ崎東池ノ内町", - "松ケ﨑東池ノ内町", - "松ケ﨑東池の内町", - "松ガ﨑東池の内町", - ], - variants_to_be_used: vec![Variant::ケ, Variant::崎, Variant::の], - }, - ); - group.finish(); -} - -fn add_tests(group: &mut BenchmarkGroup, test_suite: TestSuite) { - for input in test_suite.inputs { - let benchmark_id = BenchmarkId::new(test_suite.expected, input); - group.bench_with_input(benchmark_id, input, |b, input| { - b.iter_batched( - || OrthographicalVariantAdapter { - variant_list: test_suite.variants_to_be_used.clone(), - }, - |adapter| { - let (region_name, _) = adapter.apply(input, test_suite.expected).unwrap(); - assert_eq!(region_name, test_suite.expected); - }, - BatchSize::SmallInput, - ) - }); - } -} - -struct TestSuite { - expected: &'static str, - inputs: Vec<&'static str>, - variants_to_be_used: Vec, -} diff --git a/core/src/adapter.rs b/core/src/adapter.rs new file mode 100644 index 00000000..f606829f --- /dev/null +++ b/core/src/adapter.rs @@ -0,0 +1 @@ +pub mod orthographical_variant_adapter; diff --git a/core/src/adapter/orthographical_variant_adapter.rs b/core/src/adapter/orthographical_variant_adapter.rs new file mode 100644 index 00000000..7fba398a --- /dev/null +++ b/core/src/adapter/orthographical_variant_adapter.rs @@ -0,0 +1,160 @@ +#[derive(Clone)] +pub enum OrthographicalVariant { + の, + ツ, + ケ, + 薮, + 崎, + 檜, + 龍, + 竈, + 嶋, + 舘, + 脊, + 渕, + 己, + 槇, + 治, + 佛, + 澤, + 塚, + 恵, + 穂, + 梼, + 蛍, + 與, + 瀧, + 籠, + 濱, + 祗, + 曾, +} + +impl OrthographicalVariant { + fn value(&self) -> &[char] { + match self { + OrthographicalVariant::の => &['の', 'ノ', '之'], + OrthographicalVariant::ツ => &['ツ', 'ッ'], + OrthographicalVariant::ケ => &['ケ', 'ヶ', 'が', 'ガ'], + OrthographicalVariant::薮 => &['薮', '藪', '籔'], + OrthographicalVariant::崎 => &['崎', '﨑'], + OrthographicalVariant::檜 => &['桧', '檜'], + OrthographicalVariant::龍 => &['龍', '竜'], + OrthographicalVariant::竈 => &['竈', '竃', '釜'], + OrthographicalVariant::嶋 => &['嶋', '島'], + OrthographicalVariant::舘 => &['舘', '館'], + OrthographicalVariant::脊 => &['脊', '背'], + OrthographicalVariant::渕 => &['渕', '淵'], + OrthographicalVariant::己 => &['己', '巳'], + OrthographicalVariant::槇 => &['槇', '槙'], + OrthographicalVariant::治 => &['治', '冶'], + OrthographicalVariant::佛 => &['佛', '仏'], + OrthographicalVariant::澤 => &['澤', '沢'], + OrthographicalVariant::塚 => &['塚', '塚'], + OrthographicalVariant::恵 => &['恵', '惠'], + OrthographicalVariant::穂 => &['穂', '穗'], + OrthographicalVariant::梼 => &['梼', '檮'], + OrthographicalVariant::蛍 => &['蛍', '螢'], + OrthographicalVariant::與 => &['與', '与'], + OrthographicalVariant::瀧 => &['瀧', '滝'], + OrthographicalVariant::籠 => &['籠', '篭'], + OrthographicalVariant::濱 => &['濱', '浜'], + OrthographicalVariant::祗 => &['祗', '祇'], + OrthographicalVariant::曾 => &['曾', '曽'], + } + } + + fn permutations(&self) -> Vec<(char, char)> { + let characters = self.value(); + let mut permutations: Vec<(char, char)> = vec![]; + for n in 0..characters.len() { + for m in 0..characters.len() { + if n != m { + permutations.push((characters[n], characters[m])); + } + } + } + permutations + } +} + +pub struct OrthographicalVariantAdapter { + pub variant_list: Vec, +} + +impl OrthographicalVariantAdapter { + pub fn apply(self, input: &str, region_name: &str) -> Option<(String, String)> { + // 必要なパターンのみを選別する + let variant_list: Vec<&OrthographicalVariant> = self + .variant_list + .iter() + .filter(|v| v.value().iter().any(|&c| input.contains(c))) + .collect(); + if variant_list.is_empty() { + return None; + } + + // マッチ候補を容れておくためのVector + let mut candidates: Vec = vec![region_name.to_string()]; + // パターンを一つづつ検証していく + for variant in variant_list { + let mut semi_candidates: Vec = vec![]; + // variantから順列を作成 + // ["ケ", "ヶ", "が"] -> (ケ, ヶ), (ケ, が), (ヶ, ケ), (ヶ, が), (が, ケ), (が, ヶ) + for (a, b) in variant.permutations() { + for candidate in candidates.iter().filter(|x| x.contains(a)) { + let modified_candidate = modify_specific_character(candidate, a, b); + if input.starts_with(&modified_candidate) { + // マッチすれば早期リターン + return Some(( + region_name.to_string(), + input + .chars() + .skip(modified_candidate.chars().count()) + .collect(), + )); + } else { + // マッチしなければsemi_candidatesに置き換え後の文字列をpush + semi_candidates.push(modified_candidate); + } + } + } + candidates = semi_candidates; + candidates.push(region_name.to_string()); + } + None + } +} + +fn modify_specific_character(text: &str, from: char, to: char) -> String { + text.chars() + .map(|x| if x == from { to } else { x }) + .collect() +} + +#[cfg(test)] +mod tests { + use crate::adapter::orthographical_variant_adapter::OrthographicalVariant; + + #[test] + fn permutations() { + let variant = OrthographicalVariant::ケ; + assert_eq!( + variant.permutations(), + vec![ + ('ケ', 'ヶ'), + ('ケ', 'が'), + ('ケ', 'ガ'), + ('ヶ', 'ケ'), + ('ヶ', 'が'), + ('ヶ', 'ガ'), + ('が', 'ケ'), + ('が', 'ヶ'), + ('が', 'ガ'), + ('ガ', 'ケ'), + ('ガ', 'ヶ'), + ('ガ', 'が'), + ] + ); + } +} diff --git a/core/src/lib.rs b/core/src/lib.rs index 8c1a35cf..73b3643a 100644 --- a/core/src/lib.rs +++ b/core/src/lib.rs @@ -13,6 +13,7 @@ compile_error! { "The `blocking` feature is not supported with wasm target." } +mod adapter; #[deprecated(since = "0.1.23", note = "This module will be deleted in v0.2")] pub mod api; pub(crate) mod domain; diff --git a/core/src/parser.rs b/core/src/parser.rs index 0e8730d8..7173de49 100644 --- a/core/src/parser.rs +++ b/core/src/parser.rs @@ -11,6 +11,7 @@ use crate::interactor::geolonia::{GeoloniaInteractor, GeoloniaInteractorImpl}; use crate::tokenizer::{End, Tokenizer}; use serde::Serialize; +#[deprecated(since = "0.1.24", note = "This module will be deleted in v0.2")] pub mod adapter; impl From> for Address { diff --git a/core/src/tokenizer/read_city.rs b/core/src/tokenizer/read_city.rs index e0e58f53..0233f777 100644 --- a/core/src/tokenizer/read_city.rs +++ b/core/src/tokenizer/read_city.rs @@ -1,7 +1,7 @@ -use crate::domain::common::token::{append_token, Token}; -use crate::parser::adapter::orthographical_variant_adapter::{ - OrthographicalVariantAdapter, OrthographicalVariants, Variant, +use crate::adapter::orthographical_variant_adapter::{ + OrthographicalVariant, OrthographicalVariantAdapter, }; +use crate::domain::common::token::{append_token, Token}; use crate::tokenizer::{CityNameFound, CityNameNotFound, PrefectureNameFound, Tokenizer}; use std::marker::PhantomData; @@ -29,29 +29,29 @@ impl Tokenizer { } // ここまでで市区町村名が読み取れない場合は、表記ゆれを含む可能性を検討する - let mut variant_list = vec![Variant::ケ]; + let mut variant_list = vec![OrthographicalVariant::ケ]; match self.get_prefecture_name() { Some("青森県") => { - variant_list.push(Variant::舘); + variant_list.push(OrthographicalVariant::舘); } Some("宮城県") => { - variant_list.push(Variant::竈); + variant_list.push(OrthographicalVariant::竈); } Some("茨城県") => { - variant_list.push(Variant::龍); - variant_list.push(Variant::嶋); + variant_list.push(OrthographicalVariant::龍); + variant_list.push(OrthographicalVariant::嶋); } Some("東京都") => { - variant_list.push(Variant::檜); + variant_list.push(OrthographicalVariant::檜); } Some("兵庫県") => { - variant_list.push(Variant::塚); + variant_list.push(OrthographicalVariant::塚); } Some("高知県") => { - variant_list.push(Variant::梼); + variant_list.push(OrthographicalVariant::梼); } Some("福岡県") => { - variant_list.push(Variant::恵); + variant_list.push(OrthographicalVariant::恵); } _ => {} } diff --git a/core/src/tokenizer/read_town.rs b/core/src/tokenizer/read_town.rs index b8abeef7..7474f747 100644 --- a/core/src/tokenizer/read_town.rs +++ b/core/src/tokenizer/read_town.rs @@ -1,11 +1,11 @@ +use crate::adapter::orthographical_variant_adapter::{ + OrthographicalVariant, OrthographicalVariantAdapter, +}; use crate::domain::common::token::{append_token, Token}; use crate::formatter::chome_with_arabic_numerals::format_chome_with_arabic_numerals; use crate::formatter::fullwidth_character::format_fullwidth_number; use crate::formatter::house_number::format_house_number; use crate::formatter::informal_town_name_notation::format_informal_town_name_notation; -use crate::parser::adapter::orthographical_variant_adapter::{ - OrthographicalVariantAdapter, OrthographicalVariants, Variant, -}; use crate::tokenizer::{CityNameFound, End, Tokenizer, TownNameFound}; use std::marker::PhantomData; @@ -65,31 +65,31 @@ fn find_town(input: &str, candidates: &Vec) -> Option<(String, String)> } let adapter = OrthographicalVariantAdapter { variant_list: vec![ - Variant::の, - Variant::ツ, - Variant::ケ, - Variant::薮, - Variant::崎, - Variant::檜, - Variant::竈, - Variant::舘, - Variant::脊, - Variant::渕, - Variant::己, - Variant::槇, - Variant::治, - Variant::佛, - Variant::澤, - Variant::恵, - Variant::穂, - Variant::梼, - Variant::蛍, - Variant::與, - Variant::瀧, - Variant::籠, - Variant::濱, - Variant::祗, - Variant::曾, + OrthographicalVariant::の, + OrthographicalVariant::ツ, + OrthographicalVariant::ケ, + OrthographicalVariant::薮, + OrthographicalVariant::崎, + OrthographicalVariant::檜, + OrthographicalVariant::竈, + OrthographicalVariant::舘, + OrthographicalVariant::脊, + OrthographicalVariant::渕, + OrthographicalVariant::己, + OrthographicalVariant::槇, + OrthographicalVariant::治, + OrthographicalVariant::佛, + OrthographicalVariant::澤, + OrthographicalVariant::恵, + OrthographicalVariant::穂, + OrthographicalVariant::梼, + OrthographicalVariant::蛍, + OrthographicalVariant::與, + OrthographicalVariant::瀧, + OrthographicalVariant::籠, + OrthographicalVariant::濱, + OrthographicalVariant::祗, + OrthographicalVariant::曾, ], }; if let Some(result) = adapter.apply(input, candidate) { diff --git a/wasm/Cargo.toml b/wasm/Cargo.toml index 09791b79..59db0218 100644 --- a/wasm/Cargo.toml +++ b/wasm/Cargo.toml @@ -6,7 +6,7 @@ description.workspace = true repository.workspace = true authors.workspace = true license.workspace = true -readme = "../README.md" +readme = "README.md" keywords.workspace = true categories.workspace = true diff --git a/wasm/README.md b/wasm/README.md new file mode 100644 index 00000000..b1d1c954 --- /dev/null +++ b/wasm/README.md @@ -0,0 +1,89 @@ +# @toriyama/japanese-address-parser + +A Library for processing addresses of Japan written in Rust. + +[![npmjs](https://img.shields.io/npm/v/%40toriyama/japanese-address-parser)](https://www.npmjs.com/package/@toriyama/japanese-address-parser) +[![install size](https://packagephobia.com/badge?p=@toriyama/japanese-address-parser)](https://packagephobia.com/result?p=@toriyama/japanese-address-parser) +[![downloads](https://img.shields.io/npm/dm/@toriyama/japanese-address-parser.svg)](https://npmcharts.com/compare/@toriyama/japanese-address-parser?minimal=true) + +## Install + +Install with npm: + +```bash +npm install @toriyama/japanese-address-parser +``` + +Install with yarn: + +```bash +yarn add @toriyama/japanese-address-parser +``` + +## Introduction + +`@toriyama/japanese-address-parser` is a library for parsing Japanese addresses. +You can split an address string into prefectures(都道府県), municipalities(市区町村), towns and villages(町村), +and each subsequent element. + +This library is a JavaScript binding for [`japanese-address-parser`](https://crates.io/crates/japanese-address-parser) +crate written in Rust by using wasm-pack. +Node.js is not yet supported. If you are eager to use this library on Node.js, +please write comments on [#128](https://github.com/YuukiToriyama/japanese-address-parser/issues/128) or pull-request! + +## Demo + +You can try it out on the demo pages below. + +- https://yuukitoriyama.github.io/japanese-address-parser/public/index.html +- https://yuukitoriyama.github.io/japanese-address-parser/public/nightly.html (include experimental feature) + +## Example + +```javascript +import init, {Parser} from "@toriyama/japanese-address-parser" + +init().then(() => { + const parser = new Parser() + parser.parse("東京都千代田区丸ノ内1-1-1").then(parseResult => { + console.log(JSON.stringify(parseResult, null, "\t")) + }) +}) +``` + +```json +{ + "address": { + "prefecture": "東京都", + "city": "千代田区", + "town": "丸の内一丁目", + "rest": "1-1" + } +} +``` + +## How it works + +The input string is basically read in order from the beginning to the end. +Once the name of prefecture has been scanned, the names of city will be scanned, then the names of town, and so on. +We don't have the list of city names or town names in this library, but fetch them via the internet each time. +Version 0.1 use [Geolonia住所データ](https://github.com/geolonia/japanese-addresses) +authored by [Geolonia Inc](https://www.geolonia.com/company/). +Detection place names may fail in some cases, such as when there is a notation distortion or when county names are +omitted. In such cases, this library tries fuzzy match instead of exact match. + +For more details, please visit [our repository](https://github.com/YuukiToriyama/japanese-address-parser). + +## Contributing + +If you want to contribute this library, please read +the [contribution guide](https://github.com/YuukiToriyama/japanese-address-parser/blob/main/CONTRIBUTING.md) to learn +how to propose bug fixes and improvements. + +## License + +This library is distributed under the terms of the MIT license. + +## Related projects + +- [@geolonia/normalize-japanese-addresses](https://www.npmjs.com/package/@geolonia/normalize-japanese-addresses)