From 8e8239e1ead6a7966a55fa7bd42ddc9d2b906eeb Mon Sep 17 00:00:00 2001 From: Ben King <9087625+benfdking@users.noreply.github.com> Date: Fri, 20 Dec 2024 13:54:48 +0100 Subject: [PATCH] chore: introduce benchmarking for rust --- .github/workflows/rust-bench.yml | 12 + sqlglotrs/Cargo.lock | 546 +++++++++++++++++- sqlglotrs/Cargo.toml | 23 +- sqlglotrs/benches/dialect_settings.json | 1 + sqlglotrs/benches/long.rs | 74 +++ sqlglotrs/benches/token_type_settings.json | 1 + .../benches/tokenizer_dialect_settings.json | 1 + sqlglotrs/benches/tokenizer_settings.json | 1 + sqlglotrs/src/lib.rs | 93 +-- sqlglotrs/src/settings.rs | 65 ++- sqlglotrs/src/token.rs | 61 ++ sqlglotrs/src/tokenizer.rs | 37 +- 12 files changed, 809 insertions(+), 106 deletions(-) create mode 100644 .github/workflows/rust-bench.yml create mode 100644 sqlglotrs/benches/dialect_settings.json create mode 100644 sqlglotrs/benches/long.rs create mode 100644 sqlglotrs/benches/token_type_settings.json create mode 100644 sqlglotrs/benches/tokenizer_dialect_settings.json create mode 100644 sqlglotrs/benches/tokenizer_settings.json create mode 100644 sqlglotrs/src/token.rs diff --git a/.github/workflows/rust-bench.yml b/.github/workflows/rust-bench.yml new file mode 100644 index 0000000000..9706ba594b --- /dev/null +++ b/.github/workflows/rust-bench.yml @@ -0,0 +1,12 @@ +on: [pull_request] +name: benchmark pull requests +jobs: + runBenchmark: + name: run benchmark + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: boa-dev/criterion-compare-action@v3 + with: + branchName: ${{ github.base_ref }} + cwd: "sqlglotrs" diff --git a/sqlglotrs/Cargo.lock b/sqlglotrs/Cargo.lock index 9e506b5faf..533926ec2b 100644 --- a/sqlglotrs/Cargo.lock +++ b/sqlglotrs/Cargo.lock @@ -1,6 +1,27 @@ # This file is automatically @generated by Cargo. # It is not intended for manual editing. -version = 3 +version = 4 + +[[package]] +name = "aho-corasick" +version = "1.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916" +dependencies = [ + "memchr", +] + +[[package]] +name = "anes" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299" + +[[package]] +name = "anstyle" +version = "1.0.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "55cc3b69f167a1ef2e161439aa98aed94e6028e5f9a59be9a6ffb47aef1651f9" [[package]] name = "autocfg" @@ -8,29 +29,230 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" +[[package]] +name = "bumpalo" +version = "3.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "79296716171880943b8470b5f8d03aa55eb2e645a4874bdbb28adb49162e012c" + +[[package]] +name = "cast" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5" + [[package]] name = "cfg-if" version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" +[[package]] +name = "ciborium" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42e69ffd6f0917f5c029256a24d0161db17cea3997d185db0d35926308770f0e" +dependencies = [ + "ciborium-io", + "ciborium-ll", + "serde", +] + +[[package]] +name = "ciborium-io" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "05afea1e0a06c9be33d539b876f1ce3692f4afea2cb41f740e7743225ed1c757" + +[[package]] +name = "ciborium-ll" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "57663b653d948a338bfb3eeba9bb2fd5fcfaecb9e199e87e1eda4d9e8b240fd9" +dependencies = [ + "ciborium-io", + "half", +] + +[[package]] +name = "clap" +version = "4.5.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3135e7ec2ef7b10c6ed8950f0f792ed96ee093fa088608f1c76e569722700c84" +dependencies = [ + "clap_builder", +] + +[[package]] +name = "clap_builder" +version = "4.5.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "30582fc632330df2bd26877bde0c1f4470d57c582bbc070376afcd04d8cb4838" +dependencies = [ + "anstyle", + "clap_lex", +] + +[[package]] +name = "clap_lex" +version = "0.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f46ad14479a25103f283c0f10005961cf086d8dc42205bb44c46ac563475dca6" + +[[package]] +name = "criterion" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f2b12d017a929603d80db1831cd3a24082f8137ce19c69e6447f54f5fc8d692f" +dependencies = [ + "anes", + "cast", + "ciborium", + "clap", + "criterion-plot", + "is-terminal", + "itertools", + "num-traits", + "once_cell", + "oorandom", + "plotters", + "rayon", + "regex", + "serde", + "serde_derive", + "serde_json", + "tinytemplate", + "walkdir", +] + +[[package]] +name = "criterion-plot" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6b50826342786a51a89e2da3a28f1c32b06e387201bc2d19791f622c673706b1" +dependencies = [ + "cast", + "itertools", +] + +[[package]] +name = "crossbeam-deque" +version = "0.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51" +dependencies = [ + "crossbeam-epoch", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-epoch" +version = "0.9.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" +dependencies = [ + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-utils" +version = "0.8.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" + +[[package]] +name = "crunchy" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a81dae078cea95a014a339291cec439d2f232ebe854a9d672b796c6afafa9b7" + +[[package]] +name = "either" +version = "1.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "60b1af1c220855b6ceac025d3f6ecdd2b7c4894bfe9cd9bda4fbb4bc7c0d4cf0" + +[[package]] +name = "half" +version = "2.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6dd08c532ae367adf81c312a4580bc67f1d0fe8bc9c460520283f4c0ff277888" +dependencies = [ + "cfg-if", + "crunchy", +] + [[package]] name = "heck" version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" +[[package]] +name = "hermit-abi" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fbf6a919d6cf397374f7dfeeea91d974c7c0a7221d0d0f4f20d859d329e53fcc" + [[package]] name = "indoc" version = "2.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1e186cfbae8084e513daff4240b4797e342f988cecda4fb6c939150f96315fd8" +[[package]] +name = "is-terminal" +version = "0.4.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "261f68e344040fbd0edea105bef17c66edf46f984ddb1115b775ce31be948f4b" +dependencies = [ + "hermit-abi", + "libc", + "windows-sys 0.52.0", +] + +[[package]] +name = "itertools" +version = "0.10.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b0fd2260e829bddf4cb6ea802289de2f86d6a7a690192fbe91b3f46e0f2c8473" +dependencies = [ + "either", +] + +[[package]] +name = "itoa" +version = "1.0.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d75a2a4b1b190afb6f5425f10f6a8f959d2ea0b9c2b1d79553551850539e4674" + +[[package]] +name = "js-sys" +version = "0.3.76" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6717b6b5b077764fb5966237269cb3c64edddde4b14ce42647430a78ced9e7b7" +dependencies = [ + "once_cell", + "wasm-bindgen", +] + [[package]] name = "libc" -version = "0.2.150" +version = "0.2.169" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b5aba8db14291edd000dfcc4d620c7ebfb122c613afb886ca8803fa4e128a20a" + +[[package]] +name = "log" +version = "0.4.22" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "89d92a4743f9a61002fae18374ed11e7973f530cb3a3255fb354818118b2203c" +checksum = "a7a70ba024b9dc04c27ea2f0c0548feb474ec5c54bba33a7f72f873a39d07b24" + +[[package]] +name = "memchr" +version = "2.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3" [[package]] name = "memoffset" @@ -41,12 +263,55 @@ dependencies = [ "autocfg", ] +[[package]] +name = "num-traits" +version = "0.2.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" +dependencies = [ + "autocfg", +] + [[package]] name = "once_cell" version = "1.19.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92" +[[package]] +name = "oorandom" +version = "11.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b410bbe7e14ab526a0e86877eb47c6996a2bd7746f027ba551028c925390e4e9" + +[[package]] +name = "plotters" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5aeb6f403d7a4911efb1e33402027fc44f29b5bf6def3effcc22d7bb75f2b747" +dependencies = [ + "num-traits", + "plotters-backend", + "plotters-svg", + "wasm-bindgen", + "web-sys", +] + +[[package]] +name = "plotters-backend" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df42e13c12958a16b3f7f4386b9ab1f3e7933914ecea48da7139435263a4172a" + +[[package]] +name = "plotters-svg" +version = "0.3.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "51bae2ac328883f7acdfea3d66a7c35751187f870bc81f94563733a154d7a670" +dependencies = [ + "plotters-backend", +] + [[package]] name = "portable-atomic" version = "1.9.0" @@ -134,11 +399,111 @@ dependencies = [ "proc-macro2", ] +[[package]] +name = "rayon" +version = "1.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b418a60154510ca1a002a752ca9714984e21e4241e804d32555251faf8b78ffa" +dependencies = [ + "either", + "rayon-core", +] + +[[package]] +name = "rayon-core" +version = "1.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1465873a3dfdaa8ae7cb14b4383657caab0b3e8a0aa9ae8e04b044854c8dfce2" +dependencies = [ + "crossbeam-deque", + "crossbeam-utils", +] + +[[package]] +name = "regex" +version = "1.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b544ef1b4eac5dc2db33ea63606ae9ffcfac26c1416a2806ae0bf5f56b201191" +dependencies = [ + "aho-corasick", + "memchr", + "regex-automata", + "regex-syntax", +] + +[[package]] +name = "regex-automata" +version = "0.4.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "809e8dc61f6de73b46c85f4c96486310fe304c434cfa43669d7b40f711150908" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-syntax" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c" + +[[package]] +name = "ryu" +version = "1.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f3cb5ba0dc43242ce17de99c180e96db90b235b8a9fdc9543c96d2209116bd9f" + +[[package]] +name = "same-file" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502" +dependencies = [ + "winapi-util", +] + +[[package]] +name = "serde" +version = "1.0.216" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b9781016e935a97e8beecf0c933758c97a5520d32930e460142b4cd80c6338e" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_derive" +version = "1.0.216" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "46f859dbbf73865c6627ed570e78961cd3ac92407a2d117204c49232485da55e" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "serde_json" +version = "1.0.133" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c7fceb2473b9166b2294ef05efcb65a3db80803f0b03ef86a5fc88a2b85ee377" +dependencies = [ + "itoa", + "memchr", + "ryu", + "serde", +] + [[package]] name = "sqlglotrs" version = "0.3.0" dependencies = [ + "criterion", "pyo3", + "serde", + "serde_json", + "sqlglotrs", ] [[package]] @@ -158,6 +523,16 @@ version = "0.12.16" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "61c41af27dd6d1e27b1b16b489db798443478cef1f06a660c96db617ba5de3b1" +[[package]] +name = "tinytemplate" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "be4d6b5f19ff7664e8c98d03e2139cb510db9b0a60b55f8e8709b689d939b6bc" +dependencies = [ + "serde", + "serde_json", +] + [[package]] name = "unicode-ident" version = "1.0.12" @@ -169,3 +544,168 @@ name = "unindent" version = "0.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c7de7d73e1754487cb58364ee906a499937a0dfabd86bcb980fa99ec8c8fa2ce" + +[[package]] +name = "walkdir" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "29790946404f91d9c5d06f9874efddea1dc06c5efe94541a7d6863108e3a5e4b" +dependencies = [ + "same-file", + "winapi-util", +] + +[[package]] +name = "wasm-bindgen" +version = "0.2.99" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a474f6281d1d70c17ae7aa6a613c87fce69a127e2624002df63dcb39d6cf6396" +dependencies = [ + "cfg-if", + "once_cell", + "wasm-bindgen-macro", +] + +[[package]] +name = "wasm-bindgen-backend" +version = "0.2.99" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5f89bb38646b4f81674e8f5c3fb81b562be1fd936d84320f3264486418519c79" +dependencies = [ + "bumpalo", + "log", + "proc-macro2", + "quote", + "syn", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-macro" +version = "0.2.99" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2cc6181fd9a7492eef6fef1f33961e3695e4579b9872a6f7c83aee556666d4fe" +dependencies = [ + "quote", + "wasm-bindgen-macro-support", +] + +[[package]] +name = "wasm-bindgen-macro-support" +version = "0.2.99" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "30d7a95b763d3c45903ed6c81f156801839e5ee968bb07e534c44df0fcd330c2" +dependencies = [ + "proc-macro2", + "quote", + "syn", + "wasm-bindgen-backend", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-shared" +version = "0.2.99" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "943aab3fdaaa029a6e0271b35ea10b72b943135afe9bffca82384098ad0e06a6" + +[[package]] +name = "web-sys" +version = "0.3.76" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "04dd7223427d52553d3702c004d3b2fe07c148165faa56313cb00211e31c12bc" +dependencies = [ + "js-sys", + "wasm-bindgen", +] + +[[package]] +name = "winapi-util" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cf221c93e13a30d793f7645a0e7762c55d169dbb0a49671918a2319d289b10bb" +dependencies = [ + "windows-sys 0.59.0", +] + +[[package]] +name = "windows-sys" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" +dependencies = [ + "windows-targets", +] + +[[package]] +name = "windows-sys" +version = "0.59.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b" +dependencies = [ + "windows-targets", +] + +[[package]] +name = "windows-targets" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" +dependencies = [ + "windows_aarch64_gnullvm", + "windows_aarch64_msvc", + "windows_i686_gnu", + "windows_i686_gnullvm", + "windows_i686_msvc", + "windows_x86_64_gnu", + "windows_x86_64_gnullvm", + "windows_x86_64_msvc", +] + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" + +[[package]] +name = "windows_i686_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" + +[[package]] +name = "windows_i686_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" + +[[package]] +name = "windows_i686_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" diff --git a/sqlglotrs/Cargo.toml b/sqlglotrs/Cargo.toml index 47b4fa6207..f8d6d0f7da 100644 --- a/sqlglotrs/Cargo.toml +++ b/sqlglotrs/Cargo.toml @@ -6,7 +6,26 @@ license = "MIT" [lib] name = "sqlglotrs" -crate-type = ["cdylib"] +crate-type = ["cdylib", "rlib"] + +[[bench]] +name = "long" +harness = false + +[features] +# Enable this feature to use the serde and serde_json crates for profiling purposes +default = [] +profiling = ["serde", "serde_json"] [dependencies] -pyo3 = "0.22.6" +pyo3 = {version ="0.22.6", features = ["auto-initialize"]} + +# Optional dependencies used for profiling +serde = { version = "1", features = ["derive"] , optional = true } +serde_json = { version = "1", optional = true } + +[dev-dependencies] +criterion = "0.5" +serde = { version = "1", features = ["derive"] } +serde_json = { version = "1" } +sqlglotrs = { path = "." , features = ["profiling"] } diff --git a/sqlglotrs/benches/dialect_settings.json b/sqlglotrs/benches/dialect_settings.json new file mode 100644 index 0000000000..04c3f9950b --- /dev/null +++ b/sqlglotrs/benches/dialect_settings.json @@ -0,0 +1 @@ +{"unescaped_sequences":{},"identifiers_can_start_with_digit":false,"numbers_can_be_underscore_separated":false} \ No newline at end of file diff --git a/sqlglotrs/benches/long.rs b/sqlglotrs/benches/long.rs new file mode 100644 index 0000000000..347c1199bf --- /dev/null +++ b/sqlglotrs/benches/long.rs @@ -0,0 +1,74 @@ +use std::path::Path; + +use criterion::{black_box, criterion_group, criterion_main, Criterion}; +use sqlglotrs::settings::{TokenTypeSettings, TokenizerDialectSettings, TokenizerSettings}; +use sqlglotrs::tokenizer::Tokenizer; + +pub const LONG: &str = r#" +SELECT + "e"."employee_id" AS "Employee #", + "e"."first_name" || ' ' || "e"."last_name" AS "Name", + "e"."email" AS "Email", + "e"."phone_number" AS "Phone", + TO_CHAR("e"."hire_date", 'MM/DD/YYYY') AS "Hire Date", + TO_CHAR("e"."salary", 'L99G999D99', 'NLS_NUMERIC_CHARACTERS = ''.,'' NLS_CURRENCY = ''$''') AS "Salary", + "e"."commission_pct" AS "Commission %", + 'works as ' || "j"."job_title" || ' in ' || "d"."department_name" || ' department (manager: ' || "dm"."first_name" || ' ' || "dm"."last_name" || ') and immediate supervisor: ' || "m"."first_name" || ' ' || "m"."last_name" AS "Current Job", + TO_CHAR("j"."min_salary", 'L99G999D99', 'NLS_NUMERIC_CHARACTERS = ''.,'' NLS_CURRENCY = ''$''') || ' - ' || TO_CHAR("j"."max_salary", 'L99G999D99', 'NLS_NUMERIC_CHARACTERS = ''.,'' NLS_CURRENCY = ''$''') AS "Current Salary", + "l"."street_address" || ', ' || "l"."postal_code" || ', ' || "l"."city" || ', ' || "l"."state_province" || ', ' || "c"."country_name" || ' (' || "r"."region_name" || ')' AS "Location", + "jh"."job_id" AS "History Job ID", + 'worked from ' || TO_CHAR("jh"."start_date", 'MM/DD/YYYY') || ' to ' || TO_CHAR("jh"."end_date", 'MM/DD/YYYY') || ' as ' || "jj"."job_title" || ' in ' || "dd"."department_name" || ' department' AS "History Job Title", + case when 1 then 1 when 2 then 2 when 3 then 3 when 4 then 4 when 5 then 5 else a(b(c + 1 * 3 % 4)) end +FROM "employees" AS e +JOIN "jobs" AS j + ON "e"."job_id" = "j"."job_id" +LEFT JOIN "employees" AS m + ON "e"."manager_id" = "m"."employee_id" +LEFT JOIN "departments" AS d + ON "d"."department_id" = "e"."department_id" +LEFT JOIN "employees" AS dm + ON "d"."manager_id" = "dm"."employee_id" +LEFT JOIN "locations" AS l + ON "d"."location_id" = "l"."location_id" +LEFT JOIN "countries" AS c + ON "l"."country_id" = "c"."country_id" +LEFT JOIN "regions" AS r + ON "c"."region_id" = "r"."region_id" +LEFT JOIN "job_history" AS jh + ON "e"."employee_id" = "jh"."employee_id" +LEFT JOIN "jobs" AS jj + ON "jj"."job_id" = "jh"."job_id" +LEFT JOIN "departments" AS dd + ON "dd"."department_id" = "jh"."department_id" +ORDER BY + "e"."employee_id" +"#; + +fn long(c: &mut Criterion) { + // Read tokenizer settings + let path = Path::new(env!("CARGO_MANIFEST_DIR")).join("benches"); + let settings_file = std::fs::read_to_string(path.join("tokenizer_settings.json")).unwrap(); + let tokenizer_settings = serde_json::from_str::(&settings_file).unwrap(); + + let settings_type_file = + std::fs::read_to_string(path.join("token_type_settings.json")).unwrap(); + let settings_type_file = + serde_json::from_str::(&settings_type_file).unwrap(); + + let dialect_settings = std::fs::read_to_string(path.join("dialect_settings.json")).unwrap(); + let dialect_settings = + serde_json::from_str::(&dialect_settings).unwrap(); + let tokenizer = Tokenizer::new(tokenizer_settings, settings_type_file); + + c.bench_function("long", |b| { + b.iter(|| black_box(tokenizer.tokenize(LONG, &dialect_settings))); + }); +} + +criterion_group! { + name = benches; + config = Criterion::default(); + targets = long +} + +criterion_main!(benches); diff --git a/sqlglotrs/benches/token_type_settings.json b/sqlglotrs/benches/token_type_settings.json new file mode 100644 index 0000000000..580df9613a --- /dev/null +++ b/sqlglotrs/benches/token_type_settings.json @@ -0,0 +1 @@ +{"bit_string":67,"break_":55,"dcolon":11,"heredoc_string":72,"raw_string":71,"hex_string":68,"identifier":58,"number":57,"parameter":47,"semicolon":13,"string":56,"var":66,"heredoc_string_alternative":66,"hint":254} \ No newline at end of file diff --git a/sqlglotrs/benches/tokenizer_dialect_settings.json b/sqlglotrs/benches/tokenizer_dialect_settings.json new file mode 100644 index 0000000000..04c3f9950b --- /dev/null +++ b/sqlglotrs/benches/tokenizer_dialect_settings.json @@ -0,0 +1 @@ +{"unescaped_sequences":{},"identifiers_can_start_with_digit":false,"numbers_can_be_underscore_separated":false} \ No newline at end of file diff --git a/sqlglotrs/benches/tokenizer_settings.json b/sqlglotrs/benches/tokenizer_settings.json new file mode 100644 index 0000000000..47f04c7a48 --- /dev/null +++ b/sqlglotrs/benches/tokenizer_settings.json @@ -0,0 +1 @@ +{"white_space":{"\n":55,"\t":54,"\r":55," ":54},"single_tokens":{"\"":320,",":6,".":7,"[":2,"*":14,":":10,"]":3,"'":320,"(":0,")":1,"?":311,"-":8,"@":47,"$":46},"keywords":{"..":7},"numeric_literals":{},"identifiers":{"\"":"\""},"identifier_escapes":["\\"],"string_escapes":["\\"],"quotes":{"'":"'"},"format_strings":{"N'":["'",70],"n'":["'",70]},"has_bit_strings":false,"has_hex_strings":false,"comments":{"{#":"#}","--":null,"/*":"*/"},"var_single_tokens":[],"commands":[237,341,205,234,324],"command_prefix_tokens":[13,197],"tokens_preceding_hint":[261,334,221,361],"heredoc_tag_is_identifier":false,"string_escapes_allowed_in_raw_strings":true,"nested_comments":true,"hint_start":"/*+"} \ No newline at end of file diff --git a/sqlglotrs/src/lib.rs b/sqlglotrs/src/lib.rs index e60620a388..bb6caf6410 100644 --- a/sqlglotrs/src/lib.rs +++ b/sqlglotrs/src/lib.rs @@ -1,90 +1,13 @@ use pyo3::prelude::*; -use pyo3::types::{PyList, PyNone, PyString}; +use pyo3::{pymodule, types::PyModule, Bound, PyResult}; +use settings::{TokenTypeSettings, TokenizerDialectSettings, TokenizerSettings}; +use token::Token; +use tokenizer::Tokenizer; -mod settings; -mod tokenizer; -mod trie; - -pub use self::settings::{ - TokenType, TokenTypeSettings, TokenizerDialectSettings, TokenizerSettings, -}; -pub use self::tokenizer::Tokenizer; - -#[derive(Debug)] -#[pyclass] -pub struct Token { - #[pyo3(get, name = "token_type_index")] - pub token_type: TokenType, - #[pyo3(get, set, name = "token_type")] - pub token_type_py: PyObject, - #[pyo3(get)] - pub text: Py, - #[pyo3(get)] - pub line: usize, - #[pyo3(get)] - pub col: usize, - #[pyo3(get)] - pub start: usize, - #[pyo3(get)] - pub end: usize, - #[pyo3(get)] - pub comments: Py, -} - -impl Token { - pub fn new( - token_type: TokenType, - text: String, - line: usize, - col: usize, - start: usize, - end: usize, - comments: Vec, - ) -> Token { - Python::with_gil(|py| Token { - token_type, - token_type_py: PyNone::get_bound(py).into_py(py), - text: PyString::new_bound(py, &text).into_py(py), - line, - col, - start, - end, - comments: PyList::new_bound(py, &comments).into(), - }) - } - - pub fn append_comments(&self, comments: &mut Vec) { - Python::with_gil(|py| { - let pylist = self.comments.bind(py); - for comment in comments.iter() { - if let Err(_) = pylist.append(comment) { - panic!("Failed to append comments to the Python list"); - } - } - }); - // Simulate `Vec::append`. - let _ = std::mem::replace(comments, Vec::new()); - } -} - -#[pymethods] -impl Token { - #[pyo3(name = "__repr__")] - fn python_repr(&self) -> PyResult { - Python::with_gil(|py| { - Ok(format!( - "", - self.token_type_py.bind(py).repr()?, - self.text.bind(py).repr()?, - self.line, - self.col, - self.start, - self.end, - self.comments.bind(py).repr()?, - )) - }) - } -} +pub mod settings; +pub mod token; +pub mod tokenizer; +pub mod trie; #[pymodule] fn sqlglotrs(m: &Bound<'_, PyModule>) -> PyResult<()> { diff --git a/sqlglotrs/src/settings.rs b/sqlglotrs/src/settings.rs index d49776d5db..b0c951c09a 100644 --- a/sqlglotrs/src/settings.rs +++ b/sqlglotrs/src/settings.rs @@ -1,10 +1,12 @@ -use pyo3::prelude::*; use std::collections::{HashMap, HashSet}; +use pyo3::prelude::*; + pub type TokenType = u16; #[derive(Clone, Debug)] #[pyclass] +#[cfg_attr(feature = "profiling", derive(serde::Serialize, serde::Deserialize))] pub struct TokenTypeSettings { pub bit_string: TokenType, pub break_: TokenType, @@ -41,7 +43,7 @@ impl TokenTypeSettings { heredoc_string_alternative: TokenType, hint: TokenType, ) -> Self { - TokenTypeSettings { + let token_type_settings = TokenTypeSettings { bit_string, break_, dcolon, @@ -56,12 +58,31 @@ impl TokenTypeSettings { var, heredoc_string_alternative, hint, + }; + + #[cfg(feature = "profiling")] + { + token_type_settings.write_json_to_string(); } + + token_type_settings + } +} + +#[cfg(feature = "profiling")] +impl TokenTypeSettings { + pub fn write_json_to_string(&self) { + let json = serde_json::to_string(self).unwrap(); + let path = std::path::Path::new(env!("CARGO_MANIFEST_DIR")) + .join("benches/token_type_settings.json"); + // Write to file + std::fs::write(path, &json).unwrap(); } } #[derive(Clone, Debug)] #[pyclass] +#[cfg_attr(feature = "profiling", derive(serde::Serialize, serde::Deserialize))] pub struct TokenizerSettings { pub white_space: HashMap, pub single_tokens: HashMap, @@ -141,7 +162,7 @@ impl TokenizerSettings { let var_single_tokens_native: HashSet = var_single_tokens.iter().map(&to_char).collect(); - TokenizerSettings { + let tokenizer_settings = TokenizerSettings { white_space: white_space_native, single_tokens: single_tokens_native, keywords, @@ -162,12 +183,31 @@ impl TokenizerSettings { string_escapes_allowed_in_raw_strings, nested_comments, hint_start, + }; + + #[cfg(feature = "profiling")] + { + tokenizer_settings.write_json_to_string(); } + + tokenizer_settings + } +} + +#[cfg(feature = "profiling")] +impl TokenizerSettings { + pub fn write_json_to_string(&self) { + let json = serde_json::to_string(self).unwrap(); + let path = std::path::Path::new(env!("CARGO_MANIFEST_DIR")) + .join("benches/tokenizer_settings.json"); + // Write to file + std::fs::write(path, &json).unwrap(); } } #[derive(Clone, Debug)] #[pyclass] +#[cfg_attr(feature = "profiling", derive(serde::Serialize, serde::Deserialize))] pub struct TokenizerDialectSettings { pub unescaped_sequences: HashMap, pub identifiers_can_start_with_digit: bool, @@ -182,10 +222,27 @@ impl TokenizerDialectSettings { identifiers_can_start_with_digit: bool, numbers_can_be_underscore_separated: bool, ) -> Self { - TokenizerDialectSettings { + let settings = TokenizerDialectSettings { unescaped_sequences, identifiers_can_start_with_digit, numbers_can_be_underscore_separated, + }; + + #[cfg(feature = "profiling")] + { + settings.write_json_to_string(); } + + settings + } +} + +#[cfg(feature = "profiling")] +impl TokenizerDialectSettings { + pub fn write_json_to_string(&self) { + let json = serde_json::to_string(self).unwrap(); + let path = std::path::Path::new(env!("CARGO_MANIFEST_DIR")) + .join("benches/tokenizer_dialect_settings.json"); + std::fs::write(path, &json).unwrap(); } } diff --git a/sqlglotrs/src/token.rs b/sqlglotrs/src/token.rs new file mode 100644 index 0000000000..33524691a0 --- /dev/null +++ b/sqlglotrs/src/token.rs @@ -0,0 +1,61 @@ +use crate::settings::TokenType; +use pyo3::prelude::PyListMethods; +use pyo3::types::{PyList, PyNone, PyString}; +use pyo3::{pyclass, IntoPy, Py, PyObject, Python}; + +#[derive(Debug)] +#[pyclass] +pub struct Token { + #[pyo3(get, name = "token_type_index")] + pub token_type: TokenType, + #[pyo3(get, set, name = "token_type")] + pub token_type_py: PyObject, + #[pyo3(get)] + pub text: Py, + #[pyo3(get)] + pub line: usize, + #[pyo3(get)] + pub col: usize, + #[pyo3(get)] + pub start: usize, + #[pyo3(get)] + pub end: usize, + #[pyo3(get)] + pub comments: Py, +} + +impl Token { + pub fn new( + token_type: TokenType, + text: String, + line: usize, + col: usize, + start: usize, + end: usize, + comments: Vec, + ) -> Token { + Python::with_gil(|py| Token { + token_type, + token_type_py: PyNone::get_bound(py).into_py(py), + text: PyString::new_bound(py, &text).into_py(py), + line, + col, + start, + end, + comments: PyList::new_bound(py, &comments).into(), + }) + } + + pub fn append_comments(&self, comments: &mut Vec) { + Python::with_gil(|py| { + let pylist = self.comments.bind(py); + for comment in comments.iter() { + if let Err(_) = pylist.append(comment) { + panic!("Failed to append comments to the Python list"); + } + } + }); + // Simulate `Vec::append`. + let _ = std::mem::replace(comments, Vec::new()); + } +} diff --git a/sqlglotrs/src/tokenizer.rs b/sqlglotrs/src/tokenizer.rs index 9aec50f677..02c5327ee3 100644 --- a/sqlglotrs/src/tokenizer.rs +++ b/sqlglotrs/src/tokenizer.rs @@ -1,5 +1,6 @@ +use crate::settings::TokenType; use crate::trie::{Trie, TrieResult}; -use crate::{Token, TokenType, TokenTypeSettings, TokenizerDialectSettings, TokenizerSettings}; +use crate::{Token, TokenTypeSettings, TokenizerDialectSettings, TokenizerSettings}; use pyo3::exceptions::PyException; use pyo3::prelude::*; use std::cmp::{max, min}; @@ -375,7 +376,10 @@ impl<'a> TokenizerState<'a> { self.advance(1)?; // Nested comments are allowed by some dialects, e.g. databricks, duckdb, postgres - if self.settings.nested_comments && !self.is_end && self.chars(comment_start_size) == *comment_start { + if self.settings.nested_comments + && !self.is_end + && self.chars(comment_start_size) == *comment_start + { self.advance(comment_start_size as isize)?; comment_count += 1 } @@ -397,7 +401,11 @@ impl<'a> TokenizerState<'a> { if comment_start == self.settings.hint_start && self.tokens.last().is_some() - && self.settings.tokens_preceding_hint.contains(&self.tokens.last().unwrap().token_type) { + && self + .settings + .tokens_preceding_hint + .contains(&self.tokens.last().unwrap().token_type) + { self.add(self.token_types.hint, None)?; } @@ -443,7 +451,7 @@ impl<'a> TokenizerState<'a> { self.advance(-(tag.len() as isize))?; self.add(self.token_types.heredoc_string_alternative, None)?; - return Ok(true) + return Ok(true); } (None, *token_type, format!("{}{}{}", start, tag, end)) @@ -455,7 +463,8 @@ impl<'a> TokenizerState<'a> { }; self.advance(start.len() as isize)?; - let text = self.extract_string(&end, false, token_type == self.token_types.raw_string, true)?; + let text = + self.extract_string(&end, false, token_type == self.token_types.raw_string, true)?; if let Some(b) = base { if u64::from_str_radix(&text, b).is_err() { @@ -537,7 +546,9 @@ impl<'a> TokenizerState<'a> { self.add(self.token_types.number, Some(number_text))?; self.add(self.token_types.dcolon, Some("::".to_string()))?; self.add(unwrapped_token_type, Some(literal))?; - } else if self.dialect_settings.numbers_can_be_underscore_separated && self.is_numeric(&replaced) { + } else if self.dialect_settings.numbers_can_be_underscore_separated + && self.is_numeric(&replaced) + { self.add(self.token_types.number, Some(number_text + &replaced))?; } else if self.dialect_settings.identifiers_can_start_with_digit { self.add(self.token_types.var, None)?; @@ -677,7 +688,7 @@ impl<'a> TokenizerState<'a> { if self.is_end { if !raise_unmatched { text.push(self.current_char); - return Ok(text) + return Ok(text); } return self.error_result(format!( @@ -703,11 +714,13 @@ impl<'a> TokenizerState<'a> { } fn is_identifier(&mut self, s: &str) -> bool { - s.chars().enumerate().all( - |(i, c)| - if i == 0 { self.is_alphabetic_or_underscore(c) } - else { self.is_alphabetic_or_underscore(c) || c.is_digit(10) } - ) + s.chars().enumerate().all(|(i, c)| { + if i == 0 { + self.is_alphabetic_or_underscore(c) + } else { + self.is_alphabetic_or_underscore(c) || c.is_digit(10) + } + }) } fn is_numeric(&mut self, s: &str) -> bool {