diff --git a/Cargo.lock b/Cargo.lock index f89045bb6b0a..c7dbc90df646 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -268,9 +268,9 @@ checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" [[package]] name = "arrow" -version = "53.3.0" +version = "53.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c91839b07e474b3995035fd8ac33ee54f9c9ccbbb1ea33d9909c71bffdf1259d" +checksum = "eaf3437355979f1e93ba84ba108c38be5767713051f3c8ffbf07c094e2e61f9f" dependencies = [ "arrow-arith", "arrow-array", @@ -289,9 +289,9 @@ dependencies = [ [[package]] name = "arrow-arith" -version = "53.3.0" +version = "53.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "855c57c4efd26722b044dcd3e348252560e3e0333087fb9f6479dc0bf744054f" +checksum = "31dce77d2985522288edae7206bffd5fc4996491841dda01a13a58415867e681" dependencies = [ "arrow-array", "arrow-buffer", @@ -304,16 +304,16 @@ dependencies = [ [[package]] name = "arrow-array" -version = "53.3.0" +version = "53.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bd03279cea46569acf9295f6224fbc370c5df184b4d2ecfe97ccb131d5615a7f" +checksum = "2d45fe6d3faed0435b7313e59a02583b14c6c6339fa7729e94c32a20af319a79" dependencies = [ "ahash 0.8.11", "arrow-buffer", "arrow-data", "arrow-schema", "chrono", - "chrono-tz 0.10.0", + "chrono-tz 0.10.1", "half", "hashbrown 0.15.2", "num", @@ -321,9 +321,9 @@ dependencies = [ [[package]] name = "arrow-buffer" -version = "53.3.0" +version = "53.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9e4a9b9b1d6d7117f6138e13bc4dd5daa7f94e671b70e8c9c4dc37b4f5ecfc16" +checksum = "2b02656a35cc103f28084bc80a0159668e0a680d919cef127bd7e0aaccb06ec1" dependencies = [ "bytes", "half", @@ -332,9 +332,9 @@ dependencies = [ [[package]] name = "arrow-cast" -version = "53.3.0" +version = "53.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bc70e39916e60c5b7af7a8e2719e3ae589326039e1e863675a008bee5ffe90fd" +checksum = "c73c6233c5b5d635a56f6010e6eb1ab9e30e94707db21cea03da317f67d84cf3" dependencies = [ "arrow-array", "arrow-buffer", @@ -353,9 +353,9 @@ dependencies = [ [[package]] name = "arrow-csv" -version = "53.3.0" +version = "53.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "789b2af43c1049b03a8d088ff6b2257cdcea1756cd76b174b1f2600356771b97" +checksum = "ec222848d70fea5a32af9c3602b08f5d740d5e2d33fbd76bf6fd88759b5b13a7" dependencies = [ "arrow-array", "arrow-buffer", @@ -372,9 +372,9 @@ dependencies = [ [[package]] name = "arrow-data" -version = "53.3.0" +version = "53.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e4e75edf21ffd53744a9b8e3ed11101f610e7ceb1a29860432824f1834a1f623" +checksum = "b7f2861ffa86f107b8ab577d86cff7c7a490243eabe961ba1e1af4f27542bb79" dependencies = [ "arrow-buffer", "arrow-schema", @@ -384,9 +384,9 @@ dependencies = [ [[package]] name = "arrow-flight" -version = "53.3.0" +version = "53.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4c09b331887a526f203f2123444792aee924632bd08b9940435070901075832e" +checksum = "3ab7635558f3f803b492eae56c03cde97ea5f85a1c768f94181cb7db69cd81be" dependencies = [ "arrow-array", "arrow-buffer", @@ -405,9 +405,9 @@ dependencies = [ [[package]] name = "arrow-ipc" -version = "53.3.0" +version = "53.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d186a909dece9160bf8312f5124d797884f608ef5435a36d9d608e0b2a9bcbf8" +checksum = "0270dc511f11bb5fa98a25020ad51a99ca5b08d8a8dfbd17503bb9dba0388f0b" dependencies = [ "arrow-array", "arrow-buffer", @@ -421,9 +421,9 @@ dependencies = [ [[package]] name = "arrow-json" -version = "53.3.0" +version = "53.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b66ff2fedc1222942d0bd2fd391cb14a85baa3857be95c9373179bd616753b85" +checksum = "0eff38eeb8a971ad3a4caf62c5d57f0cff8a48b64a55e3207c4fd696a9234aad" dependencies = [ "arrow-array", "arrow-buffer", @@ -441,9 +441,9 @@ dependencies = [ [[package]] name = "arrow-ord" -version = "53.3.0" +version = "53.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ece7b5bc1180e6d82d1a60e1688c199829e8842e38497563c3ab6ea813e527fd" +checksum = "c6f202a879d287099139ff0d121e7f55ae5e0efe634b8cf2106ebc27a8715dee" dependencies = [ "arrow-array", "arrow-buffer", @@ -456,9 +456,9 @@ dependencies = [ [[package]] name = "arrow-row" -version = "53.3.0" +version = "53.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "745c114c8f0e8ce211c83389270de6fbe96a9088a7b32c2a041258a443fe83ff" +checksum = "a8f936954991c360ba762dff23f5dda16300774fafd722353d9683abd97630ae" dependencies = [ "ahash 0.8.11", "arrow-array", @@ -470,18 +470,18 @@ dependencies = [ [[package]] name = "arrow-schema" -version = "53.3.0" +version = "53.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b95513080e728e4cec37f1ff5af4f12c9688d47795d17cda80b6ec2cf74d4678" +checksum = "9579b9d8bce47aa41389fe344f2c6758279983b7c0ebb4013e283e3e91bb450e" dependencies = [ "serde", ] [[package]] name = "arrow-select" -version = "53.3.0" +version = "53.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8e415279094ea70323c032c6e739c48ad8d80e78a09bef7117b8718ad5bf3722" +checksum = "7471ba126d0b0aaa24b50a36bc6c25e4e74869a1fd1a5553357027a0b1c8d1f1" dependencies = [ "ahash 0.8.11", "arrow-array", @@ -493,9 +493,9 @@ dependencies = [ [[package]] name = "arrow-string" -version = "53.3.0" +version = "53.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "11d956cae7002eb8d83a27dbd34daaea1cf5b75852f0b84deb4d93a276e92bbf" +checksum = "72993b01cb62507b06f1fb49648d7286c8989ecfabdb7b77a750fcb54410731b" dependencies = [ "arrow-array", "arrow-buffer", @@ -616,7 +616,7 @@ checksum = "3b43422f69d8ff38f95f1b2bb76517c91589a924d1559a0e935d7c8ce0274c11" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.96", ] [[package]] @@ -638,7 +638,7 @@ checksum = "c7c24de15d275a1ecfd47a380fb4d5ec9bfe0933f309ed5e705b775596a3574d" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.96", ] [[package]] @@ -655,7 +655,7 @@ checksum = "721cae7de5c34fbb2acd27e21e6d2cf7b886dce0c27388d46c4e6c47ea4318dd" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.96", ] [[package]] @@ -735,7 +735,7 @@ checksum = "3c87f3f15e7794432337fc718554eaa4dc8f04c9677a950ffe366f20a162ae42" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.96", ] [[package]] @@ -936,7 +936,7 @@ checksum = "57d123550fa8d071b7255cb0cc04dc302baa6c8c4a79f55701552684d8399bce" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.96", ] [[package]] @@ -1059,7 +1059,7 @@ dependencies = [ "regex", "rustc-hash 1.1.0", "shlex", - "syn 2.0.90", + "syn 2.0.96", ] [[package]] @@ -1171,7 +1171,7 @@ dependencies = [ "proc-macro-crate 3.2.0", "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.96", "syn_derive", ] @@ -1532,9 +1532,9 @@ dependencies = [ [[package]] name = "chrono-tz" -version = "0.10.0" +version = "0.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cd6dd8046d00723a59a2f8c5f295c515b9bb9a331ee4f8f3d4dd49e428acd3b6" +checksum = "9c6ac4f2c0bf0f44e9161aec9675e1050aa4a530663c4a9e37e108fa948bca9f" dependencies = [ "chrono", "chrono-tz-build 0.4.0", @@ -1685,7 +1685,7 @@ dependencies = [ "heck 0.5.0", "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.96", ] [[package]] @@ -2166,7 +2166,7 @@ dependencies = [ "quote", "snafu 0.8.5", "static_assertions", - "syn 2.0.90", + "syn 2.0.96", ] [[package]] @@ -2878,7 +2878,7 @@ dependencies = [ "proc-macro2", "quote", "strsim 0.11.1", - "syn 2.0.90", + "syn 2.0.96", ] [[package]] @@ -2900,7 +2900,7 @@ checksum = "d336a2a514f6ccccaa3e09b02d41d35330c07ddf03a62165fcec10bb561c7806" dependencies = [ "darling_core 0.20.10", "quote", - "syn 2.0.90", + "syn 2.0.96", ] [[package]] @@ -3233,7 +3233,7 @@ dependencies = [ "datafusion-doc", "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.96", ] [[package]] @@ -3556,7 +3556,7 @@ checksum = "2cdc8d50f426189eef89dac62fabfa0abb27d5cc008f25bf4156a0203325becc" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.96", ] [[package]] @@ -3567,7 +3567,7 @@ checksum = "67e77553c4162a157adbf834ebae5b415acbecbeafc7a74b0e886657506a7611" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.96", ] [[package]] @@ -3630,7 +3630,7 @@ dependencies = [ "darling 0.20.10", "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.96", ] [[package]] @@ -3660,7 +3660,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4abae7035bf79b9877b779505d8cf3749285b80c43941eda66604841889451dc" dependencies = [ "derive_builder_core 0.20.1", - "syn 2.0.90", + "syn 2.0.96", ] [[package]] @@ -3680,7 +3680,7 @@ checksum = "cb7330aeadfbe296029522e6c40f315320aba36fc43a5b3632f3795348f3bd22" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.96", "unicode-xid", ] @@ -3692,7 +3692,7 @@ checksum = "65f152f4b8559c4da5d574bafc7af85454d706b4c5fe8b530d508cacbb6807ea" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.96", ] [[package]] @@ -3766,6 +3766,17 @@ dependencies = [ "winapi", ] +[[package]] +name = "displaydoc" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.96", +] + [[package]] name = "dlv-list" version = "0.3.0" @@ -3880,7 +3891,7 @@ dependencies = [ "heck 0.5.0", "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.96", ] [[package]] @@ -3892,7 +3903,7 @@ dependencies = [ "once_cell", "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.96", ] [[package]] @@ -3912,7 +3923,7 @@ checksum = "3bf679796c0322556351f287a51b49e48f7c4986e727b5dd78c972d30e2e16cc" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.96", ] [[package]] @@ -4130,9 +4141,9 @@ checksum = "0ce7134b9999ecaf8bcd65542e436736ef32ddca1b3e06094cb6ec5755203b80" [[package]] name = "flatbuffers" -version = "24.3.25" +version = "24.12.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8add37afff2d4ffa83bc748a70b4b1370984f6980768554182424ef71447c35f" +checksum = "4f1baf0dbf96932ec9a3038d57900329c015b0bfb7b63d904f3bc27e2b02a096" dependencies = [ "bitflags 1.3.2", "rustc_version", @@ -4387,7 +4398,7 @@ checksum = "e99b8b3c28ae0e84b604c75f721c21dc77afb3706076af5e8216d15fd1deaae3" dependencies = [ "frunk_proc_macro_helpers", "quote", - "syn 2.0.90", + "syn 2.0.96", ] [[package]] @@ -4399,7 +4410,7 @@ dependencies = [ "frunk_core", "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.96", ] [[package]] @@ -4411,7 +4422,7 @@ dependencies = [ "frunk_core", "frunk_proc_macro_helpers", "quote", - "syn 2.0.90", + "syn 2.0.96", ] [[package]] @@ -4535,7 +4546,7 @@ checksum = "87750cf4b7a4c0625b1529e4c543c2182106e4dedc60a2a6455e00d212c489ac" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.96", ] [[package]] @@ -4657,9 +4668,9 @@ dependencies = [ [[package]] name = "get-size2" -version = "0.1.3" +version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3aa3d1f2527cf956b5637a531e21eb1ef9c825c70cd6f8765fd00b7457eef699" +checksum = "159c430715e540d2198fa981d39cd45563ccc60900de187f5b152b33b1cb408e" [[package]] name = "getopts" @@ -5136,7 +5147,7 @@ dependencies = [ "proc-macro-crate 1.3.1", "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.96", ] [[package]] @@ -5151,7 +5162,7 @@ dependencies = [ "rust-sitter", "rust-sitter-tool", "slotmap", - "syn 2.0.90", + "syn 2.0.96", ] [[package]] @@ -5170,7 +5181,7 @@ dependencies = [ "serde", "serde_json", "slotmap", - "syn 2.0.90", + "syn 2.0.96", "webbrowser", ] @@ -5184,7 +5195,7 @@ dependencies = [ "proc-macro-crate 1.3.1", "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.96", ] [[package]] @@ -5382,6 +5393,124 @@ dependencies = [ "cc", ] +[[package]] +name = "icu_collections" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "db2fa452206ebee18c4b5c2274dbf1de17008e874b4dc4f0aea9d01ca79e4526" +dependencies = [ + "displaydoc", + "yoke", + "zerofrom", + "zerovec", +] + +[[package]] +name = "icu_locid" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "13acbb8371917fc971be86fc8057c41a64b521c184808a698c02acc242dbf637" +dependencies = [ + "displaydoc", + "litemap", + "tinystr", + "writeable", + "zerovec", +] + +[[package]] +name = "icu_locid_transform" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "01d11ac35de8e40fdeda00d9e1e9d92525f3f9d887cdd7aa81d727596788b54e" +dependencies = [ + "displaydoc", + "icu_locid", + "icu_locid_transform_data", + "icu_provider", + "tinystr", + "zerovec", +] + +[[package]] +name = "icu_locid_transform_data" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fdc8ff3388f852bede6b579ad4e978ab004f139284d7b28715f773507b946f6e" + +[[package]] +name = "icu_normalizer" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "19ce3e0da2ec68599d193c93d088142efd7f9c5d6fc9b803774855747dc6a84f" +dependencies = [ + "displaydoc", + "icu_collections", + "icu_normalizer_data", + "icu_properties", + "icu_provider", + "smallvec", + "utf16_iter", + "utf8_iter", + "write16", + "zerovec", +] + +[[package]] +name = "icu_normalizer_data" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8cafbf7aa791e9b22bec55a167906f9e1215fd475cd22adfcf660e03e989516" + +[[package]] +name = "icu_properties" +version = "1.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93d6020766cfc6302c15dbbc9c8778c37e62c14427cb7f6e601d849e092aeef5" +dependencies = [ + "displaydoc", + "icu_collections", + "icu_locid_transform", + "icu_properties_data", + "icu_provider", + "tinystr", + "zerovec", +] + +[[package]] +name = "icu_properties_data" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "67a8effbc3dd3e4ba1afa8ad918d5684b8868b3b26500753effea8d2eed19569" + +[[package]] +name = "icu_provider" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ed421c8a8ef78d3e2dbc98a973be2f3770cb42b606e3ab18d6237c4dfde68d9" +dependencies = [ + "displaydoc", + "icu_locid", + "icu_provider_macros", + "stable_deref_trait", + "tinystr", + "writeable", + "yoke", + "zerofrom", + "zerovec", +] + +[[package]] +name = "icu_provider_macros" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ec89e9337638ecdc08744df490b221a7399bf8d164eb52a665454e60e075ad6" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.96", +] + [[package]] name = "ident_case" version = "1.0.1" @@ -5390,12 +5519,23 @@ checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39" [[package]] name = "idna" -version = "0.5.0" +version = "1.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "634d9b1461af396cad843f47fdba5597a4f9e6ddd4bfb6ff5d85028c25cb12f6" +checksum = "686f825264d630750a544639377bae737628043f20d38bbc029e8f29ea968a7e" dependencies = [ - "unicode-bidi", - "unicode-normalization", + "idna_adapter", + "smallvec", + "utf8_iter", +] + +[[package]] +name = "idna_adapter" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "daca1df1c957320b2cf139ac61e7bd64fed304c5040df000a745aa1de3b4ef71" +dependencies = [ + "icu_normalizer", + "icu_properties", ] [[package]] @@ -5938,7 +6078,7 @@ dependencies = [ "proc-macro2", "quote", "serde_json", - "syn 2.0.90", + "syn 2.0.96", ] [[package]] @@ -5999,7 +6139,7 @@ dependencies = [ "proc-macro2", "quote", "regex", - "syn 2.0.90", + "syn 2.0.96", ] [[package]] @@ -6019,9 +6159,9 @@ checksum = "0c2cdeb66e45e9f36bfad5bbdb4d2384e70936afbee843c6f6543f0c551ebb25" [[package]] name = "lexical-core" -version = "1.0.2" +version = "1.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0431c65b318a590c1de6b8fd6e72798c92291d27762d94c9e6c37ed7a73d8458" +checksum = "b765c31809609075565a70b4b71402281283aeda7ecaf4818ac14a7b2ade8958" dependencies = [ "lexical-parse-float", "lexical-parse-integer", @@ -6032,9 +6172,9 @@ dependencies = [ [[package]] name = "lexical-parse-float" -version = "1.0.2" +version = "1.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eb17a4bdb9b418051aa59d41d65b1c9be5affab314a872e5ad7f06231fb3b4e0" +checksum = "de6f9cb01fb0b08060209a057c048fcbab8717b4c1ecd2eac66ebfe39a65b0f2" dependencies = [ "lexical-parse-integer", "lexical-util", @@ -6043,9 +6183,9 @@ dependencies = [ [[package]] name = "lexical-parse-integer" -version = "1.0.2" +version = "1.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5df98f4a4ab53bf8b175b363a34c7af608fe31f93cc1fb1bf07130622ca4ef61" +checksum = "72207aae22fc0a121ba7b6d479e42cbfea549af1479c3f3a4f12c70dd66df12e" dependencies = [ "lexical-util", "static_assertions", @@ -6053,18 +6193,18 @@ dependencies = [ [[package]] name = "lexical-util" -version = "1.0.3" +version = "1.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "85314db53332e5c192b6bca611fb10c114a80d1b831ddac0af1e9be1b9232ca0" +checksum = "5a82e24bf537fd24c177ffbbdc6ebcc8d54732c35b50a3f28cc3f4e4c949a0b3" dependencies = [ "static_assertions", ] [[package]] name = "lexical-write-float" -version = "1.0.2" +version = "1.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6e7c3ad4e37db81c1cbe7cf34610340adc09c322871972f74877a712abc6c809" +checksum = "c5afc668a27f460fb45a81a757b6bf2f43c2d7e30cb5a2dcd3abf294c78d62bd" dependencies = [ "lexical-util", "lexical-write-integer", @@ -6073,9 +6213,9 @@ dependencies = [ [[package]] name = "lexical-write-integer" -version = "1.0.2" +version = "1.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eb89e9f6958b83258afa3deed90b5de9ef68eef090ad5086c791cd2345610162" +checksum = "629ddff1a914a836fb245616a7888b62903aae58fa771e1d83943035efa0f978" dependencies = [ "lexical-util", "static_assertions", @@ -6083,9 +6223,9 @@ dependencies = [ [[package]] name = "libc" -version = "0.2.169" +version = "0.2.159" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b5aba8db14291edd000dfcc4d620c7ebfb122c613afb886ca8803fa4e128a20a" +checksum = "561d97a539a36e26a9a5fad1ea11a3039a67714694aaa379433e580854bc3dc5" [[package]] name = "libfuzzer-sys" @@ -6181,6 +6321,12 @@ version = "0.4.14" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "78b3ae25bc7c8c38cec158d1f2757ee79e9b3740fbc7ccf0e59e4b08d793fa89" +[[package]] +name = "litemap" +version = "0.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4ee93343901ab17bd981295f2cf0026d4ad018c7c31ba84549a4ddbb47a45104" + [[package]] name = "local-ip-address" version = "0.6.3" @@ -6915,7 +7061,7 @@ dependencies = [ "proc-macro-error", "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.96", "termcolor", "thiserror 1.0.64", ] @@ -6933,7 +7079,7 @@ dependencies = [ "proc-macro-error", "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.96", "termcolor", "thiserror 1.0.64", ] @@ -7107,7 +7253,7 @@ checksum = "254a5372af8fc138e36684761d3c0cdb758a4410e938babcff1c860ce14ddbfc" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.96", ] [[package]] @@ -7127,9 +7273,9 @@ checksum = "27b02d87554356db9e9a873add8782d4ea6e3e58ea071a9adb9a2e8ddb884a8b" [[package]] name = "neli" -version = "0.6.5" +version = "0.6.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "93062a0dce6da2517ea35f301dfc88184ce18d3601ec786a727a87bf535deca9" +checksum = "1100229e06604150b3becd61a4965d5c70f3be1759544ea7274166f4be41ef43" dependencies = [ "byteorder", "libc", @@ -7139,9 +7285,9 @@ dependencies = [ [[package]] name = "neli-proc-macros" -version = "0.1.4" +version = "0.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0c8034b7fbb6f9455b2a96c19e6edf8dc9fc34c70449938d8ee3b4df363f61fe" +checksum = "c168194d373b1e134786274020dae7fc5513d565ea2ebb9bc9ff17ffb69106d4" dependencies = [ "either", "proc-macro2", @@ -7316,7 +7462,7 @@ checksum = "ed3955f1a9c7c0c15e092f9c887db08b1fc683305fdf6eb6684f22555355e202" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.96", ] [[package]] @@ -7436,9 +7582,9 @@ dependencies = [ [[package]] name = "object_store" -version = "0.11.1" +version = "0.11.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6eb4c22c6154a1e759d7099f9ffad7cc5ef8245f9efbab4a41b92623079c82f3" +checksum = "3cfccb68961a56facde1163f9319e0d15743352344e7808a11795fb99698dcaf" dependencies = [ "async-trait", "bytes", @@ -7733,7 +7879,7 @@ dependencies = [ "bytemuck", "bytes", "chrono", - "chrono-tz 0.10.0", + "chrono-tz 0.10.1", "fallible-streaming-iterator", "flate2", "futures", @@ -7907,9 +8053,9 @@ dependencies = [ [[package]] name = "parquet" -version = "53.3.0" +version = "53.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2b449890367085eb65d7d3321540abc3d7babbd179ce31df0016e90719114191" +checksum = "8957c0c95a6a1804f3e51a18f69df29be53856a8c5768cc9b6d00fcafcd2917c" dependencies = [ "ahash 0.8.11", "arrow-array", @@ -8100,7 +8246,7 @@ dependencies = [ "pest_meta", "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.96", ] [[package]] @@ -8142,7 +8288,7 @@ dependencies = [ "rand", "ring", "rust_decimal", - "thiserror 2.0.11", + "thiserror 2.0.6", "tokio", "tokio-rustls 0.26.0", "tokio-util", @@ -8203,7 +8349,7 @@ checksum = "2f38a4412a78282e09a2cf38d195ea5420d15ba0602cb375210efbc877243965" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.96", ] [[package]] @@ -8528,12 +8674,12 @@ dependencies = [ [[package]] name = "prettyplease" -version = "0.2.25" +version = "0.2.29" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "64d1ec885c64d0457d564db4ec299b2dae3f9c02808b8ad9c3a089c591b18033" +checksum = "6924ced06e1f7dfe3fa48d57b9f74f55d8915f5036121bef647ef4b204895fac" dependencies = [ "proc-macro2", - "syn 2.0.90", + "syn 2.0.96", ] [[package]] @@ -8726,7 +8872,7 @@ dependencies = [ "prost 0.12.6", "prost-types 0.12.6", "regex", - "syn 2.0.90", + "syn 2.0.96", "tempfile", ] @@ -8747,7 +8893,7 @@ dependencies = [ "prost 0.13.3", "prost-types 0.13.3", "regex", - "syn 2.0.90", + "syn 2.0.96", "tempfile", ] @@ -8774,7 +8920,7 @@ dependencies = [ "itertools 0.12.1", "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.96", ] [[package]] @@ -8787,7 +8933,7 @@ dependencies = [ "itertools 0.13.0", "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.96", ] [[package]] @@ -9244,7 +9390,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "76009fbe0614077fc1a2ce255e3a1881a2e3a3527097d5dc6d8212c585e7e38b" dependencies = [ "quote", - "syn 2.0.90", + "syn 2.0.96", ] [[package]] @@ -9293,7 +9439,7 @@ checksum = "bcc303e793d3734489387d205e9b186fac9c6cfacedd98cbb2e8a5943595f3e6" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.96", ] [[package]] @@ -9358,11 +9504,11 @@ dependencies = [ [[package]] name = "regress" -version = "0.10.1" +version = "0.10.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1541daf4e4ed43a0922b7969bdc2170178bcacc5dabf7e39bc508a9fa3953a7a" +checksum = "78ef7fa9ed0256d64a688a3747d0fef7a88851c18a5e1d57f115f38ec2e09366" dependencies = [ - "hashbrown 0.14.5", + "hashbrown 0.15.2", "memchr", ] @@ -9414,9 +9560,9 @@ dependencies = [ [[package]] name = "reqwest" -version = "0.12.8" +version = "0.12.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f713147fbe92361e52392c73b8c9e48c04c6625bce969ef54dc901e58e042a7b" +checksum = "a77c62af46e79de0a562e1a9849205ffcb7fc1238876e9bd743357570e04046f" dependencies = [ "base64 0.22.1", "bytes", @@ -9641,7 +9787,7 @@ dependencies = [ "regex", "relative-path", "rustc_version", - "syn 2.0.90", + "syn 2.0.96", "unicode-ident", ] @@ -9653,7 +9799,7 @@ checksum = "b3a8fb4672e840a587a66fc577a5491375df51ddb88f2a2c2a792598c326fe14" dependencies = [ "quote", "rand", - "syn 2.0.90", + "syn 2.0.96", ] [[package]] @@ -9676,7 +9822,7 @@ dependencies = [ "proc-macro2", "quote", "rust-embed-utils", - "syn 2.0.90", + "syn 2.0.96", "walkdir", ] @@ -10083,7 +10229,7 @@ dependencies = [ "proc-macro2", "quote", "serde_derive_internals", - "syn 2.0.90", + "syn 2.0.96", ] [[package]] @@ -10134,7 +10280,7 @@ dependencies = [ "heck 0.4.1", "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.96", ] [[package]] @@ -10187,9 +10333,9 @@ checksum = "a3f0bf26fd526d2a95683cd0f87bf103b8539e2ca1ef48ce002d67aad59aa0b4" [[package]] name = "serde" -version = "1.0.215" +version = "1.0.217" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6513c1ad0b11a9376da888e3e0baa0077f1aed55c17f50e7b2397136129fb88f" +checksum = "02fc4265df13d6fa1d00ecff087228cc0a2b5f3c0e87e258d8b94a156e984c70" dependencies = [ "serde_derive", ] @@ -10206,13 +10352,13 @@ dependencies = [ [[package]] name = "serde_derive" -version = "1.0.215" +version = "1.0.217" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ad1e866f866923f252f05c889987993144fb74e722403468a4ebd70c3cd756c0" +checksum = "5a9bf7cf98d04a2b28aead066b7496853d4779c9cc183c440dbac457641e19a0" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.96", ] [[package]] @@ -10223,14 +10369,14 @@ checksum = "18d26a20a969b9e3fdf2fc2d9f21eda6c40e2de84c9408bb5d3b05d499aae711" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.96", ] [[package]] name = "serde_json" -version = "1.0.133" +version = "1.0.137" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c7fceb2473b9166b2294ef05efcb65a3db80803f0b03ef86a5fc88a2b85ee377" +checksum = "930cfb6e6abf99298aaad7d29abbef7a9999a9a8806a40088f55f0dcec03146b" dependencies = [ "indexmap 2.6.0", "itoa", @@ -10257,7 +10403,7 @@ checksum = "6c64451ba24fc7a6a2d60fc75dd9c83c90903b19028d4eff35e88fc1e86564e9" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.96", ] [[package]] @@ -10278,7 +10424,7 @@ dependencies = [ "proc-macro2", "quote", "serde", - "syn 2.0.90", + "syn 2.0.96", ] [[package]] @@ -10320,7 +10466,7 @@ dependencies = [ "darling 0.20.10", "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.96", ] [[package]] @@ -10698,7 +10844,7 @@ dependencies = [ "heck 0.5.0", "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.96", ] [[package]] @@ -10885,7 +11031,7 @@ checksum = "01b2e185515564f15375f593fb966b5718bc624ba77fe49fa4616ad619690554" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.96", ] [[package]] @@ -10895,7 +11041,7 @@ source = "git+https://github.com/GreptimeTeam/sqlparser-rs.git?rev=71dd86058d2af dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.96", ] [[package]] @@ -10940,7 +11086,7 @@ dependencies = [ "serde_json", "sha2", "smallvec", - "thiserror 2.0.11", + "thiserror 2.0.6", "tokio", "tokio-stream", "tracing", @@ -10958,7 +11104,7 @@ dependencies = [ "quote", "sqlx-core", "sqlx-macros-core", - "syn 2.0.90", + "syn 2.0.96", ] [[package]] @@ -10981,7 +11127,7 @@ dependencies = [ "sqlx-mysql", "sqlx-postgres", "sqlx-sqlite", - "syn 2.0.90", + "syn 2.0.96", "tempfile", "tokio", "url", @@ -11025,7 +11171,7 @@ dependencies = [ "smallvec", "sqlx-core", "stringprep", - "thiserror 2.0.11", + "thiserror 2.0.6", "tracing", "whoami", ] @@ -11063,7 +11209,7 @@ dependencies = [ "smallvec", "sqlx-core", "stringprep", - "thiserror 2.0.11", + "thiserror 2.0.6", "tracing", "whoami", ] @@ -11234,7 +11380,7 @@ dependencies = [ "proc-macro2", "quote", "rustversion", - "syn 2.0.90", + "syn 2.0.96", ] [[package]] @@ -11247,7 +11393,7 @@ dependencies = [ "proc-macro2", "quote", "rustversion", - "syn 2.0.90", + "syn 2.0.96", ] [[package]] @@ -11297,7 +11443,7 @@ dependencies = [ "serde", "serde_json", "serde_yaml", - "syn 2.0.90", + "syn 2.0.96", "typify 0.1.0", "walkdir", ] @@ -11316,13 +11462,13 @@ dependencies = [ "prost 0.13.3", "prost-build 0.13.3", "prost-types 0.13.3", - "regress 0.10.1", + "regress 0.10.3", "schemars", "semver", "serde", "serde_json", "serde_yaml", - "syn 2.0.90", + "syn 2.0.96", "typify 0.2.0", "walkdir", ] @@ -11369,9 +11515,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.90" +version = "2.0.96" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "919d3b74a5dd0ccd15aeb8f93e7006bd9e14c295087c9896a110f490752bcf31" +checksum = "d5d0adab1ae378d7f53bdebc67a39f1f151407ef230f0ce2883572f5d8985c80" dependencies = [ "proc-macro2", "quote", @@ -11397,7 +11543,7 @@ dependencies = [ "proc-macro-error", "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.96", ] [[package]] @@ -11415,6 +11561,17 @@ dependencies = [ "futures-core", ] +[[package]] +name = "synstructure" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c8af7666ab7b6390ab78131fb5b0fce11d6b7a6951602017c35fa82800708971" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.96", +] + [[package]] name = "sysinfo" version = "0.30.13" @@ -11833,11 +11990,11 @@ dependencies = [ [[package]] name = "thiserror" -version = "2.0.11" +version = "2.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d452f284b73e6d76dd36758a0c8684b1d5be31f92b89d07fd5822175732206fc" +checksum = "8fec2a1820ebd077e2b90c4df007bebf344cd394098a13c563957d0afc83ea47" dependencies = [ - "thiserror-impl 2.0.11", + "thiserror-impl 2.0.6", ] [[package]] @@ -11848,18 +12005,18 @@ checksum = "08904e7672f5eb876eaaf87e0ce17857500934f4981c4a0ab2b4aa98baac7fc3" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.96", ] [[package]] name = "thiserror-impl" -version = "2.0.11" +version = "2.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "26afc1baea8a989337eeb52b6e72a039780ce45c3edfcc9c5b9d112feeb173c2" +checksum = "d65750cab40f4ff1929fb1ba509e9914eb756131cef4210da8d5d700d26f6312" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.96", ] [[package]] @@ -11978,6 +12135,16 @@ dependencies = [ "log", ] +[[package]] +name = "tinystr" +version = "0.7.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9117f5d4db391c1cf6927e7bea3db74b9a1c1add8f7eda9ffd5364f40f57b82f" +dependencies = [ + "displaydoc", + "zerovec", +] + [[package]] name = "tinytemplate" version = "1.2.1" @@ -12005,9 +12172,9 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" [[package]] name = "tokio" -version = "1.40.0" +version = "1.42.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e2b070231665d27ad9ec9b8df639893f46727666c6767db40317fbe920a5d998" +checksum = "5cec9b21b0450273377fc97bd4c33a8acffc8c996c987a7c5b319a0083707551" dependencies = [ "backtrace", "bytes", @@ -12040,7 +12207,7 @@ checksum = "693d596312e88961bc67d7f1f97af8a70227d9f90c31bba5806eec004978d752" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.96", ] [[package]] @@ -12294,7 +12461,7 @@ dependencies = [ "proc-macro2", "prost-build 0.12.6", "quote", - "syn 2.0.90", + "syn 2.0.96", ] [[package]] @@ -12308,7 +12475,7 @@ dependencies = [ "prost-build 0.13.3", "prost-types 0.13.3", "quote", - "syn 2.0.90", + "syn 2.0.96", ] [[package]] @@ -12458,7 +12625,7 @@ checksum = "34704c8d6ebcbc939824180af020566b01a7c01f80641264eba0999f6c2b6be7" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.96", ] [[package]] @@ -12709,7 +12876,7 @@ checksum = "70b20a22c42c8f1cd23ce5e34f165d4d37038f5b663ad20fb6adbdf029172483" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.96", ] [[package]] @@ -12747,7 +12914,7 @@ dependencies = [ "semver", "serde", "serde_json", - "syn 2.0.90", + "syn 2.0.96", "thiserror 1.0.64", "unicode-ident", ] @@ -12762,12 +12929,12 @@ dependencies = [ "log", "proc-macro2", "quote", - "regress 0.10.1", + "regress 0.10.3", "schemars", "semver", "serde", "serde_json", - "syn 2.0.90", + "syn 2.0.96", "thiserror 1.0.64", "unicode-ident", ] @@ -12785,7 +12952,7 @@ dependencies = [ "serde", "serde_json", "serde_tokenstream", - "syn 2.0.90", + "syn 2.0.96", "typify-impl 0.1.0", ] @@ -12802,7 +12969,7 @@ dependencies = [ "serde", "serde_json", "serde_tokenstream", - "syn 2.0.90", + "syn 2.0.96", "typify-impl 0.2.0", ] @@ -12909,9 +13076,9 @@ checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1" [[package]] name = "url" -version = "2.5.2" +version = "2.5.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "22784dbdf76fdde8af1aeda5622b546b422b6fc585325248a2bf9f5e41e94d6c" +checksum = "32f8b686cadd1473f4bd0117a5d28d36b1ade384ea9b5069a1c40aefed7fda60" dependencies = [ "form_urlencoded", "idna", @@ -12924,6 +13091,12 @@ version = "2.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "daf8dba3b7eb870caf1ddeed7bc9d2a049f3cfdfae7cb521b087cc33ae4c49da" +[[package]] +name = "utf16_iter" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c8232dd3cdaed5356e0f716d285e4b40b932ac434100fe9b7e0e8e935b9e6246" + [[package]] name = "utf8-ranges" version = "1.0.5" @@ -12936,6 +13109,12 @@ version = "0.1.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "86bd8d4e895da8537e5315b8254664e6b769c4ff3db18321b297a1e7004392e3" +[[package]] +name = "utf8_iter" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be" + [[package]] name = "utf8parse" version = "0.2.2" @@ -13060,7 +13239,7 @@ dependencies = [ "once_cell", "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.96", "wasm-bindgen-shared", ] @@ -13094,7 +13273,7 @@ checksum = "afc340c74d9005395cf9dd098506f7f44e38f2b4a21c6aaacf9a105ea5e1e836" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.96", "wasm-bindgen-backend", "wasm-bindgen-shared", ] @@ -13304,7 +13483,7 @@ checksum = "2bbd5b46c938e506ecbce286b6628a02171d56153ba733b6c741fc627ec9579b" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.96", ] [[package]] @@ -13315,7 +13494,7 @@ checksum = "053c4c462dc91d3b1504c6fe5a726dd15e216ba718e84a0e46a88fbe5ded3515" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.96", ] [[package]] @@ -13592,6 +13771,18 @@ dependencies = [ "thiserror 1.0.64", ] +[[package]] +name = "write16" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d1890f4022759daae28ed4fe62859b1236caebfc61ede2f63ed4e695f3f6d936" + +[[package]] +name = "writeable" +version = "0.5.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e9df38ee2d2c3c5948ea468a8406ff0db0b29ae1ffde1bcf20ef305bcc95c51" + [[package]] name = "wyz" version = "0.5.1" @@ -13622,9 +13813,9 @@ dependencies = [ [[package]] name = "xattr" -version = "1.4.0" +version = "1.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e105d177a3871454f754b33bb0ee637ecaaac997446375fd3e5d43a2ed00c909" +checksum = "8da84f1a25939b27f6820d92aed108f83ff920fdf11a7b19366c27c4cda81d4f" dependencies = [ "libc", "linux-raw-sys", @@ -13655,6 +13846,30 @@ version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cfe53a6657fd280eaa890a3bc59152892ffa3e30101319d168b781ed6529b049" +[[package]] +name = "yoke" +version = "0.7.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "120e6aef9aa629e3d4f52dc8cc43a015c7724194c97dfaf45180d2daf2b77f40" +dependencies = [ + "serde", + "stable_deref_trait", + "yoke-derive", + "zerofrom", +] + +[[package]] +name = "yoke-derive" +version = "0.7.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2380878cad4ac9aac1e2435f3eb4020e8374b5f13c296cb75b4620ff8e229154" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.96", + "synstructure", +] + [[package]] name = "zerocopy" version = "0.7.35" @@ -13673,7 +13888,28 @@ checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.96", +] + +[[package]] +name = "zerofrom" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cff3ee08c995dee1859d998dea82f7374f2826091dd9cd47def953cae446cd2e" +dependencies = [ + "zerofrom-derive", +] + +[[package]] +name = "zerofrom-derive" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "595eed982f7d355beb85837f651fa22e90b3c044842dc7f2c2842c086f295808" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.96", + "synstructure", ] [[package]] @@ -13693,7 +13929,29 @@ checksum = "ce36e65b0d2999d2aafac989fb249189a141aee1f53c612c1f37d72631959f69" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.96", +] + +[[package]] +name = "zerovec" +version = "0.10.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aa2b893d79df23bfb12d5461018d408ea19dfafe76c2c7ef6d4eba614f8ff079" +dependencies = [ + "yoke", + "zerofrom", + "zerovec-derive", +] + +[[package]] +name = "zerovec-derive" +version = "0.10.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6eafa6dfb17584ea3e2bd6e76e0cc15ad7af12b09abdd1ca55961bed9b1063c6" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.96", ] [[package]] diff --git a/src/frontend/src/instance/otlp.rs b/src/frontend/src/instance/otlp.rs index 989c6c4348fc..fff075cac6a1 100644 --- a/src/frontend/src/instance/otlp.rs +++ b/src/frontend/src/instance/otlp.rs @@ -20,11 +20,11 @@ use common_telemetry::tracing; use opentelemetry_proto::tonic::collector::logs::v1::ExportLogsServiceRequest; use opentelemetry_proto::tonic::collector::metrics::v1::ExportMetricsServiceRequest; use opentelemetry_proto::tonic::collector::trace::v1::ExportTraceServiceRequest; -use pipeline::PipelineWay; +use pipeline::{GreptimePipelineParams, PipelineWay}; use servers::error::{self, AuthSnafu, InFlightWriteBytesExceededSnafu, Result as ServerResult}; use servers::interceptor::{OpenTelemetryProtocolInterceptor, OpenTelemetryProtocolInterceptorRef}; use servers::otlp; -use servers::query_handler::OpenTelemetryProtocolHandler; +use servers::query_handler::{OpenTelemetryProtocolHandler, PipelineHandlerRef}; use session::context::QueryContextRef; use snafu::ResultExt; @@ -112,8 +112,10 @@ impl OpenTelemetryProtocolHandler for Instance { #[tracing::instrument(skip_all)] async fn logs( &self, + pipeline_handler: PipelineHandlerRef, request: ExportLogsServiceRequest, pipeline: PipelineWay, + pipeline_params: GreptimePipelineParams, table_name: String, ctx: QueryContextRef, ) -> ServerResult { @@ -128,7 +130,15 @@ impl OpenTelemetryProtocolHandler for Instance { .get::>(); interceptor_ref.pre_execute(ctx.clone())?; - let (requests, rows) = otlp::logs::to_grpc_insert_requests(request, pipeline, table_name)?; + let (requests, rows) = otlp::logs::to_grpc_insert_requests( + request, + pipeline, + pipeline_params, + table_name, + &ctx, + pipeline_handler, + ) + .await?; let _guard = if let Some(limiter) = &self.limiter { let result = limiter.limit_row_inserts(&requests); diff --git a/src/pipeline/benches/processor.rs b/src/pipeline/benches/processor.rs index 8cf221af5b10..ba7240b9d527 100644 --- a/src/pipeline/benches/processor.rs +++ b/src/pipeline/benches/processor.rs @@ -13,21 +13,22 @@ // limitations under the License. use criterion::{black_box, criterion_group, criterion_main, Criterion}; -use pipeline::{parse, Content, GreptimeTransformer, Pipeline, Result}; +use pipeline::{json_to_intermediate_state, parse, Content, GreptimeTransformer, Pipeline, Result}; use serde_json::{Deserializer, Value}; fn processor_mut( pipeline: &Pipeline, input_values: Vec, ) -> Result> { - let mut payload = pipeline.init_intermediate_state(); let mut result = Vec::with_capacity(input_values.len()); for v in input_values { - pipeline.prepare(v, &mut payload)?; - let r = pipeline.exec_mut(&mut payload)?; + let mut payload = json_to_intermediate_state(v).unwrap(); + let r = pipeline + .exec_mut(&mut payload)? + .into_transformed() + .expect("expect transformed result "); result.push(r); - pipeline.reset_intermediate_state(&mut payload); } Ok(result) diff --git a/src/pipeline/src/dispatcher.rs b/src/pipeline/src/dispatcher.rs index f16fd7e57fb2..fa9e54cf0f4a 100644 --- a/src/pipeline/src/dispatcher.rs +++ b/src/pipeline/src/dispatcher.rs @@ -12,12 +12,14 @@ // See the License for the specific language governing permissions and // limitations under the License. +use std::collections::BTreeMap; + +use common_telemetry::debug; use snafu::OptionExt; use yaml_rust::Yaml; -use crate::etl::error::{Error, Result}; -use crate::etl_error::{ - FieldRequiredForDispatcherSnafu, TablePartRequiredForDispatcherRuleSnafu, +use crate::etl::error::{ + Error, FieldRequiredForDispatcherSnafu, Result, TablePartRequiredForDispatcherRuleSnafu, ValueRequiredForDispatcherRuleSnafu, }; use crate::Value; @@ -84,6 +86,7 @@ impl TryFrom<&Yaml> for Dispatcher { .as_str() .map(|s| s.to_string()) .context(TablePartRequiredForDispatcherRuleSnafu)?; + let pipeline = rule[PIPELINE].as_str().map(|s| s.to_string()); if rule[VALUE].is_badvalue() { @@ -105,3 +108,21 @@ impl TryFrom<&Yaml> for Dispatcher { Ok(Dispatcher { field, rules }) } } + +impl Dispatcher { + /// execute dispatcher and returns matched rule if any + pub(crate) fn exec(&self, data: &BTreeMap) -> Option<&Rule> { + if let Some(value) = data.get(&self.field) { + for rule in &self.rules { + if rule.value == *value { + return Some(rule); + } + } + + None + } else { + debug!("field {} not found in keys {:?}", &self.field, data.keys()); + None + } + } +} diff --git a/src/pipeline/src/etl.rs b/src/pipeline/src/etl.rs index d55cf25d543d..cac5c44c17be 100644 --- a/src/pipeline/src/etl.rs +++ b/src/pipeline/src/etl.rs @@ -20,18 +20,22 @@ pub mod processor; pub mod transform; pub mod value; -use ahash::HashSet; -use common_telemetry::debug; -use error::{IntermediateKeyIndexSnafu, PrepareValueMustBeObjectSnafu, YamlLoadSnafu}; +use std::collections::BTreeMap; +use std::sync::Arc; + +use error::{ + IntermediateKeyIndexSnafu, PrepareValueMustBeObjectSnafu, YamlLoadSnafu, YamlParseSnafu, +}; use itertools::Itertools; -use processor::{Processor, ProcessorBuilder, Processors}; +use processor::{IntermediateStatus, Processor, Processors}; use snafu::{OptionExt, ResultExt}; -use transform::{TransformBuilders, Transformer, Transforms}; +use transform::{Transformer, Transforms}; use value::Value; use yaml_rust::YamlLoader; -use crate::dispatcher::Dispatcher; +use crate::dispatcher::{Dispatcher, Rule}; use crate::etl::error::Result; +use crate::{GreptimeTransformer, PipelineVersion}; const DESCRIPTION: &str = "description"; const PROCESSORS: &str = "processors"; @@ -52,103 +56,25 @@ where Content::Yaml(str) => { let docs = YamlLoader::load_from_str(str).context(YamlLoadSnafu)?; + if docs.len() != 1 { + return YamlParseSnafu.fail(); + } + let doc = &docs[0]; let description = doc[DESCRIPTION].as_str().map(|s| s.to_string()); - let processor_builder_list = if let Some(v) = doc[PROCESSORS].as_vec() { + let processors = if let Some(v) = doc[PROCESSORS].as_vec() { v.try_into()? } else { - processor::ProcessorBuilderList::default() - }; - - let transform_builders = - if let Some(v) = doc[TRANSFORMS].as_vec().or(doc[TRANSFORM].as_vec()) { - v.try_into()? - } else { - TransformBuilders::default() - }; - - let processors_required_keys = &processor_builder_list.input_keys; - let processors_output_keys = &processor_builder_list.output_keys; - let processors_required_original_keys = &processor_builder_list.original_input_keys; - - debug!( - "processors_required_original_keys: {:?}", - processors_required_original_keys - ); - debug!("processors_required_keys: {:?}", processors_required_keys); - debug!("processors_output_keys: {:?}", processors_output_keys); - - let transforms_required_keys = &transform_builders.required_keys; - let mut tr_keys = Vec::with_capacity(50); - for key in transforms_required_keys.iter() { - if !processors_output_keys.contains(key) - && !processors_required_original_keys.contains(key) - { - tr_keys.push(key.clone()); - } - } - - let mut required_keys = processors_required_original_keys.clone(); - - required_keys.append(&mut tr_keys); - required_keys.sort(); - - debug!("required_keys: {:?}", required_keys); - - // intermediate keys are the keys that all processor and transformer required - let ordered_intermediate_keys: Vec = [ - processors_required_keys, - transforms_required_keys, - processors_output_keys, - ] - .iter() - .flat_map(|l| l.iter()) - .collect::>() - .into_iter() - .sorted() - .cloned() - .collect_vec(); - - let mut final_intermediate_keys = Vec::with_capacity(ordered_intermediate_keys.len()); - let mut intermediate_keys_exclude_original = - Vec::with_capacity(ordered_intermediate_keys.len()); - - for key_name in ordered_intermediate_keys.iter() { - if required_keys.contains(key_name) { - final_intermediate_keys.push(key_name.clone()); - } else { - intermediate_keys_exclude_original.push(key_name.clone()); - } - } - - final_intermediate_keys.extend(intermediate_keys_exclude_original); - - let output_keys = transform_builders.output_keys.clone(); - - let processors_kind_list = processor_builder_list - .processor_builders - .into_iter() - .map(|builder| builder.build(&final_intermediate_keys)) - .collect::>>()?; - let processors = Processors { - processors: processors_kind_list, - required_keys: processors_required_keys.clone(), - output_keys: processors_output_keys.clone(), - required_original_keys: processors_required_original_keys.clone(), + Processors::default() }; - let transfor_list = transform_builders - .builders - .into_iter() - .map(|builder| builder.build(&final_intermediate_keys, &output_keys)) - .collect::>>()?; - - let transformers = Transforms { - transforms: transfor_list, - required_keys: transforms_required_keys.clone(), - output_keys: output_keys.clone(), + let transformers = if let Some(v) = doc[TRANSFORMS].as_vec().or(doc[TRANSFORM].as_vec()) + { + v.try_into()? + } else { + Transforms::default() }; let transformer = T::new(transformers)?; @@ -164,9 +90,6 @@ where processors, transformer, dispatcher, - required_keys, - output_keys, - intermediate_keys: final_intermediate_keys, }) } Content::Json(_) => unimplemented!(), @@ -182,97 +105,91 @@ where processors: processor::Processors, dispatcher: Option, transformer: T, - /// required keys for the preprocessing from map data from user - /// include all processor required and transformer required keys - required_keys: Vec, - /// all output keys from the transformer - output_keys: Vec, - /// intermediate keys from the processors - intermediate_keys: Vec, - // pub on_failure: processor::Processors, } -impl Pipeline -where - T: Transformer, -{ - pub fn exec_mut(&self, val: &mut Vec) -> Result { - for processor in self.processors.iter() { - processor.exec_mut(val)?; - } +/// Where the pipeline executed is dispatched to, with context information +#[derive(Debug, Hash, PartialEq, Eq, Clone, PartialOrd, Ord)] +pub struct DispatchedTo { + pub table_part: String, + pub pipeline: Option, +} - self.transformer.transform_mut(val) +impl From<&Rule> for DispatchedTo { + fn from(value: &Rule) -> Self { + DispatchedTo { + table_part: value.table_part.clone(), + pipeline: value.pipeline.clone(), + } } +} - pub fn prepare_pipeline_value(&self, val: Value, result: &mut [Value]) -> Result<()> { - match val { - Value::Map(map) => { - let mut search_from = 0; - // because of the key in the json map is ordered - for (payload_key, payload_value) in map.values.into_iter() { - if search_from >= self.required_keys.len() { - break; - } +/// The result of pipeline execution +#[derive(Debug)] +pub enum PipelineExecOutput { + Transformed(O), + DispatchedTo(DispatchedTo), +} - // because of map key is ordered, required_keys is ordered too - if let Some(pos) = self.required_keys[search_from..] - .iter() - .position(|k| k == &payload_key) - { - result[search_from + pos] = payload_value; - // next search from is always after the current key - search_from += pos; - } - } - } - Value::String(_) => { - result[0] = val; - } - _ => { - return PrepareValueMustBeObjectSnafu.fail(); - } +impl PipelineExecOutput { + pub fn into_transformed(self) -> Option { + if let Self::Transformed(o) = self { + Some(o) + } else { + None } - Ok(()) } - pub fn prepare(&self, val: serde_json::Value, result: &mut [Value]) -> Result<()> { - match val { - serde_json::Value::Object(map) => { - let mut search_from = 0; - // because of the key in the json map is ordered - for (payload_key, payload_value) in map.into_iter() { - if search_from >= self.required_keys.len() { - break; - } + pub fn into_dispatched(self) -> Option { + if let Self::DispatchedTo(d) = self { + Some(d) + } else { + None + } + } +} - // because of map key is ordered, required_keys is ordered too - if let Some(pos) = self.required_keys[search_from..] - .iter() - .position(|k| k == &payload_key) - { - result[search_from + pos] = payload_value.try_into()?; - // next search from is always after the current key - search_from += pos; - } - } - } - serde_json::Value::String(_) => { - result[0] = val.try_into()?; - } - _ => { - return PrepareValueMustBeObjectSnafu.fail(); +pub fn json_to_intermediate_state(val: serde_json::Value) -> Result { + match val { + serde_json::Value::Object(map) => { + let mut intermediate_state = BTreeMap::new(); + for (k, v) in map { + intermediate_state.insert(k, Value::try_from(v)?); } + Ok(intermediate_state) } - Ok(()) + _ => PrepareValueMustBeObjectSnafu.fail(), } +} - pub fn init_intermediate_state(&self) -> Vec { - vec![Value::Null; self.intermediate_keys.len()] - } +pub fn json_array_to_intermediate_state( + val: Vec, +) -> Result> { + val.into_iter().map(json_to_intermediate_state).collect() +} - pub fn reset_intermediate_state(&self, result: &mut [Value]) { - for i in result { - *i = Value::Null; +impl Pipeline +where + T: Transformer, +{ + pub fn exec_mut( + &self, + val: &mut BTreeMap, + ) -> Result> { + for processor in self.processors.iter() { + processor.exec_mut(val)?; + } + + let matched_rule = self + .dispatcher + .as_ref() + .and_then(|dispatcher| dispatcher.exec(val)); + + match matched_rule { + None => self + .transformer + .transform_mut(val) + .map(PipelineExecOutput::Transformed), + Some(rule) => Ok(PipelineExecOutput::DispatchedTo(rule.into())), } } @@ -284,21 +201,6 @@ where &self.transformer } - /// Required fields in user-supplied data - pub fn required_keys(&self) -> &Vec { - &self.required_keys - } - - /// All output keys from the pipeline - pub fn output_keys(&self) -> &Vec { - &self.output_keys - } - - /// intermediate keys from the processors - pub fn intermediate_keys(&self) -> &Vec { - &self.intermediate_keys - } - pub fn schemas(&self) -> &Vec { self.transformer.schemas() } @@ -336,9 +238,29 @@ impl SelectInfo { } } +pub const GREPTIME_INTERNAL_IDENTITY_PIPELINE_NAME: &str = "greptime_identity"; + +/// Enum for holding information of a pipeline, which is either pipeline itself, +/// or information that be used to retrieve a pipeline from `PipelineHandler` +pub enum PipelineDefinition { + Resolved(Arc>), + ByNameAndValue((String, PipelineVersion)), + GreptimeIdentityPipeline, +} + +impl PipelineDefinition { + pub fn from_name(name: &str, version: PipelineVersion) -> Self { + if name == GREPTIME_INTERNAL_IDENTITY_PIPELINE_NAME { + Self::GreptimeIdentityPipeline + } else { + Self::ByNameAndValue((name.to_owned(), version)) + } + } +} + pub enum PipelineWay { - OtlpLog(Box), - Custom(std::sync::Arc>), + OtlpLogDirect(Box), + Pipeline(PipelineDefinition), } #[cfg(test)] @@ -353,33 +275,31 @@ mod tests { #[test] fn test_pipeline_prepare() { let input_value_str = r#" - { - "my_field": "1,2", - "foo": "bar" - } - "#; + { + "my_field": "1,2", + "foo": "bar" + } + "#; let input_value: serde_json::Value = serde_json::from_str(input_value_str).unwrap(); let pipeline_yaml = r#"description: 'Pipeline for Apache Tomcat' processors: - - csv: - field: my_field - target_fields: field1, field2 + - csv: + field: my_field + target_fields: field1, field2 transform: - - field: field1 - type: uint32 - - field: field2 - type: uint32 -"#; + - field: field1 + type: uint32 + - field: field2 + type: uint32 + "#; let pipeline: Pipeline = parse(&Content::Yaml(pipeline_yaml)).unwrap(); - let mut payload = pipeline.init_intermediate_state(); - pipeline.prepare(input_value, &mut payload).unwrap(); - assert_eq!(&["my_field"].to_vec(), pipeline.required_keys()); - assert_eq!( - payload, - vec![Value::String("1,2".to_string()), Value::Null, Value::Null] - ); - let result = pipeline.exec_mut(&mut payload).unwrap(); + let mut payload = json_to_intermediate_state(input_value).unwrap(); + let result = pipeline + .exec_mut(&mut payload) + .unwrap() + .into_transformed() + .unwrap(); assert_eq!(result.values[0].value_data, Some(ValueData::U32Value(1))); assert_eq!(result.values[1].value_data, Some(ValueData::U32Value(2))); @@ -395,40 +315,42 @@ transform: fn test_dissect_pipeline() { let message = r#"129.37.245.88 - meln1ks [01/Aug/2024:14:22:47 +0800] "PATCH /observability/metrics/production HTTP/1.0" 501 33085"#.to_string(); let pipeline_str = r#"processors: - - dissect: - fields: - - message - patterns: - - "%{ip} %{?ignored} %{username} [%{ts}] \"%{method} %{path} %{proto}\" %{status} %{bytes}" - - timestamp: - fields: - - ts - formats: - - "%d/%b/%Y:%H:%M:%S %z" + - dissect: + fields: + - message + patterns: + - "%{ip} %{?ignored} %{username} [%{ts}] \"%{method} %{path} %{proto}\" %{status} %{bytes}" + - timestamp: + fields: + - ts + formats: + - "%d/%b/%Y:%H:%M:%S %z" transform: - - fields: - - ip - - username - - method - - path - - proto - type: string - - fields: - - status - type: uint16 - - fields: - - bytes - type: uint32 - - field: ts - type: timestamp, ns - index: time"#; + - fields: + - ip + - username + - method + - path + - proto + type: string + - fields: + - status + type: uint16 + - fields: + - bytes + type: uint32 + - field: ts + type: timestamp, ns + index: time"#; let pipeline: Pipeline = parse(&Content::Yaml(pipeline_str)).unwrap(); - let mut payload = pipeline.init_intermediate_state(); - pipeline - .prepare(serde_json::Value::String(message), &mut payload) + let mut payload = BTreeMap::new(); + payload.insert("message".to_string(), Value::String(message)); + let result = pipeline + .exec_mut(&mut payload) + .unwrap() + .into_transformed() .unwrap(); - let result = pipeline.exec_mut(&mut payload).unwrap(); let sechema = pipeline.schemas(); assert_eq!(sechema.len(), result.values.len()); @@ -479,35 +401,33 @@ transform: #[test] fn test_csv_pipeline() { let input_value_str = r#" - { - "my_field": "1,2", - "foo": "bar" - } - "#; + { + "my_field": "1,2", + "foo": "bar" + } + "#; let input_value: serde_json::Value = serde_json::from_str(input_value_str).unwrap(); let pipeline_yaml = r#" -description: Pipeline for Apache Tomcat -processors: - - csv: - field: my_field - target_fields: field1, field2 -transform: - - field: field1 - type: uint32 - - field: field2 - type: uint32 -"#; + description: Pipeline for Apache Tomcat + processors: + - csv: + field: my_field + target_fields: field1, field2 + transform: + - field: field1 + type: uint32 + - field: field2 + type: uint32 + "#; let pipeline: Pipeline = parse(&Content::Yaml(pipeline_yaml)).unwrap(); - let mut payload = pipeline.init_intermediate_state(); - pipeline.prepare(input_value, &mut payload).unwrap(); - assert_eq!(&["my_field"].to_vec(), pipeline.required_keys()); - assert_eq!( - payload, - vec![Value::String("1,2".to_string()), Value::Null, Value::Null] - ); - let result = pipeline.exec_mut(&mut payload).unwrap(); + let mut payload = json_to_intermediate_state(input_value).unwrap(); + let result = pipeline + .exec_mut(&mut payload) + .unwrap() + .into_transformed() + .unwrap(); assert_eq!(result.values[0].value_data, Some(ValueData::U32Value(1))); assert_eq!(result.values[1].value_data, Some(ValueData::U32Value(2))); match &result.values[2].value_data { @@ -521,33 +441,36 @@ transform: #[test] fn test_date_pipeline() { let input_value_str = r#" - { - "my_field": "1,2", - "foo": "bar", - "test_time": "2014-5-17T04:34:56+00:00" - } - "#; + { + "my_field": "1,2", + "foo": "bar", + "test_time": "2014-5-17T04:34:56+00:00" + } + "#; let input_value: serde_json::Value = serde_json::from_str(input_value_str).unwrap(); - let pipeline_yaml = r#" ---- + let pipeline_yaml = r#"--- description: Pipeline for Apache Tomcat processors: - - timestamp: - field: test_time + - timestamp: + field: test_time transform: - - field: test_time - type: timestamp, ns - index: time -"#; + - field: test_time + type: timestamp, ns + index: time + "#; let pipeline: Pipeline = parse(&Content::Yaml(pipeline_yaml)).unwrap(); let schema = pipeline.schemas().clone(); - let mut result = pipeline.init_intermediate_state(); - pipeline.prepare(input_value, &mut result).unwrap(); - let row = pipeline.exec_mut(&mut result).unwrap(); + let mut result = json_to_intermediate_state(input_value).unwrap(); + + let row = pipeline + .exec_mut(&mut result) + .unwrap() + .into_transformed() + .unwrap(); let output = Rows { schema, rows: vec![row], diff --git a/src/pipeline/src/etl/error.rs b/src/pipeline/src/etl/error.rs index 999345fb1e2e..51080c86eebf 100644 --- a/src/pipeline/src/etl/error.rs +++ b/src/pipeline/src/etl/error.rs @@ -543,6 +543,11 @@ pub enum Error { #[snafu(implicit)] location: Location, }, + #[snafu(display("Yaml parse error."))] + YamlParse { + #[snafu(implicit)] + location: Location, + }, #[snafu(display("Prepare value must be an object"))] PrepareValueMustBeObject { #[snafu(implicit)] @@ -590,9 +595,9 @@ pub enum Error { }, #[snafu(display("Field is required for dispatcher"))] FieldRequiredForDispatcher, - #[snafu(display("table_part is required for dispatcher rule"))] + #[snafu(display("Table_part is required for dispatcher rule"))] TablePartRequiredForDispatcherRule, - #[snafu(display("value is required for dispatcher rule"))] + #[snafu(display("Value is required for dispatcher rule"))] ValueRequiredForDispatcherRule, #[snafu(display( "Reached max nested levels when flattening JSON object: {max_nested_levels}" diff --git a/src/pipeline/src/etl/field.rs b/src/pipeline/src/etl/field.rs index 10fa681f236c..dd4835ec9279 100644 --- a/src/pipeline/src/etl/field.rs +++ b/src/pipeline/src/etl/field.rs @@ -19,133 +19,12 @@ use snafu::OptionExt; use super::error::{EmptyInputFieldSnafu, MissingInputFieldSnafu}; use crate::etl::error::{Error, Result}; -use crate::etl::find_key_index; - -/// Information about the input field including the name and index in intermediate keys. -#[derive(Debug, Default, Clone)] -pub struct InputFieldInfo { - pub(crate) name: String, - pub(crate) index: usize, -} - -impl InputFieldInfo { - /// Create a new input field info with the given field name and index. - pub(crate) fn new(field: impl Into, index: usize) -> Self { - InputFieldInfo { - name: field.into(), - index, - } - } -} - -/// Information about a field that has one input and one output. -#[derive(Debug, Default, Clone)] -pub struct OneInputOneOutputField { - input: InputFieldInfo, - output: Option<(String, usize)>, -} - -impl OneInputOneOutputField { - /// Create a new field with the given input and output. - pub(crate) fn new(input: InputFieldInfo, output: (String, usize)) -> Self { - OneInputOneOutputField { - input, - output: Some(output), - } - } - - /// Build a new field with the given processor kind, intermediate keys, input field, and target field. - pub(crate) fn build( - processor_kind: &str, - intermediate_keys: &[String], - input_field: &str, - target_field: &str, - ) -> Result { - let input_index = find_key_index(intermediate_keys, input_field, processor_kind)?; - - let input_field_info = InputFieldInfo::new(input_field, input_index); - let output_index = find_key_index(intermediate_keys, target_field, processor_kind)?; - Ok(OneInputOneOutputField::new( - input_field_info, - (target_field.to_string(), output_index), - )) - } - - /// Get the input field information. - pub(crate) fn input(&self) -> &InputFieldInfo { - &self.input - } - - /// Get the index of the input field. - pub(crate) fn input_index(&self) -> usize { - self.input.index - } - - /// Get the name of the input field. - pub(crate) fn input_name(&self) -> &str { - &self.input.name - } - - /// Get the index of the output field. - pub(crate) fn output_index(&self) -> usize { - *self.output().1 - } - - /// Get the name of the output field. - pub(crate) fn output_name(&self) -> &str { - self.output().0 - } - - /// Get the output field information. - pub(crate) fn output(&self) -> (&String, &usize) { - if let Some((name, index)) = &self.output { - (name, index) - } else { - (&self.input.name, &self.input.index) - } - } -} - -/// Information about a field that has one input and multiple outputs. -#[derive(Debug, Default, Clone)] -pub struct OneInputMultiOutputField { - input: InputFieldInfo, - /// Typically, processors that output multiple keys need to be distinguished by splicing the keys together. - prefix: Option, -} - -impl OneInputMultiOutputField { - /// Create a new field with the given input and prefix. - pub(crate) fn new(input: InputFieldInfo, prefix: Option) -> Self { - OneInputMultiOutputField { input, prefix } - } - - /// Get the input field information. - pub(crate) fn input(&self) -> &InputFieldInfo { - &self.input - } - - /// Get the index of the input field. - pub(crate) fn input_index(&self) -> usize { - self.input.index - } - - /// Get the name of the input field. - pub(crate) fn input_name(&self) -> &str { - &self.input.name - } - - /// Get the prefix for the output fields. - pub(crate) fn target_prefix(&self) -> &str { - self.prefix.as_deref().unwrap_or(&self.input.name) - } -} /// Raw processor-defined inputs and outputs #[derive(Debug, Default, Clone)] pub struct Field { - pub(crate) input_field: String, - pub(crate) target_field: Option, + input_field: String, + target_field: Option, } impl FromStr for Field { @@ -194,6 +73,10 @@ impl Field { pub(crate) fn target_or_input_field(&self) -> &str { self.target_field.as_deref().unwrap_or(&self.input_field) } + + pub(crate) fn set_target_field(&mut self, target_field: Option) { + self.target_field = target_field; + } } /// A collection of fields. diff --git a/src/pipeline/src/etl/processor.rs b/src/pipeline/src/etl/processor.rs index bf37f1f8ce7f..005feca3794e 100644 --- a/src/pipeline/src/etl/processor.rs +++ b/src/pipeline/src/etl/processor.rs @@ -27,32 +27,33 @@ pub mod regex; pub mod timestamp; pub mod urlencoding; -use ahash::{HashSet, HashSetExt}; -use cmcd::{CmcdProcessor, CmcdProcessorBuilder}; -use csv::{CsvProcessor, CsvProcessorBuilder}; -use date::{DateProcessor, DateProcessorBuilder}; -use decolorize::{DecolorizeProcessor, DecolorizeProcessorBuilder}; -use digest::{DigestProcessor, DigestProcessorBuilder}; -use dissect::{DissectProcessor, DissectProcessorBuilder}; +use std::collections::BTreeMap; + +use cmcd::CmcdProcessor; +use csv::CsvProcessor; +use date::DateProcessor; +use decolorize::DecolorizeProcessor; +use digest::DigestProcessor; +use dissect::DissectProcessor; use enum_dispatch::enum_dispatch; -use epoch::{EpochProcessor, EpochProcessorBuilder}; -use gsub::{GsubProcessor, GsubProcessorBuilder}; -use itertools::Itertools; -use join::{JoinProcessor, JoinProcessorBuilder}; -use json_path::{JsonPathProcessor, JsonPathProcessorBuilder}; -use letter::{LetterProcessor, LetterProcessorBuilder}; -use regex::{RegexProcessor, RegexProcessorBuilder}; +use epoch::EpochProcessor; +use gsub::GsubProcessor; +use join::JoinProcessor; +use json_path::JsonPathProcessor; +use letter::LetterProcessor; +use regex::RegexProcessor; use snafu::{OptionExt, ResultExt}; -use timestamp::{TimestampProcessor, TimestampProcessorBuilder}; -use urlencoding::{UrlEncodingProcessor, UrlEncodingProcessorBuilder}; +use timestamp::TimestampProcessor; +use urlencoding::UrlEncodingProcessor; use super::error::{ FailedParseFieldFromStringSnafu, FieldMustBeTypeSnafu, ProcessorKeyMustBeStringSnafu, - ProcessorMustBeMapSnafu, ProcessorMustHaveStringKeySnafu, UnsupportedProcessorSnafu, + ProcessorMustBeMapSnafu, ProcessorMustHaveStringKeySnafu, }; use super::field::{Field, Fields}; use crate::etl::error::{Error, Result}; use crate::etl::value::Value; +use crate::etl_error::UnsupportedProcessorSnafu; const FIELD_NAME: &str = "field"; const FIELDS_NAME: &str = "fields"; @@ -65,6 +66,8 @@ const TARGET_FIELDS_NAME: &str = "target_fields"; const JSON_PATH_NAME: &str = "json_path"; const JSON_PATH_RESULT_INDEX_NAME: &str = "result_index"; +pub type IntermediateStatus = BTreeMap; + /// Processor trait defines the interface for all processors. /// /// A processor is a transformation that can be applied to a field in a document @@ -80,7 +83,7 @@ pub trait Processor: std::fmt::Debug + Send + Sync + 'static { fn ignore_missing(&self) -> bool; /// Execute the processor on a vector which be preprocessed by the pipeline - fn exec_mut(&self, val: &mut Vec) -> Result<()>; + fn exec_mut(&self, val: &mut IntermediateStatus) -> Result<()>; } #[derive(Debug)] @@ -102,57 +105,12 @@ pub enum ProcessorKind { Digest(DigestProcessor), } -/// ProcessorBuilder trait defines the interface for all processor builders -/// A processor builder is used to create a processor -#[enum_dispatch(ProcessorBuilders)] -pub trait ProcessorBuilder: std::fmt::Debug + Send + Sync + 'static { - /// Get the processor's output keys - fn output_keys(&self) -> HashSet<&str>; - /// Get the processor's input keys - fn input_keys(&self) -> HashSet<&str>; - /// Build the processor - fn build(self, intermediate_keys: &[String]) -> Result; -} - -#[derive(Debug)] -#[enum_dispatch] -pub enum ProcessorBuilders { - Cmcd(CmcdProcessorBuilder), - Csv(CsvProcessorBuilder), - Dissect(DissectProcessorBuilder), - Gsub(GsubProcessorBuilder), - Join(JoinProcessorBuilder), - Letter(LetterProcessorBuilder), - Regex(RegexProcessorBuilder), - Timestamp(TimestampProcessorBuilder), - UrlEncoding(UrlEncodingProcessorBuilder), - Epoch(EpochProcessorBuilder), - Date(DateProcessorBuilder), - JsonPath(JsonPathProcessorBuilder), - Decolorize(DecolorizeProcessorBuilder), - Digest(DigestProcessorBuilder), -} - -#[derive(Debug, Default)] -pub struct ProcessorBuilderList { - pub(crate) processor_builders: Vec, - pub(crate) input_keys: Vec, - pub(crate) output_keys: Vec, - pub(crate) original_input_keys: Vec, -} - #[derive(Debug, Default)] pub struct Processors { /// A ordered list of processors /// The order of processors is important /// The output of the first processor will be the input of the second processor pub processors: Vec, - /// all required keys in all processors - pub required_keys: Vec, - /// all required keys in user-supplied data, not pipeline output fields - pub required_original_keys: Vec, - /// all output keys in all processors - pub output_keys: Vec, } impl std::ops::Deref for Processors { @@ -169,80 +127,22 @@ impl std::ops::DerefMut for Processors { } } -impl Processors { - /// A collection of all the processor's required input fields - pub fn required_keys(&self) -> &Vec { - &self.required_keys - } - - /// A collection of all the processor's output fields - pub fn output_keys(&self) -> &Vec { - &self.output_keys - } - - /// Required fields in user-supplied data, not pipeline output fields. - pub fn required_original_keys(&self) -> &Vec { - &self.required_original_keys - } -} - -impl TryFrom<&Vec> for ProcessorBuilderList { +impl TryFrom<&Vec> for Processors { type Error = Error; fn try_from(vec: &Vec) -> Result { let mut processors_builders = vec![]; - let mut all_output_keys = HashSet::with_capacity(50); - let mut all_required_keys = HashSet::with_capacity(50); - let mut all_required_original_keys = HashSet::with_capacity(50); for doc in vec { let processor = parse_processor(doc)?; processors_builders.push(processor); } - - for processor in processors_builders.iter() { - { - // get all required keys - let processor_required_keys = processor.input_keys(); - - for key in &processor_required_keys { - if !all_output_keys.contains(key) { - all_required_original_keys.insert(*key); - } - } - - all_required_keys.extend(processor_required_keys); - - let processor_output_keys = processor.output_keys().into_iter(); - all_output_keys.extend(processor_output_keys); - } - } - - let all_required_keys = all_required_keys - .into_iter() - .map(|x| x.to_string()) - .sorted() - .collect(); - let all_output_keys = all_output_keys - .into_iter() - .map(|x| x.to_string()) - .sorted() - .collect(); - let all_required_original_keys = all_required_original_keys - .into_iter() - .map(|x| x.to_string()) - .sorted() - .collect(); - - Ok(ProcessorBuilderList { - processor_builders: processors_builders, - input_keys: all_required_keys, - output_keys: all_output_keys, - original_input_keys: all_required_original_keys, + Ok(Processors { + processors: processors_builders, }) } } -fn parse_processor(doc: &yaml_rust::Yaml) -> Result { +fn parse_processor(doc: &yaml_rust::Yaml) -> Result { let map = doc.as_hash().context(ProcessorMustBeMapSnafu)?; let key = map.keys().next().context(ProcessorMustHaveStringKeySnafu)?; @@ -256,34 +156,28 @@ fn parse_processor(doc: &yaml_rust::Yaml) -> Result { let str_key = key.as_str().context(ProcessorKeyMustBeStringSnafu)?; let processor = match str_key { - cmcd::PROCESSOR_CMCD => ProcessorBuilders::Cmcd(CmcdProcessorBuilder::try_from(value)?), - csv::PROCESSOR_CSV => ProcessorBuilders::Csv(CsvProcessorBuilder::try_from(value)?), - dissect::PROCESSOR_DISSECT => { - ProcessorBuilders::Dissect(DissectProcessorBuilder::try_from(value)?) - } - epoch::PROCESSOR_EPOCH => ProcessorBuilders::Epoch(EpochProcessorBuilder::try_from(value)?), - date::PROCESSOR_DATE => ProcessorBuilders::Date(DateProcessorBuilder::try_from(value)?), - gsub::PROCESSOR_GSUB => ProcessorBuilders::Gsub(GsubProcessorBuilder::try_from(value)?), - join::PROCESSOR_JOIN => ProcessorBuilders::Join(JoinProcessorBuilder::try_from(value)?), - letter::PROCESSOR_LETTER => { - ProcessorBuilders::Letter(LetterProcessorBuilder::try_from(value)?) - } - regex::PROCESSOR_REGEX => ProcessorBuilders::Regex(RegexProcessorBuilder::try_from(value)?), + cmcd::PROCESSOR_CMCD => ProcessorKind::Cmcd(CmcdProcessor::try_from(value)?), + csv::PROCESSOR_CSV => ProcessorKind::Csv(CsvProcessor::try_from(value)?), + dissect::PROCESSOR_DISSECT => ProcessorKind::Dissect(DissectProcessor::try_from(value)?), + epoch::PROCESSOR_EPOCH => ProcessorKind::Epoch(EpochProcessor::try_from(value)?), + date::PROCESSOR_DATE => ProcessorKind::Date(DateProcessor::try_from(value)?), + gsub::PROCESSOR_GSUB => ProcessorKind::Gsub(GsubProcessor::try_from(value)?), + join::PROCESSOR_JOIN => ProcessorKind::Join(JoinProcessor::try_from(value)?), + letter::PROCESSOR_LETTER => ProcessorKind::Letter(LetterProcessor::try_from(value)?), + regex::PROCESSOR_REGEX => ProcessorKind::Regex(RegexProcessor::try_from(value)?), timestamp::PROCESSOR_TIMESTAMP => { - ProcessorBuilders::Timestamp(TimestampProcessorBuilder::try_from(value)?) + ProcessorKind::Timestamp(TimestampProcessor::try_from(value)?) } urlencoding::PROCESSOR_URL_ENCODING => { - ProcessorBuilders::UrlEncoding(UrlEncodingProcessorBuilder::try_from(value)?) + ProcessorKind::UrlEncoding(UrlEncodingProcessor::try_from(value)?) } json_path::PROCESSOR_JSON_PATH => { - ProcessorBuilders::JsonPath(json_path::JsonPathProcessorBuilder::try_from(value)?) + ProcessorKind::JsonPath(json_path::JsonPathProcessor::try_from(value)?) } decolorize::PROCESSOR_DECOLORIZE => { - ProcessorBuilders::Decolorize(DecolorizeProcessorBuilder::try_from(value)?) - } - digest::PROCESSOR_DIGEST => { - ProcessorBuilders::Digest(DigestProcessorBuilder::try_from(value)?) + ProcessorKind::Decolorize(DecolorizeProcessor::try_from(value)?) } + digest::PROCESSOR_DIGEST => ProcessorKind::Digest(DigestProcessor::try_from(value)?), _ => return UnsupportedProcessorSnafu { processor: str_key }.fail(), }; diff --git a/src/pipeline/src/etl/processor/cmcd.rs b/src/pipeline/src/etl/processor/cmcd.rs index 086fe8f3d610..a5da69d0be42 100644 --- a/src/pipeline/src/etl/processor/cmcd.rs +++ b/src/pipeline/src/etl/processor/cmcd.rs @@ -18,20 +18,19 @@ use std::collections::BTreeMap; -use ahash::HashSet; use snafu::{OptionExt, ResultExt}; use urlencoding::decode; +use super::IntermediateStatus; use crate::etl::error::{ CmcdMissingKeySnafu, CmcdMissingValueSnafu, Error, FailedToParseFloatKeySnafu, FailedToParseIntKeySnafu, KeyMustBeStringSnafu, ProcessorExpectStringSnafu, ProcessorMissingFieldSnafu, Result, }; -use crate::etl::field::{Field, Fields, InputFieldInfo, OneInputMultiOutputField}; -use crate::etl::find_key_index; +use crate::etl::field::Fields; use crate::etl::processor::{ - yaml_bool, yaml_new_field, yaml_new_fields, Processor, ProcessorBuilder, ProcessorKind, - FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME, + yaml_bool, yaml_new_field, yaml_new_fields, Processor, FIELDS_NAME, FIELD_NAME, + IGNORE_MISSING_NAME, }; use crate::etl::value::Value; @@ -77,139 +76,6 @@ const CMCD_KEYS: [&str; 18] = [ CMCD_KEY_V, ]; -/// CmcdProcessorBuilder is a builder for CmcdProcessor -/// parse from raw yaml -#[derive(Debug, Default)] -pub struct CmcdProcessorBuilder { - fields: Fields, - output_keys: HashSet, - ignore_missing: bool, -} - -impl CmcdProcessorBuilder { - /// build_cmcd_outputs build cmcd output info - /// generate index and function for each output - pub(super) fn build_cmcd_outputs( - field: &Field, - intermediate_keys: &[String], - ) -> Result<(BTreeMap, Vec)> { - let mut output_index = BTreeMap::new(); - let mut cmcd_field_outputs = Vec::with_capacity(CMCD_KEYS.len()); - for cmcd in CMCD_KEYS { - let final_key = generate_key(field.target_or_input_field(), cmcd); - let index = find_key_index(intermediate_keys, &final_key, "cmcd")?; - output_index.insert(final_key.clone(), index); - match cmcd { - CMCD_KEY_BS | CMCD_KEY_SU => { - let output_info = CmcdOutputInfo::new(final_key, cmcd, index, bs_su); - cmcd_field_outputs.push(output_info); - } - CMCD_KEY_BR | CMCD_KEY_BL | CMCD_KEY_D | CMCD_KEY_DL | CMCD_KEY_MTP - | CMCD_KEY_RTP | CMCD_KEY_TB => { - let output_info = CmcdOutputInfo::new(final_key, cmcd, index, br_tb); - cmcd_field_outputs.push(output_info); - } - CMCD_KEY_CID | CMCD_KEY_NRR | CMCD_KEY_OT | CMCD_KEY_SF | CMCD_KEY_SID - | CMCD_KEY_ST | CMCD_KEY_V => { - let output_info = CmcdOutputInfo::new(final_key, cmcd, index, cid_v); - cmcd_field_outputs.push(output_info); - } - CMCD_KEY_NOR => { - let output_info = CmcdOutputInfo::new(final_key, cmcd, index, nor); - cmcd_field_outputs.push(output_info); - } - CMCD_KEY_PR => { - let output_info = CmcdOutputInfo::new(final_key, cmcd, index, pr); - cmcd_field_outputs.push(output_info); - } - _ => {} - } - } - Ok((output_index, cmcd_field_outputs)) - } - - /// build CmcdProcessor from CmcdProcessorBuilder - pub fn build(self, intermediate_keys: &[String]) -> Result { - let mut real_fields = vec![]; - let mut cmcd_outputs = Vec::with_capacity(CMCD_KEYS.len()); - for field in self.fields.into_iter() { - let input_index = find_key_index(intermediate_keys, field.input_field(), "cmcd")?; - - let input_field_info = InputFieldInfo::new(field.input_field(), input_index); - - let (_, cmcd_field_outputs) = Self::build_cmcd_outputs(&field, intermediate_keys)?; - - cmcd_outputs.push(cmcd_field_outputs); - - let real_field = OneInputMultiOutputField::new(input_field_info, field.target_field); - real_fields.push(real_field); - } - Ok(CmcdProcessor { - fields: real_fields, - cmcd_outputs, - ignore_missing: self.ignore_missing, - }) - } -} - -impl ProcessorBuilder for CmcdProcessorBuilder { - fn output_keys(&self) -> HashSet<&str> { - self.output_keys.iter().map(|s| s.as_str()).collect() - } - - fn input_keys(&self) -> HashSet<&str> { - self.fields.iter().map(|f| f.input_field()).collect() - } - - fn build(self, intermediate_keys: &[String]) -> Result { - self.build(intermediate_keys).map(ProcessorKind::Cmcd) - } -} - -fn generate_key(prefix: &str, key: &str) -> String { - format!("{}_{}", prefix, key) -} - -/// CmcdOutputInfo is a struct to store output info -#[derive(Debug)] -pub(super) struct CmcdOutputInfo { - /// {input_field}_{cmcd_key} - final_key: String, - /// cmcd key - key: &'static str, - /// index in intermediate_keys - index: usize, - /// function to resolve value - f: fn(&str, &str, Option<&str>) -> Result, -} - -impl CmcdOutputInfo { - fn new( - final_key: String, - key: &'static str, - index: usize, - f: fn(&str, &str, Option<&str>) -> Result, - ) -> Self { - Self { - final_key, - key, - index, - f, - } - } -} - -impl Default for CmcdOutputInfo { - fn default() -> Self { - Self { - final_key: String::default(), - key: "", - index: 0, - f: |_, _, _| Ok(Value::Null), - } - } -} - /// function to resolve CMCD_KEY_BS | CMCD_KEY_SU fn bs_su(_: &str, _: &str, _: Option<&str>) -> Result { Ok(Value::Boolean(true)) @@ -286,9 +152,7 @@ fn pr(s: &str, k: &str, v: Option<&str>) -> Result { /// 12. Transport Layer Security SHOULD be used to protect all transmission of CMCD data. #[derive(Debug, Default)] pub struct CmcdProcessor { - fields: Vec, - cmcd_outputs: Vec>, - + fields: Fields, ignore_missing: bool, } @@ -297,27 +161,52 @@ impl CmcdProcessor { format!("{}_{}", prefix, key) } - fn parse(&self, field_index: usize, s: &str) -> Result> { - let parts = s.split(','); - let mut result = Vec::new(); + fn parse(&self, name: &str, value: &str) -> Result> { + let mut working_set = BTreeMap::new(); + + let parts = value.split(','); + for part in parts { let mut kv = part.split('='); - let k = kv.next().context(CmcdMissingKeySnafu { part, s })?; + let k = kv.next().context(CmcdMissingKeySnafu { part, s: value })?; let v = kv.next(); - for cmcd_key in self.cmcd_outputs[field_index].iter() { - if cmcd_key.key == k { - let val = (cmcd_key.f)(s, k, v)?; - result.push((cmcd_key.index, val)); + for cmcd_key in CMCD_KEYS { + if cmcd_key == k { + match cmcd_key { + CMCD_KEY_BS | CMCD_KEY_SU => { + working_set + .insert(Self::generate_key(name, cmcd_key), bs_su(value, k, v)?); + } + CMCD_KEY_BR | CMCD_KEY_BL | CMCD_KEY_D | CMCD_KEY_DL | CMCD_KEY_MTP + | CMCD_KEY_RTP | CMCD_KEY_TB => { + working_set + .insert(Self::generate_key(name, cmcd_key), br_tb(value, k, v)?); + } + CMCD_KEY_CID | CMCD_KEY_NRR | CMCD_KEY_OT | CMCD_KEY_SF | CMCD_KEY_SID + | CMCD_KEY_ST | CMCD_KEY_V => { + working_set + .insert(Self::generate_key(name, cmcd_key), cid_v(value, k, v)?); + } + CMCD_KEY_NOR => { + working_set + .insert(Self::generate_key(name, cmcd_key), nor(value, k, v)?); + } + CMCD_KEY_PR => { + working_set + .insert(Self::generate_key(name, cmcd_key), pr(value, k, v)?); + } + + _ => {} + } } } } - - Ok(result) + Ok(working_set) } } -impl TryFrom<&yaml_rust::yaml::Hash> for CmcdProcessorBuilder { +impl TryFrom<&yaml_rust::yaml::Hash> for CmcdProcessor { type Error = Error; fn try_from(value: &yaml_rust::yaml::Hash) -> Result { @@ -344,22 +233,12 @@ impl TryFrom<&yaml_rust::yaml::Hash> for CmcdProcessorBuilder { } } - let output_keys = fields - .iter() - .flat_map(|f| { - CMCD_KEYS - .iter() - .map(|cmcd_key| generate_key(f.target_or_input_field(), cmcd_key)) - }) - .collect(); - - let builder = CmcdProcessorBuilder { + let proc = CmcdProcessor { fields, - output_keys, ignore_missing, }; - Ok(builder) + Ok(proc) } } @@ -372,21 +251,20 @@ impl Processor for CmcdProcessor { self.ignore_missing } - fn exec_mut(&self, val: &mut Vec) -> Result<()> { - for (field_index, field) in self.fields.iter().enumerate() { - let field_value_index = field.input_index(); - match val.get(field_value_index) { - Some(Value::String(v)) => { - let result_list = self.parse(field_index, v)?; - for (output_index, v) in result_list { - val[output_index] = v; - } + fn exec_mut(&self, val: &mut IntermediateStatus) -> Result<()> { + for field in self.fields.iter() { + let name = field.input_field(); + + match val.get(name) { + Some(Value::String(s)) => { + let results = self.parse(field.target_or_input_field(), s)?; + val.extend(results); } Some(Value::Null) | None => { if !self.ignore_missing { return ProcessorMissingFieldSnafu { processor: self.kind().to_string(), - field: field.input_name().to_string(), + field: name.to_string(), } .fail(); } @@ -400,6 +278,7 @@ impl Processor for CmcdProcessor { } } } + Ok(()) } } @@ -410,9 +289,9 @@ mod tests { use urlencoding::decode; - use super::{CmcdProcessorBuilder, CMCD_KEYS}; + use super::CmcdProcessor; use crate::etl::field::{Field, Fields}; - use crate::etl::value::{Map, Value}; + use crate::etl::value::Value; #[test] fn test_cmcd() { @@ -546,37 +425,20 @@ mod tests { let field = Field::new("prefix", None); - let output_keys = CMCD_KEYS - .iter() - .map(|k| format!("prefix_{}", k)) - .collect::>(); - - let mut intermediate_keys = vec!["prefix".to_string()]; - intermediate_keys.append(&mut (output_keys.clone())); - - let builder = CmcdProcessorBuilder { + let processor = CmcdProcessor { fields: Fields::new(vec![field]), - output_keys: output_keys.iter().map(|s| s.to_string()).collect(), ignore_missing: false, }; - let processor = builder.build(&intermediate_keys).unwrap(); - for (s, vec) in ss.into_iter() { let decoded = decode(s).unwrap().to_string(); - let values = vec + let expected = vec .into_iter() .map(|(k, v)| (k.to_string(), v)) .collect::>(); - let expected = Map { values }; - let actual = processor.parse(0, &decoded).unwrap(); - let actual = actual - .into_iter() - .map(|(index, value)| (intermediate_keys[index].clone(), value)) - .collect::>(); - let actual = Map { values: actual }; + let actual = processor.parse("prefix", &decoded).unwrap(); assert_eq!(actual, expected); } } diff --git a/src/pipeline/src/etl/processor/csv.rs b/src/pipeline/src/etl/processor/csv.rs index c9cb5f847db1..a0fac70de15c 100644 --- a/src/pipeline/src/etl/processor/csv.rs +++ b/src/pipeline/src/etl/processor/csv.rs @@ -14,7 +14,8 @@ // Reference: https://www.elastic.co/guide/en/elasticsearch/reference/current/csv-processor.html -use ahash::HashSet; +use std::collections::BTreeMap; + use csv::{ReaderBuilder, Trim}; use itertools::EitherOrBoth::{Both, Left, Right}; use itertools::Itertools; @@ -24,11 +25,10 @@ use crate::etl::error::{ CsvNoRecordSnafu, CsvQuoteNameSnafu, CsvReadSnafu, CsvSeparatorNameSnafu, Error, KeyMustBeStringSnafu, ProcessorExpectStringSnafu, ProcessorMissingFieldSnafu, Result, }; -use crate::etl::field::{Fields, InputFieldInfo, OneInputMultiOutputField}; -use crate::etl::find_key_index; +use crate::etl::field::Fields; use crate::etl::processor::{ - yaml_bool, yaml_new_field, yaml_new_fields, yaml_string, Processor, ProcessorBuilder, - ProcessorKind, FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME, + yaml_bool, yaml_new_field, yaml_new_fields, yaml_string, Processor, FIELDS_NAME, FIELD_NAME, + IGNORE_MISSING_NAME, }; use crate::etl::value::Value; @@ -40,76 +40,17 @@ const TRIM_NAME: &str = "trim"; const EMPTY_VALUE_NAME: &str = "empty_value"; const TARGET_FIELDS: &str = "target_fields"; -#[derive(Debug, Default)] -pub struct CsvProcessorBuilder { - reader: ReaderBuilder, - - fields: Fields, - ignore_missing: bool, - - // Value used to fill empty fields, empty fields will be skipped if this is not provided. - empty_value: Option, - target_fields: Vec, - // description - // if - // ignore_failure - // on_failure - // tag -} - -impl CsvProcessorBuilder { - fn build(self, intermediate_keys: &[String]) -> Result { - let mut real_fields = vec![]; - - for field in self.fields { - let input_index = find_key_index(intermediate_keys, field.input_field(), "csv")?; - - let input_field_info = InputFieldInfo::new(field.input_field(), input_index); - let real_field = OneInputMultiOutputField::new(input_field_info, None); - real_fields.push(real_field); - } - - let output_index_info = self - .target_fields - .iter() - .map(|f| find_key_index(intermediate_keys, f, "csv")) - .collect::>>()?; - Ok(CsvProcessor { - reader: self.reader, - fields: real_fields, - ignore_missing: self.ignore_missing, - empty_value: self.empty_value, - output_index_info, - }) - } -} - -impl ProcessorBuilder for CsvProcessorBuilder { - fn output_keys(&self) -> HashSet<&str> { - self.target_fields.iter().map(|s| s.as_str()).collect() - } - - fn input_keys(&self) -> HashSet<&str> { - self.fields.iter().map(|f| f.input_field()).collect() - } - - fn build(self, intermediate_keys: &[String]) -> Result { - self.build(intermediate_keys).map(ProcessorKind::Csv) - } -} - /// only support string value -#[derive(Debug)] +#[derive(Debug, Default)] pub struct CsvProcessor { reader: ReaderBuilder, - - fields: Vec, + fields: Fields, ignore_missing: bool, // Value used to fill empty fields, empty fields will be skipped if this is not provided. empty_value: Option, - output_index_info: Vec, + target_fields: Vec, // description // if // ignore_failure @@ -119,18 +60,20 @@ pub struct CsvProcessor { impl CsvProcessor { // process the csv format string to a map with target_fields as keys - fn process(&self, val: &str) -> Result> { + fn process(&self, val: &str) -> Result> { let mut reader = self.reader.from_reader(val.as_bytes()); if let Some(result) = reader.records().next() { let record: csv::StringRecord = result.context(CsvReadSnafu)?; - let values: Vec<(usize, Value)> = self - .output_index_info + let values = self + .target_fields .iter() .zip_longest(record.iter()) .filter_map(|zipped| match zipped { - Both(target_field, val) => Some((*target_field, Value::String(val.into()))), + Both(target_field, val) => { + Some((target_field.clone(), Value::String(val.into()))) + } // if target fields are more than extracted fields, fill the rest with empty value Left(target_field) => { let value = self @@ -138,7 +81,7 @@ impl CsvProcessor { .as_ref() .map(|s| Value::String(s.clone())) .unwrap_or(Value::Null); - Some((*target_field, value)) + Some((target_field.clone(), value)) } // if extracted fields are more than target fields, ignore the rest Right(_) => None, @@ -152,7 +95,7 @@ impl CsvProcessor { } } -impl TryFrom<&yaml_rust::yaml::Hash> for CsvProcessorBuilder { +impl TryFrom<&yaml_rust::yaml::Hash> for CsvProcessor { type Error = Error; fn try_from(hash: &yaml_rust::yaml::Hash) -> Result { @@ -224,8 +167,8 @@ impl TryFrom<&yaml_rust::yaml::Hash> for CsvProcessorBuilder { _ => {} } } - let builder = { - CsvProcessorBuilder { + let proc = { + CsvProcessor { reader, fields, ignore_missing, @@ -234,7 +177,7 @@ impl TryFrom<&yaml_rust::yaml::Hash> for CsvProcessorBuilder { } }; - Ok(builder) + Ok(proc) } } @@ -247,21 +190,20 @@ impl Processor for CsvProcessor { self.ignore_missing } - fn exec_mut(&self, val: &mut Vec) -> Result<()> { + fn exec_mut(&self, val: &mut BTreeMap) -> Result<()> { for field in self.fields.iter() { - let index = field.input_index(); - match val.get(index) { + let name = field.input_field(); + + match val.get(name) { Some(Value::String(v)) => { - let resule_list = self.process(v)?; - for (k, v) in resule_list { - val[k] = v; - } + let results = self.process(v)?; + val.extend(results); } Some(Value::Null) | None => { if !self.ignore_missing { return ProcessorMissingFieldSnafu { processor: self.kind().to_string(), - field: field.input_name().to_string(), + field: name.to_string(), } .fail(); } @@ -282,37 +224,28 @@ impl Processor for CsvProcessor { #[cfg(test)] mod tests { - use ahash::HashMap; - - use super::Value; - use crate::etl::processor::csv::CsvProcessorBuilder; + use super::*; + use crate::etl::field::Field; #[test] fn test_equal_length() { let mut reader = csv::ReaderBuilder::new(); reader.has_headers(false); - let builder = CsvProcessorBuilder { + let processor = CsvProcessor { reader, + fields: Fields::new(vec![Field::new("data", None)]), target_fields: vec!["a".into(), "b".into()], ..Default::default() }; - let intermediate_keys = vec!["data".into(), "a".into(), "b".into()]; - - let processor = builder.build(&intermediate_keys).unwrap(); - let result = processor - .process("1,2") - .unwrap() - .into_iter() - .map(|(k, v)| (intermediate_keys[k].clone(), v)) - .collect::>(); + let result = processor.process("1,2").unwrap(); let values = [ ("a".into(), Value::String("1".into())), ("b".into(), Value::String("2".into())), ] .into_iter() - .collect::>(); + .collect(); assert_eq!(result, values); } @@ -324,21 +257,14 @@ mod tests { { let mut reader = csv::ReaderBuilder::new(); reader.has_headers(false); - let builder = CsvProcessorBuilder { + let processor = CsvProcessor { reader, + fields: Fields::new(vec![Field::new("data", None)]), target_fields: vec!["a".into(), "b".into(), "c".into()], ..Default::default() }; - let intermediate_keys = vec!["data".into(), "a".into(), "b".into(), "c".into()]; - - let processor = builder.build(&intermediate_keys).unwrap(); - let result = processor - .process("1,2") - .unwrap() - .into_iter() - .map(|(k, v)| (intermediate_keys[k].clone(), v)) - .collect::>(); + let result = processor.process("1,2").unwrap(); let values = [ ("a".into(), Value::String("1".into())), @@ -346,7 +272,7 @@ mod tests { ("c".into(), Value::Null), ] .into_iter() - .collect::>(); + .collect(); assert_eq!(result, values); } @@ -355,22 +281,15 @@ mod tests { { let mut reader = csv::ReaderBuilder::new(); reader.has_headers(false); - let builder = CsvProcessorBuilder { + let processor = CsvProcessor { reader, + fields: Fields::new(vec![Field::new("data", None)]), target_fields: vec!["a".into(), "b".into(), "c".into()], empty_value: Some("default".into()), ..Default::default() }; - let intermediate_keys = vec!["data".into(), "a".into(), "b".into(), "c".into()]; - - let processor = builder.build(&intermediate_keys).unwrap(); - let result = processor - .process("1,2") - .unwrap() - .into_iter() - .map(|(k, v)| (intermediate_keys[k].clone(), v)) - .collect::>(); + let result = processor.process("1,2").unwrap(); let values = [ ("a".into(), Value::String("1".into())), @@ -389,22 +308,14 @@ mod tests { fn test_target_fields_has_less_length() { let mut reader = csv::ReaderBuilder::new(); reader.has_headers(false); - let builder = CsvProcessorBuilder { + let processor = CsvProcessor { reader, target_fields: vec!["a".into(), "b".into()], empty_value: Some("default".into()), ..Default::default() }; - let intermediate_keys = vec!["data".into(), "a".into(), "b".into()]; - - let processor = builder.build(&intermediate_keys).unwrap(); - let result = processor - .process("1,2") - .unwrap() - .into_iter() - .map(|(k, v)| (intermediate_keys[k].clone(), v)) - .collect::>(); + let result = processor.process("1,2").unwrap(); let values = [ ("a".into(), Value::String("1".into())), diff --git a/src/pipeline/src/etl/processor/date.rs b/src/pipeline/src/etl/processor/date.rs index fa202a0edff2..e080b795402c 100644 --- a/src/pipeline/src/etl/processor/date.rs +++ b/src/pipeline/src/etl/processor/date.rs @@ -14,21 +14,21 @@ use std::sync::Arc; -use ahash::HashSet; use chrono::{DateTime, NaiveDateTime}; use chrono_tz::Tz; use lazy_static::lazy_static; use snafu::{OptionExt, ResultExt}; +use super::IntermediateStatus; use crate::etl::error::{ DateFailedToGetLocalTimezoneSnafu, DateFailedToGetTimestampSnafu, DateParseSnafu, DateParseTimezoneSnafu, Error, KeyMustBeStringSnafu, ProcessorExpectStringSnafu, ProcessorFailedToParseStringSnafu, ProcessorMissingFieldSnafu, Result, }; -use crate::etl::field::{Fields, OneInputOneOutputField}; +use crate::etl::field::Fields; use crate::etl::processor::{ - yaml_bool, yaml_new_field, yaml_new_fields, yaml_string, yaml_strings, Processor, - ProcessorBuilder, ProcessorKind, FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME, + yaml_bool, yaml_new_field, yaml_new_fields, yaml_string, yaml_strings, Processor, FIELDS_NAME, + FIELD_NAME, IGNORE_MISSING_NAME, }; use crate::etl::value::{Timestamp, Value}; @@ -88,55 +88,7 @@ impl std::ops::Deref for Formats { } } -#[derive(Debug, Default)] -pub struct DateProcessorBuilder { - fields: Fields, - formats: Formats, - timezone: Option>, - locale: Option>, - ignore_missing: bool, -} - -impl ProcessorBuilder for DateProcessorBuilder { - fn output_keys(&self) -> HashSet<&str> { - self.fields - .iter() - .map(|f| f.target_or_input_field()) - .collect() - } - - fn input_keys(&self) -> HashSet<&str> { - self.fields.iter().map(|f| f.input_field()).collect() - } - - fn build(self, intermediate_keys: &[String]) -> Result { - self.build(intermediate_keys).map(ProcessorKind::Date) - } -} - -impl DateProcessorBuilder { - pub fn build(self, intermediate_keys: &[String]) -> Result { - let mut real_fields = vec![]; - for field in self.fields.into_iter() { - let input = OneInputOneOutputField::build( - "date", - intermediate_keys, - field.input_field(), - field.target_or_input_field(), - )?; - real_fields.push(input); - } - Ok(DateProcessor { - fields: real_fields, - formats: self.formats, - timezone: self.timezone, - locale: self.locale, - ignore_missing: self.ignore_missing, - }) - } -} - -impl TryFrom<&yaml_rust::yaml::Hash> for DateProcessorBuilder { +impl TryFrom<&yaml_rust::yaml::Hash> for DateProcessor { type Error = Error; fn try_from(hash: &yaml_rust::yaml::Hash) -> Result { @@ -181,7 +133,7 @@ impl TryFrom<&yaml_rust::yaml::Hash> for DateProcessorBuilder { } } - let builder = DateProcessorBuilder { + let builder = DateProcessor { fields, formats, timezone, @@ -197,7 +149,7 @@ impl TryFrom<&yaml_rust::yaml::Hash> for DateProcessorBuilder { /// Reserved for compatibility only #[derive(Debug, Default)] pub struct DateProcessor { - fields: Vec, + fields: Fields, formats: Formats, timezone: Option>, locale: Option>, // to support locale @@ -242,20 +194,20 @@ impl Processor for DateProcessor { self.ignore_missing } - fn exec_mut(&self, val: &mut Vec) -> Result<()> { + fn exec_mut(&self, val: &mut IntermediateStatus) -> Result<()> { for field in self.fields.iter() { - let index = field.input_index(); + let index = field.input_field(); match val.get(index) { Some(Value::String(s)) => { let timestamp = self.parse(s)?; - let output_index = field.output_index(); - val[output_index] = Value::Timestamp(timestamp); + let output_key = field.target_or_input_field(); + val.insert(output_key.to_string(), Value::Timestamp(timestamp)); } Some(Value::Null) | None => { if !self.ignore_missing { return ProcessorMissingFieldSnafu { processor: self.kind().to_string(), - field: field.input_name().to_string(), + field: field.input_field().to_string(), } .fail(); } diff --git a/src/pipeline/src/etl/processor/decolorize.rs b/src/pipeline/src/etl/processor/decolorize.rs index e72bc28a1e66..2547b99d6824 100644 --- a/src/pipeline/src/etl/processor/decolorize.rs +++ b/src/pipeline/src/etl/processor/decolorize.rs @@ -18,18 +18,17 @@ //! from Grafana Loki and [`strip_ansi_escape_codes`](https://vector.dev/docs/reference/vrl/functions/#strip_ansi_escape_codes) //! from Vector VRL. -use ahash::HashSet; use once_cell::sync::Lazy; use regex::Regex; use snafu::OptionExt; +use super::IntermediateStatus; use crate::etl::error::{ Error, KeyMustBeStringSnafu, ProcessorExpectStringSnafu, ProcessorMissingFieldSnafu, Result, }; -use crate::etl::field::{Fields, OneInputOneOutputField}; +use crate::etl::field::Fields; use crate::etl::processor::{ - yaml_bool, yaml_new_field, yaml_new_fields, ProcessorBuilder, ProcessorKind, FIELDS_NAME, - FIELD_NAME, IGNORE_MISSING_NAME, + yaml_bool, yaml_new_field, yaml_new_fields, FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME, }; use crate::etl::value::Value; @@ -37,52 +36,10 @@ pub(crate) const PROCESSOR_DECOLORIZE: &str = "decolorize"; static RE: Lazy = Lazy::new(|| Regex::new(r"\x1b\[[0-9;]*m").unwrap()); -#[derive(Debug, Default)] -pub struct DecolorizeProcessorBuilder { - fields: Fields, - ignore_missing: bool, -} - -impl ProcessorBuilder for DecolorizeProcessorBuilder { - fn output_keys(&self) -> HashSet<&str> { - self.fields - .iter() - .map(|f| f.target_or_input_field()) - .collect() - } - - fn input_keys(&self) -> HashSet<&str> { - self.fields.iter().map(|f| f.input_field()).collect() - } - - fn build(self, intermediate_keys: &[String]) -> Result { - self.build(intermediate_keys).map(ProcessorKind::Decolorize) - } -} - -impl DecolorizeProcessorBuilder { - fn build(self, intermediate_keys: &[String]) -> Result { - let mut real_fields = vec![]; - for field in self.fields.into_iter() { - let input = OneInputOneOutputField::build( - "decolorize", - intermediate_keys, - field.input_field(), - field.target_or_input_field(), - )?; - real_fields.push(input); - } - Ok(DecolorizeProcessor { - fields: real_fields, - ignore_missing: self.ignore_missing, - }) - } -} - /// Remove ANSI color control codes from the input text. #[derive(Debug, Default)] pub struct DecolorizeProcessor { - fields: Vec, + fields: Fields, ignore_missing: bool, } @@ -103,7 +60,7 @@ impl DecolorizeProcessor { } } -impl TryFrom<&yaml_rust::yaml::Hash> for DecolorizeProcessorBuilder { +impl TryFrom<&yaml_rust::yaml::Hash> for DecolorizeProcessor { type Error = Error; fn try_from(value: &yaml_rust::yaml::Hash) -> Result { @@ -129,7 +86,7 @@ impl TryFrom<&yaml_rust::yaml::Hash> for DecolorizeProcessorBuilder { } } - Ok(DecolorizeProcessorBuilder { + Ok(DecolorizeProcessor { fields, ignore_missing, }) @@ -145,23 +102,23 @@ impl crate::etl::processor::Processor for DecolorizeProcessor { self.ignore_missing } - fn exec_mut(&self, val: &mut Vec) -> Result<()> { + fn exec_mut(&self, val: &mut IntermediateStatus) -> Result<()> { for field in self.fields.iter() { - let index = field.input_index(); + let index = field.input_field(); match val.get(index) { Some(Value::Null) | None => { if !self.ignore_missing { return ProcessorMissingFieldSnafu { processor: self.kind(), - field: field.input_name(), + field: field.input_field(), } .fail(); } } Some(v) => { let result = self.process(v)?; - let output_index = field.output_index(); - val[output_index] = result; + let output_index = field.target_or_input_field(); + val.insert(output_index.to_string(), result); } } } @@ -176,7 +133,7 @@ mod tests { #[test] fn test_decolorize_processor() { let processor = DecolorizeProcessor { - fields: vec![], + fields: Fields::default(), ignore_missing: false, }; diff --git a/src/pipeline/src/etl/processor/digest.rs b/src/pipeline/src/etl/processor/digest.rs index 29054365ad03..64bb2a2f6d8a 100644 --- a/src/pipeline/src/etl/processor/digest.rs +++ b/src/pipeline/src/etl/processor/digest.rs @@ -21,17 +21,16 @@ use std::borrow::Cow; -use ahash::HashSet; use regex::Regex; use snafu::OptionExt; +use super::IntermediateStatus; use crate::etl::error::{ Error, KeyMustBeStringSnafu, ProcessorExpectStringSnafu, ProcessorMissingFieldSnafu, Result, }; -use crate::etl::field::{Fields, OneInputOneOutputField}; +use crate::etl::field::Fields; use crate::etl::processor::{ - yaml_bool, yaml_new_field, yaml_new_fields, ProcessorBuilder, ProcessorKind, FIELDS_NAME, - FIELD_NAME, IGNORE_MISSING_NAME, + yaml_bool, yaml_new_field, yaml_new_fields, FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME, }; use crate::etl::value::Value; use crate::etl_error::DigestPatternInvalidSnafu; @@ -88,54 +87,10 @@ impl PresetPattern { } } -#[derive(Debug, Default)] -pub struct DigestProcessorBuilder { - fields: Fields, - patterns: Vec, - ignore_missing: bool, -} - -impl ProcessorBuilder for DigestProcessorBuilder { - fn output_keys(&self) -> HashSet<&str> { - self.fields - .iter() - .map(|f| f.target_or_input_field()) - .collect() - } - - fn input_keys(&self) -> HashSet<&str> { - self.fields.iter().map(|f| f.input_field()).collect() - } - - fn build(self, intermediate_keys: &[String]) -> Result { - self.build(intermediate_keys).map(ProcessorKind::Digest) - } -} - -impl DigestProcessorBuilder { - fn build(self, intermediate_keys: &[String]) -> Result { - let mut real_fields = Vec::with_capacity(self.fields.len()); - for field in self.fields.into_iter() { - let input = OneInputOneOutputField::build( - "digest", - intermediate_keys, - field.input_field(), - field.target_or_input_field(), - )?; - real_fields.push(input); - } - Ok(DigestProcessor { - fields: real_fields, - ignore_missing: self.ignore_missing, - patterns: self.patterns, - }) - } -} - /// Computes a digest (hash) of the input string. #[derive(Debug, Default)] pub struct DigestProcessor { - fields: Vec, + fields: Fields, ignore_missing: bool, patterns: Vec, } @@ -169,7 +124,7 @@ impl DigestProcessor { } } -impl TryFrom<&yaml_rust::yaml::Hash> for DigestProcessorBuilder { +impl TryFrom<&yaml_rust::yaml::Hash> for DigestProcessor { type Error = Error; fn try_from(value: &yaml_rust::yaml::Hash) -> Result { @@ -226,10 +181,10 @@ impl TryFrom<&yaml_rust::yaml::Hash> for DigestProcessorBuilder { } for field in fields.iter_mut() { - field.target_field = Some(format!("{}_digest", field.input_field())); + field.set_target_field(Some(format!("{}_digest", field.input_field()))); } - Ok(DigestProcessorBuilder { + Ok(DigestProcessor { fields, patterns, ignore_missing, @@ -246,23 +201,23 @@ impl crate::etl::processor::Processor for DigestProcessor { self.ignore_missing } - fn exec_mut(&self, val: &mut Vec) -> Result<()> { + fn exec_mut(&self, val: &mut IntermediateStatus) -> Result<()> { for field in self.fields.iter() { - let index = field.input_index(); + let index = field.input_field(); match val.get(index) { Some(Value::Null) | None => { if !self.ignore_missing { return ProcessorMissingFieldSnafu { processor: self.kind(), - field: field.input_name(), + field: field.input_field(), } .fail(); } } Some(v) => { let result = self.process(v)?; - let output_index = field.output_index(); - val[output_index] = result; + let output_index = field.target_or_input_field(); + val.insert(output_index.to_string(), result); } } } @@ -278,7 +233,7 @@ mod tests { #[test] fn test_digest_processor_ip() { let processor = DigestProcessor { - fields: vec![], + fields: Fields::default(), ignore_missing: false, patterns: vec![PresetPattern::Ip.regex()], }; @@ -306,7 +261,7 @@ mod tests { #[test] fn test_digest_processor_uuid() { let processor = DigestProcessor { - fields: vec![], + fields: Fields::default(), ignore_missing: false, patterns: vec![PresetPattern::Uuid.regex()], }; @@ -339,7 +294,7 @@ mod tests { #[test] fn test_digest_processor_brackets() { let processor = DigestProcessor { - fields: vec![], + fields: Fields::default(), ignore_missing: false, patterns: vec![PresetPattern::Bracketed.regex()], }; @@ -389,7 +344,7 @@ mod tests { #[test] fn test_digest_processor_quotes() { let processor = DigestProcessor { - fields: vec![], + fields: Fields::default(), ignore_missing: false, patterns: vec![PresetPattern::Quoted.regex()], }; @@ -409,7 +364,7 @@ mod tests { #[test] fn test_digest_processor_custom_regex() { let processor = DigestProcessor { - fields: vec![], + fields: Fields::default(), ignore_missing: false, patterns: vec![Regex::new(r"\d+").unwrap()], }; diff --git a/src/pipeline/src/etl/processor/dissect.rs b/src/pipeline/src/etl/processor/dissect.rs index a9ccf5e8735e..9ac28f7bf09e 100644 --- a/src/pipeline/src/etl/processor/dissect.rs +++ b/src/pipeline/src/etl/processor/dissect.rs @@ -18,6 +18,7 @@ use ahash::{HashMap, HashMapExt, HashSet, HashSetExt}; use itertools::Itertools; use snafu::OptionExt; +use super::IntermediateStatus; use crate::etl::error::{ DissectAppendOrderAlreadySetSnafu, DissectConsecutiveNamesSnafu, DissectEmptyPatternSnafu, DissectEndModifierAlreadySetSnafu, DissectInvalidPatternSnafu, DissectModifierAlreadySetSnafu, @@ -25,12 +26,10 @@ use crate::etl::error::{ DissectOrderOnlyAppendSnafu, DissectSplitExceedsInputSnafu, DissectSplitNotMatchInputSnafu, Error, KeyMustBeStringSnafu, ProcessorExpectStringSnafu, ProcessorMissingFieldSnafu, Result, }; -use crate::etl::field::{Fields, InputFieldInfo, OneInputMultiOutputField}; -use crate::etl::find_key_index; +use crate::etl::field::Fields; use crate::etl::processor::{ yaml_bool, yaml_new_field, yaml_new_fields, yaml_parse_string, yaml_parse_strings, yaml_string, - Processor, ProcessorBuilder, ProcessorKind, FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME, - PATTERNS_NAME, PATTERN_NAME, + Processor, FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME, PATTERNS_NAME, PATTERN_NAME, }; use crate::etl::value::Value; @@ -69,14 +68,7 @@ impl std::fmt::Display for EndModifier { } } -#[derive(Debug, PartialEq, Default)] -struct NameInfo { - name: String, - start_modifier: Option, - end_modifier: Option, -} - -impl NameInfo { +impl Name { fn is_name_empty(&self) -> bool { self.name.is_empty() } @@ -140,26 +132,9 @@ impl NameInfo { } } -impl std::fmt::Display for NameInfo { - fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { - write!(f, "{}", self.name) - } -} - -impl From<&str> for NameInfo { - fn from(value: &str) -> Self { - NameInfo { - name: value.to_string(), - start_modifier: None, - end_modifier: None, - } - } -} - #[derive(Debug, PartialEq, Default)] struct Name { name: String, - index: usize, start_modifier: Option, end_modifier: Option, } @@ -170,57 +145,12 @@ impl std::fmt::Display for Name { } } -impl From for Name { - fn from(value: NameInfo) -> Self { +impl From<&str> for Name { + fn from(value: &str) -> Self { Name { - name: value.name, - index: 0, - start_modifier: value.start_modifier, - end_modifier: value.end_modifier, - } - } -} - -impl Name { - fn is_name_empty(&self) -> bool { - self.name.is_empty() - } - - fn is_empty(&self) -> bool { - self.name.is_empty() && self.start_modifier.is_none() && self.end_modifier.is_none() - } - - fn is_end_modifier_set(&self) -> bool { - self.end_modifier.is_some() - } -} - -#[derive(Debug, PartialEq)] -enum PartInfo { - Split(String), - Name(NameInfo), -} - -impl PartInfo { - fn is_empty(&self) -> bool { - match self { - PartInfo::Split(v) => v.is_empty(), - PartInfo::Name(v) => v.is_empty(), - } - } - - fn empty_split() -> Self { - PartInfo::Split(String::new()) - } - - fn empty_name() -> Self { - PartInfo::Name(NameInfo::default()) - } - - fn push(&mut self, ch: char) { - match self { - PartInfo::Split(v) => v.push(ch), - PartInfo::Name(v) => v.name.push(ch), + name: value.to_string(), + start_modifier: None, + end_modifier: None, } } } @@ -246,13 +176,11 @@ impl Part { fn empty_name() -> Self { Part::Name(Name::default()) } -} -impl From for Part { - fn from(value: PartInfo) -> Self { - match value { - PartInfo::Split(v) => Part::Split(v), - PartInfo::Name(v) => Part::Name(v.into()), + fn push(&mut self, ch: char) { + match self { + Part::Split(v) => v.push(ch), + Part::Name(v) => v.name.push(ch), } } } @@ -271,42 +199,12 @@ impl Deref for Pattern { } } -impl From for Pattern { - fn from(value: PatternInfo) -> Self { - let parts = value.parts.into_iter().map(|x| x.into()).collect(); - Pattern { - origin: value.origin, - parts, - } - } -} - -#[derive(Debug, Default)] -struct PatternInfo { - origin: String, - parts: Vec, -} - -impl std::ops::Deref for PatternInfo { - type Target = Vec; - - fn deref(&self) -> &Self::Target { - &self.parts - } -} - -impl std::ops::DerefMut for PatternInfo { - fn deref_mut(&mut self) -> &mut Self::Target { - &mut self.parts - } -} - -impl std::str::FromStr for PatternInfo { +impl std::str::FromStr for Pattern { type Err = Error; fn from_str(s: &str) -> Result { let mut parts = vec![]; - let mut cursor = PartInfo::empty_split(); + let mut cursor = Part::empty_split(); let origin = s.to_string(); let chars: Vec = origin.chars().collect(); @@ -316,27 +214,27 @@ impl std::str::FromStr for PatternInfo { let ch = chars[pos]; match (ch, &mut cursor) { // if cursor is Split part, and found %{, then ready to start a Name part - ('%', PartInfo::Split(_)) if matches!(chars.get(pos + 1), Some('{')) => { + ('%', Part::Split(_)) if matches!(chars.get(pos + 1), Some('{')) => { if !cursor.is_empty() { parts.push(cursor); } - cursor = PartInfo::empty_name(); + cursor = Part::empty_name(); pos += 1; // skip '{' } // if cursor is Split part, and not found % or {, then continue the Split part - (_, PartInfo::Split(_)) => { + (_, Part::Split(_)) => { cursor.push(ch); } // if cursor is Name part, and found }, then end the Name part, start the next Split part - ('}', PartInfo::Name(_)) => { + ('}', Part::Name(_)) => { parts.push(cursor); - cursor = PartInfo::empty_split(); + cursor = Part::empty_split(); } - ('+', PartInfo::Name(name)) if !name.is_start_modifier_set() => { + ('+', Part::Name(name)) if !name.is_start_modifier_set() => { name.try_start_modifier(StartModifier::Append(None))?; } - ('/', PartInfo::Name(name)) if name.is_append_modifier_set() => { + ('/', Part::Name(name)) if name.is_append_modifier_set() => { let mut order = 0; let mut j = pos + 1; while j < chars.len() { @@ -360,16 +258,16 @@ impl std::str::FromStr for PatternInfo { name.try_append_order(order)?; pos = j - 1; // this will change the position to the last digit of the order } - ('?', PartInfo::Name(name)) if !name.is_start_modifier_set() => { + ('?', Part::Name(name)) if !name.is_start_modifier_set() => { name.try_start_modifier(StartModifier::NamedSkip)?; } - ('*', PartInfo::Name(name)) if !name.is_start_modifier_set() => { + ('*', Part::Name(name)) if !name.is_start_modifier_set() => { name.try_start_modifier(StartModifier::MapKey)?; } - ('&', PartInfo::Name(name)) if !name.is_start_modifier_set() => { + ('&', Part::Name(name)) if !name.is_start_modifier_set() => { name.try_start_modifier(StartModifier::MapVal)?; } - ('-', PartInfo::Name(name)) if !name.is_end_modifier_set() => { + ('-', Part::Name(name)) if !name.is_end_modifier_set() => { if let Some('>') = chars.get(pos + 1) { } else { return DissectInvalidPatternSnafu { @@ -391,7 +289,7 @@ impl std::str::FromStr for PatternInfo { name.try_end_modifier()?; pos += 1; // only skip '>', the next loop will skip '}' } - (_, PartInfo::Name(name)) if !is_valid_char(ch) => { + (_, Part::Name(name)) if !is_valid_char(ch) => { let tail: String = if name.is_name_empty() { format!("Invalid '{ch}'") } else { @@ -399,7 +297,7 @@ impl std::str::FromStr for PatternInfo { }; return DissectInvalidPatternSnafu { s, detail: tail }.fail(); } - (_, PartInfo::Name(_)) => { + (_, Part::Name(_)) => { cursor.push(ch); } } @@ -408,8 +306,8 @@ impl std::str::FromStr for PatternInfo { } match cursor { - PartInfo::Split(ref split) if !split.is_empty() => parts.push(cursor), - PartInfo::Name(name) if !name.is_empty() => { + Part::Split(ref split) if !split.is_empty() => parts.push(cursor), + Part::Name(name) if !name.is_empty() => { return DissectInvalidPatternSnafu { s, detail: format!("'{name}' is not closed"), @@ -425,7 +323,7 @@ impl std::str::FromStr for PatternInfo { } } -impl PatternInfo { +impl Pattern { fn check(&self) -> Result<()> { if self.len() == 0 { return DissectEmptyPatternSnafu.fail(); @@ -438,21 +336,21 @@ impl PatternInfo { let this_part = &self[i]; let next_part = self.get(i + 1); match (this_part, next_part) { - (PartInfo::Split(split), _) if split.is_empty() => { + (Part::Split(split), _) if split.is_empty() => { return DissectInvalidPatternSnafu { s: &self.origin, detail: "Empty split is not allowed", } .fail(); } - (PartInfo::Name(name1), Some(PartInfo::Name(name2))) => { + (Part::Name(name1), Some(Part::Name(name2))) => { return DissectInvalidPatternSnafu { s: &self.origin, detail: format!("consecutive names are not allowed: '{name1}' '{name2}'",), } .fail(); } - (PartInfo::Name(name), _) if name.is_name_empty() => { + (Part::Name(name), _) if name.is_name_empty() => { if let Some(ref m) = name.start_modifier { return DissectInvalidPatternSnafu { s: &self.origin, @@ -461,7 +359,7 @@ impl PatternInfo { .fail(); } } - (PartInfo::Name(name), _) => match name.start_modifier { + (Part::Name(name), _) => match name.start_modifier { Some(StartModifier::MapKey) => { if map_keys.contains(&name.name) { return DissectInvalidPatternSnafu { @@ -509,128 +407,9 @@ impl PatternInfo { } } -impl std::fmt::Display for PatternInfo { - fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { - write!(f, "{}", self.origin) - } -} - -#[derive(Debug, Default)] -pub struct DissectProcessorBuilder { - fields: Fields, - patterns: Vec, - ignore_missing: bool, - append_separator: Option, - output_keys: HashSet, -} - -impl DissectProcessorBuilder { - fn build_output_keys(patterns: &[PatternInfo]) -> HashSet { - patterns - .iter() - .flat_map(|pattern| pattern.iter()) - .filter_map(|p| match p { - PartInfo::Name(name) => { - if !name.is_empty() - && (name.start_modifier.is_none() - || name - .start_modifier - .as_ref() - .is_some_and(|x| matches!(x, StartModifier::Append(_)))) - { - Some(name.to_string()) - } else { - None - } - } - _ => None, - }) - .collect() - } - - fn part_info_to_part(part_info: PartInfo, intermediate_keys: &[String]) -> Result { - match part_info { - PartInfo::Split(s) => Ok(Part::Split(s)), - PartInfo::Name(n) => match n.start_modifier { - None | Some(StartModifier::Append(_)) => { - let index = find_key_index(intermediate_keys, &n.name, "dissect")?; - Ok(Part::Name(Name { - name: n.name, - index, - start_modifier: n.start_modifier, - end_modifier: n.end_modifier, - })) - } - _ => Ok(Part::Name(Name { - name: n.name, - index: usize::MAX, - start_modifier: n.start_modifier, - end_modifier: n.end_modifier, - })), - }, - } - } - - fn pattern_info_to_pattern( - pattern_info: PatternInfo, - intermediate_keys: &[String], - ) -> Result { - let original = pattern_info.origin; - let pattern = pattern_info - .parts - .into_iter() - .map(|part_info| Self::part_info_to_part(part_info, intermediate_keys)) - .collect::>>()?; - Ok(Pattern { - origin: original, - parts: pattern, - }) - } - - fn build_patterns_from_pattern_infos( - patterns: Vec, - intermediate_keys: &[String], - ) -> Result> { - patterns - .into_iter() - .map(|pattern_info| Self::pattern_info_to_pattern(pattern_info, intermediate_keys)) - .collect() - } -} - -impl ProcessorBuilder for DissectProcessorBuilder { - fn output_keys(&self) -> HashSet<&str> { - self.output_keys.iter().map(|s| s.as_str()).collect() - } - - fn input_keys(&self) -> HashSet<&str> { - self.fields.iter().map(|f| f.input_field()).collect() - } - - fn build(self, intermediate_keys: &[String]) -> Result { - let mut real_fields = vec![]; - for field in self.fields.into_iter() { - let input_index = find_key_index(intermediate_keys, field.input_field(), "dissect")?; - - let input_field_info = InputFieldInfo::new(field.input_field(), input_index); - - let real_field = OneInputMultiOutputField::new(input_field_info, field.target_field); - real_fields.push(real_field); - } - let patterns = Self::build_patterns_from_pattern_infos(self.patterns, intermediate_keys)?; - let processor = DissectProcessor { - fields: real_fields, - patterns, - ignore_missing: self.ignore_missing, - append_separator: self.append_separator, - }; - Ok(ProcessorKind::Dissect(processor)) - } -} - #[derive(Debug, Default)] pub struct DissectProcessor { - fields: Vec, + fields: Fields, patterns: Vec, ignore_missing: bool, @@ -639,33 +418,37 @@ pub struct DissectProcessor { } impl DissectProcessor { - fn process_pattern(&self, chs: &[char], pattern: &Pattern) -> Result> { + fn process_name_value<'a>( + name: &'a Name, + value: String, + appends: &mut HashMap<&'a String, Vec<(String, u32)>>, + map: &mut Vec<(&'a String, Value)>, + ) { + match name.start_modifier { + Some(StartModifier::NamedSkip) => { + // do nothing, ignore this match + } + Some(StartModifier::Append(order)) => { + appends + .entry(&name.name) + .or_default() + .push((value, order.unwrap_or_default())); + } + Some(_) => { + // do nothing, ignore MapKey and MapVal + // because transform can know the key name + } + None => { + map.push((&name.name, Value::String(value))); + } + } + } + + fn process_pattern(&self, chs: &[char], pattern: &Pattern) -> Result> { let mut map = Vec::new(); let mut pos = 0; - let mut appends: HashMap> = HashMap::new(); - - let mut process_name_value = |name: &Name, value: String| { - let name_index = name.index; - match name.start_modifier { - Some(StartModifier::NamedSkip) => { - // do nothing, ignore this match - } - Some(StartModifier::Append(order)) => { - appends - .entry(name_index) - .or_default() - .push((value, order.unwrap_or_default())); - } - Some(_) => { - // do nothing, ignore MapKey and MapVal - // because transform can know the key name - } - None => { - map.push((name_index, Value::String(value))); - } - } - }; + let mut appends: HashMap<&String, Vec<(String, u32)>> = HashMap::new(); for i in 0..pattern.len() { let this_part = &pattern[i]; @@ -701,7 +484,7 @@ impl DissectProcessor { // if Name part is the last part, then the rest of the input is the value (Part::Name(name), None) => { let value = chs[pos..].iter().collect::(); - process_name_value(name, value); + Self::process_name_value(name, value, &mut appends, &mut map); } // if Name part, and next part is Split, then find the matched value of the name @@ -717,7 +500,7 @@ impl DissectProcessor { if !name.is_name_empty() { let value = chs[pos..end].iter().collect::(); - process_name_value(name, value); + Self::process_name_value(name, value, &mut appends, &mut map); } if name.is_end_modifier_set() { @@ -745,10 +528,10 @@ impl DissectProcessor { } } - Ok(map) + Ok(map.into_iter().map(|(k, v)| (k.to_string(), v)).collect()) } - fn process(&self, val: &str) -> Result> { + fn process(&self, val: &str) -> Result> { let chs = val.chars().collect::>(); for pattern in &self.patterns { @@ -760,7 +543,7 @@ impl DissectProcessor { } } -impl TryFrom<&yaml_rust::yaml::Hash> for DissectProcessorBuilder { +impl TryFrom<&yaml_rust::yaml::Hash> for DissectProcessor { type Error = Error; fn try_from(value: &yaml_rust::yaml::Hash) -> Result { @@ -782,7 +565,7 @@ impl TryFrom<&yaml_rust::yaml::Hash> for DissectProcessorBuilder { fields = yaml_new_fields(v, FIELDS_NAME)?; } PATTERN_NAME => { - let pattern: PatternInfo = yaml_parse_string(v, PATTERN_NAME)?; + let pattern: Pattern = yaml_parse_string(v, PATTERN_NAME)?; patterns = vec![pattern]; } PATTERNS_NAME => { @@ -797,13 +580,12 @@ impl TryFrom<&yaml_rust::yaml::Hash> for DissectProcessorBuilder { _ => {} } } - let output_keys = Self::build_output_keys(&patterns); - let builder = DissectProcessorBuilder { + // let output_keys = Self::build_output_keys(&patterns); + let builder = DissectProcessor { fields, patterns, ignore_missing, append_separator, - output_keys, }; Ok(builder) @@ -819,21 +601,21 @@ impl Processor for DissectProcessor { self.ignore_missing } - fn exec_mut(&self, val: &mut Vec) -> Result<()> { + fn exec_mut(&self, val: &mut IntermediateStatus) -> Result<()> { for field in self.fields.iter() { - let index = field.input_index(); + let index = field.input_field(); match val.get(index) { Some(Value::String(val_str)) => { let r = self.process(val_str)?; for (k, v) in r { - val[k] = v; + val.insert(k, v); } } Some(Value::Null) | None => { if !self.ignore_missing { return ProcessorMissingFieldSnafu { processor: self.kind(), - field: field.input_name(), + field: field.input_field(), } .fail(); } @@ -859,26 +641,19 @@ fn is_valid_char(ch: char) -> bool { mod tests { use ahash::HashMap; - use super::{DissectProcessor, EndModifier, NameInfo, PartInfo, PatternInfo, StartModifier}; - use crate::etl::processor::dissect::DissectProcessorBuilder; + use super::{DissectProcessor, EndModifier, Name, Part, StartModifier}; + use crate::etl::processor::dissect::Pattern; use crate::etl::value::Value; fn assert(pattern_str: &str, input: &str, expected: HashMap) { let chs = input.chars().collect::>(); - let pattern_infos: Vec = vec![pattern_str.parse().unwrap()]; - let output_keys: Vec = DissectProcessorBuilder::build_output_keys(&pattern_infos) - .into_iter() - .collect(); - let pattern = - DissectProcessorBuilder::build_patterns_from_pattern_infos(pattern_infos, &output_keys) - .unwrap(); + let patterns: Vec = vec![pattern_str.parse().unwrap()]; let processor = DissectProcessor::default(); let result: HashMap = processor - .process_pattern(&chs, &pattern[0]) + .process_pattern(&chs, &patterns[0]) .unwrap() .into_iter() - .map(|(k, v)| (output_keys[k].to_string(), v)) .collect(); assert_eq!(result, expected, "pattern: {}", pattern_str); @@ -889,28 +664,28 @@ mod tests { let cases = [( "%{clientip} %{ident} %{auth} [%{timestamp}] \"%{verb} %{request} HTTP/%{httpversion}\" %{status} %{size}", vec![ - PartInfo::Name("clientip".into()), - PartInfo::Split(" ".into()), - PartInfo::Name("ident".into()), - PartInfo::Split(" ".into()), - PartInfo::Name("auth".into()), - PartInfo::Split(" [".into()), - PartInfo::Name("timestamp".into()), - PartInfo::Split("] \"".into()), - PartInfo::Name("verb".into()), - PartInfo::Split(" ".into()), - PartInfo::Name("request".into()), - PartInfo::Split(" HTTP/".into()), - PartInfo::Name("httpversion".into()), - PartInfo::Split("\" ".into()), - PartInfo::Name("status".into()), - PartInfo::Split(" ".into()), - PartInfo::Name("size".into()), + Part::Name("clientip".into()), + Part::Split(" ".into()), + Part::Name("ident".into()), + Part::Split(" ".into()), + Part::Name("auth".into()), + Part::Split(" [".into()), + Part::Name("timestamp".into()), + Part::Split("] \"".into()), + Part::Name("verb".into()), + Part::Split(" ".into()), + Part::Name("request".into()), + Part::Split(" HTTP/".into()), + Part::Name("httpversion".into()), + Part::Split("\" ".into()), + Part::Name("status".into()), + Part::Split(" ".into()), + Part::Name("size".into()), ], )]; for (pattern, expected) in cases.into_iter() { - let p: PatternInfo = pattern.parse().unwrap(); + let p: Pattern = pattern.parse().unwrap(); assert_eq!(p.parts, expected); } } @@ -921,13 +696,13 @@ mod tests { ( "%{} %{}", vec![ - PartInfo::Name(NameInfo { + Part::Name(Name { name: "".into(), start_modifier: None, end_modifier: None, }), - PartInfo::Split(" ".into()), - PartInfo::Name(NameInfo { + Part::Split(" ".into()), + Part::Name(Name { name: "".into(), start_modifier: None, end_modifier: None, @@ -937,61 +712,61 @@ mod tests { ( "%{ts->} %{level}", vec![ - PartInfo::Name(NameInfo { + Part::Name(Name { name: "ts".into(), start_modifier: None, end_modifier: Some(EndModifier), }), - PartInfo::Split(" ".into()), - PartInfo::Name("level".into()), + Part::Split(" ".into()), + Part::Name("level".into()), ], ), ( "[%{ts}]%{->}[%{level}]", vec![ - PartInfo::Split("[".into()), - PartInfo::Name(NameInfo { + Part::Split("[".into()), + Part::Name(Name { name: "ts".into(), start_modifier: None, end_modifier: None, }), - PartInfo::Split("]".into()), - PartInfo::Name(NameInfo { + Part::Split("]".into()), + Part::Name(Name { name: "".into(), start_modifier: None, end_modifier: Some(EndModifier), }), - PartInfo::Split("[".into()), - PartInfo::Name(NameInfo { + Part::Split("[".into()), + Part::Name(Name { name: "level".into(), start_modifier: None, end_modifier: None, }), - PartInfo::Split("]".into()), + Part::Split("]".into()), ], ), ( "%{+name} %{+name} %{+name} %{+name}", vec![ - PartInfo::Name(NameInfo { + Part::Name(Name { name: "name".into(), start_modifier: Some(StartModifier::Append(None)), end_modifier: None, }), - PartInfo::Split(" ".into()), - PartInfo::Name(NameInfo { + Part::Split(" ".into()), + Part::Name(Name { name: "name".into(), start_modifier: Some(StartModifier::Append(None)), end_modifier: None, }), - PartInfo::Split(" ".into()), - PartInfo::Name(NameInfo { + Part::Split(" ".into()), + Part::Name(Name { name: "name".into(), start_modifier: Some(StartModifier::Append(None)), end_modifier: None, }), - PartInfo::Split(" ".into()), - PartInfo::Name(NameInfo { + Part::Split(" ".into()), + Part::Name(Name { name: "name".into(), start_modifier: Some(StartModifier::Append(None)), end_modifier: None, @@ -1001,25 +776,25 @@ mod tests { ( "%{+name/2} %{+name/4} %{+name/3} %{+name/1}", vec![ - PartInfo::Name(NameInfo { + Part::Name(Name { name: "name".into(), start_modifier: Some(StartModifier::Append(Some(2))), end_modifier: None, }), - PartInfo::Split(" ".into()), - PartInfo::Name(NameInfo { + Part::Split(" ".into()), + Part::Name(Name { name: "name".into(), start_modifier: Some(StartModifier::Append(Some(4))), end_modifier: None, }), - PartInfo::Split(" ".into()), - PartInfo::Name(NameInfo { + Part::Split(" ".into()), + Part::Name(Name { name: "name".into(), start_modifier: Some(StartModifier::Append(Some(3))), end_modifier: None, }), - PartInfo::Split(" ".into()), - PartInfo::Name(NameInfo { + Part::Split(" ".into()), + Part::Name(Name { name: "name".into(), start_modifier: Some(StartModifier::Append(Some(1))), end_modifier: None, @@ -1029,67 +804,67 @@ mod tests { ( "%{clientip} %{?ident} %{?auth} [%{timestamp}]", vec![ - PartInfo::Name(NameInfo { + Part::Name(Name { name: "clientip".into(), start_modifier: None, end_modifier: None, }), - PartInfo::Split(" ".into()), - PartInfo::Name(NameInfo { + Part::Split(" ".into()), + Part::Name(Name { name: "ident".into(), start_modifier: Some(StartModifier::NamedSkip), end_modifier: None, }), - PartInfo::Split(" ".into()), - PartInfo::Name(NameInfo { + Part::Split(" ".into()), + Part::Name(Name { name: "auth".into(), start_modifier: Some(StartModifier::NamedSkip), end_modifier: None, }), - PartInfo::Split(" [".into()), - PartInfo::Name(NameInfo { + Part::Split(" [".into()), + Part::Name(Name { name: "timestamp".into(), start_modifier: None, end_modifier: None, }), - PartInfo::Split("]".into()), + Part::Split("]".into()), ], ), ( "[%{ts}] [%{level}] %{*p1}:%{&p1} %{*p2}:%{&p2}", vec![ - PartInfo::Split("[".into()), - PartInfo::Name(NameInfo { + Part::Split("[".into()), + Part::Name(Name { name: "ts".into(), start_modifier: None, end_modifier: None, }), - PartInfo::Split("] [".into()), - PartInfo::Name(NameInfo { + Part::Split("] [".into()), + Part::Name(Name { name: "level".into(), start_modifier: None, end_modifier: None, }), - PartInfo::Split("] ".into()), - PartInfo::Name(NameInfo { + Part::Split("] ".into()), + Part::Name(Name { name: "p1".into(), start_modifier: Some(StartModifier::MapKey), end_modifier: None, }), - PartInfo::Split(":".into()), - PartInfo::Name(NameInfo { + Part::Split(":".into()), + Part::Name(Name { name: "p1".into(), start_modifier: Some(StartModifier::MapVal), end_modifier: None, }), - PartInfo::Split(" ".into()), - PartInfo::Name(NameInfo { + Part::Split(" ".into()), + Part::Name(Name { name: "p2".into(), start_modifier: Some(StartModifier::MapKey), end_modifier: None, }), - PartInfo::Split(":".into()), - PartInfo::Name(NameInfo { + Part::Split(":".into()), + Part::Name(Name { name: "p2".into(), start_modifier: Some(StartModifier::MapVal), end_modifier: None, @@ -1099,13 +874,13 @@ mod tests { ( "%{&p1}:%{*p1}", vec![ - PartInfo::Name(NameInfo { + Part::Name(Name { name: "p1".into(), start_modifier: Some(StartModifier::MapVal), end_modifier: None, }), - PartInfo::Split(":".into()), - PartInfo::Name(NameInfo { + Part::Split(":".into()), + Part::Name(Name { name: "p1".into(), start_modifier: Some(StartModifier::MapKey), end_modifier: None, @@ -1115,7 +890,7 @@ mod tests { ]; for (pattern, expected) in cases.into_iter() { - let p: PatternInfo = pattern.parse().unwrap(); + let p: Pattern = pattern.parse().unwrap(); assert_eq!(p.parts, expected); } } @@ -1195,7 +970,7 @@ mod tests { ]; for (pattern, expected) in cases.into_iter() { - let err = pattern.parse::().unwrap_err(); + let err = pattern.parse::().unwrap_err(); assert_eq!(err.to_string(), expected); } } diff --git a/src/pipeline/src/etl/processor/epoch.rs b/src/pipeline/src/etl/processor/epoch.rs index f2c03fd120de..29ad6bd3d97d 100644 --- a/src/pipeline/src/etl/processor/epoch.rs +++ b/src/pipeline/src/etl/processor/epoch.rs @@ -12,17 +12,17 @@ // See the License for the specific language governing permissions and // limitations under the License. -use ahash::HashSet; use snafu::{OptionExt, ResultExt}; +use super::IntermediateStatus; use crate::etl::error::{ EpochInvalidResolutionSnafu, Error, FailedToParseIntSnafu, KeyMustBeStringSnafu, ProcessorMissingFieldSnafu, ProcessorUnsupportedValueSnafu, Result, }; -use crate::etl::field::{Fields, OneInputOneOutputField}; +use crate::etl::field::Fields; use crate::etl::processor::{ - yaml_bool, yaml_new_field, yaml_new_fields, yaml_string, Processor, ProcessorBuilder, - ProcessorKind, FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME, + yaml_bool, yaml_new_field, yaml_new_fields, yaml_string, Processor, FIELDS_NAME, FIELD_NAME, + IGNORE_MISSING_NAME, }; use crate::etl::value::time::{ MICROSECOND_RESOLUTION, MICRO_RESOLUTION, MILLISECOND_RESOLUTION, MILLI_RESOLUTION, @@ -57,56 +57,12 @@ impl TryFrom<&str> for Resolution { } } -#[derive(Debug, Default)] -pub struct EpochProcessorBuilder { - fields: Fields, - resolution: Resolution, - ignore_missing: bool, -} - -impl ProcessorBuilder for EpochProcessorBuilder { - fn output_keys(&self) -> HashSet<&str> { - self.fields - .iter() - .map(|f| f.target_or_input_field()) - .collect() - } - - fn input_keys(&self) -> HashSet<&str> { - self.fields.iter().map(|f| f.input_field()).collect() - } - - fn build(self, intermediate_keys: &[String]) -> Result { - self.build(intermediate_keys).map(ProcessorKind::Epoch) - } -} - -impl EpochProcessorBuilder { - pub fn build(self, intermediate_keys: &[String]) -> Result { - let mut real_fields = vec![]; - for field in self.fields.into_iter() { - let input = OneInputOneOutputField::build( - "epoch", - intermediate_keys, - field.input_field(), - field.target_or_input_field(), - )?; - real_fields.push(input); - } - Ok(EpochProcessor { - fields: real_fields, - resolution: self.resolution, - ignore_missing: self.ignore_missing, - }) - } -} - /// support string, integer, float, time, epoch /// deprecated it should be removed in the future /// Reserved for compatibility only #[derive(Debug, Default)] pub struct EpochProcessor { - fields: Vec, + fields: Fields, resolution: Resolution, ignore_missing: bool, // description @@ -157,7 +113,7 @@ impl EpochProcessor { } } -impl TryFrom<&yaml_rust::yaml::Hash> for EpochProcessorBuilder { +impl TryFrom<&yaml_rust::yaml::Hash> for EpochProcessor { type Error = Error; fn try_from(hash: &yaml_rust::yaml::Hash) -> Result { @@ -188,7 +144,7 @@ impl TryFrom<&yaml_rust::yaml::Hash> for EpochProcessorBuilder { _ => {} } } - let builder = EpochProcessorBuilder { + let builder = EpochProcessor { fields, resolution, ignore_missing, @@ -207,23 +163,23 @@ impl Processor for EpochProcessor { self.ignore_missing } - fn exec_mut(&self, val: &mut Vec) -> Result<()> { + fn exec_mut(&self, val: &mut IntermediateStatus) -> Result<()> { for field in self.fields.iter() { - let index = field.input_index(); + let index = field.input_field(); match val.get(index) { Some(Value::Null) | None => { if !self.ignore_missing { return ProcessorMissingFieldSnafu { processor: self.kind(), - field: field.input_name(), + field: field.input_field(), } .fail(); } } Some(v) => { let timestamp = self.parse(v)?; - let output_index = field.output_index(); - val[output_index] = Value::Timestamp(timestamp); + let output_index = field.target_or_input_field(); + val.insert(output_index.to_string(), Value::Timestamp(timestamp)); } } } diff --git a/src/pipeline/src/etl/processor/gsub.rs b/src/pipeline/src/etl/processor/gsub.rs index 54c8306ec4de..7f0f601f44f3 100644 --- a/src/pipeline/src/etl/processor/gsub.rs +++ b/src/pipeline/src/etl/processor/gsub.rs @@ -12,18 +12,18 @@ // See the License for the specific language governing permissions and // limitations under the License. -use ahash::HashSet; use regex::Regex; use snafu::{OptionExt, ResultExt}; +use super::IntermediateStatus; use crate::etl::error::{ Error, GsubPatternRequiredSnafu, GsubReplacementRequiredSnafu, KeyMustBeStringSnafu, ProcessorExpectStringSnafu, ProcessorMissingFieldSnafu, RegexSnafu, Result, }; -use crate::etl::field::{Fields, OneInputOneOutputField}; +use crate::etl::field::Fields; use crate::etl::processor::{ - yaml_bool, yaml_new_field, yaml_new_fields, yaml_string, ProcessorBuilder, ProcessorKind, - FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME, PATTERN_NAME, + yaml_bool, yaml_new_field, yaml_new_fields, yaml_string, FIELDS_NAME, FIELD_NAME, + IGNORE_MISSING_NAME, PATTERN_NAME, }; use crate::etl::value::Value; @@ -31,94 +31,18 @@ pub(crate) const PROCESSOR_GSUB: &str = "gsub"; const REPLACEMENT_NAME: &str = "replacement"; -#[derive(Debug, Default)] -pub struct GsubProcessorBuilder { - fields: Fields, - pattern: Option, - replacement: Option, - ignore_missing: bool, -} - -impl ProcessorBuilder for GsubProcessorBuilder { - fn output_keys(&self) -> HashSet<&str> { - self.fields - .iter() - .map(|f| f.target_or_input_field()) - .collect() - } - - fn input_keys(&self) -> HashSet<&str> { - self.fields.iter().map(|f| f.input_field()).collect() - } - - fn build(self, intermediate_keys: &[String]) -> Result { - self.build(intermediate_keys).map(ProcessorKind::Gsub) - } -} - -impl GsubProcessorBuilder { - fn check(self) -> Result { - if self.pattern.is_none() { - return GsubPatternRequiredSnafu.fail(); - } - - if self.replacement.is_none() { - return GsubReplacementRequiredSnafu.fail(); - } - - Ok(self) - } - - fn build(self, intermediate_keys: &[String]) -> Result { - let mut real_fields = vec![]; - for field in self.fields.into_iter() { - let input = OneInputOneOutputField::build( - "gsub", - intermediate_keys, - field.input_field(), - field.target_or_input_field(), - )?; - real_fields.push(input); - } - Ok(GsubProcessor { - fields: real_fields, - pattern: self.pattern, - replacement: self.replacement, - ignore_missing: self.ignore_missing, - }) - } -} - /// A processor to replace all matches of a pattern in string by a replacement, only support string value, and array string value -#[derive(Debug, Default)] +#[derive(Debug)] pub struct GsubProcessor { - fields: Vec, - pattern: Option, - replacement: Option, + fields: Fields, + pattern: Regex, + replacement: String, ignore_missing: bool, } impl GsubProcessor { - fn check(self) -> Result { - if self.pattern.is_none() { - return GsubPatternRequiredSnafu.fail(); - } - - if self.replacement.is_none() { - return GsubReplacementRequiredSnafu.fail(); - } - - Ok(self) - } - fn process_string(&self, val: &str) -> Result { - let replacement = self.replacement.as_ref().unwrap(); - let new_val = self - .pattern - .as_ref() - .unwrap() - .replace_all(val, replacement) - .to_string(); + let new_val = self.pattern.replace_all(val, &self.replacement).to_string(); let val = Value::String(new_val); Ok(val) @@ -136,7 +60,7 @@ impl GsubProcessor { } } -impl TryFrom<&yaml_rust::yaml::Hash> for GsubProcessorBuilder { +impl TryFrom<&yaml_rust::yaml::Hash> for GsubProcessor { type Error = Error; fn try_from(value: &yaml_rust::yaml::Hash) -> Result { @@ -176,14 +100,12 @@ impl TryFrom<&yaml_rust::yaml::Hash> for GsubProcessorBuilder { } } - let builder = GsubProcessorBuilder { + Ok(GsubProcessor { fields, - pattern, - replacement, + pattern: pattern.context(GsubPatternRequiredSnafu)?, + replacement: replacement.context(GsubReplacementRequiredSnafu)?, ignore_missing, - }; - - builder.check() + }) } } @@ -196,23 +118,23 @@ impl crate::etl::processor::Processor for GsubProcessor { self.ignore_missing } - fn exec_mut(&self, val: &mut Vec) -> Result<()> { + fn exec_mut(&self, val: &mut IntermediateStatus) -> Result<()> { for field in self.fields.iter() { - let index = field.input_index(); + let index = field.input_field(); match val.get(index) { Some(Value::Null) | None => { if !self.ignore_missing { return ProcessorMissingFieldSnafu { processor: self.kind(), - field: field.input_name(), + field: field.input_field(), } .fail(); } } Some(v) => { let result = self.process(v)?; - let output_index = field.output_index(); - val[output_index] = result; + let output_index = field.target_or_input_field(); + val.insert(output_index.to_string(), result); } } } @@ -222,15 +144,17 @@ impl crate::etl::processor::Processor for GsubProcessor { #[cfg(test)] mod tests { + use super::*; use crate::etl::processor::gsub::GsubProcessor; use crate::etl::value::Value; #[test] fn test_string_value() { let processor = GsubProcessor { - pattern: Some(regex::Regex::new(r"\d+").unwrap()), - replacement: Some("xxx".to_string()), - ..Default::default() + fields: Fields::default(), + pattern: regex::Regex::new(r"\d+").unwrap(), + replacement: "xxx".to_string(), + ignore_missing: false, }; let val = Value::String("123".to_string()); diff --git a/src/pipeline/src/etl/processor/join.rs b/src/pipeline/src/etl/processor/join.rs index ddbc086ab8da..72fafdbf7dd1 100644 --- a/src/pipeline/src/etl/processor/join.rs +++ b/src/pipeline/src/etl/processor/join.rs @@ -12,105 +12,43 @@ // See the License for the specific language governing permissions and // limitations under the License. -use ahash::HashSet; use snafu::OptionExt; +use super::IntermediateStatus; use crate::etl::error::{ Error, JoinSeparatorRequiredSnafu, KeyMustBeStringSnafu, ProcessorExpectStringSnafu, ProcessorMissingFieldSnafu, Result, }; -use crate::etl::field::{Fields, OneInputOneOutputField}; +use crate::etl::field::Fields; use crate::etl::processor::{ - yaml_bool, yaml_new_field, yaml_new_fields, yaml_string, Processor, ProcessorBuilder, - ProcessorKind, FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME, SEPARATOR_NAME, + yaml_bool, yaml_new_field, yaml_new_fields, yaml_string, Processor, FIELDS_NAME, FIELD_NAME, + IGNORE_MISSING_NAME, SEPARATOR_NAME, }; use crate::etl::value::{Array, Value}; pub(crate) const PROCESSOR_JOIN: &str = "join"; -#[derive(Debug, Default)] -pub struct JoinProcessorBuilder { - fields: Fields, - separator: Option, - ignore_missing: bool, -} - -impl ProcessorBuilder for JoinProcessorBuilder { - fn output_keys(&self) -> HashSet<&str> { - self.fields - .iter() - .map(|f| f.target_or_input_field()) - .collect() - } - - fn input_keys(&self) -> HashSet<&str> { - self.fields.iter().map(|f| f.input_field()).collect() - } - - fn build(self, intermediate_keys: &[String]) -> Result { - self.build(intermediate_keys).map(ProcessorKind::Join) - } -} - -impl JoinProcessorBuilder { - fn check(self) -> Result { - if self.separator.is_none() { - return JoinSeparatorRequiredSnafu.fail(); - } - - Ok(self) - } - - pub fn build(self, intermediate_keys: &[String]) -> Result { - let mut real_fields = vec![]; - for field in self.fields.into_iter() { - let input = OneInputOneOutputField::build( - "join", - intermediate_keys, - field.input_field(), - field.target_or_input_field(), - )?; - real_fields.push(input); - } - - Ok(JoinProcessor { - fields: real_fields, - separator: self.separator, - ignore_missing: self.ignore_missing, - }) - } -} - /// A processor to join each element of an array into a single string using a separator string between each element #[derive(Debug, Default)] pub struct JoinProcessor { - fields: Vec, - separator: Option, + fields: Fields, + separator: String, ignore_missing: bool, } impl JoinProcessor { fn process(&self, arr: &Array) -> Result { - let sep = self.separator.as_ref().unwrap(); let val = arr .iter() .map(|v| v.to_str_value()) .collect::>() - .join(sep); + .join(&self.separator); Ok(Value::String(val)) } - - fn check(self) -> Result { - if self.separator.is_none() { - return JoinSeparatorRequiredSnafu.fail(); - } - - Ok(self) - } } -impl TryFrom<&yaml_rust::yaml::Hash> for JoinProcessorBuilder { +impl TryFrom<&yaml_rust::yaml::Hash> for JoinProcessor { type Error = Error; fn try_from(value: &yaml_rust::yaml::Hash) -> Result { @@ -140,12 +78,11 @@ impl TryFrom<&yaml_rust::yaml::Hash> for JoinProcessorBuilder { } } - let builder = JoinProcessorBuilder { + Ok(JoinProcessor { fields, - separator, + separator: separator.context(JoinSeparatorRequiredSnafu)?, ignore_missing, - }; - builder.check() + }) } } @@ -158,20 +95,20 @@ impl Processor for JoinProcessor { self.ignore_missing } - fn exec_mut(&self, val: &mut Vec) -> Result<()> { + fn exec_mut(&self, val: &mut IntermediateStatus) -> Result<()> { for field in self.fields.iter() { - let index = field.input_index(); + let index = field.input_field(); match val.get(index) { Some(Value::Array(arr)) => { let result = self.process(arr)?; - let output_index = field.output_index(); - val[output_index] = result; + let output_index = field.target_or_input_field(); + val.insert(output_index.to_string(), result); } Some(Value::Null) | None => { if !self.ignore_missing { return ProcessorMissingFieldSnafu { processor: self.kind(), - field: field.input_name(), + field: field.input_field(), } .fail(); } @@ -199,7 +136,7 @@ mod tests { #[test] fn test_join_processor() { let processor = JoinProcessor { - separator: Some("-".to_string()), + separator: "-".to_string(), ..Default::default() }; diff --git a/src/pipeline/src/etl/processor/json_path.rs b/src/pipeline/src/etl/processor/json_path.rs index c09d338c637f..92916263e4e9 100644 --- a/src/pipeline/src/etl/processor/json_path.rs +++ b/src/pipeline/src/etl/processor/json_path.rs @@ -12,17 +12,15 @@ // See the License for the specific language governing permissions and // limitations under the License. -use ahash::HashSet; use jsonpath_rust::JsonPath; use snafu::{OptionExt, ResultExt}; use super::{ - yaml_bool, yaml_new_field, yaml_new_fields, yaml_string, Processor, ProcessorBuilder, + yaml_bool, yaml_new_field, yaml_new_fields, yaml_string, IntermediateStatus, Processor, FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME, JSON_PATH_NAME, JSON_PATH_RESULT_INDEX_NAME, }; use crate::etl::error::{Error, Result}; -use crate::etl::field::{Fields, OneInputOneOutputField}; -use crate::etl::processor::ProcessorKind; +use crate::etl::field::Fields; use crate::etl_error::{ JsonPathParseResultIndexSnafu, JsonPathParseSnafu, KeyMustBeStringSnafu, ProcessorMissingFieldSnafu, @@ -31,54 +29,7 @@ use crate::Value; pub(crate) const PROCESSOR_JSON_PATH: &str = "json_path"; -#[derive(Debug)] -pub struct JsonPathProcessorBuilder { - fields: Fields, - json_path: JsonPath, - ignore_missing: bool, - result_idex: Option, -} - -impl JsonPathProcessorBuilder { - fn build(self, intermediate_keys: &[String]) -> Result { - let mut real_fields = vec![]; - for field in self.fields.into_iter() { - let input = OneInputOneOutputField::build( - JSON_PATH_NAME, - intermediate_keys, - field.input_field(), - field.target_or_input_field(), - )?; - real_fields.push(input); - } - - Ok(JsonPathProcessor { - fields: real_fields, - json_path: self.json_path, - ignore_missing: self.ignore_missing, - result_idex: self.result_idex, - }) - } -} - -impl ProcessorBuilder for JsonPathProcessorBuilder { - fn output_keys(&self) -> HashSet<&str> { - self.fields - .iter() - .map(|f| f.target_or_input_field()) - .collect() - } - - fn input_keys(&self) -> HashSet<&str> { - self.fields.iter().map(|f| f.input_field()).collect() - } - - fn build(self, intermediate_keys: &[String]) -> Result { - self.build(intermediate_keys).map(ProcessorKind::JsonPath) - } -} - -impl TryFrom<&yaml_rust::yaml::Hash> for JsonPathProcessorBuilder { +impl TryFrom<&yaml_rust::yaml::Hash> for JsonPathProcessor { type Error = Error; fn try_from(value: &yaml_rust::yaml::Hash) -> std::result::Result { @@ -116,40 +67,36 @@ impl TryFrom<&yaml_rust::yaml::Hash> for JsonPathProcessorBuilder { _ => {} } } - if let Some(json_path) = json_path { - let processor = JsonPathProcessorBuilder { - fields, - json_path, - ignore_missing, - result_idex, - }; - - Ok(processor) - } else { - ProcessorMissingFieldSnafu { + + let processor = JsonPathProcessor { + fields, + json_path: json_path.context(ProcessorMissingFieldSnafu { processor: PROCESSOR_JSON_PATH, field: JSON_PATH_NAME, - } - .fail() - } + })?, + ignore_missing, + result_index: result_idex, + }; + + Ok(processor) } } #[derive(Debug)] pub struct JsonPathProcessor { - fields: Vec, + fields: Fields, json_path: JsonPath, ignore_missing: bool, - result_idex: Option, + result_index: Option, } impl Default for JsonPathProcessor { fn default() -> Self { JsonPathProcessor { - fields: vec![], + fields: Fields::default(), json_path: JsonPath::try_from("$").unwrap(), ignore_missing: false, - result_idex: None, + result_index: None, } } } @@ -159,7 +106,7 @@ impl JsonPathProcessor { let processed = self.json_path.find(val); match processed { Value::Array(arr) => { - if let Some(index) = self.result_idex { + if let Some(index) = self.result_index { Ok(arr.get(index).cloned().unwrap_or(Value::Null)) } else { Ok(Value::Array(arr)) @@ -179,21 +126,20 @@ impl Processor for JsonPathProcessor { self.ignore_missing } - fn exec_mut(&self, val: &mut Vec) -> Result<()> { + fn exec_mut(&self, val: &mut IntermediateStatus) -> Result<()> { for field in self.fields.iter() { - let index = field.input_index(); + let index = field.input_field(); match val.get(index) { Some(v) => { let processed = self.process_field(v)?; - - let output_index = field.output_index(); - val[output_index] = processed; + let output_index = field.target_or_input_field(); + val.insert(output_index.to_string(), processed); } None => { if !self.ignore_missing { return ProcessorMissingFieldSnafu { processor: self.kind(), - field: field.input_name(), + field: field.input_field(), } .fail(); } @@ -216,7 +162,7 @@ mod test { let json_path = JsonPath::try_from("$.hello").unwrap(); let processor = JsonPathProcessor { json_path, - result_idex: Some(0), + result_index: Some(0), ..Default::default() }; diff --git a/src/pipeline/src/etl/processor/letter.rs b/src/pipeline/src/etl/processor/letter.rs index 8eb939918104..960521853e48 100644 --- a/src/pipeline/src/etl/processor/letter.rs +++ b/src/pipeline/src/etl/processor/letter.rs @@ -12,17 +12,17 @@ // See the License for the specific language governing permissions and // limitations under the License. -use ahash::HashSet; use snafu::OptionExt; +use super::IntermediateStatus; use crate::etl::error::{ Error, KeyMustBeStringSnafu, LetterInvalidMethodSnafu, ProcessorExpectStringSnafu, ProcessorMissingFieldSnafu, Result, }; -use crate::etl::field::{Fields, OneInputOneOutputField}; +use crate::etl::field::Fields; use crate::etl::processor::{ - yaml_bool, yaml_new_field, yaml_new_fields, yaml_string, Processor, ProcessorBuilder, - ProcessorKind, FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME, METHOD_NAME, + yaml_bool, yaml_new_field, yaml_new_fields, yaml_string, Processor, FIELDS_NAME, FIELD_NAME, + IGNORE_MISSING_NAME, METHOD_NAME, }; use crate::etl::value::Value; @@ -59,55 +59,10 @@ impl std::str::FromStr for Method { } } -#[derive(Debug, Default)] -pub struct LetterProcessorBuilder { - fields: Fields, - method: Method, - ignore_missing: bool, -} - -impl ProcessorBuilder for LetterProcessorBuilder { - fn output_keys(&self) -> HashSet<&str> { - self.fields - .iter() - .map(|f| f.target_or_input_field()) - .collect() - } - - fn input_keys(&self) -> HashSet<&str> { - self.fields.iter().map(|f| f.input_field()).collect() - } - - fn build(self, intermediate_keys: &[String]) -> Result { - self.build(intermediate_keys).map(ProcessorKind::Letter) - } -} - -impl LetterProcessorBuilder { - pub fn build(self, intermediate_keys: &[String]) -> Result { - let mut real_fields = vec![]; - for field in self.fields.into_iter() { - let input = OneInputOneOutputField::build( - "letter", - intermediate_keys, - field.input_field(), - field.target_or_input_field(), - )?; - real_fields.push(input); - } - - Ok(LetterProcessor { - fields: real_fields, - method: self.method, - ignore_missing: self.ignore_missing, - }) - } -} - /// only support string value #[derive(Debug, Default)] pub struct LetterProcessor { - fields: Vec, + fields: Fields, method: Method, ignore_missing: bool, } @@ -125,7 +80,7 @@ impl LetterProcessor { } } -impl TryFrom<&yaml_rust::yaml::Hash> for LetterProcessorBuilder { +impl TryFrom<&yaml_rust::yaml::Hash> for LetterProcessor { type Error = Error; fn try_from(value: &yaml_rust::yaml::Hash) -> Result { @@ -154,7 +109,7 @@ impl TryFrom<&yaml_rust::yaml::Hash> for LetterProcessorBuilder { } } - Ok(LetterProcessorBuilder { + Ok(LetterProcessor { fields, method, ignore_missing, @@ -171,20 +126,20 @@ impl Processor for LetterProcessor { self.ignore_missing } - fn exec_mut(&self, val: &mut Vec) -> Result<()> { + fn exec_mut(&self, val: &mut IntermediateStatus) -> Result<()> { for field in self.fields.iter() { - let index = field.input_index(); + let index = field.input_field(); match val.get(index) { Some(Value::String(s)) => { let result = self.process_field(s)?; - let (_, output_index) = field.output(); - val[*output_index] = result; + let output_key = field.target_or_input_field(); + val.insert(output_key.to_string(), result); } Some(Value::Null) | None => { if !self.ignore_missing { return ProcessorMissingFieldSnafu { processor: self.kind(), - field: field.input_name(), + field: field.input_field(), } .fail(); } diff --git a/src/pipeline/src/etl/processor/regex.rs b/src/pipeline/src/etl/processor/regex.rs index de25195f99ab..27f30f65d9ae 100644 --- a/src/pipeline/src/etl/processor/regex.rs +++ b/src/pipeline/src/etl/processor/regex.rs @@ -18,21 +18,22 @@ const PATTERNS_NAME: &str = "patterns"; pub(crate) const PROCESSOR_REGEX: &str = "regex"; -use ahash::{HashSet, HashSetExt}; +use std::collections::BTreeMap; + use lazy_static::lazy_static; use regex::Regex; use snafu::{OptionExt, ResultExt}; +use super::IntermediateStatus; use crate::etl::error::{ Error, KeyMustBeStringSnafu, ProcessorExpectStringSnafu, ProcessorMissingFieldSnafu, RegexNamedGroupNotFoundSnafu, RegexNoValidFieldSnafu, RegexNoValidPatternSnafu, RegexSnafu, Result, }; -use crate::etl::field::{Fields, InputFieldInfo, OneInputMultiOutputField}; -use crate::etl::find_key_index; +use crate::etl::field::Fields; use crate::etl::processor::{ - yaml_bool, yaml_new_field, yaml_new_fields, yaml_string, yaml_strings, Processor, - ProcessorBuilder, ProcessorKind, FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME, PATTERN_NAME, + yaml_bool, yaml_new_field, yaml_new_fields, yaml_string, yaml_strings, Processor, FIELDS_NAME, + FIELD_NAME, IGNORE_MISSING_NAME, PATTERN_NAME, }; use crate::etl::value::Value; @@ -83,113 +84,7 @@ impl std::str::FromStr for GroupRegex { } } -#[derive(Debug, Default)] -pub struct RegexProcessorBuilder { - fields: Fields, - patterns: Vec, - ignore_missing: bool, - output_keys: HashSet, -} - -impl ProcessorBuilder for RegexProcessorBuilder { - fn output_keys(&self) -> HashSet<&str> { - self.output_keys.iter().map(|k| k.as_str()).collect() - } - - fn input_keys(&self) -> HashSet<&str> { - self.fields.iter().map(|f| f.input_field()).collect() - } - - fn build(self, intermediate_keys: &[String]) -> Result { - self.build(intermediate_keys).map(ProcessorKind::Regex) - } -} - -impl RegexProcessorBuilder { - fn check(self) -> Result { - if self.fields.is_empty() { - return RegexNoValidFieldSnafu { - processor: PROCESSOR_REGEX, - } - .fail(); - } - - if self.patterns.is_empty() { - return RegexNoValidPatternSnafu { - processor: PROCESSOR_REGEX, - } - .fail(); - } - - Ok(self) - } - - fn build_group_output_info( - group_regex: &GroupRegex, - om_field: &OneInputMultiOutputField, - intermediate_keys: &[String], - ) -> Result> { - group_regex - .groups - .iter() - .map(|g| { - let key = generate_key(om_field.target_prefix(), g); - let index = find_key_index(intermediate_keys, &key, "regex"); - index.map(|index| OutPutInfo { - final_key: key, - group_name: g.to_string(), - index, - }) - }) - .collect::>>() - } - - fn build_group_output_infos( - patterns: &[GroupRegex], - om_field: &OneInputMultiOutputField, - intermediate_keys: &[String], - ) -> Result>> { - patterns - .iter() - .map(|group_regex| { - Self::build_group_output_info(group_regex, om_field, intermediate_keys) - }) - .collect::>>() - } - - fn build_output_info( - real_fields: &[OneInputMultiOutputField], - patterns: &[GroupRegex], - intermediate_keys: &[String], - ) -> Result { - let inner = real_fields - .iter() - .map(|om_field| Self::build_group_output_infos(patterns, om_field, intermediate_keys)) - .collect::>>(); - inner.map(|inner| RegexProcessorOutputInfo { inner }) - } - - fn build(self, intermediate_keys: &[String]) -> Result { - let mut real_fields = vec![]; - for field in self.fields.into_iter() { - let input_index = find_key_index(intermediate_keys, field.input_field(), "regex")?; - let input_field_info = InputFieldInfo::new(field.input_field(), input_index); - - let input = OneInputMultiOutputField::new(input_field_info, field.target_field); - real_fields.push(input); - } - let output_info = Self::build_output_info(&real_fields, &self.patterns, intermediate_keys)?; - Ok(RegexProcessor { - // fields: Fields::one(Field::new("test".to_string())), - fields: real_fields, - patterns: self.patterns, - output_info, - ignore_missing: self.ignore_missing, - }) - } -} - -impl TryFrom<&yaml_rust::yaml::Hash> for RegexProcessorBuilder { +impl TryFrom<&yaml_rust::yaml::Hash> for RegexProcessor { type Error = Error; fn try_from(value: &yaml_rust::yaml::Hash) -> Result { @@ -226,61 +121,44 @@ impl TryFrom<&yaml_rust::yaml::Hash> for RegexProcessorBuilder { } } - let pattern_output_keys = patterns - .iter() - .flat_map(|pattern| pattern.groups.iter()) - .collect::>(); - let mut output_keys = HashSet::new(); - for field in fields.iter() { - for x in pattern_output_keys.iter() { - output_keys.insert(generate_key(field.target_or_input_field(), x)); - } - } - - let processor_builder = RegexProcessorBuilder { + let processor_builder = RegexProcessor { fields, patterns, ignore_missing, - output_keys, }; processor_builder.check() } } -#[derive(Debug, Default)] -struct OutPutInfo { - final_key: String, - group_name: String, - index: usize, -} - -#[derive(Debug, Default)] -struct RegexProcessorOutputInfo { - pub inner: Vec>>, -} - -impl RegexProcessorOutputInfo { - fn get_output_index( - &self, - field_index: usize, - pattern_index: usize, - group_index: usize, - ) -> usize { - self.inner[field_index][pattern_index][group_index].index - } -} /// only support string value /// if no value found from a pattern, the target_field will be ignored #[derive(Debug, Default)] pub struct RegexProcessor { - fields: Vec, - output_info: RegexProcessorOutputInfo, + fields: Fields, patterns: Vec, ignore_missing: bool, } impl RegexProcessor { + fn check(self) -> Result { + if self.fields.is_empty() { + return RegexNoValidFieldSnafu { + processor: PROCESSOR_REGEX, + } + .fail(); + } + + if self.patterns.is_empty() { + return RegexNoValidPatternSnafu { + processor: PROCESSOR_REGEX, + } + .fail(); + } + + Ok(self) + } + fn try_with_patterns(&mut self, patterns: Vec) -> Result<()> { let mut rs = vec![]; for pattern in patterns { @@ -291,21 +169,15 @@ impl RegexProcessor { Ok(()) } - fn process( - &self, - val: &str, - gr: &GroupRegex, - index: (usize, usize), - ) -> Result> { - let mut result = Vec::new(); - if let Some(captures) = gr.regex.captures(val) { - for (group_index, group) in gr.groups.iter().enumerate() { - if let Some(capture) = captures.name(group) { - let value = capture.as_str().to_string(); - let index = self - .output_info - .get_output_index(index.0, index.1, group_index); - result.push((index, Value::String(value))); + fn process(&self, prefix: &str, val: &str) -> Result> { + let mut result = BTreeMap::new(); + for gr in self.patterns.iter() { + if let Some(captures) = gr.regex.captures(val) { + for group in gr.groups.iter() { + if let Some(capture) = captures.name(group) { + let value = capture.as_str().to_string(); + result.insert(generate_key(prefix, group), Value::String(value)); + } } } } @@ -322,39 +194,20 @@ impl Processor for RegexProcessor { self.ignore_missing } - fn exec_mut(&self, val: &mut Vec) -> Result<()> { - for (field_index, field) in self.fields.iter().enumerate() { - let index = field.input_index(); - let mut result_list = None; + fn exec_mut(&self, val: &mut IntermediateStatus) -> Result<()> { + for field in self.fields.iter() { + let index = field.input_field(); + let prefix = field.target_or_input_field(); match val.get(index) { Some(Value::String(s)) => { - // we get rust borrow checker error here - // for (gr_index, gr) in self.patterns.iter().enumerate() { - // let result_list = self.process(s.as_str(), gr, (field_index, gr_index))?; - // for (output_index, result) in result_list { - //cannot borrow `*val` as mutable because it is also borrowed as immutable mutable borrow occurs here - // val[output_index] = result; - // } - // } - for (gr_index, gr) in self.patterns.iter().enumerate() { - let result = self.process(s.as_str(), gr, (field_index, gr_index))?; - if !result.is_empty() { - match result_list.as_mut() { - None => { - result_list = Some(result); - } - Some(result_list) => { - result_list.extend(result); - } - } - } - } + let result = self.process(prefix, s)?; + val.extend(result); } Some(Value::Null) | None => { if !self.ignore_missing { return ProcessorMissingFieldSnafu { processor: self.kind(), - field: field.input_name(), + field: field.input_field(), } .fail(); } @@ -367,15 +220,6 @@ impl Processor for RegexProcessor { .fail(); } } - // safety here - match result_list { - None => {} - Some(result_list) => { - for (output_index, result) in result_list { - val[output_index] = result; - } - } - } } Ok(()) @@ -388,7 +232,7 @@ mod tests { use ahash::{HashMap, HashMapExt}; use itertools::Itertools; - use crate::etl::processor::regex::RegexProcessorBuilder; + use crate::etl::processor::regex::RegexProcessor; use crate::etl::value::{Map, Value}; #[test] @@ -402,18 +246,11 @@ ignore_missing: false"#; .pop() .unwrap(); let processor_yaml_hash = processor_yaml.as_hash().unwrap(); - let builder = RegexProcessorBuilder::try_from(processor_yaml_hash).unwrap(); - let intermediate_keys = ["a".to_string(), "a_ar".to_string()]; - let processor = builder.build(&intermediate_keys).unwrap(); + let processor = RegexProcessor::try_from(processor_yaml_hash).unwrap(); // single field (with prefix), multiple patterns - let result = processor - .process("123", &processor.patterns[0], (0, 0)) - .unwrap() - .into_iter() - .map(|(k, v)| (intermediate_keys[k].clone(), v)) - .collect(); + let result = processor.process("a", "123").unwrap(); let map = Map { values: result }; @@ -435,7 +272,7 @@ ignore_missing: false"#; let cw = "[c=w,n=US_CA_SANJOSE,o=55155]"; let breadcrumbs_str = [cc, cg, co, cp, cw].iter().join(","); - let values = [ + let temporary_map: BTreeMap = [ ("breadcrumbs_parent", Value::String(cc.to_string())), ("breadcrumbs_edge", Value::String(cg.to_string())), ("breadcrumbs_origin", Value::String(co.to_string())), @@ -445,7 +282,6 @@ ignore_missing: false"#; .into_iter() .map(|(k, v)| (k.to_string(), v)) .collect(); - let temporary_map = Map { values }; { // single field (with prefix), multiple patterns @@ -464,31 +300,11 @@ ignore_missing: false"#; .pop() .unwrap(); let processor_yaml_hash = processor_yaml.as_hash().unwrap(); - let builder = RegexProcessorBuilder::try_from(processor_yaml_hash).unwrap(); - let intermediate_keys = [ - "breadcrumbs", - "breadcrumbs_parent", - "breadcrumbs_edge", - "breadcrumbs_origin", - "breadcrumbs_peer", - "breadcrumbs_wrapper", - ] - .iter() - .map(|k| k.to_string()) - .collect_vec(); - let processor = builder.build(&intermediate_keys).unwrap(); - let mut result = BTreeMap::new(); - for (index, pattern) in processor.patterns.iter().enumerate() { - let r = processor - .process(&breadcrumbs_str, pattern, (0, index)) - .unwrap() - .into_iter() - .map(|(k, v)| (intermediate_keys[k].clone(), v)) - .collect::>(); - result.extend(r); - } - let map = Map { values: result }; - assert_eq!(temporary_map, map); + let processor = RegexProcessor::try_from(processor_yaml_hash).unwrap(); + + let result = processor.process("breadcrumbs", &breadcrumbs_str).unwrap(); + + assert_eq!(temporary_map, result); } { @@ -515,70 +331,19 @@ ignore_missing: false"#; .pop() .unwrap(); let processor_yaml_hash = processor_yaml.as_hash().unwrap(); - let builder = RegexProcessorBuilder::try_from(processor_yaml_hash).unwrap(); - - let intermediate_keys = [ - "breadcrumbs_parent", - "breadcrumbs_edge", - "breadcrumbs_origin", - "breadcrumbs_peer", - "breadcrumbs_wrapper", - "edge_ip", - "edge_request_id", - "edge_request_end_time", - "edge_turn_around_time", - "edge_dns_lookup_time", - "edge_geo", - "edge_asn", - "origin_ip", - "origin_request_id", - "origin_request_end_time", - "origin_turn_around_time", - "origin_dns_lookup_time", - "origin_geo", - "origin_asn", - "peer_ip", - "peer_request_id", - "peer_request_end_time", - "peer_turn_around_time", - "peer_dns_lookup_time", - "peer_geo", - "peer_asn", - "parent_ip", - "parent_request_id", - "parent_request_end_time", - "parent_turn_around_time", - "parent_dns_lookup_time", - "parent_geo", - "parent_asn", - "wrapper_ip", - "wrapper_request_id", - "wrapper_request_end_time", - "wrapper_turn_around_time", - "wrapper_dns_lookup_time", - "wrapper_geo", - "wrapper_asn", - ] - .iter() - .map(|k| k.to_string()) - .collect_vec(); - let processor = builder.build(&intermediate_keys).unwrap(); + let processor = RegexProcessor::try_from(processor_yaml_hash).unwrap(); let mut result = HashMap::new(); - for (field_index, field) in processor.fields.iter().enumerate() { - for (pattern_index, pattern) in processor.patterns.iter().enumerate() { - let s = temporary_map - .get(field.input_name()) - .unwrap() - .to_str_value(); - let r = processor - .process(&s, pattern, (field_index, pattern_index)) - .unwrap() - .into_iter() - .map(|(k, v)| (intermediate_keys[k].clone(), v)) - .collect::>(); - result.extend(r); - } + for field in processor.fields.iter() { + let s = temporary_map + .get(field.input_field()) + .unwrap() + .to_str_value(); + let prefix = field.target_or_input_field(); + + let r = processor.process(prefix, &s).unwrap(); + + result.extend(r); } let new_values = vec![ diff --git a/src/pipeline/src/etl/processor/timestamp.rs b/src/pipeline/src/etl/processor/timestamp.rs index 18b6711c1d80..bf90e78f2165 100644 --- a/src/pipeline/src/etl/processor/timestamp.rs +++ b/src/pipeline/src/etl/processor/timestamp.rs @@ -14,22 +14,22 @@ use std::sync::Arc; -use ahash::HashSet; use chrono::{DateTime, NaiveDateTime}; use chrono_tz::Tz; use lazy_static::lazy_static; use snafu::{OptionExt, ResultExt}; +use super::IntermediateStatus; use crate::etl::error::{ DateFailedToGetLocalTimezoneSnafu, DateFailedToGetTimestampSnafu, DateInvalidFormatSnafu, DateParseSnafu, DateParseTimezoneSnafu, EpochInvalidResolutionSnafu, Error, KeyMustBeStringSnafu, ProcessorFailedToParseStringSnafu, ProcessorMissingFieldSnafu, ProcessorUnsupportedValueSnafu, Result, }; -use crate::etl::field::{Fields, OneInputOneOutputField}; +use crate::etl::field::Fields; use crate::etl::processor::{ - yaml_bool, yaml_new_field, yaml_new_fields, yaml_string, yaml_strings, Processor, - ProcessorBuilder, ProcessorKind, FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME, + yaml_bool, yaml_new_field, yaml_new_fields, yaml_string, yaml_strings, Processor, FIELDS_NAME, + FIELD_NAME, IGNORE_MISSING_NAME, }; use crate::etl::value::time::{ MICROSECOND_RESOLUTION, MICRO_RESOLUTION, MILLISECOND_RESOLUTION, MILLI_RESOLUTION, @@ -114,56 +114,10 @@ impl std::ops::Deref for Formats { } } -#[derive(Debug)] -pub struct TimestampProcessorBuilder { - fields: Fields, - formats: Formats, - resolution: Resolution, - ignore_missing: bool, -} - -impl ProcessorBuilder for TimestampProcessorBuilder { - fn output_keys(&self) -> HashSet<&str> { - self.fields - .iter() - .map(|f| f.target_or_input_field()) - .collect() - } - - fn input_keys(&self) -> HashSet<&str> { - self.fields.iter().map(|f| f.input_field()).collect() - } - - fn build(self, intermediate_keys: &[String]) -> Result { - self.build(intermediate_keys).map(ProcessorKind::Timestamp) - } -} - -impl TimestampProcessorBuilder { - pub fn build(self, intermediate_keys: &[String]) -> Result { - let mut real_fields = vec![]; - for field in self.fields.into_iter() { - let input = OneInputOneOutputField::build( - "timestamp", - intermediate_keys, - field.input_field(), - field.target_or_input_field(), - )?; - real_fields.push(input); - } - Ok(TimestampProcessor { - fields: real_fields, - formats: self.formats, - resolution: self.resolution, - ignore_missing: self.ignore_missing, - }) - } -} - /// support string, integer, float, time, epoch #[derive(Debug, Default)] pub struct TimestampProcessor { - fields: Vec, + fields: Fields, formats: Formats, resolution: Resolution, ignore_missing: bool, @@ -289,7 +243,7 @@ fn parse_formats(yaml: &yaml_rust::yaml::Yaml) -> Result, Tz)>> } } -impl TryFrom<&yaml_rust::yaml::Hash> for TimestampProcessorBuilder { +impl TryFrom<&yaml_rust::yaml::Hash> for TimestampProcessor { type Error = Error; fn try_from(hash: &yaml_rust::yaml::Hash) -> Result { @@ -324,7 +278,7 @@ impl TryFrom<&yaml_rust::yaml::Hash> for TimestampProcessorBuilder { } } - let processor_builder = TimestampProcessorBuilder { + let processor_builder = TimestampProcessor { fields, formats, resolution, @@ -344,23 +298,23 @@ impl Processor for TimestampProcessor { self.ignore_missing } - fn exec_mut(&self, val: &mut Vec) -> Result<()> { + fn exec_mut(&self, val: &mut IntermediateStatus) -> Result<()> { for field in self.fields.iter() { - let index = field.input().index; + let index = field.input_field(); match val.get(index) { Some(Value::Null) | None => { if !self.ignore_missing { return ProcessorMissingFieldSnafu { processor: self.kind(), - field: field.input_name(), + field: field.input_field(), } .fail(); } } Some(v) => { let result = self.parse(v)?; - let (_, index) = field.output(); - val[*index] = Value::Timestamp(result); + let output_key = field.target_or_input_field(); + val.insert(output_key.to_string(), Value::Timestamp(result)); } } } @@ -372,18 +326,9 @@ impl Processor for TimestampProcessor { mod tests { use yaml_rust::YamlLoader; - use super::{TimestampProcessor, TimestampProcessorBuilder}; + use super::TimestampProcessor; use crate::etl::value::{Timestamp, Value}; - fn builder_to_native_processor(builder: TimestampProcessorBuilder) -> TimestampProcessor { - TimestampProcessor { - fields: vec![], - formats: builder.formats, - resolution: builder.resolution, - ignore_missing: builder.ignore_missing, - } - } - #[test] fn test_parse_epoch() { let processor_yaml_str = r#"fields: @@ -397,9 +342,7 @@ formats: "#; let yaml = &YamlLoader::load_from_str(processor_yaml_str).unwrap()[0]; let timestamp_yaml = yaml.as_hash().unwrap(); - let processor = builder_to_native_processor( - TimestampProcessorBuilder::try_from(timestamp_yaml).unwrap(), - ); + let processor = TimestampProcessor::try_from(timestamp_yaml).unwrap(); let values = [ ( @@ -451,9 +394,7 @@ formats: "#; let yaml = &YamlLoader::load_from_str(processor_yaml_str).unwrap()[0]; let timestamp_yaml = yaml.as_hash().unwrap(); - let processor = builder_to_native_processor( - TimestampProcessorBuilder::try_from(timestamp_yaml).unwrap(), - ); + let processor = TimestampProcessor::try_from(timestamp_yaml).unwrap(); let values: Vec<&str> = vec![ "2014-5-17T12:34:56", diff --git a/src/pipeline/src/etl/processor/urlencoding.rs b/src/pipeline/src/etl/processor/urlencoding.rs index ca42aae23677..c14c7d87b11f 100644 --- a/src/pipeline/src/etl/processor/urlencoding.rs +++ b/src/pipeline/src/etl/processor/urlencoding.rs @@ -12,7 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -use ahash::HashSet; +use std::collections::BTreeMap; + use snafu::{OptionExt, ResultExt}; use urlencoding::{decode, encode}; @@ -20,10 +21,10 @@ use crate::etl::error::{ Error, KeyMustBeStringSnafu, ProcessorExpectStringSnafu, ProcessorMissingFieldSnafu, Result, UrlEncodingDecodeSnafu, UrlEncodingInvalidMethodSnafu, }; -use crate::etl::field::{Fields, OneInputOneOutputField}; +use crate::etl::field::Fields; use crate::etl::processor::{ - yaml_bool, yaml_new_field, yaml_new_fields, yaml_string, ProcessorBuilder, ProcessorKind, - FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME, METHOD_NAME, + yaml_bool, yaml_new_field, yaml_new_fields, yaml_string, FIELDS_NAME, FIELD_NAME, + IGNORE_MISSING_NAME, METHOD_NAME, }; use crate::etl::value::Value; @@ -57,55 +58,10 @@ impl std::str::FromStr for Method { } } -#[derive(Debug, Default)] -pub struct UrlEncodingProcessorBuilder { - fields: Fields, - method: Method, - ignore_missing: bool, -} - -impl ProcessorBuilder for UrlEncodingProcessorBuilder { - fn output_keys(&self) -> HashSet<&str> { - self.fields - .iter() - .map(|f| f.target_or_input_field()) - .collect() - } - - fn input_keys(&self) -> HashSet<&str> { - self.fields.iter().map(|f| f.input_field()).collect() - } - - fn build(self, intermediate_keys: &[String]) -> Result { - self.build(intermediate_keys) - .map(ProcessorKind::UrlEncoding) - } -} - -impl UrlEncodingProcessorBuilder { - fn build(self, intermediate_keys: &[String]) -> Result { - let mut real_fields = vec![]; - for field in self.fields.into_iter() { - let input = OneInputOneOutputField::build( - "urlencoding", - intermediate_keys, - field.input_field(), - field.target_or_input_field(), - )?; - real_fields.push(input); - } - Ok(UrlEncodingProcessor { - fields: real_fields, - method: self.method, - ignore_missing: self.ignore_missing, - }) - } -} - /// only support string value #[derive(Debug, Default)] pub struct UrlEncodingProcessor { - fields: Vec, + fields: Fields, method: Method, ignore_missing: bool, } @@ -120,7 +76,7 @@ impl UrlEncodingProcessor { } } -impl TryFrom<&yaml_rust::yaml::Hash> for UrlEncodingProcessorBuilder { +impl TryFrom<&yaml_rust::yaml::Hash> for UrlEncodingProcessor { type Error = Error; fn try_from(value: &yaml_rust::yaml::Hash) -> Result { @@ -152,7 +108,7 @@ impl TryFrom<&yaml_rust::yaml::Hash> for UrlEncodingProcessorBuilder { _ => {} } } - let processor = UrlEncodingProcessorBuilder { + let processor = UrlEncodingProcessor { fields, method, ignore_missing, @@ -171,20 +127,20 @@ impl crate::etl::processor::Processor for UrlEncodingProcessor { self.ignore_missing } - fn exec_mut(&self, val: &mut Vec) -> Result<()> { + fn exec_mut(&self, val: &mut BTreeMap) -> Result<()> { for field in self.fields.iter() { - let index = field.input_index(); + let index = field.input_field(); match val.get(index) { Some(Value::String(s)) => { let result = self.process_field(s)?; - let output_index = field.output_index(); - val[output_index] = result; + let output_index = field.target_or_input_field(); + val.insert(output_index.to_string(), result); } Some(Value::Null) | None => { if !self.ignore_missing { return ProcessorMissingFieldSnafu { processor: self.kind(), - field: field.input_name(), + field: field.input_field(), } .fail(); } @@ -205,6 +161,7 @@ impl crate::etl::processor::Processor for UrlEncodingProcessor { #[cfg(test)] mod tests { + use crate::etl::field::Fields; use crate::etl::processor::urlencoding::UrlEncodingProcessor; use crate::etl::value::Value; @@ -220,7 +177,7 @@ mod tests { } { let processor = UrlEncodingProcessor { - fields: vec![], + fields: Fields::default(), method: super::Method::Encode, ignore_missing: false, }; diff --git a/src/pipeline/src/etl/transform.rs b/src/pipeline/src/etl/transform.rs index be7fe35e5076..e3039d6c7ac4 100644 --- a/src/pipeline/src/etl/transform.rs +++ b/src/pipeline/src/etl/transform.rs @@ -15,11 +15,9 @@ pub mod index; pub mod transformer; -use snafu::OptionExt; +use std::collections::BTreeMap; use crate::etl::error::{Error, Result}; -use crate::etl::find_key_index; -use crate::etl::processor::yaml_string; use crate::etl::transform::index::Index; use crate::etl::value::Value; @@ -30,14 +28,15 @@ const TRANSFORM_INDEX: &str = "index"; const TRANSFORM_DEFAULT: &str = "default"; const TRANSFORM_ON_FAILURE: &str = "on_failure"; +use snafu::OptionExt; pub use transformer::greptime::GreptimeTransformer; use super::error::{ KeyMustBeStringSnafu, TransformElementMustBeMapSnafu, TransformOnFailureInvalidValueSnafu, TransformTypeMustBeSetSnafu, }; -use super::field::{Fields, InputFieldInfo, OneInputOneOutputField}; -use super::processor::{yaml_new_field, yaml_new_fields}; +use super::field::Fields; +use super::processor::{yaml_new_field, yaml_new_fields, yaml_string}; pub trait Transformer: std::fmt::Debug + Sized + Send + Sync + 'static { type Output; @@ -47,7 +46,7 @@ pub trait Transformer: std::fmt::Debug + Sized + Send + Sync + 'static { fn schemas(&self) -> &Vec; fn transforms(&self) -> &Transforms; fn transforms_mut(&mut self) -> &mut Transforms; - fn transform_mut(&self, val: &mut Vec) -> Result; + fn transform_mut(&self, val: &mut BTreeMap) -> Result; } /// On Failure behavior when transform fails @@ -73,37 +72,12 @@ impl std::str::FromStr for OnFailure { } } -#[derive(Debug, Default, Clone)] -pub struct TransformBuilders { - pub(crate) builders: Vec, - pub(crate) output_keys: Vec, - pub(crate) required_keys: Vec, -} - #[derive(Debug, Default, Clone)] pub struct Transforms { pub(crate) transforms: Vec, - pub(crate) output_keys: Vec, - pub(crate) required_keys: Vec, } impl Transforms { - pub fn output_keys(&self) -> &Vec { - &self.output_keys - } - - pub fn output_keys_mut(&mut self) -> &mut Vec { - &mut self.output_keys - } - - pub fn required_keys_mut(&mut self) -> &mut Vec { - &mut self.required_keys - } - - pub fn required_keys(&self) -> &Vec { - &self.required_keys - } - pub fn transforms(&self) -> &Vec { &self.transforms } @@ -123,7 +97,7 @@ impl std::ops::DerefMut for Transforms { } } -impl TryFrom<&Vec> for TransformBuilders { +impl TryFrom<&Vec> for Transforms { type Error = Error; fn try_from(docs: &Vec) -> Result { @@ -131,7 +105,7 @@ impl TryFrom<&Vec> for TransformBuilders { let mut all_output_keys: Vec = Vec::with_capacity(100); let mut all_required_keys = Vec::with_capacity(100); for doc in docs { - let transform_builder: TransformBuilder = doc + let transform_builder: Transform = doc .as_hash() .context(TransformElementMustBeMapSnafu)? .try_into()?; @@ -154,51 +128,14 @@ impl TryFrom<&Vec> for TransformBuilders { all_required_keys.sort(); - Ok(TransformBuilders { - builders: transforms, - output_keys: all_output_keys, - required_keys: all_required_keys, - }) - } -} - -#[derive(Debug, Clone)] -pub struct TransformBuilder { - fields: Fields, - type_: Value, - default: Option, - index: Option, - on_failure: Option, -} - -impl TransformBuilder { - pub fn build(self, intermediate_keys: &[String], output_keys: &[String]) -> Result { - let mut real_fields = vec![]; - for field in self.fields { - let input_index = find_key_index(intermediate_keys, field.input_field(), "transform")?; - let input_field_info = InputFieldInfo::new(field.input_field(), input_index); - let output_index = - find_key_index(output_keys, field.target_or_input_field(), "transform")?; - let input = OneInputOneOutputField::new( - input_field_info, - (field.target_or_input_field().to_string(), output_index), - ); - real_fields.push(input); - } - Ok(Transform { - real_fields, - type_: self.type_, - default: self.default, - index: self.index, - on_failure: self.on_failure, - }) + Ok(Transforms { transforms }) } } /// only field is required #[derive(Debug, Clone)] pub struct Transform { - pub real_fields: Vec, + pub fields: Fields, pub type_: Value, @@ -212,7 +149,7 @@ pub struct Transform { impl Default for Transform { fn default() -> Self { Transform { - real_fields: Vec::new(), + fields: Fields::default(), type_: Value::Null, default: None, index: None, @@ -231,7 +168,7 @@ impl Transform { } } -impl TryFrom<&yaml_rust::yaml::Hash> for TransformBuilder { +impl TryFrom<&yaml_rust::yaml::Hash> for Transform { type Error = Error; fn try_from(hash: &yaml_rust::yaml::Hash) -> Result { @@ -294,7 +231,7 @@ impl TryFrom<&yaml_rust::yaml::Hash> for TransformBuilder { } } } - let builder = TransformBuilder { + let builder = Transform { fields, type_, default: final_default, diff --git a/src/pipeline/src/etl/transform/transformer/greptime.rs b/src/pipeline/src/etl/transform/transformer/greptime.rs index dedb07e842d6..749806261a02 100644 --- a/src/pipeline/src/etl/transform/transformer/greptime.rs +++ b/src/pipeline/src/etl/transform/transformer/greptime.rs @@ -14,7 +14,7 @@ pub mod coerce; -use std::collections::HashSet; +use std::collections::{BTreeMap, HashSet}; use std::sync::Arc; use ahash::HashMap; @@ -25,7 +25,7 @@ use api::v1::{ColumnDataType, ColumnDataTypeExtension, JsonTypeExtension, Semant use coerce::{coerce_columns, coerce_value}; use greptime_proto::v1::{ColumnSchema, Row, Rows, Value as GreptimeValue}; use itertools::Itertools; -use serde_json::{Map, Number, Value as JsonValue}; +use serde_json::Number; use crate::etl::error::{ IdentifyPipelineColumnTypeMismatchSnafu, ReachedMaxNestedLevelsSnafu, Result, @@ -33,14 +33,12 @@ use crate::etl::error::{ TransformMultipleTimestampIndexSnafu, TransformTimestampIndexCountSnafu, UnsupportedNumberTypeSnafu, }; -use crate::etl::field::{InputFieldInfo, OneInputOneOutputField}; +use crate::etl::field::{Field, Fields}; +use crate::etl::processor::IntermediateStatus; use crate::etl::transform::index::Index; use crate::etl::transform::{Transform, Transformer, Transforms}; use crate::etl::value::{Timestamp, Value}; -/// The header key that contains the pipeline params. -pub const GREPTIME_PIPELINE_PARAMS_HEADER: &str = "x-greptime-pipeline-params"; - const DEFAULT_GREPTIME_TIMESTAMP_COLUMN: &str = "greptime_timestamp"; const DEFAULT_MAX_NESTED_LEVELS_FOR_JSON_FLATTENING: usize = 10; @@ -91,30 +89,15 @@ impl GreptimeTransformer { let default = Some(type_.clone()); let transform = Transform { - real_fields: vec![OneInputOneOutputField::new( - InputFieldInfo { - name: DEFAULT_GREPTIME_TIMESTAMP_COLUMN.to_string(), - index: usize::MAX, - }, - ( - DEFAULT_GREPTIME_TIMESTAMP_COLUMN.to_string(), - transforms - .transforms - .iter() - .map(|x| x.real_fields.len()) - .sum(), - ), - )], + fields: Fields::one(Field::new( + DEFAULT_GREPTIME_TIMESTAMP_COLUMN.to_string(), + None, + )), type_, default, index: Some(Index::Time), on_failure: Some(crate::etl::transform::OnFailure::Default), }; - let required_keys = transforms.required_keys_mut(); - required_keys.push(DEFAULT_GREPTIME_TIMESTAMP_COLUMN.to_string()); - - let output_keys = transforms.output_keys_mut(); - output_keys.push(DEFAULT_GREPTIME_TIMESTAMP_COLUMN.to_string()); transforms.push(transform); } @@ -142,9 +125,9 @@ impl Transformer for GreptimeTransformer { for transform in transforms.iter() { let target_fields_set = transform - .real_fields + .fields .iter() - .map(|f| f.output_name()) + .map(|f| f.target_or_input_field()) .collect::>(); let intersections: Vec<_> = column_names_set.intersection(&target_fields_set).collect(); @@ -157,16 +140,17 @@ impl Transformer for GreptimeTransformer { if let Some(idx) = transform.index { if idx == Index::Time { - match transform.real_fields.len() { + match transform.fields.len() { //Safety unwrap is fine here because we have checked the length of real_fields - 1 => timestamp_columns - .push(transform.real_fields.first().unwrap().input_name()), + 1 => { + timestamp_columns.push(transform.fields.first().unwrap().input_field()) + } _ => { return TransformMultipleTimestampIndexSnafu { columns: transform - .real_fields + .fields .iter() - .map(|x| x.input_name()) + .map(|x| x.input_field()) .join(", "), } .fail(); @@ -195,12 +179,12 @@ impl Transformer for GreptimeTransformer { } } - fn transform_mut(&self, val: &mut Vec) -> Result { + fn transform_mut(&self, val: &mut IntermediateStatus) -> Result { let mut values = vec![GreptimeValue { value_data: None }; self.schema.len()]; + let mut output_index = 0; for transform in self.transforms.iter() { - for field in transform.real_fields.iter() { - let index = field.input_index(); - let output_index = field.output_index(); + for field in transform.fields.iter() { + let index = field.input_field(); match val.get(index) { Some(v) => { let value_data = coerce_value(v, transform)?; @@ -216,6 +200,7 @@ impl Transformer for GreptimeTransformer { values[output_index] = GreptimeValue { value_data }; } } + output_index += 1; } } Ok(Row { values }) @@ -326,30 +311,49 @@ fn resolve_number_schema( ) } -fn json_value_to_row( - schema_info: &mut SchemaInfo, - map: Map, -) -> Result { +fn values_to_row(schema_info: &mut SchemaInfo, values: BTreeMap) -> Result { let mut row: Vec = Vec::with_capacity(schema_info.schema.len()); for _ in 0..schema_info.schema.len() { row.push(GreptimeValue { value_data: None }); } - for (column_name, value) in map { + + for (column_name, value) in values.into_iter() { if column_name == DEFAULT_GREPTIME_TIMESTAMP_COLUMN { continue; } + let index = schema_info.index.get(&column_name).copied(); + match value { - serde_json::Value::Null => { - // do nothing + Value::Null => {} + + Value::Int8(_) | Value::Int16(_) | Value::Int32(_) | Value::Int64(_) => { + // safe unwrap after type matched + let v = value.as_i64().unwrap(); + resolve_schema( + index, + ValueData::I64Value(v), + ColumnSchema { + column_name, + datatype: ColumnDataType::Int64 as i32, + semantic_type: SemanticType::Field as i32, + datatype_extension: None, + options: None, + }, + &mut row, + schema_info, + )?; } - serde_json::Value::String(s) => { + + Value::Uint8(_) | Value::Uint16(_) | Value::Uint32(_) | Value::Uint64(_) => { + // safe unwrap after type matched + let v = value.as_u64().unwrap(); resolve_schema( index, - ValueData::StringValue(s), + ValueData::U64Value(v), ColumnSchema { column_name, - datatype: ColumnDataType::String as i32, + datatype: ColumnDataType::Uint64 as i32, semantic_type: SemanticType::Field as i32, datatype_extension: None, options: None, @@ -358,10 +362,29 @@ fn json_value_to_row( schema_info, )?; } - serde_json::Value::Bool(b) => { + + Value::Float32(_) | Value::Float64(_) => { + // safe unwrap after type matched + let v = value.as_f64().unwrap(); resolve_schema( index, - ValueData::BoolValue(b), + ValueData::F64Value(v), + ColumnSchema { + column_name, + datatype: ColumnDataType::Float64 as i32, + semantic_type: SemanticType::Field as i32, + datatype_extension: None, + options: None, + }, + &mut row, + schema_info, + )?; + } + + Value::Boolean(v) => { + resolve_schema( + index, + ValueData::BoolValue(v), ColumnSchema { column_name, datatype: ColumnDataType::Boolean as i32, @@ -373,13 +396,88 @@ fn json_value_to_row( schema_info, )?; } - serde_json::Value::Number(n) => { - resolve_number_schema(n, column_name, index, &mut row, schema_info)?; + Value::String(v) => { + resolve_schema( + index, + ValueData::StringValue(v), + ColumnSchema { + column_name, + datatype: ColumnDataType::String as i32, + semantic_type: SemanticType::Field as i32, + datatype_extension: None, + options: None, + }, + &mut row, + schema_info, + )?; + } + + Value::Timestamp(Timestamp::Nanosecond(ns)) => { + resolve_schema( + index, + ValueData::TimestampNanosecondValue(ns), + ColumnSchema { + column_name, + datatype: ColumnDataType::TimestampNanosecond as i32, + semantic_type: SemanticType::Field as i32, + datatype_extension: None, + options: None, + }, + &mut row, + schema_info, + )?; + } + Value::Timestamp(Timestamp::Microsecond(us)) => { + resolve_schema( + index, + ValueData::TimestampMicrosecondValue(us), + ColumnSchema { + column_name, + datatype: ColumnDataType::TimestampMicrosecond as i32, + semantic_type: SemanticType::Field as i32, + datatype_extension: None, + options: None, + }, + &mut row, + schema_info, + )?; + } + Value::Timestamp(Timestamp::Millisecond(ms)) => { + resolve_schema( + index, + ValueData::TimestampMillisecondValue(ms), + ColumnSchema { + column_name, + datatype: ColumnDataType::TimestampMillisecond as i32, + semantic_type: SemanticType::Field as i32, + datatype_extension: None, + options: None, + }, + &mut row, + schema_info, + )?; } - serde_json::Value::Array(_) | serde_json::Value::Object(_) => { + Value::Timestamp(Timestamp::Second(s)) => { resolve_schema( index, - ValueData::BinaryValue(jsonb::Value::from(value).to_vec()), + ValueData::TimestampSecondValue(s), + ColumnSchema { + column_name, + datatype: ColumnDataType::TimestampSecond as i32, + semantic_type: SemanticType::Field as i32, + datatype_extension: None, + options: None, + }, + &mut row, + schema_info, + )?; + } + + Value::Array(_) | Value::Map(_) => { + let data: jsonb::Value = value.into(); + resolve_schema( + index, + ValueData::BinaryValue(data.to_vec()), ColumnSchema { column_name, datatype: ColumnDataType::Binary as i32, @@ -399,23 +497,18 @@ fn json_value_to_row( } fn identity_pipeline_inner<'a>( - array: Vec, + array: Vec>, tag_column_names: Option>, - params: &GreptimePipelineParams, + _params: &GreptimePipelineParams, ) -> Result { let mut rows = Vec::with_capacity(array.len()); let mut schema_info = SchemaInfo::default(); - for value in array { - if let serde_json::Value::Object(map) = value { - let object = if params.flatten_json_object() { - flatten_json_object(map, DEFAULT_MAX_NESTED_LEVELS_FOR_JSON_FLATTENING)? - } else { - map - }; - let row = json_value_to_row(&mut schema_info, object)?; - rows.push(row); - } + + for values in array { + let row = values_to_row(&mut schema_info, values)?; + rows.push(row); } + let greptime_timestamp_schema = ColumnSchema { column_name: DEFAULT_GREPTIME_TIMESTAMP_COLUMN.to_string(), datatype: ColumnDataType::TimestampNanosecond as i32, @@ -460,17 +553,26 @@ fn identity_pipeline_inner<'a>( /// 4. The pipeline will return an error if the same column datatype is mismatched /// 5. The pipeline will analyze the schema of each json record and merge them to get the final schema. pub fn identity_pipeline( - array: Vec, + array: Vec>, table: Option>, params: &GreptimePipelineParams, ) -> Result { + let input = if params.flatten_json_object() { + array + .into_iter() + .map(|item| flatten_object(item, DEFAULT_MAX_NESTED_LEVELS_FOR_JSON_FLATTENING)) + .collect::>>>()? + } else { + array + }; + match table { Some(table) => { let table_info = table.table_info(); let tag_column_names = table_info.meta.row_key_column_names(); - identity_pipeline_inner(array, Some(tag_column_names), params) + identity_pipeline_inner(input, Some(tag_column_names), params) } - None => identity_pipeline_inner(array, None::>, params), + None => identity_pipeline_inner(input, None::>, params), } } @@ -478,24 +580,24 @@ pub fn identity_pipeline( /// /// The `max_nested_levels` parameter is used to limit the nested levels of the JSON object. /// The error will be returned if the nested levels is greater than the `max_nested_levels`. -pub fn flatten_json_object( - object: Map, +pub fn flatten_object( + object: BTreeMap, max_nested_levels: usize, -) -> Result> { - let mut flattened = Map::new(); +) -> Result> { + let mut flattened = BTreeMap::new(); if !object.is_empty() { // it will use recursion to flatten the object. - do_flatten_json_object(&mut flattened, None, object, 1, max_nested_levels)?; + do_flatten_object(&mut flattened, None, object, 1, max_nested_levels)?; } Ok(flattened) } -fn do_flatten_json_object( - dest: &mut Map, +fn do_flatten_object( + dest: &mut BTreeMap, base: Option<&str>, - object: Map, + object: BTreeMap, current_level: usize, max_nested_levels: usize, ) -> Result<()> { @@ -508,11 +610,11 @@ fn do_flatten_json_object( let new_key = base.map_or_else(|| key.clone(), |base_key| format!("{base_key}.{key}")); match value { - JsonValue::Object(object) => { - do_flatten_json_object( + Value::Map(object) => { + do_flatten_object( dest, Some(&new_key), - object, + object.values, current_level + 1, max_nested_levels, )?; @@ -531,9 +633,8 @@ fn do_flatten_json_object( mod tests { use api::v1::SemanticType; - use crate::etl::transform::transformer::greptime::{ - flatten_json_object, identity_pipeline_inner, GreptimePipelineParams, - }; + use super::*; + use crate::etl::{json_array_to_intermediate_state, json_to_intermediate_state}; use crate::identity_pipeline; #[test] @@ -559,6 +660,7 @@ mod tests { "gaga": "gaga" }), ]; + let array = json_array_to_intermediate_state(array).unwrap(); let rows = identity_pipeline(array, None, &GreptimePipelineParams::default()); assert!(rows.is_err()); assert_eq!( @@ -587,7 +689,11 @@ mod tests { "gaga": "gaga" }), ]; - let rows = identity_pipeline(array, None, &GreptimePipelineParams::default()); + let rows = identity_pipeline( + json_array_to_intermediate_state(array).unwrap(), + None, + &GreptimePipelineParams::default(), + ); assert!(rows.is_err()); assert_eq!( rows.err().unwrap().to_string(), @@ -615,7 +721,11 @@ mod tests { "gaga": "gaga" }), ]; - let rows = identity_pipeline(array, None, &GreptimePipelineParams::default()); + let rows = identity_pipeline( + json_array_to_intermediate_state(array).unwrap(), + None, + &GreptimePipelineParams::default(), + ); assert!(rows.is_ok()); let rows = rows.unwrap(); assert_eq!(rows.schema.len(), 8); @@ -646,7 +756,7 @@ mod tests { ]; let tag_column_names = ["name".to_string(), "address".to_string()]; let rows = identity_pipeline_inner( - array, + json_array_to_intermediate_state(array).unwrap(), Some(tag_column_names.iter()), &GreptimePipelineParams::default(), ); @@ -745,14 +855,11 @@ mod tests { ]; for (input, max_depth, expected) in test_cases { - let flattened_object = - flatten_json_object(input.as_object().unwrap().clone(), max_depth); - match flattened_object { - Ok(flattened_object) => { - assert_eq!(&flattened_object, expected.unwrap().as_object().unwrap()) - } - Err(_) => assert_eq!(None, expected), - } + let input = json_to_intermediate_state(input).unwrap(); + let expected = expected.map(|e| json_to_intermediate_state(e).unwrap()); + + let flattened_object = flatten_object(input, max_depth).ok(); + assert_eq!(flattened_object, expected); } } diff --git a/src/pipeline/src/etl/transform/transformer/greptime/coerce.rs b/src/pipeline/src/etl/transform/transformer/greptime/coerce.rs index 5f448b386cbd..da345b3bdeb3 100644 --- a/src/pipeline/src/etl/transform/transformer/greptime/coerce.rs +++ b/src/pipeline/src/etl/transform/transformer/greptime/coerce.rs @@ -71,12 +71,11 @@ impl TryFrom for ValueData { } } -// TODO(yuanbohan): add fulltext support in datatype_extension pub(crate) fn coerce_columns(transform: &Transform) -> Result> { let mut columns = Vec::new(); - for field in transform.real_fields.iter() { - let column_name = field.output_name().to_string(); + for field in transform.fields.iter() { + let column_name = field.target_or_input_field().to_string(); let (datatype, datatype_extension) = coerce_type(transform)?; @@ -477,12 +476,14 @@ fn coerce_json_value(v: &Value, transform: &Transform) -> Result Option { + match self { + Value::Uint32(v) => Some(*v as i64), + Value::Uint16(v) => Some(*v as i64), + Value::Uint8(v) => Some(*v as i64), + Value::Int64(v) => Some(*v), + Value::Int32(v) => Some(*v as i64), + Value::Int16(v) => Some(*v as i64), + Value::Int8(v) => Some(*v as i64), + _ => None, + } + } + + pub fn as_u64(&self) -> Option { + match self { + Value::Uint64(v) => Some(*v), + Value::Uint32(v) => Some(*v as u64), + Value::Uint16(v) => Some(*v as u64), + Value::Uint8(v) => Some(*v as u64), + _ => None, + } + } + pub fn as_f64(&self) -> Option { match self { Value::Float32(v) => Some(*v as f64), diff --git a/src/pipeline/src/etl/value/map.rs b/src/pipeline/src/etl/value/map.rs index 004a617b0f9c..9e730ef532d8 100644 --- a/src/pipeline/src/etl/value/map.rs +++ b/src/pipeline/src/etl/value/map.rs @@ -49,6 +49,12 @@ impl From> for Map { } } +impl From> for Map { + fn from(values: BTreeMap) -> Self { + Self { values } + } +} + impl std::ops::Deref for Map { type Target = BTreeMap; diff --git a/src/pipeline/src/lib.rs b/src/pipeline/src/lib.rs index edb6ce1f5874..a6c82f9353cf 100644 --- a/src/pipeline/src/lib.rs +++ b/src/pipeline/src/lib.rs @@ -19,13 +19,15 @@ mod metrics; pub use etl::error::Result; pub use etl::processor::Processor; -pub use etl::transform::transformer::greptime::{ - GreptimePipelineParams, SchemaInfo, GREPTIME_PIPELINE_PARAMS_HEADER, -}; +pub use etl::transform::transformer::greptime::{GreptimePipelineParams, SchemaInfo}; pub use etl::transform::transformer::identity_pipeline; pub use etl::transform::{GreptimeTransformer, Transformer}; pub use etl::value::{Array, Map, Value}; -pub use etl::{error as etl_error, parse, Content, Pipeline, PipelineWay, SelectInfo}; +pub use etl::{ + error as etl_error, json_array_to_intermediate_state, json_to_intermediate_state, parse, + Content, DispatchedTo, Pipeline, PipelineDefinition, PipelineExecOutput, PipelineWay, + SelectInfo, GREPTIME_INTERNAL_IDENTITY_PIPELINE_NAME, +}; pub use manager::{ error, pipeline_operator, table, util, PipelineInfo, PipelineRef, PipelineTableRef, PipelineVersion, diff --git a/src/pipeline/tests/common.rs b/src/pipeline/tests/common.rs index d825c91e4cb3..89bebbf85bb9 100644 --- a/src/pipeline/tests/common.rs +++ b/src/pipeline/tests/common.rs @@ -13,7 +13,7 @@ // limitations under the License. use greptime_proto::v1::{ColumnDataType, ColumnSchema, Rows, SemanticType}; -use pipeline::{parse, Content, GreptimeTransformer, Pipeline}; +use pipeline::{json_to_intermediate_state, parse, Content, GreptimeTransformer, Pipeline}; /// test util function to parse and execute pipeline pub fn parse_and_exec(input_str: &str, pipeline_yaml: &str) -> Rows { @@ -22,7 +22,6 @@ pub fn parse_and_exec(input_str: &str, pipeline_yaml: &str) -> Rows { let yaml_content = Content::Yaml(pipeline_yaml); let pipeline: Pipeline = parse(&yaml_content).expect("failed to parse pipeline"); - let mut result = pipeline.init_intermediate_state(); let schema = pipeline.schemas().clone(); @@ -31,19 +30,22 @@ pub fn parse_and_exec(input_str: &str, pipeline_yaml: &str) -> Rows { match input_value { serde_json::Value::Array(array) => { for value in array { - pipeline.prepare(value, &mut result).unwrap(); + let mut intermediate_status = json_to_intermediate_state(value).unwrap(); let row = pipeline - .exec_mut(&mut result) - .expect("failed to exec pipeline"); + .exec_mut(&mut intermediate_status) + .expect("failed to exec pipeline") + .into_transformed() + .expect("expect transformed result "); rows.push(row); - pipeline.reset_intermediate_state(&mut result); } } serde_json::Value::Object(_) => { - pipeline.prepare(input_value, &mut result).unwrap(); + let mut intermediate_status = json_to_intermediate_state(input_value).unwrap(); let row = pipeline - .exec_mut(&mut result) - .expect("failed to exec pipeline"); + .exec_mut(&mut intermediate_status) + .expect("failed to exec pipeline") + .into_transformed() + .expect("expect transformed result "); rows.push(row); } _ => { diff --git a/src/pipeline/tests/dissect.rs b/src/pipeline/tests/dissect.rs index 56386d0e860a..a93112d68945 100644 --- a/src/pipeline/tests/dissect.rs +++ b/src/pipeline/tests/dissect.rs @@ -16,6 +16,7 @@ mod common; use greptime_proto::v1::value::ValueData::StringValue; use greptime_proto::v1::{ColumnDataType, SemanticType}; +use pipeline::json_to_intermediate_state; fn make_string_column_schema(name: String) -> greptime_proto::v1::ColumnSchema { common::make_column_schema(name, ColumnDataType::String, SemanticType::Field) @@ -273,9 +274,8 @@ transform: let yaml_content = pipeline::Content::Yaml(pipeline_yaml); let pipeline: pipeline::Pipeline = pipeline::parse(&yaml_content).expect("failed to parse pipeline"); - let mut result = pipeline.init_intermediate_state(); + let mut result = json_to_intermediate_state(input_value).unwrap(); - pipeline.prepare(input_value, &mut result).unwrap(); let row = pipeline.exec_mut(&mut result); assert!(row.is_err()); diff --git a/src/pipeline/tests/pipeline.rs b/src/pipeline/tests/pipeline.rs index cb84e9ad0c8e..c34187c80c91 100644 --- a/src/pipeline/tests/pipeline.rs +++ b/src/pipeline/tests/pipeline.rs @@ -20,7 +20,7 @@ use greptime_proto::v1::value::ValueData::{ U32Value, U64Value, U8Value, }; use greptime_proto::v1::Value as GreptimeValue; -use pipeline::{parse, Content, GreptimeTransformer, Pipeline}; +use pipeline::{json_to_intermediate_state, parse, Content, GreptimeTransformer, Pipeline}; #[test] fn test_complex_data() { @@ -420,14 +420,13 @@ transform: let yaml_content = Content::Yaml(pipeline_yaml); let pipeline: Pipeline = parse(&yaml_content).expect("failed to parse pipeline"); - let mut stats = pipeline.init_intermediate_state(); - pipeline - .prepare(input_value, &mut stats) - .expect("failed to prepare pipeline"); + let mut stats = json_to_intermediate_state(input_value).unwrap(); let row = pipeline .exec_mut(&mut stats) - .expect("failed to exec pipeline"); + .expect("failed to exec pipeline") + .into_transformed() + .expect("expect transformed result "); let output = Rows { schema: pipeline.schemas().clone(), @@ -490,9 +489,12 @@ transform: let yaml_content = Content::Yaml(pipeline_yaml); let pipeline: Pipeline = parse(&yaml_content).unwrap(); - let mut status = pipeline.init_intermediate_state(); - pipeline.prepare(input_value, &mut status).unwrap(); - let row = pipeline.exec_mut(&mut status).unwrap(); + let mut status = json_to_intermediate_state(input_value).unwrap(); + let row = pipeline + .exec_mut(&mut status) + .unwrap() + .into_transformed() + .expect("expect transformed result "); let r = row .values .into_iter() @@ -595,10 +597,12 @@ transform: let yaml_content = Content::Yaml(pipeline_yaml); let pipeline: Pipeline = parse(&yaml_content).unwrap(); - let mut status = pipeline.init_intermediate_state(); - - pipeline.prepare(input_value, &mut status).unwrap(); - let row = pipeline.exec_mut(&mut status).unwrap(); + let mut status = json_to_intermediate_state(input_value).unwrap(); + let row = pipeline + .exec_mut(&mut status) + .unwrap() + .into_transformed() + .expect("expect transformed result "); let r = row .values @@ -638,10 +642,10 @@ processors: - dissect: fields: - line - patterns: + patterns: - "%{+ts} %{+ts} %{content}" - date: - fields: + fields: - ts formats: - "%Y-%m-%d %H:%M:%S%.3f" @@ -658,9 +662,12 @@ transform: let yaml_content = Content::Yaml(pipeline_yaml); let pipeline: Pipeline = parse(&yaml_content).unwrap(); - let mut status = pipeline.init_intermediate_state(); - pipeline.prepare(input_value, &mut status).unwrap(); - let row = pipeline.exec_mut(&mut status).unwrap(); + let mut status = json_to_intermediate_state(input_value).unwrap(); + let row = pipeline + .exec_mut(&mut status) + .unwrap() + .into_transformed() + .expect("expect transformed result "); let r = row .values .into_iter() @@ -694,9 +701,13 @@ transform: let yaml_content = Content::Yaml(pipeline_yaml); let pipeline: Pipeline = parse(&yaml_content).unwrap(); - let mut status = pipeline.init_intermediate_state(); - pipeline.prepare(input_value, &mut status).unwrap(); - let row = pipeline.exec_mut(&mut status).unwrap(); + let mut status = json_to_intermediate_state(input_value).unwrap(); + + let row = pipeline + .exec_mut(&mut status) + .unwrap() + .into_transformed() + .expect("expect transformed result "); let r = row .values @@ -749,9 +760,12 @@ transform: let yaml_content = Content::Yaml(pipeline_yaml); let pipeline: Pipeline = parse(&yaml_content).unwrap(); - let mut status = pipeline.init_intermediate_state(); - pipeline.prepare(input_value, &mut status).unwrap(); - let row = pipeline.exec_mut(&mut status).unwrap(); + let mut status = json_to_intermediate_state(input_value).unwrap(); + let row = pipeline + .exec_mut(&mut status) + .unwrap() + .into_transformed() + .expect("expect transformed result "); let mut r = row .values @@ -770,3 +784,79 @@ transform: assert_eq!(expected, r); } + +#[test] +fn test_dispatch() { + let input_value_str1 = r#" +{ + "line": "2024-05-25 20:16:37.217 [http] hello world" +} +"#; + let input_value1 = serde_json::from_str::(input_value_str1).unwrap(); + let input_value_str2 = r#" +{ + "line": "2024-05-25 20:16:37.217 [database] hello world" +} +"#; + let input_value2 = serde_json::from_str::(input_value_str2).unwrap(); + + let pipeline_yaml = r#" +processors: + - dissect: + fields: + - line + patterns: + - "%{+ts} %{+ts} [%{logger}] %{content}" + - date: + fields: + - ts + formats: + - "%Y-%m-%d %H:%M:%S%.3f" + +dispatcher: + field: logger + rules: + - value: http + table_part: http + pipeline: access_log_pipeline + +transform: + - fields: + - content + type: string + - field: ts + type: time + index: timestamp +"#; + + let yaml_content = Content::Yaml(pipeline_yaml); + let pipeline: Pipeline = parse(&yaml_content).unwrap(); + + let mut status = json_to_intermediate_state(input_value1).unwrap(); + let dispatched_to = pipeline + .exec_mut(&mut status) + .unwrap() + .into_dispatched() + .expect("expect dispatched result "); + assert_eq!(dispatched_to.table_part, "http"); + assert_eq!(dispatched_to.pipeline.unwrap(), "access_log_pipeline"); + + let mut status = json_to_intermediate_state(input_value2).unwrap(); + let row = pipeline + .exec_mut(&mut status) + .unwrap() + .into_transformed() + .expect("expect transformed result "); + let r = row + .values + .into_iter() + .map(|v| v.value_data.unwrap()) + .collect::>(); + + let expected = vec![ + StringValue("hello world".into()), + TimestampNanosecondValue(1716668197217000000), + ]; + + assert_eq!(expected, r); +} diff --git a/src/servers/src/elasticsearch.rs b/src/servers/src/elasticsearch.rs index e9f1204e25b2..baa25ba7760a 100644 --- a/src/servers/src/elasticsearch.rs +++ b/src/servers/src/elasticsearch.rs @@ -24,6 +24,7 @@ use common_error::ext::ErrorExt; use common_telemetry::{debug, error}; use headers::ContentType; use once_cell::sync::Lazy; +use pipeline::GREPTIME_INTERNAL_IDENTITY_PIPELINE_NAME; use serde_json::{json, Deserializer, Value}; use session::context::{Channel, QueryContext}; use snafu::{ensure, ResultExt}; @@ -32,10 +33,7 @@ use crate::error::{ status_code_to_http_status, InvalidElasticsearchInputSnafu, ParseJsonSnafu, Result as ServersResult, }; -use crate::http::event::{ - ingest_logs_inner, LogIngestRequest, LogIngesterQueryParams, LogState, - GREPTIME_INTERNAL_IDENTITY_PIPELINE_NAME, -}; +use crate::http::event::{ingest_logs_inner, LogIngestRequest, LogIngesterQueryParams, LogState}; use crate::metrics::{ METRIC_ELASTICSEARCH_LOGS_DOCS_COUNT, METRIC_ELASTICSEARCH_LOGS_INGESTION_ELAPSED, }; diff --git a/src/servers/src/error.rs b/src/servers/src/error.rs index 096c3fd75f8e..adfe3ab841e2 100644 --- a/src/servers/src/error.rs +++ b/src/servers/src/error.rs @@ -158,6 +158,14 @@ pub enum Error { location: Location, }, + #[snafu(display("Pipeline transform error"))] + PipelineTransform { + #[snafu(source)] + source: pipeline::etl_error::Error, + #[snafu(implicit)] + location: Location, + }, + #[snafu(display("Not supported: {}", feat))] NotSupported { feat: String }, @@ -557,12 +565,6 @@ pub enum Error { location: Location, }, - #[snafu(display("OpenTelemetry log error"))] - OpenTelemetryLog { - source: pipeline::etl_error::Error, - #[snafu(implicit)] - location: Location, - }, #[snafu(display("Unsupported json data type for tag: {} {}", key, ty))] UnsupportedJsonDataTypeForTag { key: String, @@ -634,6 +636,7 @@ impl ErrorExt for Error { | CheckDatabaseValidity { source, .. } => source.status_code(), Pipeline { source, .. } => source.status_code(), + PipelineTransform { source, .. } => source.status_code(), NotSupported { .. } | InvalidParameter { .. } @@ -661,7 +664,6 @@ impl ErrorExt for Error { | InvalidLokiPayload { .. } | UnsupportedContentType { .. } | TimestampOverflow { .. } - | OpenTelemetryLog { .. } | UnsupportedJsonDataTypeForTag { .. } | InvalidTableName { .. } | PrepareStatementNotFound { .. } diff --git a/src/servers/src/http/event.rs b/src/servers/src/http/event.rs index 00b177b2c096..d6d8e89a56ea 100644 --- a/src/servers/src/http/event.rs +++ b/src/servers/src/http/event.rs @@ -16,7 +16,7 @@ use std::str::FromStr; use std::sync::Arc; use std::time::Instant; -use api::v1::{RowInsertRequest, RowInsertRequests, Rows}; +use api::v1::RowInsertRequests; use async_trait::async_trait; use axum::extract::{FromRequest, Multipart, Path, Query, Request, State}; use axum::http::header::CONTENT_TYPE; @@ -32,18 +32,17 @@ use headers::ContentType; use lazy_static::lazy_static; use pipeline::error::PipelineTransformSnafu; use pipeline::util::to_pipeline_version; -use pipeline::{ - GreptimePipelineParams, GreptimeTransformer, PipelineVersion, GREPTIME_PIPELINE_PARAMS_HEADER, -}; +use pipeline::{GreptimePipelineParams, GreptimeTransformer, PipelineDefinition, PipelineVersion}; use serde::{Deserialize, Serialize}; use serde_json::{json, Deserializer, Map, Value}; use session::context::{Channel, QueryContext, QueryContextRef}; use snafu::{ensure, OptionExt, ResultExt}; use crate::error::{ - status_code_to_http_status, CatalogSnafu, Error, InvalidParameterSnafu, ParseJsonSnafu, - PipelineSnafu, Result, UnsupportedContentTypeSnafu, + status_code_to_http_status, Error, InvalidParameterSnafu, ParseJsonSnafu, PipelineSnafu, + Result, UnsupportedContentTypeSnafu, }; +use crate::http::header::constants::GREPTIME_PIPELINE_PARAMS_HEADER; use crate::http::header::CONTENT_TYPE_PROTOBUF_STR; use crate::http::result::greptime_manage_resp::GreptimedbManageResponse; use crate::http::result::greptime_result_v1::GreptimedbV1Response; @@ -51,11 +50,11 @@ use crate::http::HttpResponse; use crate::interceptor::{LogIngestInterceptor, LogIngestInterceptorRef}; use crate::metrics::{ METRIC_FAILURE_VALUE, METRIC_HTTP_LOGS_INGESTION_COUNTER, METRIC_HTTP_LOGS_INGESTION_ELAPSED, - METRIC_HTTP_LOGS_TRANSFORM_ELAPSED, METRIC_SUCCESS_VALUE, + METRIC_SUCCESS_VALUE, }; +use crate::pipeline::run_pipeline; use crate::query_handler::PipelineHandlerRef; -pub const GREPTIME_INTERNAL_IDENTITY_PIPELINE_NAME: &str = "greptime_identity"; const GREPTIME_INTERNAL_PIPELINE_NAME_PREFIX: &str = "greptime_"; lazy_static! { @@ -269,86 +268,107 @@ fn transform_ndjson_array_factory( } /// Dryrun pipeline with given data -fn dryrun_pipeline_inner( +async fn dryrun_pipeline_inner( value: Vec, - pipeline: &pipeline::Pipeline, + pipeline: Arc>, + pipeline_handler: PipelineHandlerRef, + query_ctx: &QueryContextRef, ) -> Result { - let mut intermediate_state = pipeline.init_intermediate_state(); + let params = GreptimePipelineParams::default(); - let mut results = Vec::with_capacity(value.len()); - for v in value { - pipeline - .prepare(v, &mut intermediate_state) - .context(PipelineTransformSnafu) - .context(PipelineSnafu)?; - let r = pipeline - .exec_mut(&mut intermediate_state) + let results = run_pipeline( + &pipeline_handler, + PipelineDefinition::Resolved(pipeline), + ¶ms, + pipeline::json_array_to_intermediate_state(value) .context(PipelineTransformSnafu) - .context(PipelineSnafu)?; - results.push(r); - pipeline.reset_intermediate_state(&mut intermediate_state); - } + .context(PipelineSnafu)?, + "dry_run".to_owned(), + query_ctx, + true, + ) + .await?; let colume_type_key = "colume_type"; let data_type_key = "data_type"; let name_key = "name"; - let schema = pipeline - .schemas() - .iter() - .map(|cs| { - let mut map = Map::new(); - map.insert(name_key.to_string(), Value::String(cs.column_name.clone())); - map.insert( - data_type_key.to_string(), - Value::String(cs.datatype().as_str_name().to_string()), - ); - map.insert( - colume_type_key.to_string(), - Value::String(cs.semantic_type().as_str_name().to_string()), - ); - map.insert( - "fulltext".to_string(), - Value::Bool( - cs.options - .clone() - .is_some_and(|x| x.options.contains_key("fulltext")), - ), - ); - Value::Object(map) - }) - .collect::>(); - let rows = results + let results = results .into_iter() - .map(|row| { - let row = row - .values - .into_iter() - .enumerate() - .map(|(idx, v)| { - v.value_data - .map(|d| { - let mut map = Map::new(); - map.insert("value".to_string(), column_data_to_json(d)); - map.insert("key".to_string(), schema[idx][name_key].clone()); - map.insert( - "semantic_type".to_string(), - schema[idx][colume_type_key].clone(), - ); - map.insert("data_type".to_string(), schema[idx][data_type_key].clone()); - Value::Object(map) - }) - .unwrap_or(Value::Null) - }) - .collect(); - Value::Array(row) + .filter_map(|row| { + if let Some(rows) = row.rows { + let table_name = row.table_name; + let schema = rows.schema; + + let schema = schema + .iter() + .map(|cs| { + let mut map = Map::new(); + map.insert(name_key.to_string(), Value::String(cs.column_name.clone())); + map.insert( + data_type_key.to_string(), + Value::String(cs.datatype().as_str_name().to_string()), + ); + map.insert( + colume_type_key.to_string(), + Value::String(cs.semantic_type().as_str_name().to_string()), + ); + map.insert( + "fulltext".to_string(), + Value::Bool( + cs.options + .clone() + .is_some_and(|x| x.options.contains_key("fulltext")), + ), + ); + Value::Object(map) + }) + .collect::>(); + + let rows = rows + .rows + .into_iter() + .map(|row| { + row.values + .into_iter() + .enumerate() + .map(|(idx, v)| { + v.value_data + .map(|d| { + let mut map = Map::new(); + map.insert("value".to_string(), column_data_to_json(d)); + map.insert( + "key".to_string(), + schema[idx][name_key].clone(), + ); + map.insert( + "semantic_type".to_string(), + schema[idx][colume_type_key].clone(), + ); + map.insert( + "data_type".to_string(), + schema[idx][data_type_key].clone(), + ); + Value::Object(map) + }) + .unwrap_or(Value::Null) + }) + .collect() + }) + .collect(); + + let mut result = Map::new(); + result.insert("schema".to_string(), Value::Array(schema)); + result.insert("rows".to_string(), Value::Array(rows)); + result.insert("table_name".to_string(), Value::String(table_name)); + let result = Value::Object(result); + Some(result) + } else { + None + } }) - .collect::>(); - let mut result = Map::new(); - result.insert("schema".to_string(), Value::Array(schema)); - result.insert("rows".to_string(), Value::Array(rows)); - let result = Value::Object(result); - Ok(Json(result).into_response()) + .collect(); + Ok(Json(Value::Array(results)).into_response()) } /// Dryrun pipeline with given data @@ -414,6 +434,9 @@ pub async fn pipeline_dryrun( ) -> Result { let handler = log_state.log_handler; + query_ctx.set_channel(Channel::Http); + let query_ctx = Arc::new(query_ctx); + match check_pipeline_dryrun_params_valid(&payload) { Some(params) => { let data = params.data; @@ -426,20 +449,29 @@ pub async fn pipeline_dryrun( to_pipeline_version(params.pipeline_version).context(PipelineSnafu)?; let pipeline_name = check_pipeline_name_exists(params.pipeline_name)?; let pipeline = handler - .get_pipeline(&pipeline_name, version, Arc::new(query_ctx)) + .get_pipeline(&pipeline_name, version, query_ctx.clone()) .await?; - dryrun_pipeline_inner(data, &pipeline) + dryrun_pipeline_inner(data, pipeline, handler, &query_ctx).await } Some(pipeline) => { let pipeline = handler.build_pipeline(&pipeline); match pipeline { - Ok(pipeline) => match dryrun_pipeline_inner(data, &pipeline) { - Ok(response) => Ok(response), - Err(e) => Ok(add_step_info_for_pipeline_dryrun_error( - "Failed to exec pipeline", - e, - )), - }, + Ok(pipeline) => { + match dryrun_pipeline_inner( + data, + Arc::new(pipeline), + handler, + &query_ctx, + ) + .await + { + Ok(response) => Ok(response), + Err(e) => Ok(add_step_info_for_pipeline_dryrun_error( + "Failed to exec pipeline", + e, + )), + } + } Err(e) => Ok(add_step_info_for_pipeline_dryrun_error( "Failed to build pipeline", e, @@ -463,14 +495,11 @@ pub async fn pipeline_dryrun( check_data_valid(value.len())?; - query_ctx.set_channel(Channel::Http); - let query_ctx = Arc::new(query_ctx); - let pipeline = handler .get_pipeline(&pipeline_name, version, query_ctx.clone()) .await?; - dryrun_pipeline_inner(value, &pipeline) + dryrun_pipeline_inner(value, pipeline, handler, &query_ctx).await } } } @@ -544,7 +573,7 @@ fn extract_pipeline_value_by_content_type( ct if ct == *TEXT_CONTENT_TYPE || ct == *TEXT_UTF8_CONTENT_TYPE => payload .lines() .filter(|line| !line.is_empty()) - .map(|line| Value::String(line.to_string())) + .map(|line| json!({"message": line})) .collect(), _ => UnsupportedContentTypeSnafu { content_type }.fail()?, }) @@ -570,59 +599,20 @@ pub(crate) async fn ingest_logs_inner( ); for request in log_ingest_requests { - let transformed_data: Rows = if pipeline_name == GREPTIME_INTERNAL_IDENTITY_PIPELINE_NAME { - let table = state - .get_table(&request.table, &query_ctx) - .await - .context(CatalogSnafu)?; - pipeline::identity_pipeline(request.values, table, &pipeline_params) + let requests = run_pipeline( + &state, + PipelineDefinition::from_name(&pipeline_name, version), + &pipeline_params, + pipeline::json_array_to_intermediate_state(request.values) .context(PipelineTransformSnafu) - .context(PipelineSnafu)? - } else { - let pipeline = state - .get_pipeline(&pipeline_name, version, query_ctx.clone()) - .await?; - - let transform_timer = std::time::Instant::now(); - let mut intermediate_state = pipeline.init_intermediate_state(); - let mut results = Vec::with_capacity(request.values.len()); - for v in request.values { - pipeline - .prepare(v, &mut intermediate_state) - .inspect_err(|_| { - METRIC_HTTP_LOGS_TRANSFORM_ELAPSED - .with_label_values(&[db.as_str(), METRIC_FAILURE_VALUE]) - .observe(transform_timer.elapsed().as_secs_f64()); - }) - .context(PipelineTransformSnafu) - .context(PipelineSnafu)?; - let r = pipeline - .exec_mut(&mut intermediate_state) - .inspect_err(|_| { - METRIC_HTTP_LOGS_TRANSFORM_ELAPSED - .with_label_values(&[db.as_str(), METRIC_FAILURE_VALUE]) - .observe(transform_timer.elapsed().as_secs_f64()); - }) - .context(PipelineTransformSnafu) - .context(PipelineSnafu)?; - results.push(r); - pipeline.reset_intermediate_state(&mut intermediate_state); - } - - METRIC_HTTP_LOGS_TRANSFORM_ELAPSED - .with_label_values(&[db.as_str(), METRIC_SUCCESS_VALUE]) - .observe(transform_timer.elapsed().as_secs_f64()); - - Rows { - rows: results, - schema: pipeline.schemas().clone(), - } - }; + .context(PipelineSnafu)?, + request.table, + &query_ctx, + true, + ) + .await?; - insert_requests.push(RowInsertRequest { - rows: Some(transformed_data), - table_name: request.table.clone(), - }); + insert_requests.extend(requests); } let output = state diff --git a/src/servers/src/http/extractor.rs b/src/servers/src/http/extractor.rs index f3ae606636c5..ae578f21d302 100644 --- a/src/servers/src/http/extractor.rs +++ b/src/servers/src/http/extractor.rs @@ -18,12 +18,12 @@ use axum::extract::FromRequestParts; use axum::http::request::Parts; use axum::http::StatusCode; use http::HeaderMap; -use pipeline::SelectInfo; +use pipeline::{GreptimePipelineParams, SelectInfo}; use crate::http::header::constants::{ GREPTIME_LOG_EXTRACT_KEYS_HEADER_NAME, GREPTIME_LOG_PIPELINE_NAME_HEADER_NAME, GREPTIME_LOG_PIPELINE_VERSION_HEADER_NAME, GREPTIME_LOG_TABLE_NAME_HEADER_NAME, - GREPTIME_TRACE_TABLE_NAME_HEADER_NAME, + GREPTIME_PIPELINE_PARAMS_HEADER, GREPTIME_TRACE_TABLE_NAME_HEADER_NAME, }; /// Axum extractor for optional target log table name from HTTP header @@ -91,6 +91,7 @@ where pub struct PipelineInfo { pub pipeline_name: Option, pub pipeline_version: Option, + pub pipeline_params: GreptimePipelineParams, } impl FromRequestParts for PipelineInfo @@ -105,20 +106,14 @@ where string_value_from_header(headers, GREPTIME_LOG_PIPELINE_NAME_HEADER_NAME)?; let pipeline_version = string_value_from_header(headers, GREPTIME_LOG_PIPELINE_VERSION_HEADER_NAME)?; - match (pipeline_name, pipeline_version) { - (Some(name), Some(version)) => Ok(PipelineInfo { - pipeline_name: Some(name), - pipeline_version: Some(version), - }), - (None, _) => Ok(PipelineInfo { - pipeline_name: None, - pipeline_version: None, - }), - (Some(name), None) => Ok(PipelineInfo { - pipeline_name: Some(name), - pipeline_version: None, - }), - } + let pipeline_parameters = + string_value_from_header(headers, GREPTIME_PIPELINE_PARAMS_HEADER)?; + + Ok(PipelineInfo { + pipeline_name, + pipeline_version, + pipeline_params: GreptimePipelineParams::from_params(pipeline_parameters.as_deref()), + }) } } diff --git a/src/servers/src/http/header.rs b/src/servers/src/http/header.rs index 51a07ca01f0c..e14ce6172958 100644 --- a/src/servers/src/http/header.rs +++ b/src/servers/src/http/header.rs @@ -50,6 +50,8 @@ pub mod constants { pub const GREPTIME_LOG_TABLE_NAME_HEADER_NAME: &str = "x-greptime-log-table-name"; pub const GREPTIME_LOG_EXTRACT_KEYS_HEADER_NAME: &str = "x-greptime-log-extract-keys"; pub const GREPTIME_TRACE_TABLE_NAME_HEADER_NAME: &str = "x-greptime-trace-table-name"; + /// The header key that contains the pipeline params. + pub const GREPTIME_PIPELINE_PARAMS_HEADER: &str = "x-greptime-pipeline-params"; } pub static GREPTIME_DB_HEADER_FORMAT: HeaderName = diff --git a/src/servers/src/http/otlp.rs b/src/servers/src/http/otlp.rs index b5c4607c29e3..d8579fc960b3 100644 --- a/src/servers/src/http/otlp.rs +++ b/src/servers/src/http/otlp.rs @@ -30,7 +30,7 @@ use opentelemetry_proto::tonic::collector::trace::v1::{ ExportTraceServiceRequest, ExportTraceServiceResponse, }; use pipeline::util::to_pipeline_version; -use pipeline::PipelineWay; +use pipeline::{PipelineDefinition, PipelineWay}; use prost::Message; use session::context::{Channel, QueryContext}; use snafu::prelude::*; @@ -39,7 +39,7 @@ use super::header::{write_cost_header_map, CONTENT_TYPE_PROTOBUF}; use crate::error::{self, PipelineSnafu, Result}; use crate::http::extractor::{LogTableName, PipelineInfo, SelectInfoWrapper, TraceTableName}; use crate::otlp::trace::TRACE_TABLE_NAME; -use crate::query_handler::OpenTelemetryProtocolHandlerRef; +use crate::query_handler::{OpenTelemetryProtocolHandlerRef, PipelineHandler}; #[axum_macros::debug_handler] #[tracing::instrument(skip_all, fields(protocol = "otlp", request_type = "metrics"))] @@ -117,25 +117,29 @@ pub async fn logs( .start_timer(); let request = ExportLogsServiceRequest::decode(bytes).context(error::DecodeOtlpRequestSnafu)?; - let pipeline_way = if let Some(pipeline_name) = &pipeline_info.pipeline_name { - let pipeline_version = - to_pipeline_version(pipeline_info.pipeline_version).context(PipelineSnafu)?; - let pipeline = match handler - .get_pipeline(pipeline_name, pipeline_version, query_ctx.clone()) - .await - { - Ok(p) => p, - Err(e) => { - return Err(e); - } - }; - PipelineWay::Custom(pipeline) + let pipeline = if let Some(pipeline_name) = pipeline_info.pipeline_name { + PipelineWay::Pipeline(PipelineDefinition::from_name( + &pipeline_name, + to_pipeline_version(pipeline_info.pipeline_version).context(PipelineSnafu)?, + )) } else { - PipelineWay::OtlpLog(Box::new(select_info)) + PipelineWay::OtlpLogDirect(Box::new(select_info)) }; + let pipeline_params = pipeline_info.pipeline_params; + + // here we use nightly feature `trait_upcasting` to convert handler to + // pipeline_handler + let pipeline_handler: Arc = handler.clone(); handler - .logs(request, pipeline_way, tablename, query_ctx) + .logs( + pipeline_handler, + request, + pipeline, + pipeline_params, + tablename, + query_ctx, + ) .await .map(|o| OtlpResponse { resp_body: ExportLogsServiceResponse { diff --git a/src/servers/src/lib.rs b/src/servers/src/lib.rs index e95bdac7525d..61bf041f526f 100644 --- a/src/servers/src/lib.rs +++ b/src/servers/src/lib.rs @@ -17,6 +17,7 @@ #![feature(exclusive_wrapper)] #![feature(let_chains)] #![feature(if_let_guard)] +#![feature(trait_upcasting)] use datafusion_expr::LogicalPlan; use datatypes::schema::Schema; @@ -37,6 +38,7 @@ pub mod metrics_handler; pub mod mysql; pub mod opentsdb; pub mod otlp; +mod pipeline; pub mod postgres; mod prom_row_builder; pub mod prom_store; diff --git a/src/servers/src/otlp/logs.rs b/src/servers/src/otlp/logs.rs index f11cd4ff3c68..5936bd40ad60 100644 --- a/src/servers/src/otlp/logs.rs +++ b/src/servers/src/otlp/logs.rs @@ -25,14 +25,18 @@ use jsonb::{Number as JsonbNumber, Value as JsonbValue}; use opentelemetry_proto::tonic::collector::logs::v1::ExportLogsServiceRequest; use opentelemetry_proto::tonic::common::v1::{any_value, AnyValue, InstrumentationScope, KeyValue}; use opentelemetry_proto::tonic::logs::v1::{LogRecord, ResourceLogs, ScopeLogs}; -use pipeline::{Array, Map, PipelineWay, SchemaInfo, SelectInfo, Value as PipelineValue}; +use pipeline::{GreptimePipelineParams, PipelineWay, SchemaInfo, SelectInfo}; +use serde_json::{Map, Value}; +use session::context::QueryContextRef; use snafu::{ensure, ResultExt}; use super::trace::attributes::OtlpAnyValue; use super::utils::{bytes_to_hex_string, key_value_to_jsonb}; use crate::error::{ - IncompatibleSchemaSnafu, OpenTelemetryLogSnafu, Result, UnsupportedJsonDataTypeForTagSnafu, + IncompatibleSchemaSnafu, PipelineTransformSnafu, Result, UnsupportedJsonDataTypeForTagSnafu, }; +use crate::pipeline::run_pipeline; +use crate::query_handler::PipelineHandlerRef; pub const LOG_TABLE_NAME: &str = "opentelemetry_logs"; @@ -43,13 +47,16 @@ pub const LOG_TABLE_NAME: &str = "opentelemetry_logs"; /// for data structure of OTLP metrics. /// /// Returns `InsertRequests` and total number of rows to ingest -pub fn to_grpc_insert_requests( +pub async fn to_grpc_insert_requests( request: ExportLogsServiceRequest, pipeline: PipelineWay, + pipeline_params: GreptimePipelineParams, table_name: String, + query_ctx: &QueryContextRef, + pipeline_handler: PipelineHandlerRef, ) -> Result<(RowInsertRequests, usize)> { match pipeline { - PipelineWay::OtlpLog(select_info) => { + PipelineWay::OtlpLogDirect(select_info) => { let rows = parse_export_logs_service_request_to_rows(request, select_info)?; let len = rows.rows.len(); let insert_request = RowInsertRequest { @@ -63,53 +70,48 @@ pub fn to_grpc_insert_requests( len, )) } - PipelineWay::Custom(p) => { - let request = parse_export_logs_service_request(request); - let mut result = Vec::new(); - let mut intermediate_state = p.init_intermediate_state(); - for v in request { - p.prepare_pipeline_value(v, &mut intermediate_state) - .context(OpenTelemetryLogSnafu)?; - let r = p - .exec_mut(&mut intermediate_state) - .context(OpenTelemetryLogSnafu)?; - result.push(r); - } - let len = result.len(); - let rows = Rows { - schema: p.schemas().clone(), - rows: result, - }; - let insert_request = RowInsertRequest { - rows: Some(rows), + PipelineWay::Pipeline(pipeline_def) => { + let data = parse_export_logs_service_request(request); + let array = + pipeline::json_array_to_intermediate_state(data).context(PipelineTransformSnafu)?; + + let inserts = run_pipeline( + &pipeline_handler, + pipeline_def, + &pipeline_params, + array, table_name, - }; - let insert_requests = RowInsertRequests { - inserts: vec![insert_request], - }; + query_ctx, + true, + ) + .await?; + let len = inserts + .iter() + .map(|insert| { + insert + .rows + .as_ref() + .map(|rows| rows.rows.len()) + .unwrap_or(0) + }) + .sum(); + + let insert_requests = RowInsertRequests { inserts }; Ok((insert_requests, len)) } } } -fn scope_to_pipeline_value( - scope: Option, -) -> (PipelineValue, PipelineValue, PipelineValue) { +fn scope_to_pipeline_value(scope: Option) -> (Value, Value, Value) { scope .map(|x| { ( - PipelineValue::Map(Map { - values: key_value_to_map(x.attributes), - }), - PipelineValue::String(x.version), - PipelineValue::String(x.name), + Value::Object(key_value_to_map(x.attributes)), + Value::String(x.version), + Value::String(x.name), ) }) - .unwrap_or(( - PipelineValue::Null, - PipelineValue::Null, - PipelineValue::Null, - )) + .unwrap_or((Value::Null, Value::Null, Value::Null)) } fn scope_to_jsonb( @@ -128,51 +130,43 @@ fn scope_to_jsonb( fn log_to_pipeline_value( log: LogRecord, - resource_schema_url: PipelineValue, - resource_attr: PipelineValue, - scope_schema_url: PipelineValue, - scope_name: PipelineValue, - scope_version: PipelineValue, - scope_attrs: PipelineValue, -) -> PipelineValue { - let log_attrs = PipelineValue::Map(Map { - values: key_value_to_map(log.attributes), - }); - let mut map = BTreeMap::new(); - map.insert( - "Timestamp".to_string(), - PipelineValue::Uint64(log.time_unix_nano), - ); + resource_schema_url: Value, + resource_attr: Value, + scope_schema_url: Value, + scope_name: Value, + scope_version: Value, + scope_attrs: Value, +) -> Value { + let log_attrs = Value::Object(key_value_to_map(log.attributes)); + let mut map = Map::new(); + map.insert("Timestamp".to_string(), Value::from(log.time_unix_nano)); map.insert( "ObservedTimestamp".to_string(), - PipelineValue::Uint64(log.observed_time_unix_nano), + Value::from(log.observed_time_unix_nano), ); // need to be convert to string map.insert( "TraceId".to_string(), - PipelineValue::String(bytes_to_hex_string(&log.trace_id)), + Value::String(bytes_to_hex_string(&log.trace_id)), ); map.insert( "SpanId".to_string(), - PipelineValue::String(bytes_to_hex_string(&log.span_id)), - ); - map.insert("TraceFlags".to_string(), PipelineValue::Uint32(log.flags)); - map.insert( - "SeverityText".to_string(), - PipelineValue::String(log.severity_text), + Value::String(bytes_to_hex_string(&log.span_id)), ); + map.insert("TraceFlags".to_string(), Value::from(log.flags)); + map.insert("SeverityText".to_string(), Value::String(log.severity_text)); map.insert( "SeverityNumber".to_string(), - PipelineValue::Int32(log.severity_number), + Value::from(log.severity_number), ); // need to be convert to string map.insert( "Body".to_string(), log.body .as_ref() - .map(|x| PipelineValue::String(log_body_to_string(x))) - .unwrap_or(PipelineValue::Null), + .map(|x| Value::String(log_body_to_string(x))) + .unwrap_or(Value::Null), ); map.insert("ResourceSchemaUrl".to_string(), resource_schema_url); @@ -182,7 +176,7 @@ fn log_to_pipeline_value( map.insert("ScopeVersion".to_string(), scope_version); map.insert("ScopeAttributes".to_string(), scope_attrs); map.insert("LogAttributes".to_string(), log_attrs); - PipelineValue::Map(Map { values: map }) + Value::Object(map) } fn build_otlp_logs_identity_schema() -> Vec { @@ -699,22 +693,18 @@ struct ParseInfo { /// transform otlp logs request to pipeline value /// https://opentelemetry.io/docs/concepts/signals/logs/ -fn parse_export_logs_service_request(request: ExportLogsServiceRequest) -> Vec { +fn parse_export_logs_service_request(request: ExportLogsServiceRequest) -> Vec { let mut result = Vec::new(); for r in request.resource_logs { let resource_attr = r .resource - .map(|x| { - PipelineValue::Map(Map { - values: key_value_to_map(x.attributes), - }) - }) - .unwrap_or(PipelineValue::Null); - let resource_schema_url = PipelineValue::String(r.schema_url); + .map(|x| Value::Object(key_value_to_map(x.attributes))) + .unwrap_or(Value::Null); + let resource_schema_url = Value::String(r.schema_url); for scope_logs in r.scope_logs { let (scope_attrs, scope_version, scope_name) = scope_to_pipeline_value(scope_logs.scope); - let scope_schema_url = PipelineValue::String(scope_logs.schema_url); + let scope_schema_url = Value::String(scope_logs.schema_url); for log in scope_logs.log_records { let value = log_to_pipeline_value( log, @@ -733,41 +723,41 @@ fn parse_export_logs_service_request(request: ExportLogsServiceRequest) -> Vec

PipelineValue { +fn any_value_to_pipeline_value(value: any_value::Value) -> Value { match value { - any_value::Value::StringValue(s) => PipelineValue::String(s), - any_value::Value::IntValue(i) => PipelineValue::Int64(i), - any_value::Value::DoubleValue(d) => PipelineValue::Float64(d), - any_value::Value::BoolValue(b) => PipelineValue::Boolean(b), + any_value::Value::StringValue(s) => Value::String(s), + any_value::Value::IntValue(i) => Value::from(i), + any_value::Value::DoubleValue(d) => Value::from(d), + any_value::Value::BoolValue(b) => Value::Bool(b), any_value::Value::ArrayValue(a) => { let values = a .values .into_iter() .map(|v| match v.value { Some(value) => any_value_to_pipeline_value(value), - None => PipelineValue::Null, + None => Value::Null, }) .collect(); - PipelineValue::Array(Array { values }) + Value::Array(values) } any_value::Value::KvlistValue(kv) => { let value = key_value_to_map(kv.values); - PipelineValue::Map(Map { values: value }) + Value::Object(value) } - any_value::Value::BytesValue(b) => PipelineValue::String(bytes_to_hex_string(&b)), + any_value::Value::BytesValue(b) => Value::String(bytes_to_hex_string(&b)), } } // convert otlp keyValue vec to map -fn key_value_to_map(key_values: Vec) -> BTreeMap { - let mut map = BTreeMap::new(); +fn key_value_to_map(key_values: Vec) -> Map { + let mut map = Map::new(); for kv in key_values { let value = match kv.value { Some(value) => match value.value { Some(value) => any_value_to_pipeline_value(value), - None => PipelineValue::Null, + None => Value::Null, }, - None => PipelineValue::Null, + None => Value::Null, }; map.insert(kv.key.clone(), value); } diff --git a/src/servers/src/pipeline.rs b/src/servers/src/pipeline.rs new file mode 100644 index 000000000000..4d16cb6c351f --- /dev/null +++ b/src/servers/src/pipeline.rs @@ -0,0 +1,159 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::BTreeMap; +use std::sync::Arc; + +use api::v1::{RowInsertRequest, Rows}; +use pipeline::{ + DispatchedTo, GreptimePipelineParams, GreptimeTransformer, Pipeline, PipelineDefinition, + PipelineExecOutput, GREPTIME_INTERNAL_IDENTITY_PIPELINE_NAME, +}; +use session::context::QueryContextRef; +use snafu::ResultExt; + +use crate::error::{CatalogSnafu, PipelineTransformSnafu, Result}; +use crate::metrics::{ + METRIC_FAILURE_VALUE, METRIC_HTTP_LOGS_TRANSFORM_ELAPSED, METRIC_SUCCESS_VALUE, +}; +use crate::query_handler::PipelineHandlerRef; + +/// Never call this on `GreptimeIdentityPipeline` because it's a real pipeline +pub async fn get_pipeline( + pipeline_def: PipelineDefinition, + handler: &PipelineHandlerRef, + query_ctx: &QueryContextRef, +) -> Result>> { + match pipeline_def { + PipelineDefinition::Resolved(pipeline) => Ok(pipeline), + PipelineDefinition::ByNameAndValue((name, version)) => { + handler + .get_pipeline(&name, version, query_ctx.clone()) + .await + } + _ => { + unreachable!("Never call get_pipeline on identity.") + } + } +} + +pub(crate) async fn run_pipeline( + state: &PipelineHandlerRef, + pipeline_definition: PipelineDefinition, + pipeline_parameters: &GreptimePipelineParams, + array: Vec>, + table_name: String, + query_ctx: &QueryContextRef, + is_top_level: bool, +) -> Result> { + let db = query_ctx.get_db_string(); + + if matches!( + pipeline_definition, + PipelineDefinition::GreptimeIdentityPipeline + ) { + let table = state + .get_table(&table_name, query_ctx) + .await + .context(CatalogSnafu)?; + pipeline::identity_pipeline(array, table, pipeline_parameters) + .map(|rows| { + vec![RowInsertRequest { + rows: Some(rows), + table_name, + }] + }) + .context(PipelineTransformSnafu) + } else { + let pipeline = get_pipeline(pipeline_definition, state, query_ctx).await?; + + let transform_timer = std::time::Instant::now(); + + let mut transformed = Vec::with_capacity(array.len()); + let mut dispatched: BTreeMap>> = + BTreeMap::new(); + + for mut values in array { + let r = pipeline + .exec_mut(&mut values) + .inspect_err(|_| { + METRIC_HTTP_LOGS_TRANSFORM_ELAPSED + .with_label_values(&[db.as_str(), METRIC_FAILURE_VALUE]) + .observe(transform_timer.elapsed().as_secs_f64()); + }) + .context(PipelineTransformSnafu)?; + + match r { + PipelineExecOutput::Transformed(row) => { + transformed.push(row); + } + PipelineExecOutput::DispatchedTo(dispatched_to) => { + if let Some(coll) = dispatched.get_mut(&dispatched_to) { + coll.push(values); + } else { + dispatched.insert(dispatched_to, vec![values]); + } + } + } + } + + let mut results = Vec::new(); + // if current pipeline generates some transformed results, build it as + // `RowInsertRequest` and append to results. If the pipeline doesn't + // have dispatch, this will be only output of the pipeline. + if !transformed.is_empty() { + results.push(RowInsertRequest { + rows: Some(Rows { + rows: transformed, + schema: pipeline.schemas().clone(), + }), + table_name: table_name.clone(), + }) + } + + // if current pipeline contains dispatcher and has several rules, we may + // already accumulated several dispatched rules and rows. + for (dispatched_to, coll) in dispatched { + // we generate the new table name according to `table_part` and + // current custom table name. + let table_name = format!("{}_{}", &table_name, dispatched_to.table_part); + let next_pipeline_name = dispatched_to + .pipeline + .as_deref() + .unwrap_or(GREPTIME_INTERNAL_IDENTITY_PIPELINE_NAME); + + // run pipeline recursively. + let requests = Box::pin(run_pipeline( + state, + PipelineDefinition::from_name(next_pipeline_name, None), + pipeline_parameters, + coll, + table_name, + query_ctx, + false, + )) + .await?; + + results.extend(requests); + } + + if is_top_level { + METRIC_HTTP_LOGS_TRANSFORM_ELAPSED + .with_label_values(&[db.as_str(), METRIC_SUCCESS_VALUE]) + .observe(transform_timer.elapsed().as_secs_f64()); + } + + Ok(results) + } +} diff --git a/src/servers/src/query_handler.rs b/src/servers/src/query_handler.rs index d450815a4a0c..dd41305626b9 100644 --- a/src/servers/src/query_handler.rs +++ b/src/servers/src/query_handler.rs @@ -38,7 +38,10 @@ use log_query::LogQuery; use opentelemetry_proto::tonic::collector::logs::v1::ExportLogsServiceRequest; use opentelemetry_proto::tonic::collector::metrics::v1::ExportMetricsServiceRequest; use opentelemetry_proto::tonic::collector::trace::v1::ExportTraceServiceRequest; -use pipeline::{GreptimeTransformer, Pipeline, PipelineInfo, PipelineVersion, PipelineWay}; +use pipeline::{ + GreptimePipelineParams, GreptimeTransformer, Pipeline, PipelineInfo, PipelineVersion, + PipelineWay, +}; use serde_json::Value; use session::context::{QueryContext, QueryContextRef}; @@ -110,8 +113,10 @@ pub trait OpenTelemetryProtocolHandler: PipelineHandler { async fn logs( &self, + pipeline_handler: PipelineHandlerRef, request: ExportLogsServiceRequest, pipeline: PipelineWay, + pipeline_params: GreptimePipelineParams, table_name: String, ctx: QueryContextRef, ) -> Result; diff --git a/tests-integration/tests/http.rs b/tests-integration/tests/http.rs index e4a75aa3418b..e5ed2d84fd7f 100644 --- a/tests-integration/tests/http.rs +++ b/tests-integration/tests/http.rs @@ -93,6 +93,7 @@ macro_rules! http_tests { test_plain_text_ingestion, test_identify_pipeline, test_identify_pipeline_with_flatten, + test_pipeline_dispatcher, test_otlp_metrics, test_otlp_traces, @@ -1382,6 +1383,197 @@ pub async fn test_identify_pipeline(store_type: StorageType) { guard.remove_all().await; } +pub async fn test_pipeline_dispatcher(storage_type: StorageType) { + common_telemetry::init_default_ut_logging(); + let (app, mut guard) = + setup_test_http_app_with_frontend(storage_type, "test_pipeline_dispatcher").await; + + // handshake + let client = TestClient::new(app).await; + + let root_pipeline = r#" +processors: + - date: + field: time + formats: + - "%Y-%m-%d %H:%M:%S%.3f" + ignore_missing: true + +dispatcher: + field: type + rules: + - value: http + table_part: http + pipeline: http + - value: db + table_part: db + - value: not_found + table_part: not_found + pipeline: not_found + +transform: + - fields: + - id1, id1_root + - id2, id2_root + type: int32 + - fields: + - type + - log + - logger + type: string + - field: time + type: time + index: timestamp +"#; + + let http_pipeline = r#" +processors: + +transform: + - fields: + - id1, id1_http + - id2, id2_http + type: int32 + - fields: + - log + - logger + type: string + - field: time + type: time + index: timestamp +"#; + + // 1. create pipeline + let res = client + .post("/v1/events/pipelines/root") + .header("Content-Type", "application/x-yaml") + .body(root_pipeline) + .send() + .await; + + assert_eq!(res.status(), StatusCode::OK); + + let res = client + .post("/v1/events/pipelines/http") + .header("Content-Type", "application/x-yaml") + .body(http_pipeline) + .send() + .await; + + assert_eq!(res.status(), StatusCode::OK); + + // 2. write data + let data_body = r#" +[ + { + "id1": "2436", + "id2": "2528", + "logger": "INTERACT.MANAGER", + "type": "http", + "time": "2024-05-25 20:16:37.217", + "log": "ClusterAdapter:enter sendTextDataToCluster\\n" + } +] +"#; + let res = client + .post("/v1/events/logs?db=public&table=logs1&pipeline_name=root") + .header("Content-Type", "application/json") + .body(data_body) + .send() + .await; + assert_eq!(res.status(), StatusCode::OK); + + let data_body = r#" +[ + { + "id1": "2436", + "id2": "2528", + "logger": "INTERACT.MANAGER", + "type": "db", + "time": "2024-05-25 20:16:37.217", + "log": "ClusterAdapter:enter sendTextDataToCluster\\n" + } +] +"#; + let res = client + .post("/v1/events/logs?db=public&table=logs1&pipeline_name=root") + .header("Content-Type", "application/json") + .body(data_body) + .send() + .await; + assert_eq!(res.status(), StatusCode::OK); + + let data_body = r#" +[ + { + "id1": "2436", + "id2": "2528", + "logger": "INTERACT.MANAGER", + "type": "api", + "time": "2024-05-25 20:16:37.217", + "log": "ClusterAdapter:enter sendTextDataToCluster\\n" + } +] +"#; + let res = client + .post("/v1/events/logs?db=public&table=logs1&pipeline_name=root") + .header("Content-Type", "application/json") + .body(data_body) + .send() + .await; + assert_eq!(res.status(), StatusCode::OK); + + let data_body = r#" +[ + { + "id1": "2436", + "id2": "2528", + "logger": "INTERACT.MANAGER", + "type": "not_found", + "time": "2024-05-25 20:16:37.217", + "log": "ClusterAdapter:enter sendTextDataToCluster\\n" + } +] +"#; + let res = client + .post("/v1/events/logs?db=public&table=logs1&pipeline_name=root") + .header("Content-Type", "application/json") + .body(data_body) + .send() + .await; + assert_eq!(res.status(), StatusCode::BAD_REQUEST); + + // 3. verify data + let expected = "[[2436]]"; + validate_data( + "test_dispatcher_pipeline default table", + &client, + "select id1_root from logs1", + expected, + ) + .await; + + let expected = "[[2436]]"; + validate_data( + "test_dispatcher_pipeline http table", + &client, + "select id1_http from logs1_http", + expected, + ) + .await; + + let expected = "[[\"2436\"]]"; + validate_data( + "test_dispatcher_pipeline db table", + &client, + "select id1 from logs1_db", + expected, + ) + .await; + + guard.remove_all().await; +} + pub async fn test_identify_pipeline_with_flatten(store_type: StorageType) { common_telemetry::init_default_ut_logging(); let (app, mut guard) = @@ -1586,8 +1778,8 @@ transform: .await; assert_eq!(res.status(), StatusCode::OK); let body: Value = res.json().await; - let schema = &body["schema"]; - let rows = &body["rows"]; + let schema = &body[0]["schema"]; + let rows = &body[0]["rows"]; assert_eq!(schema, &dryrun_schema); assert_eq!(rows, &dryrun_rows); } @@ -1616,8 +1808,8 @@ transform: .await; assert_eq!(res.status(), StatusCode::OK); let body: Value = res.json().await; - let schema = &body["schema"]; - let rows = &body["rows"]; + let schema = &body[0]["schema"]; + let rows = &body[0]["rows"]; assert_eq!(schema, &dryrun_schema); assert_eq!(rows, &dryrun_rows); } @@ -1644,8 +1836,8 @@ transform: .await; assert_eq!(res.status(), StatusCode::OK); let body: Value = res.json().await; - let schema = &body["schema"]; - let rows = &body["rows"]; + let schema = &body[0]["schema"]; + let rows = &body[0]["rows"]; assert_eq!(schema, &dryrun_schema); assert_eq!(rows, &dryrun_rows); } @@ -1686,7 +1878,7 @@ pub async fn test_plain_text_ingestion(store_type: StorageType) { processors: - dissect: fields: - - line + - message patterns: - "%{+ts} %{+ts} %{content}" - date: @@ -2271,7 +2463,7 @@ async fn validate_data(test_name: &str, client: &TestClient, sql: &str, expected .get(format!("/v1/sql?sql={sql}").as_str()) .send() .await; - assert_eq!(res.status(), StatusCode::OK); + assert_eq!(res.status(), StatusCode::OK, "validate {test_name} fail"); let resp = res.text().await; let v = get_rows_from_output(&resp);