From 0e4574452ce8b4c0a90b708ee49480956096367e Mon Sep 17 00:00:00 2001 From: Ning Sun Date: Tue, 14 Jan 2025 22:34:26 +0800 Subject: [PATCH 01/32] fmt: correct format --- src/pipeline/src/dispatcher.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/src/pipeline/src/dispatcher.rs b/src/pipeline/src/dispatcher.rs index f16fd7e57fb2..f54531e802e3 100644 --- a/src/pipeline/src/dispatcher.rs +++ b/src/pipeline/src/dispatcher.rs @@ -15,7 +15,6 @@ use snafu::OptionExt; use yaml_rust::Yaml; -use crate::etl::error::{Error, Result}; use crate::etl_error::{ FieldRequiredForDispatcherSnafu, TablePartRequiredForDispatcherRuleSnafu, ValueRequiredForDispatcherRuleSnafu, From aa63b875873a9b0233db995ff70232abe5260b95 Mon Sep 17 00:00:00 2001 From: Ning Sun Date: Wed, 15 Jan 2025 10:40:11 +0800 Subject: [PATCH 02/32] test: add negative tests --- src/pipeline/src/dispatcher.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/pipeline/src/dispatcher.rs b/src/pipeline/src/dispatcher.rs index f54531e802e3..7c4207f6b5c1 100644 --- a/src/pipeline/src/dispatcher.rs +++ b/src/pipeline/src/dispatcher.rs @@ -83,6 +83,7 @@ impl TryFrom<&Yaml> for Dispatcher { .as_str() .map(|s| s.to_string()) .context(TablePartRequiredForDispatcherRuleSnafu)?; + let pipeline = rule[PIPELINE].as_str().map(|s| s.to_string()); if rule[VALUE].is_badvalue() { From 6c226b52b7e3e9536d4bc12b7af3cbb6f4daa931 Mon Sep 17 00:00:00 2001 From: Ning Sun Date: Thu, 16 Jan 2025 19:17:57 +0800 Subject: [PATCH 03/32] feat: Add pipeline dispatching and execution output handling --- src/pipeline/src/dispatcher.rs | 24 ++++ src/pipeline/src/etl.rs | 74 +++++++++++-- src/pipeline/src/lib.rs | 5 +- src/servers/src/http/event.rs | 194 ++++++++++++++++++++++++--------- 4 files changed, 236 insertions(+), 61 deletions(-) diff --git a/src/pipeline/src/dispatcher.rs b/src/pipeline/src/dispatcher.rs index 7c4207f6b5c1..a2a1e9fa1425 100644 --- a/src/pipeline/src/dispatcher.rs +++ b/src/pipeline/src/dispatcher.rs @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +use common_telemetry::debug; use snafu::OptionExt; use yaml_rust::Yaml; @@ -105,3 +106,26 @@ impl TryFrom<&Yaml> for Dispatcher { Ok(Dispatcher { field, rules }) } } + +impl Dispatcher { + /// execute dispatcher and returns matched rule if any + pub(crate) fn exec(&self, keys: &Vec, val: &Vec) -> Option<&Rule> { + if let Some(index) = keys.iter().position(|key| key == &self.field) { + if let Some(value) = val.get(index) { + for rule in &self.rules { + if rule.value == *value { + return Some(rule); + } + } + + None + } else { + debug!("value at index {} is not found in {:?}", &index, val); + None + } + } else { + debug!("field {} not found in keys {:?}", &self.field, keys); + None + } + } +} diff --git a/src/pipeline/src/etl.rs b/src/pipeline/src/etl.rs index d55cf25d543d..a12b9d7b0478 100644 --- a/src/pipeline/src/etl.rs +++ b/src/pipeline/src/etl.rs @@ -30,7 +30,7 @@ use transform::{TransformBuilders, Transformer, Transforms}; use value::Value; use yaml_rust::YamlLoader; -use crate::dispatcher::Dispatcher; +use crate::dispatcher::{Dispatcher, Rule}; use crate::etl::error::Result; const DESCRIPTION: &str = "description"; @@ -192,16 +192,60 @@ where // pub on_failure: processor::Processors, } +/// Where the pipeline executed is dispatched to, with context information +#[derive(Debug, Hash, PartialEq, Eq, Clone, PartialOrd, Ord)] +pub struct DispatchedTo { + pub table_part: String, + pub pipeline: Option, +} + +impl From<&Rule> for DispatchedTo { + fn from(value: &Rule) -> Self { + DispatchedTo { + table_part: value.table_part.clone(), + pipeline: value.pipeline.clone(), + } + } +} + +/// The result of pipeline execution +#[derive(Debug)] +pub enum PipelineExecOutput { + Transformed(O), + DispatchedTo(DispatchedTo), +} + +impl PipelineExecOutput { + pub(crate) fn into_transformed(self) -> Option { + if let Self::Transformed(o) = self { + Some(o) + } else { + None + } + } +} + impl Pipeline where T: Transformer, { - pub fn exec_mut(&self, val: &mut Vec) -> Result { + pub fn exec_mut(&self, val: &mut Vec) -> Result> { for processor in self.processors.iter() { processor.exec_mut(val)?; } - self.transformer.transform_mut(val) + let matched_rule = self + .dispatcher + .as_ref() + .and_then(|dispatcher| dispatcher.exec(&self.intermediate_keys, val)); + + match matched_rule { + None => self + .transformer + .transform_mut(val) + .map(PipelineExecOutput::Transformed), + Some(rule) => Ok(PipelineExecOutput::DispatchedTo(rule.into())), + } } pub fn prepare_pipeline_value(&self, val: Value, result: &mut [Value]) -> Result<()> { @@ -379,7 +423,11 @@ transform: payload, vec![Value::String("1,2".to_string()), Value::Null, Value::Null] ); - let result = pipeline.exec_mut(&mut payload).unwrap(); + let result = pipeline + .exec_mut(&mut payload) + .unwrap() + .into_transformed() + .unwrap(); assert_eq!(result.values[0].value_data, Some(ValueData::U32Value(1))); assert_eq!(result.values[1].value_data, Some(ValueData::U32Value(2))); @@ -428,7 +476,11 @@ transform: pipeline .prepare(serde_json::Value::String(message), &mut payload) .unwrap(); - let result = pipeline.exec_mut(&mut payload).unwrap(); + let result = pipeline + .exec_mut(&mut payload) + .unwrap() + .into_transformed() + .unwrap(); let sechema = pipeline.schemas(); assert_eq!(sechema.len(), result.values.len()); @@ -507,7 +559,11 @@ transform: payload, vec![Value::String("1,2".to_string()), Value::Null, Value::Null] ); - let result = pipeline.exec_mut(&mut payload).unwrap(); + let result = pipeline + .exec_mut(&mut payload) + .unwrap() + .into_transformed() + .unwrap(); assert_eq!(result.values[0].value_data, Some(ValueData::U32Value(1))); assert_eq!(result.values[1].value_data, Some(ValueData::U32Value(2))); match &result.values[2].value_data { @@ -547,7 +603,11 @@ transform: let schema = pipeline.schemas().clone(); let mut result = pipeline.init_intermediate_state(); pipeline.prepare(input_value, &mut result).unwrap(); - let row = pipeline.exec_mut(&mut result).unwrap(); + let row = pipeline + .exec_mut(&mut result) + .unwrap() + .into_transformed() + .unwrap(); let output = Rows { schema, rows: vec![row], diff --git a/src/pipeline/src/lib.rs b/src/pipeline/src/lib.rs index 49ecea41c449..1cf5589b47f3 100644 --- a/src/pipeline/src/lib.rs +++ b/src/pipeline/src/lib.rs @@ -23,7 +23,10 @@ pub use etl::transform::transformer::greptime::SchemaInfo; pub use etl::transform::transformer::identity_pipeline; pub use etl::transform::{GreptimeTransformer, Transformer}; pub use etl::value::{Array, Map, Value}; -pub use etl::{error as etl_error, parse, Content, Pipeline, PipelineWay, SelectInfo}; +pub use etl::{ + error as etl_error, parse, Content, DispatchedTo, Pipeline, PipelineExecOutput, PipelineWay, + SelectInfo, +}; pub use manager::{ error, pipeline_operator, table, util, PipelineInfo, PipelineRef, PipelineTableRef, PipelineVersion, diff --git a/src/servers/src/http/event.rs b/src/servers/src/http/event.rs index 17fc56f56135..685d67fc8f1a 100644 --- a/src/servers/src/http/event.rs +++ b/src/servers/src/http/event.rs @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +use std::collections::BTreeMap; use std::result::Result as StdResult; use std::str::FromStr; use std::sync::Arc; @@ -32,7 +33,7 @@ use datatypes::value::column_data_to_json; use lazy_static::lazy_static; use pipeline::error::PipelineTransformSnafu; use pipeline::util::to_pipeline_version; -use pipeline::{GreptimeTransformer, PipelineVersion}; +use pipeline::{DispatchedTo, GreptimeTransformer, PipelineExecOutput, PipelineVersion}; use serde::{Deserialize, Serialize}; use serde_json::{json, Deserializer, Map, Value}; use session::context::{Channel, QueryContext, QueryContextRef}; @@ -551,72 +552,159 @@ fn extract_pipeline_value_by_content_type( }) } -pub(crate) async fn ingest_logs_inner( - state: PipelineHandlerRef, - pipeline_name: String, +enum PipelineInputValue { + // multiple row values as a value object + Original(Vec), + // 2-dimension row values by column + Intermediate(Vec>), +} + +async fn run_pipeline( + state: &PipelineHandlerRef, + pipeline_name: &str, version: PipelineVersion, - log_ingest_requests: Vec, - query_ctx: QueryContextRef, -) -> Result { - let db = query_ctx.get_db_string(); - let exec_timer = std::time::Instant::now(); + value: PipelineInputValue, + table_name: String, + query_ctx: &QueryContextRef, + db: &str, + is_top_level: bool, +) -> Result> { + if pipeline_name == GREPTIME_INTERNAL_IDENTITY_PIPELINE_NAME { + let table = state + .get_table(&table, &query_ctx) + .await + .context(CatalogSnafu)?; + pipeline::identity_pipeline(request.values, table) + .map(|rows| { + vec![RowInsertRequest { + rows: Some(rows), + table_name: table_name, + }] + }) + .context(PipelineTransformSnafu) + .context(PipelineSnafu) + } else { + let pipeline = state + .get_pipeline(&pipeline_name, version, query_ctx.clone()) + .await?; - let mut insert_requests = Vec::with_capacity(log_ingest_requests.len()); + let transform_timer = std::time::Instant::now(); + let mut intermediate_state = pipeline.init_intermediate_state(); - for request in log_ingest_requests { - let transformed_data: Rows = if pipeline_name == GREPTIME_INTERNAL_IDENTITY_PIPELINE_NAME { - let table = state - .get_table(&request.table, &query_ctx) - .await - .context(CatalogSnafu)?; - pipeline::identity_pipeline(request.values, table) - .context(PipelineTransformSnafu) - .context(PipelineSnafu)? - } else { - let pipeline = state - .get_pipeline(&pipeline_name, version, query_ctx.clone()) - .await?; + let mut transformed = Vec::with_capacity(request.values.len()); + let mut dispatched: BTreeMap> = BTreeMap::new(); - let transform_timer = std::time::Instant::now(); - let mut intermediate_state = pipeline.init_intermediate_state(); - let mut results = Vec::with_capacity(request.values.len()); - for v in request.values { - pipeline - .prepare(v, &mut intermediate_state) - .inspect_err(|_| { + for v in request.values { + pipeline + .prepare(v, &mut intermediate_state) + .inspect_err(|_| { + if is_top_level { METRIC_HTTP_LOGS_TRANSFORM_ELAPSED - .with_label_values(&[db.as_str(), METRIC_FAILURE_VALUE]) + .with_label_values(&[db, METRIC_FAILURE_VALUE]) .observe(transform_timer.elapsed().as_secs_f64()); - }) - .context(PipelineTransformSnafu) - .context(PipelineSnafu)?; - let r = pipeline - .exec_mut(&mut intermediate_state) - .inspect_err(|_| { + } + }) + .context(PipelineTransformSnafu) + .context(PipelineSnafu)?; + let r = pipeline + .exec_mut(&mut intermediate_state) + .inspect_err(|_| { + if is_top_level { METRIC_HTTP_LOGS_TRANSFORM_ELAPSED - .with_label_values(&[db.as_str(), METRIC_FAILURE_VALUE]) + .with_label_values(&[db, METRIC_FAILURE_VALUE]) .observe(transform_timer.elapsed().as_secs_f64()); - }) - .context(PipelineTransformSnafu) - .context(PipelineSnafu)?; - results.push(r); - pipeline.reset_intermediate_state(&mut intermediate_state); + } + }) + .context(PipelineTransformSnafu) + .context(PipelineSnafu)?; + + match r { + PipelineExecOutput::Transformed(row) => { + transformed.push(row); + } + PipelineExecOutput::DispatchedTo(dispatched_to) => { + if let Some(values) = dispatched.get_mut(&dispatched_to) { + // FIXME: can only push intermediate state + values.push(v.clone()); + } else { + dispatched.insert(dispatched_to, vec![v]); + } + } } + pipeline.reset_intermediate_state(&mut intermediate_state); + } + + let mut results = Vec::new(); + if !transformed.is_empty() { + results.push(RowInsertRequest { + rows: Some(Rows { + rows: transformed, + schema: pipeline.schemas().clone(), + }), + table_name, + }) + } + + for (dispatched_to, values) in dispatched { + let request = LogIngestRequest { + values, + table: format!("{}_{}", table_name, dispatched_to.table_part), + }; + let next_pipeline_name = dispatched_to + .pipeline + .as_deref() + .unwrap_or(GREPTIME_INTERNAL_IDENTITY_PIPELINE_NAME); + + let requests = Box::pin(run_pipeline( + state, + next_pipeline_name, + None, + request, + query_ctx, + db, + false, + )) + .await?; + + results.extend(requests); + } + + if is_top_level { METRIC_HTTP_LOGS_TRANSFORM_ELAPSED - .with_label_values(&[db.as_str(), METRIC_SUCCESS_VALUE]) + .with_label_values(&[db, METRIC_SUCCESS_VALUE]) .observe(transform_timer.elapsed().as_secs_f64()); + } - Rows { - rows: results, - schema: pipeline.schemas().clone(), - } - }; + Ok(results) + } +} + +pub(crate) async fn ingest_logs_inner( + state: PipelineHandlerRef, + pipeline_name: String, + version: PipelineVersion, + log_ingest_requests: Vec, + query_ctx: QueryContextRef, +) -> Result { + let db = query_ctx.get_db_string(); + let exec_timer = std::time::Instant::now(); + + let mut insert_requests = Vec::with_capacity(log_ingest_requests.len()); + + for request in log_ingest_requests { + let requests = run_pipeline( + &state, + &pipeline_name, + version, + request, + &query_ctx, + db.as_str(), + true, + ) + .await?; - insert_requests.push(RowInsertRequest { - rows: Some(transformed_data), - table_name: request.table.clone(), - }); + insert_requests.extend(requests); } let output = state From 63f79097c963197e63b34eac670c19abd56e5b38 Mon Sep 17 00:00:00 2001 From: Ning Sun Date: Fri, 17 Jan 2025 18:53:59 +0800 Subject: [PATCH 04/32] refactor: Enhance ingest function to correctly process original data values custom table names during pipeline execution while optimizing the management of transformed rows and multiple dispatched pipelines --- .../src/etl/transform/transformer/greptime.rs | 46 ++++- src/pipeline/src/lib.rs | 2 +- src/servers/src/http/event.rs | 160 ++++++++++++------ 3 files changed, 145 insertions(+), 63 deletions(-) diff --git a/src/pipeline/src/etl/transform/transformer/greptime.rs b/src/pipeline/src/etl/transform/transformer/greptime.rs index 0ae15d0d50b7..087f5bc97516 100644 --- a/src/pipeline/src/etl/transform/transformer/greptime.rs +++ b/src/pipeline/src/etl/transform/transformer/greptime.rs @@ -363,17 +363,28 @@ fn json_value_to_row( } fn identity_pipeline_inner<'a>( - array: Vec, + array: PipelineExecInput, tag_column_names: Option>, ) -> Result { let mut rows = Vec::with_capacity(array.len()); let mut schema_info = SchemaInfo::default(); - for value in array { - if let serde_json::Value::Object(map) = value { - let row = json_value_to_row(&mut schema_info, map)?; - rows.push(row); + + match array { + PipelineExecInput::Original(array) => { + for value in array { + if let serde_json::Value::Object(map) = value { + let row = json_value_to_row(&mut schema_info, map)?; + rows.push(row); + } + } + } + PipelineExecInput::Intermediate { keys, array } => { + for values in array { + todo!() + } } } + let greptime_timestamp_schema = ColumnSchema { column_name: DEFAULT_GREPTIME_TIMESTAMP_COLUMN.to_string(), datatype: ColumnDataType::TimestampNanosecond as i32, @@ -409,6 +420,29 @@ fn identity_pipeline_inner<'a>( }) } +/// The input data format for pipeline +/// +/// It can either be raw input as in `serde_json::Value` or intermediate `Vec` +pub enum PipelineExecInput { + // multiple row values as a value object + Original(Vec), + // 2-dimension row values by column + Intermediate { + array: Vec>, + keys: Vec, + }, +} + +impl PipelineExecInput { + /// return the length of internal array + pub fn len(&self) -> usize { + match self { + PipelineExecInput::Original(array) => array.len(), + PipelineExecInput::Intermediate { array, .. } => array.len(), + } + } +} + /// Identity pipeline for Greptime /// This pipeline will convert the input JSON array to Greptime Rows /// params table is used to set the semantic type of the row key column to Tag @@ -418,7 +452,7 @@ fn identity_pipeline_inner<'a>( /// 4. The pipeline will return an error if the same column datatype is mismatched /// 5. The pipeline will analyze the schema of each json record and merge them to get the final schema. pub fn identity_pipeline( - array: Vec, + array: PipelineExecInput, table: Option>, ) -> Result { match table { diff --git a/src/pipeline/src/lib.rs b/src/pipeline/src/lib.rs index 1cf5589b47f3..8ebf9ab0b9f1 100644 --- a/src/pipeline/src/lib.rs +++ b/src/pipeline/src/lib.rs @@ -19,7 +19,7 @@ mod metrics; pub use etl::error::Result; pub use etl::processor::Processor; -pub use etl::transform::transformer::greptime::SchemaInfo; +pub use etl::transform::transformer::greptime::{PipelineExecInput, SchemaInfo}; pub use etl::transform::transformer::identity_pipeline; pub use etl::transform::{GreptimeTransformer, Transformer}; pub use etl::value::{Array, Map, Value}; diff --git a/src/servers/src/http/event.rs b/src/servers/src/http/event.rs index 685d67fc8f1a..446306ef26a0 100644 --- a/src/servers/src/http/event.rs +++ b/src/servers/src/http/event.rs @@ -18,7 +18,7 @@ use std::str::FromStr; use std::sync::Arc; use std::time::Instant; -use api::v1::{RowInsertRequest, RowInsertRequests, Rows}; +use api::v1::{Row, RowInsertRequest, RowInsertRequests, Rows}; use axum::body::HttpBody; use axum::extract::{FromRequest, Multipart, Path, Query, State}; use axum::headers::ContentType; @@ -33,7 +33,10 @@ use datatypes::value::column_data_to_json; use lazy_static::lazy_static; use pipeline::error::PipelineTransformSnafu; use pipeline::util::to_pipeline_version; -use pipeline::{DispatchedTo, GreptimeTransformer, PipelineExecOutput, PipelineVersion}; +use pipeline::{ + DispatchedTo, GreptimeTransformer, Pipeline, PipelineExecInput, PipelineExecOutput, + PipelineVersion, +}; use serde::{Deserialize, Serialize}; use serde_json::{json, Deserializer, Map, Value}; use session::context::{Channel, QueryContext, QueryContextRef}; @@ -552,18 +555,49 @@ fn extract_pipeline_value_by_content_type( }) } -enum PipelineInputValue { - // multiple row values as a value object - Original(Vec), - // 2-dimension row values by column - Intermediate(Vec>), +#[inline] +fn pipline_exec_with_intermediate_state( + pipeline: &Arc>, + intermediate_state: &mut Vec, + transformed: &mut Vec, + dispatched: &mut BTreeMap>>, + db: &str, + transform_timer: &Instant, + is_top_level: bool, +) -> Result<()> { + let r = pipeline + .exec_mut(intermediate_state) + .inspect_err(|_| { + if is_top_level { + METRIC_HTTP_LOGS_TRANSFORM_ELAPSED + .with_label_values(&[db, METRIC_FAILURE_VALUE]) + .observe(transform_timer.elapsed().as_secs_f64()); + } + }) + .context(PipelineTransformSnafu) + .context(PipelineSnafu)?; + + match r { + PipelineExecOutput::Transformed(row) => { + transformed.push(row); + } + PipelineExecOutput::DispatchedTo(dispatched_to) => { + if let Some(values) = dispatched.get_mut(&dispatched_to) { + values.push(intermediate_state.clone()); + } else { + dispatched.insert(dispatched_to, vec![intermediate_state.clone()]); + } + } + } + + Ok(()) } async fn run_pipeline( state: &PipelineHandlerRef, pipeline_name: &str, version: PipelineVersion, - value: PipelineInputValue, + values: PipelineExecInput, table_name: String, query_ctx: &QueryContextRef, db: &str, @@ -571,10 +605,10 @@ async fn run_pipeline( ) -> Result> { if pipeline_name == GREPTIME_INTERNAL_IDENTITY_PIPELINE_NAME { let table = state - .get_table(&table, &query_ctx) + .get_table(&table_name, &query_ctx) .await .context(CatalogSnafu)?; - pipeline::identity_pipeline(request.values, table) + pipeline::identity_pipeline(values, table) .map(|rows| { vec![RowInsertRequest { rows: Some(rows), @@ -591,76 +625,89 @@ async fn run_pipeline( let transform_timer = std::time::Instant::now(); let mut intermediate_state = pipeline.init_intermediate_state(); - let mut transformed = Vec::with_capacity(request.values.len()); - let mut dispatched: BTreeMap> = BTreeMap::new(); - - for v in request.values { - pipeline - .prepare(v, &mut intermediate_state) - .inspect_err(|_| { - if is_top_level { - METRIC_HTTP_LOGS_TRANSFORM_ELAPSED - .with_label_values(&[db, METRIC_FAILURE_VALUE]) - .observe(transform_timer.elapsed().as_secs_f64()); - } - }) - .context(PipelineTransformSnafu) - .context(PipelineSnafu)?; - let r = pipeline - .exec_mut(&mut intermediate_state) - .inspect_err(|_| { - if is_top_level { - METRIC_HTTP_LOGS_TRANSFORM_ELAPSED - .with_label_values(&[db, METRIC_FAILURE_VALUE]) - .observe(transform_timer.elapsed().as_secs_f64()); - } - }) - .context(PipelineTransformSnafu) - .context(PipelineSnafu)?; - - match r { - PipelineExecOutput::Transformed(row) => { - transformed.push(row); + let mut transformed = Vec::with_capacity(values.len()); + let mut dispatched: BTreeMap>> = BTreeMap::new(); + + match values { + PipelineExecInput::Original(array) => { + for v in array { + pipeline + .prepare(v, &mut intermediate_state) + .inspect_err(|_| { + if is_top_level { + METRIC_HTTP_LOGS_TRANSFORM_ELAPSED + .with_label_values(&[db, METRIC_FAILURE_VALUE]) + .observe(transform_timer.elapsed().as_secs_f64()); + } + }) + .context(PipelineTransformSnafu) + .context(PipelineSnafu)?; + + pipline_exec_with_intermediate_state( + &pipeline, + &mut intermediate_state, + &mut transformed, + &mut dispatched, + db, + &transform_timer, + is_top_level, + )?; + + pipeline.reset_intermediate_state(&mut intermediate_state); } - PipelineExecOutput::DispatchedTo(dispatched_to) => { - if let Some(values) = dispatched.get_mut(&dispatched_to) { - // FIXME: can only push intermediate state - values.push(v.clone()); - } else { - dispatched.insert(dispatched_to, vec![v]); - } + } + PipelineExecInput::Intermediate { array, .. } => { + for mut intermediate_state in array { + pipline_exec_with_intermediate_state( + &pipeline, + &mut intermediate_state, + &mut transformed, + &mut dispatched, + db, + &transform_timer, + is_top_level, + )?; } } - - pipeline.reset_intermediate_state(&mut intermediate_state); } let mut results = Vec::new(); + // if current pipeline generates some transformed results, build it as + // `RowInsertRequest` and append to results. If the pipeline doesn't + // have dispatch, this will be only output of the pipeline. if !transformed.is_empty() { results.push(RowInsertRequest { rows: Some(Rows { rows: transformed, schema: pipeline.schemas().clone(), }), - table_name, + table_name: table_name.clone(), }) } + // if current pipeline contains dispatcher and has several rules, we may + // already accumulated several dispatched rules and rows. for (dispatched_to, values) in dispatched { - let request = LogIngestRequest { - values, - table: format!("{}_{}", table_name, dispatched_to.table_part), - }; + // we generate the new table name according to `table_part` and + // current custom table name. + let table_name = format!("{}_{}", &table_name, dispatched_to.table_part); let next_pipeline_name = dispatched_to .pipeline .as_deref() .unwrap_or(GREPTIME_INTERNAL_IDENTITY_PIPELINE_NAME); + // run pipeline recursively. Note that the values we are going to + // process is now intermediate version. It's in form of + // `Vec>`. let requests = Box::pin(run_pipeline( state, next_pipeline_name, None, - request, + PipelineExecInput::Intermediate { + array: values, + keys: pipeline.intermediate_keys().clone(), + }, + table_name, query_ctx, db, false, @@ -697,7 +744,8 @@ pub(crate) async fn ingest_logs_inner( &state, &pipeline_name, version, - request, + PipelineExecInput::Original(request.values), + request.table, &query_ctx, db.as_str(), true, From 9fd85359e65a91fcc76a9ed548a511b938687cee Mon Sep 17 00:00:00 2001 From: Ning Sun Date: Mon, 20 Jan 2025 15:35:57 +0800 Subject: [PATCH 05/32] refactor: call greptime_identity with intermediate values --- src/pipeline/src/dispatcher.rs | 4 +- src/pipeline/src/etl/error.rs | 11 +- .../src/etl/transform/transformer/greptime.rs | 211 +++++++++++++++++- src/pipeline/src/etl/value.rs | 23 ++ src/servers/src/http/event.rs | 4 +- 5 files changed, 245 insertions(+), 8 deletions(-) diff --git a/src/pipeline/src/dispatcher.rs b/src/pipeline/src/dispatcher.rs index a2a1e9fa1425..45bd6b47cbfb 100644 --- a/src/pipeline/src/dispatcher.rs +++ b/src/pipeline/src/dispatcher.rs @@ -16,8 +16,8 @@ use common_telemetry::debug; use snafu::OptionExt; use yaml_rust::Yaml; -use crate::etl_error::{ - FieldRequiredForDispatcherSnafu, TablePartRequiredForDispatcherRuleSnafu, +use crate::etl::error::{ + Error, FieldRequiredForDispatcherSnafu, Result, TablePartRequiredForDispatcherRuleSnafu, ValueRequiredForDispatcherRuleSnafu, }; use crate::Value; diff --git a/src/pipeline/src/etl/error.rs b/src/pipeline/src/etl/error.rs index e19aaad8396e..526ed61ce4a9 100644 --- a/src/pipeline/src/etl/error.rs +++ b/src/pipeline/src/etl/error.rs @@ -590,10 +590,17 @@ pub enum Error { }, #[snafu(display("Field is required for dispatcher"))] FieldRequiredForDispatcher, - #[snafu(display("table_part is required for dispatcher rule"))] + #[snafu(display("Table_part is required for dispatcher rule"))] TablePartRequiredForDispatcherRule, - #[snafu(display("value is required for dispatcher rule"))] + #[snafu(display("Value is required for dispatcher rule"))] ValueRequiredForDispatcherRule, + #[snafu(display("Keys and values length mismatch, values: {values}, keys: {keys}"))] + KeyValueLengthMismatch { + #[snafu(implicit)] + location: Location, + keys: usize, + values: usize, + }, } pub type Result = std::result::Result; diff --git a/src/pipeline/src/etl/transform/transformer/greptime.rs b/src/pipeline/src/etl/transform/transformer/greptime.rs index 087f5bc97516..94e922ab7845 100644 --- a/src/pipeline/src/etl/transform/transformer/greptime.rs +++ b/src/pipeline/src/etl/transform/transformer/greptime.rs @@ -26,10 +26,12 @@ use coerce::{coerce_columns, coerce_value}; use greptime_proto::v1::{ColumnSchema, Row, Rows, Value as GreptimeValue}; use itertools::Itertools; use serde_json::{Map, Number}; +use snafu::ensure; use crate::etl::error::{ - IdentifyPipelineColumnTypeMismatchSnafu, Result, TransformColumnNameMustBeUniqueSnafu, - TransformEmptySnafu, TransformMultipleTimestampIndexSnafu, TransformTimestampIndexCountSnafu, + IdentifyPipelineColumnTypeMismatchSnafu, KeyValueLengthMismatchSnafu, Result, + TransformColumnNameMustBeUniqueSnafu, TransformEmptySnafu, + TransformMultipleTimestampIndexSnafu, TransformTimestampIndexCountSnafu, UnsupportedNumberTypeSnafu, }; use crate::etl::field::{InputFieldInfo, OneInputOneOutputField}; @@ -290,6 +292,201 @@ fn resolve_number_schema( ) } +fn values_to_row(schema_info: &mut SchemaInfo, values: Vec, keys: &[String]) -> Result { + ensure!( + values.len() == keys.len(), + KeyValueLengthMismatchSnafu { + keys: keys.len(), + values: values.len(), + } + ); + + let mut row: Vec = Vec::with_capacity(schema_info.schema.len()); + for _ in 0..schema_info.schema.len() { + row.push(GreptimeValue { value_data: None }); + } + + for (idx, value) in values.into_iter().enumerate() { + // ensured by previous check + let column_name = keys[idx].clone(); + if column_name == DEFAULT_GREPTIME_TIMESTAMP_COLUMN { + continue; + } + + let index = schema_info.index.get(&column_name).copied(); + + match value { + Value::Null => {} + + Value::Int8(_) | Value::Int16(_) | Value::Int32(_) | Value::Int64(_) => { + // safe unwrap after type matched + let v = value.as_i64().unwrap(); + resolve_schema( + index, + ValueData::I64Value(v), + ColumnSchema { + column_name, + datatype: ColumnDataType::Int64 as i32, + semantic_type: SemanticType::Field as i32, + datatype_extension: None, + options: None, + }, + &mut row, + schema_info, + )?; + } + + Value::Uint8(_) | Value::Uint16(_) | Value::Uint32(_) | Value::Uint64(_) => { + // safe unwrap after type matched + let v = value.as_u64().unwrap(); + resolve_schema( + index, + ValueData::U64Value(v), + ColumnSchema { + column_name, + datatype: ColumnDataType::Uint64 as i32, + semantic_type: SemanticType::Field as i32, + datatype_extension: None, + options: None, + }, + &mut row, + schema_info, + )?; + } + + Value::Float32(_) | Value::Float64(_) => { + // safe unwrap after type matched + let v = value.as_f64().unwrap(); + resolve_schema( + index, + ValueData::F64Value(v), + ColumnSchema { + column_name, + datatype: ColumnDataType::Float64 as i32, + semantic_type: SemanticType::Field as i32, + datatype_extension: None, + options: None, + }, + &mut row, + schema_info, + )?; + } + + Value::Boolean(v) => { + resolve_schema( + index, + ValueData::BoolValue(v), + ColumnSchema { + column_name, + datatype: ColumnDataType::Boolean as i32, + semantic_type: SemanticType::Field as i32, + datatype_extension: None, + options: None, + }, + &mut row, + schema_info, + )?; + } + Value::String(v) => { + resolve_schema( + index, + ValueData::StringValue(v), + ColumnSchema { + column_name, + datatype: ColumnDataType::String as i32, + semantic_type: SemanticType::Field as i32, + datatype_extension: None, + options: None, + }, + &mut row, + schema_info, + )?; + } + + Value::Timestamp(Timestamp::Nanosecond(ns)) => { + resolve_schema( + index, + ValueData::TimestampNanosecondValue(ns), + ColumnSchema { + column_name, + datatype: ColumnDataType::TimestampNanosecond as i32, + semantic_type: SemanticType::Field as i32, + datatype_extension: None, + options: None, + }, + &mut row, + schema_info, + )?; + } + Value::Timestamp(Timestamp::Microsecond(us)) => { + resolve_schema( + index, + ValueData::TimestampMicrosecondValue(us), + ColumnSchema { + column_name, + datatype: ColumnDataType::TimestampMicrosecond as i32, + semantic_type: SemanticType::Field as i32, + datatype_extension: None, + options: None, + }, + &mut row, + schema_info, + )?; + } + Value::Timestamp(Timestamp::Millisecond(ms)) => { + resolve_schema( + index, + ValueData::TimestampMillisecondValue(ms), + ColumnSchema { + column_name, + datatype: ColumnDataType::TimestampMillisecond as i32, + semantic_type: SemanticType::Field as i32, + datatype_extension: None, + options: None, + }, + &mut row, + schema_info, + )?; + } + Value::Timestamp(Timestamp::Second(s)) => { + resolve_schema( + index, + ValueData::TimestampSecondValue(s), + ColumnSchema { + column_name, + datatype: ColumnDataType::TimestampSecond as i32, + semantic_type: SemanticType::Field as i32, + datatype_extension: None, + options: None, + }, + &mut row, + schema_info, + )?; + } + + Value::Array(_) | Value::Map(_) => { + let data: jsonb::Value = value.into(); + resolve_schema( + index, + ValueData::BinaryValue(data.to_vec()), + ColumnSchema { + column_name, + datatype: ColumnDataType::Binary as i32, + semantic_type: SemanticType::Field as i32, + datatype_extension: Some(ColumnDataTypeExtension { + type_ext: Some(TypeExt::JsonType(JsonTypeExtension::JsonBinary.into())), + }), + options: None, + }, + &mut row, + schema_info, + )?; + } + } + } + Ok(Row { values: row }) +} + fn json_value_to_row( schema_info: &mut SchemaInfo, map: Map, @@ -380,7 +577,8 @@ fn identity_pipeline_inner<'a>( } PipelineExecInput::Intermediate { keys, array } => { for values in array { - todo!() + let row = values_to_row(&mut schema_info, values, &keys)?; + rows.push(row); } } } @@ -441,6 +639,13 @@ impl PipelineExecInput { PipelineExecInput::Intermediate { array, .. } => array.len(), } } + + pub fn is_empty(&self) -> bool { + match self { + PipelineExecInput::Original(array) => array.is_empty(), + PipelineExecInput::Intermediate { array, .. } => array.is_empty(), + } + } } /// Identity pipeline for Greptime diff --git a/src/pipeline/src/etl/value.rs b/src/pipeline/src/etl/value.rs index fee9a2c52742..5d97c0cbd913 100644 --- a/src/pipeline/src/etl/value.rs +++ b/src/pipeline/src/etl/value.rs @@ -249,6 +249,29 @@ impl Value { } } + pub fn as_i64(&self) -> Option { + match self { + Value::Uint32(v) => Some(*v as i64), + Value::Uint16(v) => Some(*v as i64), + Value::Uint8(v) => Some(*v as i64), + Value::Int64(v) => Some(*v), + Value::Int32(v) => Some(*v as i64), + Value::Int16(v) => Some(*v as i64), + Value::Int8(v) => Some(*v as i64), + _ => None, + } + } + + pub fn as_u64(&self) -> Option { + match self { + Value::Uint64(v) => Some(*v), + Value::Uint32(v) => Some(*v as u64), + Value::Uint16(v) => Some(*v as u64), + Value::Uint8(v) => Some(*v as u64), + _ => None, + } + } + pub fn as_f64(&self) -> Option { match self { Value::Float32(v) => Some(*v as f64), diff --git a/src/servers/src/http/event.rs b/src/servers/src/http/event.rs index 446306ef26a0..83038dcea78a 100644 --- a/src/servers/src/http/event.rs +++ b/src/servers/src/http/event.rs @@ -612,7 +612,7 @@ async fn run_pipeline( .map(|rows| { vec![RowInsertRequest { rows: Some(rows), - table_name: table_name, + table_name, }] }) .context(PipelineTransformSnafu) @@ -705,6 +705,8 @@ async fn run_pipeline( None, PipelineExecInput::Intermediate { array: values, + // FIXME(sunng87): this intermediate_keys is incorrect. what + // we will need is the keys that generated after processors keys: pipeline.intermediate_keys().clone(), }, table_name, From a9bc720b724e60100b0a8bfb0ce33558d152a6b5 Mon Sep 17 00:00:00 2001 From: Ning Sun Date: Mon, 20 Jan 2025 16:00:27 +0800 Subject: [PATCH 06/32] fix: typo --- src/servers/src/http/event.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/servers/src/http/event.rs b/src/servers/src/http/event.rs index 83038dcea78a..6cea3123626d 100644 --- a/src/servers/src/http/event.rs +++ b/src/servers/src/http/event.rs @@ -556,7 +556,7 @@ fn extract_pipeline_value_by_content_type( } #[inline] -fn pipline_exec_with_intermediate_state( +fn pipeline_exec_with_intermediate_state( pipeline: &Arc>, intermediate_state: &mut Vec, transformed: &mut Vec, @@ -643,7 +643,7 @@ async fn run_pipeline( .context(PipelineTransformSnafu) .context(PipelineSnafu)?; - pipline_exec_with_intermediate_state( + pipeline_exec_with_intermediate_state( &pipeline, &mut intermediate_state, &mut transformed, @@ -658,7 +658,7 @@ async fn run_pipeline( } PipelineExecInput::Intermediate { array, .. } => { for mut intermediate_state in array { - pipline_exec_with_intermediate_state( + pipeline_exec_with_intermediate_state( &pipeline, &mut intermediate_state, &mut transformed, From 81e57a32660fba09c683aced87457e51bc67213f Mon Sep 17 00:00:00 2001 From: Ning Sun Date: Mon, 20 Jan 2025 16:37:13 +0800 Subject: [PATCH 07/32] test: port tests to refactored apis --- src/pipeline/benches/processor.rs | 5 +- src/pipeline/src/etl.rs | 10 +- .../src/etl/transform/transformer/greptime.rs | 13 +- src/pipeline/tests/common.rs | 8 +- src/pipeline/tests/pipeline.rs | 117 ++++++++++++++++-- 5 files changed, 136 insertions(+), 17 deletions(-) diff --git a/src/pipeline/benches/processor.rs b/src/pipeline/benches/processor.rs index 8cf221af5b10..01d1a293d66e 100644 --- a/src/pipeline/benches/processor.rs +++ b/src/pipeline/benches/processor.rs @@ -25,7 +25,10 @@ fn processor_mut( for v in input_values { pipeline.prepare(v, &mut payload)?; - let r = pipeline.exec_mut(&mut payload)?; + let r = pipeline + .exec_mut(&mut payload)? + .into_transformed() + .expect("expect transformed result "); result.push(r); pipeline.reset_intermediate_state(&mut payload); } diff --git a/src/pipeline/src/etl.rs b/src/pipeline/src/etl.rs index a12b9d7b0478..50889cb37fad 100644 --- a/src/pipeline/src/etl.rs +++ b/src/pipeline/src/etl.rs @@ -216,13 +216,21 @@ pub enum PipelineExecOutput { } impl PipelineExecOutput { - pub(crate) fn into_transformed(self) -> Option { + pub fn into_transformed(self) -> Option { if let Self::Transformed(o) = self { Some(o) } else { None } } + + pub fn into_dispatched(self) -> Option { + if let Self::DispatchedTo(d) = self { + Some(d) + } else { + None + } + } } impl Pipeline diff --git a/src/pipeline/src/etl/transform/transformer/greptime.rs b/src/pipeline/src/etl/transform/transformer/greptime.rs index 94e922ab7845..7d3752ef2880 100644 --- a/src/pipeline/src/etl/transform/transformer/greptime.rs +++ b/src/pipeline/src/etl/transform/transformer/greptime.rs @@ -675,7 +675,7 @@ mod tests { use api::v1::SemanticType; use crate::etl::transform::transformer::greptime::identity_pipeline_inner; - use crate::identity_pipeline; + use crate::{identity_pipeline, PipelineExecInput}; #[test] fn test_identify_pipeline() { @@ -700,7 +700,7 @@ mod tests { "gaga": "gaga" }), ]; - let rows = identity_pipeline(array, None); + let rows = identity_pipeline(PipelineExecInput::Original(array), None); assert!(rows.is_err()); assert_eq!( rows.err().unwrap().to_string(), @@ -728,7 +728,7 @@ mod tests { "gaga": "gaga" }), ]; - let rows = identity_pipeline(array, None); + let rows = identity_pipeline(PipelineExecInput::Original(array), None); assert!(rows.is_err()); assert_eq!( rows.err().unwrap().to_string(), @@ -756,7 +756,7 @@ mod tests { "gaga": "gaga" }), ]; - let rows = identity_pipeline(array, None); + let rows = identity_pipeline(PipelineExecInput::Original(array), None); assert!(rows.is_ok()); let rows = rows.unwrap(); assert_eq!(rows.schema.len(), 8); @@ -786,7 +786,10 @@ mod tests { }), ]; let tag_column_names = ["name".to_string(), "address".to_string()]; - let rows = identity_pipeline_inner(array, Some(tag_column_names.iter())); + let rows = identity_pipeline_inner( + PipelineExecInput::Original(array), + Some(tag_column_names.iter()), + ); assert!(rows.is_ok()); let rows = rows.unwrap(); assert_eq!(rows.schema.len(), 8); diff --git a/src/pipeline/tests/common.rs b/src/pipeline/tests/common.rs index d825c91e4cb3..781c3a30fe0f 100644 --- a/src/pipeline/tests/common.rs +++ b/src/pipeline/tests/common.rs @@ -34,7 +34,9 @@ pub fn parse_and_exec(input_str: &str, pipeline_yaml: &str) -> Rows { pipeline.prepare(value, &mut result).unwrap(); let row = pipeline .exec_mut(&mut result) - .expect("failed to exec pipeline"); + .expect("failed to exec pipeline") + .into_transformed() + .expect("expect transformed result "); rows.push(row); pipeline.reset_intermediate_state(&mut result); } @@ -43,7 +45,9 @@ pub fn parse_and_exec(input_str: &str, pipeline_yaml: &str) -> Rows { pipeline.prepare(input_value, &mut result).unwrap(); let row = pipeline .exec_mut(&mut result) - .expect("failed to exec pipeline"); + .expect("failed to exec pipeline") + .into_transformed() + .expect("expect transformed result "); rows.push(row); } _ => { diff --git a/src/pipeline/tests/pipeline.rs b/src/pipeline/tests/pipeline.rs index cb84e9ad0c8e..f0fa3992e4bf 100644 --- a/src/pipeline/tests/pipeline.rs +++ b/src/pipeline/tests/pipeline.rs @@ -427,7 +427,9 @@ transform: let row = pipeline .exec_mut(&mut stats) - .expect("failed to exec pipeline"); + .expect("failed to exec pipeline") + .into_transformed() + .expect("expect transformed result "); let output = Rows { schema: pipeline.schemas().clone(), @@ -492,7 +494,11 @@ transform: let mut status = pipeline.init_intermediate_state(); pipeline.prepare(input_value, &mut status).unwrap(); - let row = pipeline.exec_mut(&mut status).unwrap(); + let row = pipeline + .exec_mut(&mut status) + .unwrap() + .into_transformed() + .expect("expect transformed result "); let r = row .values .into_iter() @@ -598,7 +604,11 @@ transform: let mut status = pipeline.init_intermediate_state(); pipeline.prepare(input_value, &mut status).unwrap(); - let row = pipeline.exec_mut(&mut status).unwrap(); + let row = pipeline + .exec_mut(&mut status) + .unwrap() + .into_transformed() + .expect("expect transformed result "); let r = row .values @@ -638,10 +648,10 @@ processors: - dissect: fields: - line - patterns: + patterns: - "%{+ts} %{+ts} %{content}" - date: - fields: + fields: - ts formats: - "%Y-%m-%d %H:%M:%S%.3f" @@ -660,7 +670,11 @@ transform: let mut status = pipeline.init_intermediate_state(); pipeline.prepare(input_value, &mut status).unwrap(); - let row = pipeline.exec_mut(&mut status).unwrap(); + let row = pipeline + .exec_mut(&mut status) + .unwrap() + .into_transformed() + .expect("expect transformed result "); let r = row .values .into_iter() @@ -696,7 +710,12 @@ transform: let mut status = pipeline.init_intermediate_state(); pipeline.prepare(input_value, &mut status).unwrap(); - let row = pipeline.exec_mut(&mut status).unwrap(); + + let row = pipeline + .exec_mut(&mut status) + .unwrap() + .into_transformed() + .expect("expect transformed result "); let r = row .values @@ -751,7 +770,11 @@ transform: let mut status = pipeline.init_intermediate_state(); pipeline.prepare(input_value, &mut status).unwrap(); - let row = pipeline.exec_mut(&mut status).unwrap(); + let row = pipeline + .exec_mut(&mut status) + .unwrap() + .into_transformed() + .expect("expect transformed result "); let mut r = row .values @@ -770,3 +793,81 @@ transform: assert_eq!(expected, r); } + +#[test] +fn test_dispatch() { + let input_value_str1 = r#" +{ + "line": "2024-05-25 20:16:37.217 [http] hello world" +} +"#; + let input_value1 = serde_json::from_str::(input_value_str1).unwrap(); + let input_value_str2 = r#" +{ + "line": "2024-05-25 20:16:37.217 [database] hello world" +} +"#; + let input_value2 = serde_json::from_str::(input_value_str2).unwrap(); + + let pipeline_yaml = r#" +processors: + - dissect: + fields: + - line + patterns: + - "%{+ts} %{+ts} [%{logger}] %{content}" + - date: + fields: + - ts + formats: + - "%Y-%m-%d %H:%M:%S%.3f" + +dispatcher: + field: logger + rules: + - value: http + table_part: http + pipeline: access_log_pipeline + +transform: + - fields: + - content + type: string + - field: ts + type: time + index: timestamp +"#; + + let yaml_content = Content::Yaml(pipeline_yaml); + let pipeline: Pipeline = parse(&yaml_content).unwrap(); + + let mut status = pipeline.init_intermediate_state(); + pipeline.prepare(input_value1, &mut status).unwrap(); + let dispatched_to = pipeline + .exec_mut(&mut status) + .unwrap() + .into_dispatched() + .expect("expect dispatched result "); + assert_eq!(dispatched_to.table_part, "http"); + assert_eq!(dispatched_to.pipeline.unwrap(), "access_log_pipeline"); + + let mut status = pipeline.init_intermediate_state(); + pipeline.prepare(input_value2, &mut status).unwrap(); + let row = pipeline + .exec_mut(&mut status) + .unwrap() + .into_transformed() + .expect("expect transformed result "); + let r = row + .values + .into_iter() + .map(|v| v.value_data.unwrap()) + .collect::>(); + + let expected = vec![ + StringValue("hello world".into()), + TimestampNanosecondValue(1716668197217000000), + ]; + + assert_eq!(expected, r); +} From d37b59dcec6ad567cabf8398bbd33169771367e3 Mon Sep 17 00:00:00 2001 From: Ning Sun Date: Mon, 20 Jan 2025 19:27:06 +0800 Subject: [PATCH 08/32] refactor: adapt dryrun api call --- src/servers/src/http/event.rs | 249 +++++++++++++++++++++------------- 1 file changed, 155 insertions(+), 94 deletions(-) diff --git a/src/servers/src/http/event.rs b/src/servers/src/http/event.rs index 6cea3123626d..1c341e1610d6 100644 --- a/src/servers/src/http/event.rs +++ b/src/servers/src/http/event.rs @@ -276,86 +276,105 @@ fn transform_ndjson_array_factory( } /// Dryrun pipeline with given data -fn dryrun_pipeline_inner( +async fn dryrun_pipeline_inner( value: Vec, - pipeline: &pipeline::Pipeline, + pipeline: Arc>, + pipeline_handler: PipelineHandlerRef, + query_ctx: &QueryContextRef, ) -> Result { - let mut intermediate_state = pipeline.init_intermediate_state(); + let db = query_ctx.get_db_string(); - let mut results = Vec::with_capacity(value.len()); - for v in value { - pipeline - .prepare(v, &mut intermediate_state) - .context(PipelineTransformSnafu) - .context(PipelineSnafu)?; - let r = pipeline - .exec_mut(&mut intermediate_state) - .context(PipelineTransformSnafu) - .context(PipelineSnafu)?; - results.push(r); - pipeline.reset_intermediate_state(&mut intermediate_state); - } + let results = run_pipeline( + &pipeline_handler, + PipelineDefinition::Resolved(pipeline), + PipelineExecInput::Original(value), + "dry_run".to_owned(), + query_ctx, + db.as_ref(), + true, + ) + .await?; let colume_type_key = "colume_type"; let data_type_key = "data_type"; let name_key = "name"; - let schema = pipeline - .schemas() - .iter() - .map(|cs| { - let mut map = Map::new(); - map.insert(name_key.to_string(), Value::String(cs.column_name.clone())); - map.insert( - data_type_key.to_string(), - Value::String(cs.datatype().as_str_name().to_string()), - ); - map.insert( - colume_type_key.to_string(), - Value::String(cs.semantic_type().as_str_name().to_string()), - ); - map.insert( - "fulltext".to_string(), - Value::Bool( - cs.options - .clone() - .is_some_and(|x| x.options.contains_key("fulltext")), - ), - ); - Value::Object(map) - }) - .collect::>(); - let rows = results + let results = results .into_iter() - .map(|row| { - let row = row - .values - .into_iter() - .enumerate() - .map(|(idx, v)| { - v.value_data - .map(|d| { - let mut map = Map::new(); - map.insert("value".to_string(), column_data_to_json(d)); - map.insert("key".to_string(), schema[idx][name_key].clone()); - map.insert( - "semantic_type".to_string(), - schema[idx][colume_type_key].clone(), - ); - map.insert("data_type".to_string(), schema[idx][data_type_key].clone()); - Value::Object(map) - }) - .unwrap_or(Value::Null) - }) - .collect(); - Value::Array(row) + .filter_map(|row| { + if let Some(rows) = row.rows { + let table_name = row.table_name; + let schema = rows.schema; + + let schema = schema + .iter() + .map(|cs| { + let mut map = Map::new(); + map.insert(name_key.to_string(), Value::String(cs.column_name.clone())); + map.insert( + data_type_key.to_string(), + Value::String(cs.datatype().as_str_name().to_string()), + ); + map.insert( + colume_type_key.to_string(), + Value::String(cs.semantic_type().as_str_name().to_string()), + ); + map.insert( + "fulltext".to_string(), + Value::Bool( + cs.options + .clone() + .is_some_and(|x| x.options.contains_key("fulltext")), + ), + ); + Value::Object(map) + }) + .collect::>(); + + let rows = rows + .rows + .into_iter() + .map(|row| { + row.values + .into_iter() + .enumerate() + .map(|(idx, v)| { + v.value_data + .map(|d| { + let mut map = Map::new(); + map.insert("value".to_string(), column_data_to_json(d)); + map.insert( + "key".to_string(), + schema[idx][name_key].clone(), + ); + map.insert( + "semantic_type".to_string(), + schema[idx][colume_type_key].clone(), + ); + map.insert( + "data_type".to_string(), + schema[idx][data_type_key].clone(), + ); + Value::Object(map) + }) + .unwrap_or(Value::Null) + }) + .collect() + }) + .collect(); + + let mut result = Map::new(); + result.insert("schema".to_string(), Value::Array(schema)); + result.insert("rows".to_string(), Value::Array(rows)); + result.insert("table_name".to_string(), Value::String(table_name)); + let result = Value::Object(result); + Some(result) + } else { + None + } }) - .collect::>(); - let mut result = Map::new(); - result.insert("schema".to_string(), Value::Array(schema)); - result.insert("rows".to_string(), Value::Array(rows)); - let result = Value::Object(result); - Ok(Json(result).into_response()) + .collect(); + Ok(Json(Value::Array(results)).into_response()) } /// Dryrun pipeline with given data @@ -421,6 +440,9 @@ pub async fn pipeline_dryrun( ) -> Result { let handler = log_state.log_handler; + query_ctx.set_channel(Channel::Http); + let query_ctx = Arc::new(query_ctx); + match check_pipeline_dryrun_params_valid(&payload) { Some(params) => { let data = params.data; @@ -433,20 +455,29 @@ pub async fn pipeline_dryrun( to_pipeline_version(params.pipeline_version).context(PipelineSnafu)?; let pipeline_name = check_pipeline_name_exists(params.pipeline_name)?; let pipeline = handler - .get_pipeline(&pipeline_name, version, Arc::new(query_ctx)) + .get_pipeline(&pipeline_name, version, query_ctx.clone()) .await?; - dryrun_pipeline_inner(data, &pipeline) + dryrun_pipeline_inner(data, pipeline, handler, &query_ctx).await } Some(pipeline) => { let pipeline = handler.build_pipeline(&pipeline); match pipeline { - Ok(pipeline) => match dryrun_pipeline_inner(data, &pipeline) { - Ok(response) => Ok(response), - Err(e) => Ok(add_step_info_for_pipeline_dryrun_error( - "Failed to exec pipeline", - e, - )), - }, + Ok(pipeline) => { + match dryrun_pipeline_inner( + data, + Arc::new(pipeline), + handler, + &query_ctx, + ) + .await + { + Ok(response) => Ok(response), + Err(e) => Ok(add_step_info_for_pipeline_dryrun_error( + "Failed to exec pipeline", + e, + )), + } + } Err(e) => Ok(add_step_info_for_pipeline_dryrun_error( "Failed to build pipeline", e, @@ -470,14 +501,11 @@ pub async fn pipeline_dryrun( check_data_valid(value.len())?; - query_ctx.set_channel(Channel::Http); - let query_ctx = Arc::new(query_ctx); - let pipeline = handler .get_pipeline(&pipeline_name, version, query_ctx.clone()) .await?; - dryrun_pipeline_inner(value, &pipeline) + dryrun_pipeline_inner(value, pipeline, handler, &query_ctx).await } } } @@ -593,17 +621,54 @@ fn pipeline_exec_with_intermediate_state( Ok(()) } -async fn run_pipeline( +/// Enum for holding information of a pipeline, which is either pipeline itself, +/// or information that be used to retrieve a pipeline from `PipelineHandler` +enum PipelineDefinition<'a> { + Resolved(Arc>), + ByNameAndValue((&'a str, PipelineVersion)), + GreptimeIdentityPipeline, +} + +impl<'a> PipelineDefinition<'a> { + pub fn from_name(name: &'a str, version: PipelineVersion) -> Self { + if name == GREPTIME_INTERNAL_IDENTITY_PIPELINE_NAME { + Self::GreptimeIdentityPipeline + } else { + Self::ByNameAndValue((name, version)) + } + } + + /// Never call this on `GreptimeIdentityPipeline` because it's a real pipeline + pub async fn get_pipeline( + self, + handler: &PipelineHandlerRef, + query_ctx: &QueryContextRef, + ) -> Result>> { + match self { + Self::Resolved(pipeline) => Ok(pipeline), + Self::ByNameAndValue((name, version)) => { + handler.get_pipeline(name, version, query_ctx.clone()).await + } + _ => { + unreachable!("Never call get_pipeline on identity.") + } + } + } +} + +async fn run_pipeline<'a>( state: &PipelineHandlerRef, - pipeline_name: &str, - version: PipelineVersion, + pipeline_definition: PipelineDefinition<'a>, values: PipelineExecInput, table_name: String, query_ctx: &QueryContextRef, db: &str, is_top_level: bool, ) -> Result> { - if pipeline_name == GREPTIME_INTERNAL_IDENTITY_PIPELINE_NAME { + if matches!( + pipeline_definition, + PipelineDefinition::GreptimeIdentityPipeline + ) { let table = state .get_table(&table_name, &query_ctx) .await @@ -618,9 +683,7 @@ async fn run_pipeline( .context(PipelineTransformSnafu) .context(PipelineSnafu) } else { - let pipeline = state - .get_pipeline(&pipeline_name, version, query_ctx.clone()) - .await?; + let pipeline = pipeline_definition.get_pipeline(state, query_ctx).await?; let transform_timer = std::time::Instant::now(); let mut intermediate_state = pipeline.init_intermediate_state(); @@ -701,8 +764,7 @@ async fn run_pipeline( // `Vec>`. let requests = Box::pin(run_pipeline( state, - next_pipeline_name, - None, + PipelineDefinition::from_name(next_pipeline_name, None), PipelineExecInput::Intermediate { array: values, // FIXME(sunng87): this intermediate_keys is incorrect. what @@ -744,8 +806,7 @@ pub(crate) async fn ingest_logs_inner( for request in log_ingest_requests { let requests = run_pipeline( &state, - &pipeline_name, - version, + PipelineDefinition::from_name(&pipeline_name, version), PipelineExecInput::Original(request.values), request.table, &query_ctx, From dd40c090f0c9dd10402a8495848da4cdad4ca0d6 Mon Sep 17 00:00:00 2001 From: Ning Sun Date: Tue, 21 Jan 2025 14:56:04 +0800 Subject: [PATCH 09/32] refactor: move pipeline execution code to a separated module --- src/servers/src/elasticsearch.rs | 6 +- src/servers/src/http/event.rs | 225 +--------------------------- src/servers/src/lib.rs | 1 + src/servers/src/pipeline.rs | 243 +++++++++++++++++++++++++++++++ 4 files changed, 252 insertions(+), 223 deletions(-) create mode 100644 src/servers/src/pipeline.rs diff --git a/src/servers/src/elasticsearch.rs b/src/servers/src/elasticsearch.rs index 58c6aa520a61..41bb9cbc9f76 100644 --- a/src/servers/src/elasticsearch.rs +++ b/src/servers/src/elasticsearch.rs @@ -31,13 +31,11 @@ use crate::error::{ status_code_to_http_status, InvalidElasticsearchInputSnafu, ParseJsonSnafu, Result as ServersResult, }; -use crate::http::event::{ - ingest_logs_inner, LogIngestRequest, LogIngesterQueryParams, LogState, - GREPTIME_INTERNAL_IDENTITY_PIPELINE_NAME, -}; +use crate::http::event::{ingest_logs_inner, LogIngestRequest, LogIngesterQueryParams, LogState}; use crate::metrics::{ METRIC_ELASTICSEARCH_LOGS_DOCS_COUNT, METRIC_ELASTICSEARCH_LOGS_INGESTION_ELAPSED, }; +use crate::pipeline::GREPTIME_INTERNAL_IDENTITY_PIPELINE_NAME; // The headers for every response of Elasticsearch API. static ELASTICSEARCH_HEADERS: Lazy = Lazy::new(|| { diff --git a/src/servers/src/http/event.rs b/src/servers/src/http/event.rs index 1c341e1610d6..c2998a396671 100644 --- a/src/servers/src/http/event.rs +++ b/src/servers/src/http/event.rs @@ -12,13 +12,12 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::collections::BTreeMap; use std::result::Result as StdResult; use std::str::FromStr; use std::sync::Arc; use std::time::Instant; -use api::v1::{Row, RowInsertRequest, RowInsertRequests, Rows}; +use api::v1::RowInsertRequests; use axum::body::HttpBody; use axum::extract::{FromRequest, Multipart, Path, Query, State}; use axum::headers::ContentType; @@ -31,20 +30,16 @@ use common_query::{Output, OutputData}; use common_telemetry::{error, warn}; use datatypes::value::column_data_to_json; use lazy_static::lazy_static; -use pipeline::error::PipelineTransformSnafu; use pipeline::util::to_pipeline_version; -use pipeline::{ - DispatchedTo, GreptimeTransformer, Pipeline, PipelineExecInput, PipelineExecOutput, - PipelineVersion, -}; +use pipeline::{GreptimeTransformer, PipelineExecInput, PipelineVersion}; use serde::{Deserialize, Serialize}; use serde_json::{json, Deserializer, Map, Value}; use session::context::{Channel, QueryContext, QueryContextRef}; use snafu::{ensure, OptionExt, ResultExt}; use crate::error::{ - status_code_to_http_status, CatalogSnafu, Error, InvalidParameterSnafu, ParseJsonSnafu, - PipelineSnafu, Result, UnsupportedContentTypeSnafu, + status_code_to_http_status, Error, InvalidParameterSnafu, ParseJsonSnafu, PipelineSnafu, + Result, UnsupportedContentTypeSnafu, }; use crate::http::header::CONTENT_TYPE_PROTOBUF_STR; use crate::http::result::greptime_manage_resp::GreptimedbManageResponse; @@ -53,11 +48,11 @@ use crate::http::HttpResponse; use crate::interceptor::{LogIngestInterceptor, LogIngestInterceptorRef}; use crate::metrics::{ METRIC_FAILURE_VALUE, METRIC_HTTP_LOGS_INGESTION_COUNTER, METRIC_HTTP_LOGS_INGESTION_ELAPSED, - METRIC_HTTP_LOGS_TRANSFORM_ELAPSED, METRIC_SUCCESS_VALUE, + METRIC_SUCCESS_VALUE, }; +use crate::pipeline::{run_pipeline, PipelineDefinition}; use crate::query_handler::PipelineHandlerRef; -pub const GREPTIME_INTERNAL_IDENTITY_PIPELINE_NAME: &str = "greptime_identity"; const GREPTIME_INTERNAL_PIPELINE_NAME_PREFIX: &str = "greptime_"; lazy_static! { @@ -583,214 +578,6 @@ fn extract_pipeline_value_by_content_type( }) } -#[inline] -fn pipeline_exec_with_intermediate_state( - pipeline: &Arc>, - intermediate_state: &mut Vec, - transformed: &mut Vec, - dispatched: &mut BTreeMap>>, - db: &str, - transform_timer: &Instant, - is_top_level: bool, -) -> Result<()> { - let r = pipeline - .exec_mut(intermediate_state) - .inspect_err(|_| { - if is_top_level { - METRIC_HTTP_LOGS_TRANSFORM_ELAPSED - .with_label_values(&[db, METRIC_FAILURE_VALUE]) - .observe(transform_timer.elapsed().as_secs_f64()); - } - }) - .context(PipelineTransformSnafu) - .context(PipelineSnafu)?; - - match r { - PipelineExecOutput::Transformed(row) => { - transformed.push(row); - } - PipelineExecOutput::DispatchedTo(dispatched_to) => { - if let Some(values) = dispatched.get_mut(&dispatched_to) { - values.push(intermediate_state.clone()); - } else { - dispatched.insert(dispatched_to, vec![intermediate_state.clone()]); - } - } - } - - Ok(()) -} - -/// Enum for holding information of a pipeline, which is either pipeline itself, -/// or information that be used to retrieve a pipeline from `PipelineHandler` -enum PipelineDefinition<'a> { - Resolved(Arc>), - ByNameAndValue((&'a str, PipelineVersion)), - GreptimeIdentityPipeline, -} - -impl<'a> PipelineDefinition<'a> { - pub fn from_name(name: &'a str, version: PipelineVersion) -> Self { - if name == GREPTIME_INTERNAL_IDENTITY_PIPELINE_NAME { - Self::GreptimeIdentityPipeline - } else { - Self::ByNameAndValue((name, version)) - } - } - - /// Never call this on `GreptimeIdentityPipeline` because it's a real pipeline - pub async fn get_pipeline( - self, - handler: &PipelineHandlerRef, - query_ctx: &QueryContextRef, - ) -> Result>> { - match self { - Self::Resolved(pipeline) => Ok(pipeline), - Self::ByNameAndValue((name, version)) => { - handler.get_pipeline(name, version, query_ctx.clone()).await - } - _ => { - unreachable!("Never call get_pipeline on identity.") - } - } - } -} - -async fn run_pipeline<'a>( - state: &PipelineHandlerRef, - pipeline_definition: PipelineDefinition<'a>, - values: PipelineExecInput, - table_name: String, - query_ctx: &QueryContextRef, - db: &str, - is_top_level: bool, -) -> Result> { - if matches!( - pipeline_definition, - PipelineDefinition::GreptimeIdentityPipeline - ) { - let table = state - .get_table(&table_name, &query_ctx) - .await - .context(CatalogSnafu)?; - pipeline::identity_pipeline(values, table) - .map(|rows| { - vec![RowInsertRequest { - rows: Some(rows), - table_name, - }] - }) - .context(PipelineTransformSnafu) - .context(PipelineSnafu) - } else { - let pipeline = pipeline_definition.get_pipeline(state, query_ctx).await?; - - let transform_timer = std::time::Instant::now(); - let mut intermediate_state = pipeline.init_intermediate_state(); - - let mut transformed = Vec::with_capacity(values.len()); - let mut dispatched: BTreeMap>> = BTreeMap::new(); - - match values { - PipelineExecInput::Original(array) => { - for v in array { - pipeline - .prepare(v, &mut intermediate_state) - .inspect_err(|_| { - if is_top_level { - METRIC_HTTP_LOGS_TRANSFORM_ELAPSED - .with_label_values(&[db, METRIC_FAILURE_VALUE]) - .observe(transform_timer.elapsed().as_secs_f64()); - } - }) - .context(PipelineTransformSnafu) - .context(PipelineSnafu)?; - - pipeline_exec_with_intermediate_state( - &pipeline, - &mut intermediate_state, - &mut transformed, - &mut dispatched, - db, - &transform_timer, - is_top_level, - )?; - - pipeline.reset_intermediate_state(&mut intermediate_state); - } - } - PipelineExecInput::Intermediate { array, .. } => { - for mut intermediate_state in array { - pipeline_exec_with_intermediate_state( - &pipeline, - &mut intermediate_state, - &mut transformed, - &mut dispatched, - db, - &transform_timer, - is_top_level, - )?; - } - } - } - - let mut results = Vec::new(); - // if current pipeline generates some transformed results, build it as - // `RowInsertRequest` and append to results. If the pipeline doesn't - // have dispatch, this will be only output of the pipeline. - if !transformed.is_empty() { - results.push(RowInsertRequest { - rows: Some(Rows { - rows: transformed, - schema: pipeline.schemas().clone(), - }), - table_name: table_name.clone(), - }) - } - - // if current pipeline contains dispatcher and has several rules, we may - // already accumulated several dispatched rules and rows. - for (dispatched_to, values) in dispatched { - // we generate the new table name according to `table_part` and - // current custom table name. - let table_name = format!("{}_{}", &table_name, dispatched_to.table_part); - let next_pipeline_name = dispatched_to - .pipeline - .as_deref() - .unwrap_or(GREPTIME_INTERNAL_IDENTITY_PIPELINE_NAME); - - // run pipeline recursively. Note that the values we are going to - // process is now intermediate version. It's in form of - // `Vec>`. - let requests = Box::pin(run_pipeline( - state, - PipelineDefinition::from_name(next_pipeline_name, None), - PipelineExecInput::Intermediate { - array: values, - // FIXME(sunng87): this intermediate_keys is incorrect. what - // we will need is the keys that generated after processors - keys: pipeline.intermediate_keys().clone(), - }, - table_name, - query_ctx, - db, - false, - )) - .await?; - - results.extend(requests); - } - - if is_top_level { - METRIC_HTTP_LOGS_TRANSFORM_ELAPSED - .with_label_values(&[db, METRIC_SUCCESS_VALUE]) - .observe(transform_timer.elapsed().as_secs_f64()); - } - - Ok(results) - } -} - pub(crate) async fn ingest_logs_inner( state: PipelineHandlerRef, pipeline_name: String, diff --git a/src/servers/src/lib.rs b/src/servers/src/lib.rs index 417d2646513b..a2f76a115583 100644 --- a/src/servers/src/lib.rs +++ b/src/servers/src/lib.rs @@ -37,6 +37,7 @@ pub mod metrics_handler; pub mod mysql; pub mod opentsdb; pub mod otlp; +mod pipeline; pub mod postgres; mod prom_row_builder; pub mod prom_store; diff --git a/src/servers/src/pipeline.rs b/src/servers/src/pipeline.rs new file mode 100644 index 000000000000..ddebd4d37a6a --- /dev/null +++ b/src/servers/src/pipeline.rs @@ -0,0 +1,243 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::BTreeMap; +use std::sync::Arc; +use std::time::Instant; + +use session::context::QueryContextRef; +use snafu::ResultExt; + +use api::v1::{Row, RowInsertRequest, Rows}; +use pipeline::error::PipelineTransformSnafu; +use pipeline::{ + DispatchedTo, GreptimeTransformer, Pipeline, PipelineExecInput, PipelineExecOutput, + PipelineVersion, +}; + +use crate::error::{CatalogSnafu, PipelineSnafu, Result}; +use crate::metrics::{ + METRIC_FAILURE_VALUE, METRIC_HTTP_LOGS_TRANSFORM_ELAPSED, METRIC_SUCCESS_VALUE, +}; +use crate::query_handler::PipelineHandlerRef; + +pub(crate) const GREPTIME_INTERNAL_IDENTITY_PIPELINE_NAME: &str = "greptime_identity"; + +#[inline] +pub(crate) fn pipeline_exec_with_intermediate_state( + pipeline: &Arc>, + intermediate_state: &mut Vec, + transformed: &mut Vec, + dispatched: &mut BTreeMap>>, + db: &str, + transform_timer: &Instant, + is_top_level: bool, +) -> Result<()> { + let r = pipeline + .exec_mut(intermediate_state) + .inspect_err(|_| { + if is_top_level { + METRIC_HTTP_LOGS_TRANSFORM_ELAPSED + .with_label_values(&[db, METRIC_FAILURE_VALUE]) + .observe(transform_timer.elapsed().as_secs_f64()); + } + }) + .context(PipelineTransformSnafu) + .context(PipelineSnafu)?; + + match r { + PipelineExecOutput::Transformed(row) => { + transformed.push(row); + } + PipelineExecOutput::DispatchedTo(dispatched_to) => { + if let Some(values) = dispatched.get_mut(&dispatched_to) { + values.push(intermediate_state.clone()); + } else { + dispatched.insert(dispatched_to, vec![intermediate_state.clone()]); + } + } + } + + Ok(()) +} + +/// Enum for holding information of a pipeline, which is either pipeline itself, +/// or information that be used to retrieve a pipeline from `PipelineHandler` +pub(crate) enum PipelineDefinition<'a> { + Resolved(Arc>), + ByNameAndValue((&'a str, PipelineVersion)), + GreptimeIdentityPipeline, +} + +impl<'a> PipelineDefinition<'a> { + pub fn from_name(name: &'a str, version: PipelineVersion) -> Self { + if name == GREPTIME_INTERNAL_IDENTITY_PIPELINE_NAME { + Self::GreptimeIdentityPipeline + } else { + Self::ByNameAndValue((name, version)) + } + } + + /// Never call this on `GreptimeIdentityPipeline` because it's a real pipeline + pub async fn get_pipeline( + self, + handler: &PipelineHandlerRef, + query_ctx: &QueryContextRef, + ) -> Result>> { + match self { + Self::Resolved(pipeline) => Ok(pipeline), + Self::ByNameAndValue((name, version)) => { + handler.get_pipeline(name, version, query_ctx.clone()).await + } + _ => { + unreachable!("Never call get_pipeline on identity.") + } + } + } +} + +pub(crate) async fn run_pipeline<'a>( + state: &PipelineHandlerRef, + pipeline_definition: PipelineDefinition<'a>, + values: PipelineExecInput, + table_name: String, + query_ctx: &QueryContextRef, + db: &str, + is_top_level: bool, +) -> Result> { + if matches!( + pipeline_definition, + PipelineDefinition::GreptimeIdentityPipeline + ) { + let table = state + .get_table(&table_name, &query_ctx) + .await + .context(CatalogSnafu)?; + pipeline::identity_pipeline(values, table) + .map(|rows| { + vec![RowInsertRequest { + rows: Some(rows), + table_name, + }] + }) + .context(PipelineTransformSnafu) + .context(PipelineSnafu) + } else { + let pipeline = pipeline_definition.get_pipeline(state, query_ctx).await?; + + let transform_timer = std::time::Instant::now(); + let mut intermediate_state = pipeline.init_intermediate_state(); + + let mut transformed = Vec::with_capacity(values.len()); + let mut dispatched: BTreeMap>> = BTreeMap::new(); + + match values { + PipelineExecInput::Original(array) => { + for v in array { + pipeline + .prepare(v, &mut intermediate_state) + .inspect_err(|_| { + if is_top_level { + METRIC_HTTP_LOGS_TRANSFORM_ELAPSED + .with_label_values(&[db, METRIC_FAILURE_VALUE]) + .observe(transform_timer.elapsed().as_secs_f64()); + } + }) + .context(PipelineTransformSnafu) + .context(PipelineSnafu)?; + + pipeline_exec_with_intermediate_state( + &pipeline, + &mut intermediate_state, + &mut transformed, + &mut dispatched, + db, + &transform_timer, + is_top_level, + )?; + + pipeline.reset_intermediate_state(&mut intermediate_state); + } + } + PipelineExecInput::Intermediate { array, .. } => { + for mut intermediate_state in array { + pipeline_exec_with_intermediate_state( + &pipeline, + &mut intermediate_state, + &mut transformed, + &mut dispatched, + db, + &transform_timer, + is_top_level, + )?; + } + } + } + + let mut results = Vec::new(); + // if current pipeline generates some transformed results, build it as + // `RowInsertRequest` and append to results. If the pipeline doesn't + // have dispatch, this will be only output of the pipeline. + if !transformed.is_empty() { + results.push(RowInsertRequest { + rows: Some(Rows { + rows: transformed, + schema: pipeline.schemas().clone(), + }), + table_name: table_name.clone(), + }) + } + + // if current pipeline contains dispatcher and has several rules, we may + // already accumulated several dispatched rules and rows. + for (dispatched_to, values) in dispatched { + // we generate the new table name according to `table_part` and + // current custom table name. + let table_name = format!("{}_{}", &table_name, dispatched_to.table_part); + let next_pipeline_name = dispatched_to + .pipeline + .as_deref() + .unwrap_or(GREPTIME_INTERNAL_IDENTITY_PIPELINE_NAME); + + // run pipeline recursively. Note that the values we are going to + // process is now intermediate version. It's in form of + // `Vec>`. + let requests = Box::pin(run_pipeline( + state, + PipelineDefinition::from_name(next_pipeline_name, None), + PipelineExecInput::Intermediate { + array: values, + // FIXME(sunng87): this intermediate_keys is incorrect. what + // we will need is the keys that generated after processors + keys: pipeline.intermediate_keys().clone(), + }, + table_name, + query_ctx, + db, + false, + )) + .await?; + + results.extend(requests); + } + + if is_top_level { + METRIC_HTTP_LOGS_TRANSFORM_ELAPSED + .with_label_values(&[db, METRIC_SUCCESS_VALUE]) + .observe(transform_timer.elapsed().as_secs_f64()); + } + + Ok(results) + } +} From 233d57f3691ced5018beeab8e3c272619b0ec966 Mon Sep 17 00:00:00 2001 From: Ning Sun Date: Tue, 21 Jan 2025 17:09:11 +0800 Subject: [PATCH 10/32] refactor: update otlp pipeline execution path --- src/frontend/src/instance/otlp.rs | 12 ++- src/pipeline/src/etl.rs | 87 +++++++++------ src/pipeline/src/lib.rs | 4 +- src/servers/src/elasticsearch.rs | 2 +- src/servers/src/http/event.rs | 4 +- src/servers/src/http/otlp.rs | 29 +++-- src/servers/src/lib.rs | 1 + src/servers/src/otlp/logs.rs | 171 ++++++++++++++---------------- src/servers/src/pipeline.rs | 58 ++++------ src/servers/src/query_handler.rs | 1 + 10 files changed, 183 insertions(+), 186 deletions(-) diff --git a/src/frontend/src/instance/otlp.rs b/src/frontend/src/instance/otlp.rs index 989c6c4348fc..5b5a7fbfe10e 100644 --- a/src/frontend/src/instance/otlp.rs +++ b/src/frontend/src/instance/otlp.rs @@ -24,7 +24,7 @@ use pipeline::PipelineWay; use servers::error::{self, AuthSnafu, InFlightWriteBytesExceededSnafu, Result as ServerResult}; use servers::interceptor::{OpenTelemetryProtocolInterceptor, OpenTelemetryProtocolInterceptorRef}; use servers::otlp; -use servers::query_handler::OpenTelemetryProtocolHandler; +use servers::query_handler::{OpenTelemetryProtocolHandler, PipelineHandlerRef}; use session::context::QueryContextRef; use snafu::ResultExt; @@ -112,6 +112,7 @@ impl OpenTelemetryProtocolHandler for Instance { #[tracing::instrument(skip_all)] async fn logs( &self, + pipeline_handler: PipelineHandlerRef, request: ExportLogsServiceRequest, pipeline: PipelineWay, table_name: String, @@ -128,7 +129,14 @@ impl OpenTelemetryProtocolHandler for Instance { .get::>(); interceptor_ref.pre_execute(ctx.clone())?; - let (requests, rows) = otlp::logs::to_grpc_insert_requests(request, pipeline, table_name)?; + let (requests, rows) = otlp::logs::to_grpc_insert_requests( + request, + pipeline, + table_name, + &ctx, + &pipeline_handler, + ) + .await?; let _guard = if let Some(limiter) = &self.limiter { let result = limiter.limit_row_inserts(&requests); diff --git a/src/pipeline/src/etl.rs b/src/pipeline/src/etl.rs index 50889cb37fad..275c1000f46a 100644 --- a/src/pipeline/src/etl.rs +++ b/src/pipeline/src/etl.rs @@ -20,6 +20,8 @@ pub mod processor; pub mod transform; pub mod value; +use std::sync::Arc; + use ahash::HashSet; use common_telemetry::debug; use error::{IntermediateKeyIndexSnafu, PrepareValueMustBeObjectSnafu, YamlLoadSnafu}; @@ -32,6 +34,7 @@ use yaml_rust::YamlLoader; use crate::dispatcher::{Dispatcher, Rule}; use crate::etl::error::Result; +use crate::{GreptimeTransformer, PipelineVersion}; const DESCRIPTION: &str = "description"; const PROCESSORS: &str = "processors"; @@ -256,36 +259,36 @@ where } } - pub fn prepare_pipeline_value(&self, val: Value, result: &mut [Value]) -> Result<()> { - match val { - Value::Map(map) => { - let mut search_from = 0; - // because of the key in the json map is ordered - for (payload_key, payload_value) in map.values.into_iter() { - if search_from >= self.required_keys.len() { - break; - } - - // because of map key is ordered, required_keys is ordered too - if let Some(pos) = self.required_keys[search_from..] - .iter() - .position(|k| k == &payload_key) - { - result[search_from + pos] = payload_value; - // next search from is always after the current key - search_from += pos; - } - } - } - Value::String(_) => { - result[0] = val; - } - _ => { - return PrepareValueMustBeObjectSnafu.fail(); - } - } - Ok(()) - } + // pub fn prepare_pipeline_value(&self, val: Value, result: &mut [Value]) -> Result<()> { + // match val { + // Value::Map(map) => { + // let mut search_from = 0; + // // because of the key in the json map is ordered + // for (payload_key, payload_value) in map.values.into_iter() { + // if search_from >= self.required_keys.len() { + // break; + // } + + // // because of map key is ordered, required_keys is ordered too + // if let Some(pos) = self.required_keys[search_from..] + // .iter() + // .position(|k| k == &payload_key) + // { + // result[search_from + pos] = payload_value; + // // next search from is always after the current key + // search_from += pos; + // } + // } + // } + // Value::String(_) => { + // result[0] = val; + // } + // _ => { + // return PrepareValueMustBeObjectSnafu.fail(); + // } + // } + // Ok(()) + // } pub fn prepare(&self, val: serde_json::Value, result: &mut [Value]) -> Result<()> { match val { @@ -388,9 +391,29 @@ impl SelectInfo { } } +pub const GREPTIME_INTERNAL_IDENTITY_PIPELINE_NAME: &str = "greptime_identity"; + +/// Enum for holding information of a pipeline, which is either pipeline itself, +/// or information that be used to retrieve a pipeline from `PipelineHandler` +pub enum PipelineDefinition { + Resolved(Arc>), + ByNameAndValue((String, PipelineVersion)), + GreptimeIdentityPipeline, +} + +impl PipelineDefinition { + pub fn from_name(name: &str, version: PipelineVersion) -> Self { + if name == GREPTIME_INTERNAL_IDENTITY_PIPELINE_NAME { + Self::GreptimeIdentityPipeline + } else { + Self::ByNameAndValue((name.to_owned(), version)) + } + } +} + pub enum PipelineWay { - OtlpLog(Box), - Custom(std::sync::Arc>), + OtlpLogDirect(Box), + Pipeline(PipelineDefinition), } #[cfg(test)] diff --git a/src/pipeline/src/lib.rs b/src/pipeline/src/lib.rs index 8ebf9ab0b9f1..9482f63a723a 100644 --- a/src/pipeline/src/lib.rs +++ b/src/pipeline/src/lib.rs @@ -24,8 +24,8 @@ pub use etl::transform::transformer::identity_pipeline; pub use etl::transform::{GreptimeTransformer, Transformer}; pub use etl::value::{Array, Map, Value}; pub use etl::{ - error as etl_error, parse, Content, DispatchedTo, Pipeline, PipelineExecOutput, PipelineWay, - SelectInfo, + error as etl_error, parse, Content, DispatchedTo, Pipeline, PipelineDefinition, + PipelineExecOutput, PipelineWay, SelectInfo, GREPTIME_INTERNAL_IDENTITY_PIPELINE_NAME, }; pub use manager::{ error, pipeline_operator, table, util, PipelineInfo, PipelineRef, PipelineTableRef, diff --git a/src/servers/src/elasticsearch.rs b/src/servers/src/elasticsearch.rs index 41bb9cbc9f76..c3b4eb54abc9 100644 --- a/src/servers/src/elasticsearch.rs +++ b/src/servers/src/elasticsearch.rs @@ -23,6 +23,7 @@ use axum::{Extension, TypedHeader}; use common_error::ext::ErrorExt; use common_telemetry::{debug, error}; use once_cell::sync::Lazy; +use pipeline::GREPTIME_INTERNAL_IDENTITY_PIPELINE_NAME; use serde_json::{json, Deserializer, Value}; use session::context::{Channel, QueryContext}; use snafu::{ensure, ResultExt}; @@ -35,7 +36,6 @@ use crate::http::event::{ingest_logs_inner, LogIngestRequest, LogIngesterQueryPa use crate::metrics::{ METRIC_ELASTICSEARCH_LOGS_DOCS_COUNT, METRIC_ELASTICSEARCH_LOGS_INGESTION_ELAPSED, }; -use crate::pipeline::GREPTIME_INTERNAL_IDENTITY_PIPELINE_NAME; // The headers for every response of Elasticsearch API. static ELASTICSEARCH_HEADERS: Lazy = Lazy::new(|| { diff --git a/src/servers/src/http/event.rs b/src/servers/src/http/event.rs index c2998a396671..2b2b1535cbc6 100644 --- a/src/servers/src/http/event.rs +++ b/src/servers/src/http/event.rs @@ -31,7 +31,7 @@ use common_telemetry::{error, warn}; use datatypes::value::column_data_to_json; use lazy_static::lazy_static; use pipeline::util::to_pipeline_version; -use pipeline::{GreptimeTransformer, PipelineExecInput, PipelineVersion}; +use pipeline::{GreptimeTransformer, PipelineDefinition, PipelineExecInput, PipelineVersion}; use serde::{Deserialize, Serialize}; use serde_json::{json, Deserializer, Map, Value}; use session::context::{Channel, QueryContext, QueryContextRef}; @@ -50,7 +50,7 @@ use crate::metrics::{ METRIC_FAILURE_VALUE, METRIC_HTTP_LOGS_INGESTION_COUNTER, METRIC_HTTP_LOGS_INGESTION_ELAPSED, METRIC_SUCCESS_VALUE, }; -use crate::pipeline::{run_pipeline, PipelineDefinition}; +use crate::pipeline::run_pipeline; use crate::query_handler::PipelineHandlerRef; const GREPTIME_INTERNAL_PIPELINE_NAME_PREFIX: &str = "greptime_"; diff --git a/src/servers/src/http/otlp.rs b/src/servers/src/http/otlp.rs index b5c4607c29e3..6657bfc845a3 100644 --- a/src/servers/src/http/otlp.rs +++ b/src/servers/src/http/otlp.rs @@ -30,7 +30,7 @@ use opentelemetry_proto::tonic::collector::trace::v1::{ ExportTraceServiceRequest, ExportTraceServiceResponse, }; use pipeline::util::to_pipeline_version; -use pipeline::PipelineWay; +use pipeline::{PipelineDefinition, PipelineWay}; use prost::Message; use session::context::{Channel, QueryContext}; use snafu::prelude::*; @@ -39,7 +39,7 @@ use super::header::{write_cost_header_map, CONTENT_TYPE_PROTOBUF}; use crate::error::{self, PipelineSnafu, Result}; use crate::http::extractor::{LogTableName, PipelineInfo, SelectInfoWrapper, TraceTableName}; use crate::otlp::trace::TRACE_TABLE_NAME; -use crate::query_handler::OpenTelemetryProtocolHandlerRef; +use crate::query_handler::{OpenTelemetryProtocolHandlerRef, PipelineHandler}; #[axum_macros::debug_handler] #[tracing::instrument(skip_all, fields(protocol = "otlp", request_type = "metrics"))] @@ -117,25 +117,20 @@ pub async fn logs( .start_timer(); let request = ExportLogsServiceRequest::decode(bytes).context(error::DecodeOtlpRequestSnafu)?; - let pipeline_way = if let Some(pipeline_name) = &pipeline_info.pipeline_name { - let pipeline_version = - to_pipeline_version(pipeline_info.pipeline_version).context(PipelineSnafu)?; - let pipeline = match handler - .get_pipeline(pipeline_name, pipeline_version, query_ctx.clone()) - .await - { - Ok(p) => p, - Err(e) => { - return Err(e); - } - }; - PipelineWay::Custom(pipeline) + let pipeline = if let Some(pipeline_name) = pipeline_info.pipeline_name { + PipelineWay::Pipeline(PipelineDefinition::from_name( + &pipeline_name, + to_pipeline_version(pipeline_info.pipeline_version).context(PipelineSnafu)?, + )) } else { - PipelineWay::OtlpLog(Box::new(select_info)) + PipelineWay::OtlpLogDirect(Box::new(select_info)) }; + // here we use nightly feature `trait_upcasting` to convert handler to + // pipeline_handler + let pipeline_handler: Arc = handler.clone(); handler - .logs(request, pipeline_way, tablename, query_ctx) + .logs(pipeline_handler, request, pipeline, tablename, query_ctx) .await .map(|o| OtlpResponse { resp_body: ExportLogsServiceResponse { diff --git a/src/servers/src/lib.rs b/src/servers/src/lib.rs index a2f76a115583..423d640759f8 100644 --- a/src/servers/src/lib.rs +++ b/src/servers/src/lib.rs @@ -17,6 +17,7 @@ #![feature(exclusive_wrapper)] #![feature(let_chains)] #![feature(if_let_guard)] +#![feature(trait_upcasting)] use datafusion_expr::LogicalPlan; use datatypes::schema::Schema; diff --git a/src/servers/src/otlp/logs.rs b/src/servers/src/otlp/logs.rs index f11cd4ff3c68..348dcdd2d96a 100644 --- a/src/servers/src/otlp/logs.rs +++ b/src/servers/src/otlp/logs.rs @@ -25,14 +25,16 @@ use jsonb::{Number as JsonbNumber, Value as JsonbValue}; use opentelemetry_proto::tonic::collector::logs::v1::ExportLogsServiceRequest; use opentelemetry_proto::tonic::common::v1::{any_value, AnyValue, InstrumentationScope, KeyValue}; use opentelemetry_proto::tonic::logs::v1::{LogRecord, ResourceLogs, ScopeLogs}; -use pipeline::{Array, Map, PipelineWay, SchemaInfo, SelectInfo, Value as PipelineValue}; -use snafu::{ensure, ResultExt}; +use pipeline::{PipelineExecInput, PipelineWay, SchemaInfo, SelectInfo}; +use serde_json::{Map, Value}; +use session::context::QueryContextRef; +use snafu::ensure; use super::trace::attributes::OtlpAnyValue; use super::utils::{bytes_to_hex_string, key_value_to_jsonb}; -use crate::error::{ - IncompatibleSchemaSnafu, OpenTelemetryLogSnafu, Result, UnsupportedJsonDataTypeForTagSnafu, -}; +use crate::error::{IncompatibleSchemaSnafu, Result, UnsupportedJsonDataTypeForTagSnafu}; +use crate::pipeline::run_pipeline; +use crate::query_handler::PipelineHandlerRef; pub const LOG_TABLE_NAME: &str = "opentelemetry_logs"; @@ -43,13 +45,15 @@ pub const LOG_TABLE_NAME: &str = "opentelemetry_logs"; /// for data structure of OTLP metrics. /// /// Returns `InsertRequests` and total number of rows to ingest -pub fn to_grpc_insert_requests( +pub async fn to_grpc_insert_requests( request: ExportLogsServiceRequest, pipeline: PipelineWay, table_name: String, + query_ctx: &QueryContextRef, + pipeline_handler: PipelineHandlerRef, ) -> Result<(RowInsertRequests, usize)> { match pipeline { - PipelineWay::OtlpLog(select_info) => { + PipelineWay::OtlpLogDirect(select_info) => { let rows = parse_export_logs_service_request_to_rows(request, select_info)?; let len = rows.rows.len(); let insert_request = RowInsertRequest { @@ -63,53 +67,48 @@ pub fn to_grpc_insert_requests( len, )) } - PipelineWay::Custom(p) => { - let request = parse_export_logs_service_request(request); - let mut result = Vec::new(); - let mut intermediate_state = p.init_intermediate_state(); - for v in request { - p.prepare_pipeline_value(v, &mut intermediate_state) - .context(OpenTelemetryLogSnafu)?; - let r = p - .exec_mut(&mut intermediate_state) - .context(OpenTelemetryLogSnafu)?; - result.push(r); - } - let len = result.len(); - let rows = Rows { - schema: p.schemas().clone(), - rows: result, - }; - let insert_request = RowInsertRequest { - rows: Some(rows), + PipelineWay::Pipeline(pipeline_def) => { + let data = parse_export_logs_service_request(request); + + let db_string = query_ctx.get_db_string(); + + let inserts = run_pipeline( + &pipeline_handler, + pipeline_def, + PipelineExecInput::Original(data), table_name, - }; - let insert_requests = RowInsertRequests { - inserts: vec![insert_request], - }; + query_ctx, + db_string.as_ref(), + true, + ) + .await?; + let len = inserts + .iter() + .map(|insert| { + insert + .rows + .as_ref() + .map(|rows| rows.rows.len()) + .unwrap_or(0) + }) + .sum(); + + let insert_requests = RowInsertRequests { inserts }; Ok((insert_requests, len)) } } } -fn scope_to_pipeline_value( - scope: Option, -) -> (PipelineValue, PipelineValue, PipelineValue) { +fn scope_to_pipeline_value(scope: Option) -> (Value, Value, Value) { scope .map(|x| { ( - PipelineValue::Map(Map { - values: key_value_to_map(x.attributes), - }), - PipelineValue::String(x.version), - PipelineValue::String(x.name), + Value::Object(key_value_to_map(x.attributes)), + Value::String(x.version), + Value::String(x.name), ) }) - .unwrap_or(( - PipelineValue::Null, - PipelineValue::Null, - PipelineValue::Null, - )) + .unwrap_or((Value::Null, Value::Null, Value::Null)) } fn scope_to_jsonb( @@ -128,51 +127,43 @@ fn scope_to_jsonb( fn log_to_pipeline_value( log: LogRecord, - resource_schema_url: PipelineValue, - resource_attr: PipelineValue, - scope_schema_url: PipelineValue, - scope_name: PipelineValue, - scope_version: PipelineValue, - scope_attrs: PipelineValue, -) -> PipelineValue { - let log_attrs = PipelineValue::Map(Map { - values: key_value_to_map(log.attributes), - }); - let mut map = BTreeMap::new(); - map.insert( - "Timestamp".to_string(), - PipelineValue::Uint64(log.time_unix_nano), - ); + resource_schema_url: Value, + resource_attr: Value, + scope_schema_url: Value, + scope_name: Value, + scope_version: Value, + scope_attrs: Value, +) -> Value { + let log_attrs = Value::Object(key_value_to_map(log.attributes)); + let mut map = Map::new(); + map.insert("Timestamp".to_string(), Value::from(log.time_unix_nano)); map.insert( "ObservedTimestamp".to_string(), - PipelineValue::Uint64(log.observed_time_unix_nano), + Value::from(log.observed_time_unix_nano), ); // need to be convert to string map.insert( "TraceId".to_string(), - PipelineValue::String(bytes_to_hex_string(&log.trace_id)), + Value::String(bytes_to_hex_string(&log.trace_id)), ); map.insert( "SpanId".to_string(), - PipelineValue::String(bytes_to_hex_string(&log.span_id)), - ); - map.insert("TraceFlags".to_string(), PipelineValue::Uint32(log.flags)); - map.insert( - "SeverityText".to_string(), - PipelineValue::String(log.severity_text), + Value::String(bytes_to_hex_string(&log.span_id)), ); + map.insert("TraceFlags".to_string(), Value::from(log.flags)); + map.insert("SeverityText".to_string(), Value::String(log.severity_text)); map.insert( "SeverityNumber".to_string(), - PipelineValue::Int32(log.severity_number), + Value::from(log.severity_number), ); // need to be convert to string map.insert( "Body".to_string(), log.body .as_ref() - .map(|x| PipelineValue::String(log_body_to_string(x))) - .unwrap_or(PipelineValue::Null), + .map(|x| Value::String(log_body_to_string(x))) + .unwrap_or(Value::Null), ); map.insert("ResourceSchemaUrl".to_string(), resource_schema_url); @@ -182,7 +173,7 @@ fn log_to_pipeline_value( map.insert("ScopeVersion".to_string(), scope_version); map.insert("ScopeAttributes".to_string(), scope_attrs); map.insert("LogAttributes".to_string(), log_attrs); - PipelineValue::Map(Map { values: map }) + Value::Object(map) } fn build_otlp_logs_identity_schema() -> Vec { @@ -699,22 +690,18 @@ struct ParseInfo { /// transform otlp logs request to pipeline value /// https://opentelemetry.io/docs/concepts/signals/logs/ -fn parse_export_logs_service_request(request: ExportLogsServiceRequest) -> Vec { +fn parse_export_logs_service_request(request: ExportLogsServiceRequest) -> Vec { let mut result = Vec::new(); for r in request.resource_logs { let resource_attr = r .resource - .map(|x| { - PipelineValue::Map(Map { - values: key_value_to_map(x.attributes), - }) - }) - .unwrap_or(PipelineValue::Null); - let resource_schema_url = PipelineValue::String(r.schema_url); + .map(|x| Value::Object(key_value_to_map(x.attributes))) + .unwrap_or(Value::Null); + let resource_schema_url = Value::String(r.schema_url); for scope_logs in r.scope_logs { let (scope_attrs, scope_version, scope_name) = scope_to_pipeline_value(scope_logs.scope); - let scope_schema_url = PipelineValue::String(scope_logs.schema_url); + let scope_schema_url = Value::String(scope_logs.schema_url); for log in scope_logs.log_records { let value = log_to_pipeline_value( log, @@ -733,41 +720,41 @@ fn parse_export_logs_service_request(request: ExportLogsServiceRequest) -> Vec

PipelineValue { +fn any_value_to_pipeline_value(value: any_value::Value) -> Value { match value { - any_value::Value::StringValue(s) => PipelineValue::String(s), - any_value::Value::IntValue(i) => PipelineValue::Int64(i), - any_value::Value::DoubleValue(d) => PipelineValue::Float64(d), - any_value::Value::BoolValue(b) => PipelineValue::Boolean(b), + any_value::Value::StringValue(s) => Value::String(s), + any_value::Value::IntValue(i) => Value::from(i), + any_value::Value::DoubleValue(d) => Value::from(d), + any_value::Value::BoolValue(b) => Value::Bool(b), any_value::Value::ArrayValue(a) => { let values = a .values .into_iter() .map(|v| match v.value { Some(value) => any_value_to_pipeline_value(value), - None => PipelineValue::Null, + None => Value::Null, }) .collect(); - PipelineValue::Array(Array { values }) + Value::Array(values) } any_value::Value::KvlistValue(kv) => { let value = key_value_to_map(kv.values); - PipelineValue::Map(Map { values: value }) + Value::Object(value) } - any_value::Value::BytesValue(b) => PipelineValue::String(bytes_to_hex_string(&b)), + any_value::Value::BytesValue(b) => Value::String(bytes_to_hex_string(&b)), } } // convert otlp keyValue vec to map -fn key_value_to_map(key_values: Vec) -> BTreeMap { - let mut map = BTreeMap::new(); +fn key_value_to_map(key_values: Vec) -> Map { + let mut map = Map::new(); for kv in key_values { let value = match kv.value { Some(value) => match value.value { Some(value) => any_value_to_pipeline_value(value), - None => PipelineValue::Null, + None => Value::Null, }, - None => PipelineValue::Null, + None => Value::Null, }; map.insert(kv.key.clone(), value); } diff --git a/src/servers/src/pipeline.rs b/src/servers/src/pipeline.rs index ddebd4d37a6a..25914b42df0d 100644 --- a/src/servers/src/pipeline.rs +++ b/src/servers/src/pipeline.rs @@ -22,8 +22,8 @@ use snafu::ResultExt; use api::v1::{Row, RowInsertRequest, Rows}; use pipeline::error::PipelineTransformSnafu; use pipeline::{ - DispatchedTo, GreptimeTransformer, Pipeline, PipelineExecInput, PipelineExecOutput, - PipelineVersion, + DispatchedTo, GreptimeTransformer, Pipeline, PipelineDefinition, PipelineExecInput, + PipelineExecOutput, GREPTIME_INTERNAL_IDENTITY_PIPELINE_NAME, }; use crate::error::{CatalogSnafu, PipelineSnafu, Result}; @@ -32,8 +32,6 @@ use crate::metrics::{ }; use crate::query_handler::PipelineHandlerRef; -pub(crate) const GREPTIME_INTERNAL_IDENTITY_PIPELINE_NAME: &str = "greptime_identity"; - #[inline] pub(crate) fn pipeline_exec_with_intermediate_state( pipeline: &Arc>, @@ -72,44 +70,28 @@ pub(crate) fn pipeline_exec_with_intermediate_state( Ok(()) } -/// Enum for holding information of a pipeline, which is either pipeline itself, -/// or information that be used to retrieve a pipeline from `PipelineHandler` -pub(crate) enum PipelineDefinition<'a> { - Resolved(Arc>), - ByNameAndValue((&'a str, PipelineVersion)), - GreptimeIdentityPipeline, -} - -impl<'a> PipelineDefinition<'a> { - pub fn from_name(name: &'a str, version: PipelineVersion) -> Self { - if name == GREPTIME_INTERNAL_IDENTITY_PIPELINE_NAME { - Self::GreptimeIdentityPipeline - } else { - Self::ByNameAndValue((name, version)) +/// Never call this on `GreptimeIdentityPipeline` because it's a real pipeline +pub async fn get_pipeline( + pipeline_def: PipelineDefinition, + handler: &PipelineHandlerRef, + query_ctx: &QueryContextRef, +) -> Result>> { + match pipeline_def { + PipelineDefinition::Resolved(pipeline) => Ok(pipeline), + PipelineDefinition::ByNameAndValue((name, version)) => { + handler + .get_pipeline(&name, version, query_ctx.clone()) + .await } - } - - /// Never call this on `GreptimeIdentityPipeline` because it's a real pipeline - pub async fn get_pipeline( - self, - handler: &PipelineHandlerRef, - query_ctx: &QueryContextRef, - ) -> Result>> { - match self { - Self::Resolved(pipeline) => Ok(pipeline), - Self::ByNameAndValue((name, version)) => { - handler.get_pipeline(name, version, query_ctx.clone()).await - } - _ => { - unreachable!("Never call get_pipeline on identity.") - } + _ => { + unreachable!("Never call get_pipeline on identity.") } } } -pub(crate) async fn run_pipeline<'a>( +pub(crate) async fn run_pipeline( state: &PipelineHandlerRef, - pipeline_definition: PipelineDefinition<'a>, + pipeline_definition: PipelineDefinition, values: PipelineExecInput, table_name: String, query_ctx: &QueryContextRef, @@ -121,7 +103,7 @@ pub(crate) async fn run_pipeline<'a>( PipelineDefinition::GreptimeIdentityPipeline ) { let table = state - .get_table(&table_name, &query_ctx) + .get_table(&table_name, query_ctx) .await .context(CatalogSnafu)?; pipeline::identity_pipeline(values, table) @@ -134,7 +116,7 @@ pub(crate) async fn run_pipeline<'a>( .context(PipelineTransformSnafu) .context(PipelineSnafu) } else { - let pipeline = pipeline_definition.get_pipeline(state, query_ctx).await?; + let pipeline = get_pipeline(pipeline_definition, state, query_ctx).await?; let transform_timer = std::time::Instant::now(); let mut intermediate_state = pipeline.init_intermediate_state(); diff --git a/src/servers/src/query_handler.rs b/src/servers/src/query_handler.rs index d450815a4a0c..9029a8fc2a99 100644 --- a/src/servers/src/query_handler.rs +++ b/src/servers/src/query_handler.rs @@ -110,6 +110,7 @@ pub trait OpenTelemetryProtocolHandler: PipelineHandler { async fn logs( &self, + pipeline_handler: PipelineHandlerRef, request: ExportLogsServiceRequest, pipeline: PipelineWay, table_name: String, From af64c069da5a0eff8d253deea78239ca6aa2434d Mon Sep 17 00:00:00 2001 From: Ning Sun Date: Tue, 21 Jan 2025 17:12:39 +0800 Subject: [PATCH 11/32] fmt: format imports --- src/servers/src/pipeline.rs | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/servers/src/pipeline.rs b/src/servers/src/pipeline.rs index 25914b42df0d..c4b01b489ef6 100644 --- a/src/servers/src/pipeline.rs +++ b/src/servers/src/pipeline.rs @@ -16,15 +16,14 @@ use std::collections::BTreeMap; use std::sync::Arc; use std::time::Instant; -use session::context::QueryContextRef; -use snafu::ResultExt; - use api::v1::{Row, RowInsertRequest, Rows}; use pipeline::error::PipelineTransformSnafu; use pipeline::{ DispatchedTo, GreptimeTransformer, Pipeline, PipelineDefinition, PipelineExecInput, PipelineExecOutput, GREPTIME_INTERNAL_IDENTITY_PIPELINE_NAME, }; +use session::context::QueryContextRef; +use snafu::ResultExt; use crate::error::{CatalogSnafu, PipelineSnafu, Result}; use crate::metrics::{ From a980314e3a8a488f202c1e9f2918cdc9f43446d1 Mon Sep 17 00:00:00 2001 From: Ning Sun Date: Tue, 21 Jan 2025 17:16:26 +0800 Subject: [PATCH 12/32] fix: compilation --- src/frontend/src/instance/otlp.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/frontend/src/instance/otlp.rs b/src/frontend/src/instance/otlp.rs index 5b5a7fbfe10e..8c33f4dfdf4b 100644 --- a/src/frontend/src/instance/otlp.rs +++ b/src/frontend/src/instance/otlp.rs @@ -134,7 +134,7 @@ impl OpenTelemetryProtocolHandler for Instance { pipeline, table_name, &ctx, - &pipeline_handler, + pipeline_handler, ) .await?; From 5bd87988964badb64112379722124e9849e59578 Mon Sep 17 00:00:00 2001 From: Ning Sun Date: Tue, 21 Jan 2025 18:20:57 +0800 Subject: [PATCH 13/32] fix: resolve residual issues --- src/pipeline/src/etl.rs | 31 ------------------------------- src/servers/src/error.rs | 7 ------- tests-integration/tests/http.rs | 12 ++++++------ 3 files changed, 6 insertions(+), 44 deletions(-) diff --git a/src/pipeline/src/etl.rs b/src/pipeline/src/etl.rs index 275c1000f46a..f7d3a1c3bbe7 100644 --- a/src/pipeline/src/etl.rs +++ b/src/pipeline/src/etl.rs @@ -259,37 +259,6 @@ where } } - // pub fn prepare_pipeline_value(&self, val: Value, result: &mut [Value]) -> Result<()> { - // match val { - // Value::Map(map) => { - // let mut search_from = 0; - // // because of the key in the json map is ordered - // for (payload_key, payload_value) in map.values.into_iter() { - // if search_from >= self.required_keys.len() { - // break; - // } - - // // because of map key is ordered, required_keys is ordered too - // if let Some(pos) = self.required_keys[search_from..] - // .iter() - // .position(|k| k == &payload_key) - // { - // result[search_from + pos] = payload_value; - // // next search from is always after the current key - // search_from += pos; - // } - // } - // } - // Value::String(_) => { - // result[0] = val; - // } - // _ => { - // return PrepareValueMustBeObjectSnafu.fail(); - // } - // } - // Ok(()) - // } - pub fn prepare(&self, val: serde_json::Value, result: &mut [Value]) -> Result<()> { match val { serde_json::Value::Object(map) => { diff --git a/src/servers/src/error.rs b/src/servers/src/error.rs index b8882b0b7299..f285019a9d69 100644 --- a/src/servers/src/error.rs +++ b/src/servers/src/error.rs @@ -554,12 +554,6 @@ pub enum Error { location: Location, }, - #[snafu(display("OpenTelemetry log error"))] - OpenTelemetryLog { - source: pipeline::etl_error::Error, - #[snafu(implicit)] - location: Location, - }, #[snafu(display("Unsupported json data type for tag: {} {}", key, ty))] UnsupportedJsonDataTypeForTag { key: String, @@ -658,7 +652,6 @@ impl ErrorExt for Error { | InvalidLokiPayload { .. } | UnsupportedContentType { .. } | TimestampOverflow { .. } - | OpenTelemetryLog { .. } | UnsupportedJsonDataTypeForTag { .. } | InvalidTableName { .. } | PrepareStatementNotFound { .. } diff --git a/tests-integration/tests/http.rs b/tests-integration/tests/http.rs index 2c912aa0af60..66fc21a6fd3d 100644 --- a/tests-integration/tests/http.rs +++ b/tests-integration/tests/http.rs @@ -1440,8 +1440,8 @@ transform: .await; assert_eq!(res.status(), StatusCode::OK); let body: Value = res.json().await; - let schema = &body["schema"]; - let rows = &body["rows"]; + let schema = &body[0]["schema"]; + let rows = &body[0]["rows"]; assert_eq!(schema, &dryrun_schema); assert_eq!(rows, &dryrun_rows); } @@ -1470,8 +1470,8 @@ transform: .await; assert_eq!(res.status(), StatusCode::OK); let body: Value = res.json().await; - let schema = &body["schema"]; - let rows = &body["rows"]; + let schema = &body[0]["schema"]; + let rows = &body[0]["rows"]; assert_eq!(schema, &dryrun_schema); assert_eq!(rows, &dryrun_rows); } @@ -1498,8 +1498,8 @@ transform: .await; assert_eq!(res.status(), StatusCode::OK); let body: Value = res.json().await; - let schema = &body["schema"]; - let rows = &body["rows"]; + let schema = &body[0]["schema"]; + let rows = &body[0]["rows"]; assert_eq!(schema, &dryrun_schema); assert_eq!(rows, &dryrun_rows); } From b43a6c83139a03e7589bf22c6ddbdb8b16f39c10 Mon Sep 17 00:00:00 2001 From: Ning Sun Date: Wed, 22 Jan 2025 14:48:55 +0800 Subject: [PATCH 14/32] refactor: address review comments --- src/servers/src/pipeline.rs | 23 ++++++++--------------- 1 file changed, 8 insertions(+), 15 deletions(-) diff --git a/src/servers/src/pipeline.rs b/src/servers/src/pipeline.rs index c4b01b489ef6..5d11df2eff2b 100644 --- a/src/servers/src/pipeline.rs +++ b/src/servers/src/pipeline.rs @@ -32,23 +32,20 @@ use crate::metrics::{ use crate::query_handler::PipelineHandlerRef; #[inline] -pub(crate) fn pipeline_exec_with_intermediate_state( +fn pipeline_exec_with_intermediate_state( pipeline: &Arc>, intermediate_state: &mut Vec, transformed: &mut Vec, dispatched: &mut BTreeMap>>, db: &str, transform_timer: &Instant, - is_top_level: bool, ) -> Result<()> { let r = pipeline .exec_mut(intermediate_state) .inspect_err(|_| { - if is_top_level { - METRIC_HTTP_LOGS_TRANSFORM_ELAPSED - .with_label_values(&[db, METRIC_FAILURE_VALUE]) - .observe(transform_timer.elapsed().as_secs_f64()); - } + METRIC_HTTP_LOGS_TRANSFORM_ELAPSED + .with_label_values(&[db, METRIC_FAILURE_VALUE]) + .observe(transform_timer.elapsed().as_secs_f64()); }) .context(PipelineTransformSnafu) .context(PipelineSnafu)?; @@ -118,22 +115,20 @@ pub(crate) async fn run_pipeline( let pipeline = get_pipeline(pipeline_definition, state, query_ctx).await?; let transform_timer = std::time::Instant::now(); - let mut intermediate_state = pipeline.init_intermediate_state(); let mut transformed = Vec::with_capacity(values.len()); let mut dispatched: BTreeMap>> = BTreeMap::new(); match values { PipelineExecInput::Original(array) => { + let mut intermediate_state = pipeline.init_intermediate_state(); for v in array { pipeline .prepare(v, &mut intermediate_state) .inspect_err(|_| { - if is_top_level { - METRIC_HTTP_LOGS_TRANSFORM_ELAPSED - .with_label_values(&[db, METRIC_FAILURE_VALUE]) - .observe(transform_timer.elapsed().as_secs_f64()); - } + METRIC_HTTP_LOGS_TRANSFORM_ELAPSED + .with_label_values(&[db, METRIC_FAILURE_VALUE]) + .observe(transform_timer.elapsed().as_secs_f64()); }) .context(PipelineTransformSnafu) .context(PipelineSnafu)?; @@ -145,7 +140,6 @@ pub(crate) async fn run_pipeline( &mut dispatched, db, &transform_timer, - is_top_level, )?; pipeline.reset_intermediate_state(&mut intermediate_state); @@ -160,7 +154,6 @@ pub(crate) async fn run_pipeline( &mut dispatched, db, &transform_timer, - is_top_level, )?; } } From ad05a39ed70795296f5026b984e9b557067efe6a Mon Sep 17 00:00:00 2001 From: paomian Date: Thu, 23 Jan 2025 17:16:44 +0800 Subject: [PATCH 15/32] chore: use btreemap as pipeline intermediate status trait modify --- src/pipeline/src/etl.rs | 676 +++++++----------- src/pipeline/src/etl/processor.rs | 137 +--- src/pipeline/src/etl/transform.rs | 172 +---- .../src/etl/transform/transformer/greptime.rs | 112 +-- 4 files changed, 338 insertions(+), 759 deletions(-) diff --git a/src/pipeline/src/etl.rs b/src/pipeline/src/etl.rs index f7d3a1c3bbe7..61b72efb470e 100644 --- a/src/pipeline/src/etl.rs +++ b/src/pipeline/src/etl.rs @@ -20,6 +20,7 @@ pub mod processor; pub mod transform; pub mod value; +use std::collections::BTreeMap; use std::sync::Arc; use ahash::HashSet; @@ -28,7 +29,7 @@ use error::{IntermediateKeyIndexSnafu, PrepareValueMustBeObjectSnafu, YamlLoadSn use itertools::Itertools; use processor::{Processor, ProcessorBuilder, Processors}; use snafu::{OptionExt, ResultExt}; -use transform::{TransformBuilders, Transformer, Transforms}; +use transform::{Transformer, Transforms}; use value::Value; use yaml_rust::YamlLoader; @@ -59,99 +60,17 @@ where let description = doc[DESCRIPTION].as_str().map(|s| s.to_string()); - let processor_builder_list = if let Some(v) = doc[PROCESSORS].as_vec() { + let processors = if let Some(v) = doc[PROCESSORS].as_vec() { v.try_into()? } else { - processor::ProcessorBuilderList::default() + Processors::default() }; - let transform_builders = - if let Some(v) = doc[TRANSFORMS].as_vec().or(doc[TRANSFORM].as_vec()) { - v.try_into()? - } else { - TransformBuilders::default() - }; - - let processors_required_keys = &processor_builder_list.input_keys; - let processors_output_keys = &processor_builder_list.output_keys; - let processors_required_original_keys = &processor_builder_list.original_input_keys; - - debug!( - "processors_required_original_keys: {:?}", - processors_required_original_keys - ); - debug!("processors_required_keys: {:?}", processors_required_keys); - debug!("processors_output_keys: {:?}", processors_output_keys); - - let transforms_required_keys = &transform_builders.required_keys; - let mut tr_keys = Vec::with_capacity(50); - for key in transforms_required_keys.iter() { - if !processors_output_keys.contains(key) - && !processors_required_original_keys.contains(key) - { - tr_keys.push(key.clone()); - } - } - - let mut required_keys = processors_required_original_keys.clone(); - - required_keys.append(&mut tr_keys); - required_keys.sort(); - - debug!("required_keys: {:?}", required_keys); - - // intermediate keys are the keys that all processor and transformer required - let ordered_intermediate_keys: Vec = [ - processors_required_keys, - transforms_required_keys, - processors_output_keys, - ] - .iter() - .flat_map(|l| l.iter()) - .collect::>() - .into_iter() - .sorted() - .cloned() - .collect_vec(); - - let mut final_intermediate_keys = Vec::with_capacity(ordered_intermediate_keys.len()); - let mut intermediate_keys_exclude_original = - Vec::with_capacity(ordered_intermediate_keys.len()); - - for key_name in ordered_intermediate_keys.iter() { - if required_keys.contains(key_name) { - final_intermediate_keys.push(key_name.clone()); - } else { - intermediate_keys_exclude_original.push(key_name.clone()); - } - } - - final_intermediate_keys.extend(intermediate_keys_exclude_original); - - let output_keys = transform_builders.output_keys.clone(); - - let processors_kind_list = processor_builder_list - .processor_builders - .into_iter() - .map(|builder| builder.build(&final_intermediate_keys)) - .collect::>>()?; - let processors = Processors { - processors: processors_kind_list, - required_keys: processors_required_keys.clone(), - output_keys: processors_output_keys.clone(), - required_original_keys: processors_required_original_keys.clone(), - }; - - let transfor_list = transform_builders - .builders - .into_iter() - .map(|builder| builder.build(&final_intermediate_keys, &output_keys)) - .collect::>>()?; - - let transformers = Transforms { - transforms: transfor_list, - required_keys: transforms_required_keys.clone(), - output_keys: output_keys.clone(), + let transformers = if let Some(v) = doc[TRANSFORMS].as_vec().or(doc[TRANSFORM].as_vec()) + { + v.try_into()? + } else { + Transforms::default() }; let transformer = T::new(transformers)?; @@ -167,9 +86,6 @@ where processors, transformer, dispatcher, - required_keys, - output_keys, - intermediate_keys: final_intermediate_keys, }) } Content::Json(_) => unimplemented!(), @@ -185,14 +101,6 @@ where processors: processor::Processors, dispatcher: Option, transformer: T, - /// required keys for the preprocessing from map data from user - /// include all processor required and transformer required keys - required_keys: Vec, - /// all output keys from the transformer - output_keys: Vec, - /// intermediate keys from the processors - intermediate_keys: Vec, - // pub on_failure: processor::Processors, } /// Where the pipeline executed is dispatched to, with context information @@ -240,64 +148,31 @@ impl Pipeline where T: Transformer, { - pub fn exec_mut(&self, val: &mut Vec) -> Result> { - for processor in self.processors.iter() { - processor.exec_mut(val)?; - } - - let matched_rule = self - .dispatcher - .as_ref() - .and_then(|dispatcher| dispatcher.exec(&self.intermediate_keys, val)); - - match matched_rule { - None => self - .transformer - .transform_mut(val) - .map(PipelineExecOutput::Transformed), - Some(rule) => Ok(PipelineExecOutput::DispatchedTo(rule.into())), - } - } - - pub fn prepare(&self, val: serde_json::Value, result: &mut [Value]) -> Result<()> { - match val { - serde_json::Value::Object(map) => { - let mut search_from = 0; - // because of the key in the json map is ordered - for (payload_key, payload_value) in map.into_iter() { - if search_from >= self.required_keys.len() { - break; - } - - // because of map key is ordered, required_keys is ordered too - if let Some(pos) = self.required_keys[search_from..] - .iter() - .position(|k| k == &payload_key) - { - result[search_from + pos] = payload_value.try_into()?; - // next search from is always after the current key - search_from += pos; - } - } - } - serde_json::Value::String(_) => { - result[0] = val.try_into()?; - } - _ => { - return PrepareValueMustBeObjectSnafu.fail(); - } - } - Ok(()) + pub fn exec_mut( + &self, + val: &mut BTreeMap, + ) -> Result> { + // for processor in self.processors.iter() { + // processor.exec_mut(val)?; + // } + + // let matched_rule = self + // .dispatcher + // .as_ref() + // .and_then(|dispatcher| dispatcher.exec(&self.intermediate_keys, val)); + + // match matched_rule { + // None => self + // .transformer + // .transform_mut(val) + // .map(PipelineExecOutput::Transformed), + // Some(rule) => Ok(PipelineExecOutput::DispatchedTo(rule.into())), + // } + todo!() } - pub fn init_intermediate_state(&self) -> Vec { - vec![Value::Null; self.intermediate_keys.len()] - } - - pub fn reset_intermediate_state(&self, result: &mut [Value]) { - for i in result { - *i = Value::Null; - } + pub fn prepare(&self, val: serde_json::Value) -> Result> { + todo!() } pub fn processors(&self) -> &processor::Processors { @@ -308,21 +183,6 @@ where &self.transformer } - /// Required fields in user-supplied data - pub fn required_keys(&self) -> &Vec { - &self.required_keys - } - - /// All output keys from the pipeline - pub fn output_keys(&self) -> &Vec { - &self.output_keys - } - - /// intermediate keys from the processors - pub fn intermediate_keys(&self) -> &Vec { - &self.intermediate_keys - } - pub fn schemas(&self) -> &Vec { self.transformer.schemas() } @@ -394,242 +254,242 @@ mod tests { use super::*; use crate::etl::transform::GreptimeTransformer; - #[test] - fn test_pipeline_prepare() { - let input_value_str = r#" - { - "my_field": "1,2", - "foo": "bar" - } - "#; - let input_value: serde_json::Value = serde_json::from_str(input_value_str).unwrap(); - - let pipeline_yaml = r#"description: 'Pipeline for Apache Tomcat' -processors: - - csv: - field: my_field - target_fields: field1, field2 -transform: - - field: field1 - type: uint32 - - field: field2 - type: uint32 -"#; - let pipeline: Pipeline = parse(&Content::Yaml(pipeline_yaml)).unwrap(); - let mut payload = pipeline.init_intermediate_state(); - pipeline.prepare(input_value, &mut payload).unwrap(); - assert_eq!(&["my_field"].to_vec(), pipeline.required_keys()); - assert_eq!( - payload, - vec![Value::String("1,2".to_string()), Value::Null, Value::Null] - ); - let result = pipeline - .exec_mut(&mut payload) - .unwrap() - .into_transformed() - .unwrap(); - - assert_eq!(result.values[0].value_data, Some(ValueData::U32Value(1))); - assert_eq!(result.values[1].value_data, Some(ValueData::U32Value(2))); - match &result.values[2].value_data { - Some(ValueData::TimestampNanosecondValue(v)) => { - assert_ne!(*v, 0); - } - _ => panic!("expect null value"), - } - } - - #[test] - fn test_dissect_pipeline() { - let message = r#"129.37.245.88 - meln1ks [01/Aug/2024:14:22:47 +0800] "PATCH /observability/metrics/production HTTP/1.0" 501 33085"#.to_string(); - let pipeline_str = r#"processors: - - dissect: - fields: - - message - patterns: - - "%{ip} %{?ignored} %{username} [%{ts}] \"%{method} %{path} %{proto}\" %{status} %{bytes}" - - timestamp: - fields: - - ts - formats: - - "%d/%b/%Y:%H:%M:%S %z" - -transform: - - fields: - - ip - - username - - method - - path - - proto - type: string - - fields: - - status - type: uint16 - - fields: - - bytes - type: uint32 - - field: ts - type: timestamp, ns - index: time"#; - let pipeline: Pipeline = parse(&Content::Yaml(pipeline_str)).unwrap(); - let mut payload = pipeline.init_intermediate_state(); - pipeline - .prepare(serde_json::Value::String(message), &mut payload) - .unwrap(); - let result = pipeline - .exec_mut(&mut payload) - .unwrap() - .into_transformed() - .unwrap(); - let sechema = pipeline.schemas(); - - assert_eq!(sechema.len(), result.values.len()); - let test = vec![ - ( - ColumnDataType::String as i32, - Some(ValueData::StringValue("129.37.245.88".into())), - ), - ( - ColumnDataType::String as i32, - Some(ValueData::StringValue("meln1ks".into())), - ), - ( - ColumnDataType::String as i32, - Some(ValueData::StringValue("PATCH".into())), - ), - ( - ColumnDataType::String as i32, - Some(ValueData::StringValue( - "/observability/metrics/production".into(), - )), - ), - ( - ColumnDataType::String as i32, - Some(ValueData::StringValue("HTTP/1.0".into())), - ), - ( - ColumnDataType::Uint16 as i32, - Some(ValueData::U16Value(501)), - ), - ( - ColumnDataType::Uint32 as i32, - Some(ValueData::U32Value(33085)), - ), - ( - ColumnDataType::TimestampNanosecond as i32, - Some(ValueData::TimestampNanosecondValue(1722493367000000000)), - ), - ]; - for i in 0..sechema.len() { - let schema = &sechema[i]; - let value = &result.values[i]; - assert_eq!(schema.datatype, test[i].0); - assert_eq!(value.value_data, test[i].1); - } - } - - #[test] - fn test_csv_pipeline() { - let input_value_str = r#" - { - "my_field": "1,2", - "foo": "bar" - } - "#; - let input_value: serde_json::Value = serde_json::from_str(input_value_str).unwrap(); - - let pipeline_yaml = r#" -description: Pipeline for Apache Tomcat -processors: - - csv: - field: my_field - target_fields: field1, field2 -transform: - - field: field1 - type: uint32 - - field: field2 - type: uint32 -"#; - - let pipeline: Pipeline = parse(&Content::Yaml(pipeline_yaml)).unwrap(); - let mut payload = pipeline.init_intermediate_state(); - pipeline.prepare(input_value, &mut payload).unwrap(); - assert_eq!(&["my_field"].to_vec(), pipeline.required_keys()); - assert_eq!( - payload, - vec![Value::String("1,2".to_string()), Value::Null, Value::Null] - ); - let result = pipeline - .exec_mut(&mut payload) - .unwrap() - .into_transformed() - .unwrap(); - assert_eq!(result.values[0].value_data, Some(ValueData::U32Value(1))); - assert_eq!(result.values[1].value_data, Some(ValueData::U32Value(2))); - match &result.values[2].value_data { - Some(ValueData::TimestampNanosecondValue(v)) => { - assert_ne!(*v, 0); - } - _ => panic!("expect null value"), - } - } - - #[test] - fn test_date_pipeline() { - let input_value_str = r#" - { - "my_field": "1,2", - "foo": "bar", - "test_time": "2014-5-17T04:34:56+00:00" - } - "#; - let input_value: serde_json::Value = serde_json::from_str(input_value_str).unwrap(); - - let pipeline_yaml = r#" ---- -description: Pipeline for Apache Tomcat - -processors: - - timestamp: - field: test_time - -transform: - - field: test_time - type: timestamp, ns - index: time -"#; - - let pipeline: Pipeline = parse(&Content::Yaml(pipeline_yaml)).unwrap(); - let schema = pipeline.schemas().clone(); - let mut result = pipeline.init_intermediate_state(); - pipeline.prepare(input_value, &mut result).unwrap(); - let row = pipeline - .exec_mut(&mut result) - .unwrap() - .into_transformed() - .unwrap(); - let output = Rows { - schema, - rows: vec![row], - }; - let schemas = output.schema; - - assert_eq!(schemas.len(), 1); - let schema = schemas[0].clone(); - assert_eq!("test_time", schema.column_name); - assert_eq!(ColumnDataType::TimestampNanosecond as i32, schema.datatype); - assert_eq!(SemanticType::Timestamp as i32, schema.semantic_type); - - let row = output.rows[0].clone(); - assert_eq!(1, row.values.len()); - let value_data = row.values[0].clone().value_data; - assert_eq!( - Some(v1::value::ValueData::TimestampNanosecondValue( - 1400301296000000000 - )), - value_data - ); - } +// #[test] +// fn test_pipeline_prepare() { +// let input_value_str = r#" +// { +// "my_field": "1,2", +// "foo": "bar" +// } +// "#; +// let input_value: serde_json::Value = serde_json::from_str(input_value_str).unwrap(); + +// let pipeline_yaml = r#"description: 'Pipeline for Apache Tomcat' +// processors: +// - csv: +// field: my_field +// target_fields: field1, field2 +// transform: +// - field: field1 +// type: uint32 +// - field: field2 +// type: uint32 +// "#; +// let pipeline: Pipeline = parse(&Content::Yaml(pipeline_yaml)).unwrap(); +// let mut payload = pipeline.init_intermediate_state(); +// pipeline.prepare(input_value, &mut payload).unwrap(); +// assert_eq!(&["my_field"].to_vec(), pipeline.required_keys()); +// assert_eq!( +// payload, +// vec![Value::String("1,2".to_string()), Value::Null, Value::Null] +// ); +// let result = pipeline +// .exec_mut(&mut payload) +// .unwrap() +// .into_transformed() +// .unwrap(); + +// assert_eq!(result.values[0].value_data, Some(ValueData::U32Value(1))); +// assert_eq!(result.values[1].value_data, Some(ValueData::U32Value(2))); +// match &result.values[2].value_data { +// Some(ValueData::TimestampNanosecondValue(v)) => { +// assert_ne!(*v, 0); +// } +// _ => panic!("expect null value"), +// } +// } + +// #[test] +// fn test_dissect_pipeline() { +// let message = r#"129.37.245.88 - meln1ks [01/Aug/2024:14:22:47 +0800] "PATCH /observability/metrics/production HTTP/1.0" 501 33085"#.to_string(); +// let pipeline_str = r#"processors: +// - dissect: +// fields: +// - message +// patterns: +// - "%{ip} %{?ignored} %{username} [%{ts}] \"%{method} %{path} %{proto}\" %{status} %{bytes}" +// - timestamp: +// fields: +// - ts +// formats: +// - "%d/%b/%Y:%H:%M:%S %z" + +// transform: +// - fields: +// - ip +// - username +// - method +// - path +// - proto +// type: string +// - fields: +// - status +// type: uint16 +// - fields: +// - bytes +// type: uint32 +// - field: ts +// type: timestamp, ns +// index: time"#; +// let pipeline: Pipeline = parse(&Content::Yaml(pipeline_str)).unwrap(); +// let mut payload = pipeline.init_intermediate_state(); +// pipeline +// .prepare(serde_json::Value::String(message), &mut payload) +// .unwrap(); +// let result = pipeline +// .exec_mut(&mut payload) +// .unwrap() +// .into_transformed() +// .unwrap(); +// let sechema = pipeline.schemas(); + +// assert_eq!(sechema.len(), result.values.len()); +// let test = vec![ +// ( +// ColumnDataType::String as i32, +// Some(ValueData::StringValue("129.37.245.88".into())), +// ), +// ( +// ColumnDataType::String as i32, +// Some(ValueData::StringValue("meln1ks".into())), +// ), +// ( +// ColumnDataType::String as i32, +// Some(ValueData::StringValue("PATCH".into())), +// ), +// ( +// ColumnDataType::String as i32, +// Some(ValueData::StringValue( +// "/observability/metrics/production".into(), +// )), +// ), +// ( +// ColumnDataType::String as i32, +// Some(ValueData::StringValue("HTTP/1.0".into())), +// ), +// ( +// ColumnDataType::Uint16 as i32, +// Some(ValueData::U16Value(501)), +// ), +// ( +// ColumnDataType::Uint32 as i32, +// Some(ValueData::U32Value(33085)), +// ), +// ( +// ColumnDataType::TimestampNanosecond as i32, +// Some(ValueData::TimestampNanosecondValue(1722493367000000000)), +// ), +// ]; +// for i in 0..sechema.len() { +// let schema = &sechema[i]; +// let value = &result.values[i]; +// assert_eq!(schema.datatype, test[i].0); +// assert_eq!(value.value_data, test[i].1); +// } +// } + +// #[test] +// fn test_csv_pipeline() { +// let input_value_str = r#" +// { +// "my_field": "1,2", +// "foo": "bar" +// } +// "#; +// let input_value: serde_json::Value = serde_json::from_str(input_value_str).unwrap(); + +// let pipeline_yaml = r#" +// description: Pipeline for Apache Tomcat +// processors: +// - csv: +// field: my_field +// target_fields: field1, field2 +// transform: +// - field: field1 +// type: uint32 +// - field: field2 +// type: uint32 +// "#; + +// let pipeline: Pipeline = parse(&Content::Yaml(pipeline_yaml)).unwrap(); +// let mut payload = pipeline.init_intermediate_state(); +// pipeline.prepare(input_value, &mut payload).unwrap(); +// assert_eq!(&["my_field"].to_vec(), pipeline.required_keys()); +// assert_eq!( +// payload, +// vec![Value::String("1,2".to_string()), Value::Null, Value::Null] +// ); +// let result = pipeline +// .exec_mut(&mut payload) +// .unwrap() +// .into_transformed() +// .unwrap(); +// assert_eq!(result.values[0].value_data, Some(ValueData::U32Value(1))); +// assert_eq!(result.values[1].value_data, Some(ValueData::U32Value(2))); +// match &result.values[2].value_data { +// Some(ValueData::TimestampNanosecondValue(v)) => { +// assert_ne!(*v, 0); +// } +// _ => panic!("expect null value"), +// } +// } + +// #[test] +// fn test_date_pipeline() { +// let input_value_str = r#" +// { +// "my_field": "1,2", +// "foo": "bar", +// "test_time": "2014-5-17T04:34:56+00:00" +// } +// "#; +// let input_value: serde_json::Value = serde_json::from_str(input_value_str).unwrap(); + +// let pipeline_yaml = r#" +// --- +// description: Pipeline for Apache Tomcat + +// processors: +// - timestamp: +// field: test_time + +// transform: +// - field: test_time +// type: timestamp, ns +// index: time +// "#; + +// let pipeline: Pipeline = parse(&Content::Yaml(pipeline_yaml)).unwrap(); +// let schema = pipeline.schemas().clone(); +// let mut result = pipeline.init_intermediate_state(); +// pipeline.prepare(input_value, &mut result).unwrap(); +// let row = pipeline +// .exec_mut(&mut result) +// .unwrap() +// .into_transformed() +// .unwrap(); +// let output = Rows { +// schema, +// rows: vec![row], +// }; +// let schemas = output.schema; + +// assert_eq!(schemas.len(), 1); +// let schema = schemas[0].clone(); +// assert_eq!("test_time", schema.column_name); +// assert_eq!(ColumnDataType::TimestampNanosecond as i32, schema.datatype); +// assert_eq!(SemanticType::Timestamp as i32, schema.semantic_type); + +// let row = output.rows[0].clone(); +// assert_eq!(1, row.values.len()); +// let value_data = row.values[0].clone().value_data; +// assert_eq!( +// Some(v1::value::ValueData::TimestampNanosecondValue( +// 1400301296000000000 +// )), +// value_data +// ); +// } #[test] fn test_dispatcher() { diff --git a/src/pipeline/src/etl/processor.rs b/src/pipeline/src/etl/processor.rs index bf37f1f8ce7f..b6df91204c39 100644 --- a/src/pipeline/src/etl/processor.rs +++ b/src/pipeline/src/etl/processor.rs @@ -27,6 +27,8 @@ pub mod regex; pub mod timestamp; pub mod urlencoding; +use std::collections::BTreeMap; + use ahash::{HashSet, HashSetExt}; use cmcd::{CmcdProcessor, CmcdProcessorBuilder}; use csv::{CsvProcessor, CsvProcessorBuilder}; @@ -80,7 +82,7 @@ pub trait Processor: std::fmt::Debug + Send + Sync + 'static { fn ignore_missing(&self) -> bool; /// Execute the processor on a vector which be preprocessed by the pipeline - fn exec_mut(&self, val: &mut Vec) -> Result<()>; + fn exec_mut(&self, val: &mut BTreeMap) -> Result<()>; } #[derive(Debug)] @@ -114,45 +116,12 @@ pub trait ProcessorBuilder: std::fmt::Debug + Send + Sync + 'static { fn build(self, intermediate_keys: &[String]) -> Result; } -#[derive(Debug)] -#[enum_dispatch] -pub enum ProcessorBuilders { - Cmcd(CmcdProcessorBuilder), - Csv(CsvProcessorBuilder), - Dissect(DissectProcessorBuilder), - Gsub(GsubProcessorBuilder), - Join(JoinProcessorBuilder), - Letter(LetterProcessorBuilder), - Regex(RegexProcessorBuilder), - Timestamp(TimestampProcessorBuilder), - UrlEncoding(UrlEncodingProcessorBuilder), - Epoch(EpochProcessorBuilder), - Date(DateProcessorBuilder), - JsonPath(JsonPathProcessorBuilder), - Decolorize(DecolorizeProcessorBuilder), - Digest(DigestProcessorBuilder), -} - -#[derive(Debug, Default)] -pub struct ProcessorBuilderList { - pub(crate) processor_builders: Vec, - pub(crate) input_keys: Vec, - pub(crate) output_keys: Vec, - pub(crate) original_input_keys: Vec, -} - #[derive(Debug, Default)] pub struct Processors { /// A ordered list of processors /// The order of processors is important /// The output of the first processor will be the input of the second processor pub processors: Vec, - /// all required keys in all processors - pub required_keys: Vec, - /// all required keys in user-supplied data, not pipeline output fields - pub required_original_keys: Vec, - /// all output keys in all processors - pub output_keys: Vec, } impl std::ops::Deref for Processors { @@ -169,80 +138,22 @@ impl std::ops::DerefMut for Processors { } } -impl Processors { - /// A collection of all the processor's required input fields - pub fn required_keys(&self) -> &Vec { - &self.required_keys - } - - /// A collection of all the processor's output fields - pub fn output_keys(&self) -> &Vec { - &self.output_keys - } - - /// Required fields in user-supplied data, not pipeline output fields. - pub fn required_original_keys(&self) -> &Vec { - &self.required_original_keys - } -} - -impl TryFrom<&Vec> for ProcessorBuilderList { +impl TryFrom<&Vec> for Processors { type Error = Error; fn try_from(vec: &Vec) -> Result { let mut processors_builders = vec![]; - let mut all_output_keys = HashSet::with_capacity(50); - let mut all_required_keys = HashSet::with_capacity(50); - let mut all_required_original_keys = HashSet::with_capacity(50); for doc in vec { let processor = parse_processor(doc)?; processors_builders.push(processor); } - - for processor in processors_builders.iter() { - { - // get all required keys - let processor_required_keys = processor.input_keys(); - - for key in &processor_required_keys { - if !all_output_keys.contains(key) { - all_required_original_keys.insert(*key); - } - } - - all_required_keys.extend(processor_required_keys); - - let processor_output_keys = processor.output_keys().into_iter(); - all_output_keys.extend(processor_output_keys); - } - } - - let all_required_keys = all_required_keys - .into_iter() - .map(|x| x.to_string()) - .sorted() - .collect(); - let all_output_keys = all_output_keys - .into_iter() - .map(|x| x.to_string()) - .sorted() - .collect(); - let all_required_original_keys = all_required_original_keys - .into_iter() - .map(|x| x.to_string()) - .sorted() - .collect(); - - Ok(ProcessorBuilderList { - processor_builders: processors_builders, - input_keys: all_required_keys, - output_keys: all_output_keys, - original_input_keys: all_required_original_keys, + Ok(Processors { + processors: processors_builders, }) } } -fn parse_processor(doc: &yaml_rust::Yaml) -> Result { +fn parse_processor(doc: &yaml_rust::Yaml) -> Result { let map = doc.as_hash().context(ProcessorMustBeMapSnafu)?; let key = map.keys().next().context(ProcessorMustHaveStringKeySnafu)?; @@ -255,39 +166,7 @@ fn parse_processor(doc: &yaml_rust::Yaml) -> Result { let str_key = key.as_str().context(ProcessorKeyMustBeStringSnafu)?; - let processor = match str_key { - cmcd::PROCESSOR_CMCD => ProcessorBuilders::Cmcd(CmcdProcessorBuilder::try_from(value)?), - csv::PROCESSOR_CSV => ProcessorBuilders::Csv(CsvProcessorBuilder::try_from(value)?), - dissect::PROCESSOR_DISSECT => { - ProcessorBuilders::Dissect(DissectProcessorBuilder::try_from(value)?) - } - epoch::PROCESSOR_EPOCH => ProcessorBuilders::Epoch(EpochProcessorBuilder::try_from(value)?), - date::PROCESSOR_DATE => ProcessorBuilders::Date(DateProcessorBuilder::try_from(value)?), - gsub::PROCESSOR_GSUB => ProcessorBuilders::Gsub(GsubProcessorBuilder::try_from(value)?), - join::PROCESSOR_JOIN => ProcessorBuilders::Join(JoinProcessorBuilder::try_from(value)?), - letter::PROCESSOR_LETTER => { - ProcessorBuilders::Letter(LetterProcessorBuilder::try_from(value)?) - } - regex::PROCESSOR_REGEX => ProcessorBuilders::Regex(RegexProcessorBuilder::try_from(value)?), - timestamp::PROCESSOR_TIMESTAMP => { - ProcessorBuilders::Timestamp(TimestampProcessorBuilder::try_from(value)?) - } - urlencoding::PROCESSOR_URL_ENCODING => { - ProcessorBuilders::UrlEncoding(UrlEncodingProcessorBuilder::try_from(value)?) - } - json_path::PROCESSOR_JSON_PATH => { - ProcessorBuilders::JsonPath(json_path::JsonPathProcessorBuilder::try_from(value)?) - } - decolorize::PROCESSOR_DECOLORIZE => { - ProcessorBuilders::Decolorize(DecolorizeProcessorBuilder::try_from(value)?) - } - digest::PROCESSOR_DIGEST => { - ProcessorBuilders::Digest(DigestProcessorBuilder::try_from(value)?) - } - _ => return UnsupportedProcessorSnafu { processor: str_key }.fail(), - }; - - Ok(processor) + todo!() } pub(crate) fn yaml_string(v: &yaml_rust::Yaml, field: &str) -> Result { diff --git a/src/pipeline/src/etl/transform.rs b/src/pipeline/src/etl/transform.rs index be7fe35e5076..4daa3a4d8cf4 100644 --- a/src/pipeline/src/etl/transform.rs +++ b/src/pipeline/src/etl/transform.rs @@ -15,6 +15,8 @@ pub mod index; pub mod transformer; +use std::collections::BTreeMap; + use snafu::OptionExt; use crate::etl::error::{Error, Result}; @@ -47,7 +49,7 @@ pub trait Transformer: std::fmt::Debug + Sized + Send + Sync + 'static { fn schemas(&self) -> &Vec; fn transforms(&self) -> &Transforms; fn transforms_mut(&mut self) -> &mut Transforms; - fn transform_mut(&self, val: &mut Vec) -> Result; + fn transform_mut(&self, val: &mut BTreeMap) -> Result; } /// On Failure behavior when transform fails @@ -73,37 +75,12 @@ impl std::str::FromStr for OnFailure { } } -#[derive(Debug, Default, Clone)] -pub struct TransformBuilders { - pub(crate) builders: Vec, - pub(crate) output_keys: Vec, - pub(crate) required_keys: Vec, -} - #[derive(Debug, Default, Clone)] pub struct Transforms { pub(crate) transforms: Vec, - pub(crate) output_keys: Vec, - pub(crate) required_keys: Vec, } impl Transforms { - pub fn output_keys(&self) -> &Vec { - &self.output_keys - } - - pub fn output_keys_mut(&mut self) -> &mut Vec { - &mut self.output_keys - } - - pub fn required_keys_mut(&mut self) -> &mut Vec { - &mut self.required_keys - } - - pub fn required_keys(&self) -> &Vec { - &self.required_keys - } - pub fn transforms(&self) -> &Vec { &self.transforms } @@ -123,75 +100,11 @@ impl std::ops::DerefMut for Transforms { } } -impl TryFrom<&Vec> for TransformBuilders { +impl TryFrom<&Vec> for Transforms { type Error = Error; fn try_from(docs: &Vec) -> Result { - let mut transforms = Vec::with_capacity(100); - let mut all_output_keys: Vec = Vec::with_capacity(100); - let mut all_required_keys = Vec::with_capacity(100); - for doc in docs { - let transform_builder: TransformBuilder = doc - .as_hash() - .context(TransformElementMustBeMapSnafu)? - .try_into()?; - let mut transform_output_keys = transform_builder - .fields - .iter() - .map(|f| f.target_or_input_field().to_string()) - .collect(); - all_output_keys.append(&mut transform_output_keys); - - let mut transform_required_keys = transform_builder - .fields - .iter() - .map(|f| f.input_field().to_string()) - .collect(); - all_required_keys.append(&mut transform_required_keys); - - transforms.push(transform_builder); - } - - all_required_keys.sort(); - - Ok(TransformBuilders { - builders: transforms, - output_keys: all_output_keys, - required_keys: all_required_keys, - }) - } -} - -#[derive(Debug, Clone)] -pub struct TransformBuilder { - fields: Fields, - type_: Value, - default: Option, - index: Option, - on_failure: Option, -} - -impl TransformBuilder { - pub fn build(self, intermediate_keys: &[String], output_keys: &[String]) -> Result { - let mut real_fields = vec![]; - for field in self.fields { - let input_index = find_key_index(intermediate_keys, field.input_field(), "transform")?; - let input_field_info = InputFieldInfo::new(field.input_field(), input_index); - let output_index = - find_key_index(output_keys, field.target_or_input_field(), "transform")?; - let input = OneInputOneOutputField::new( - input_field_info, - (field.target_or_input_field().to_string(), output_index), - ); - real_fields.push(input); - } - Ok(Transform { - real_fields, - type_: self.type_, - default: self.default, - index: self.index, - on_failure: self.on_failure, - }) + todo!() } } @@ -230,78 +143,3 @@ impl Transform { &self.type_ } } - -impl TryFrom<&yaml_rust::yaml::Hash> for TransformBuilder { - type Error = Error; - - fn try_from(hash: &yaml_rust::yaml::Hash) -> Result { - let mut fields = Fields::default(); - let mut type_ = Value::Null; - let mut default = None; - let mut index = None; - let mut on_failure = None; - - for (k, v) in hash { - let key = k - .as_str() - .with_context(|| KeyMustBeStringSnafu { k: k.clone() })?; - match key { - TRANSFORM_FIELD => { - fields = Fields::one(yaml_new_field(v, TRANSFORM_FIELD)?); - } - - TRANSFORM_FIELDS => { - fields = yaml_new_fields(v, TRANSFORM_FIELDS)?; - } - - TRANSFORM_TYPE => { - let t = yaml_string(v, TRANSFORM_TYPE)?; - type_ = Value::parse_str_type(&t)?; - } - - TRANSFORM_INDEX => { - let index_str = yaml_string(v, TRANSFORM_INDEX)?; - index = Some(index_str.try_into()?); - } - - TRANSFORM_DEFAULT => { - default = Some(Value::try_from(v)?); - } - - TRANSFORM_ON_FAILURE => { - let on_failure_str = yaml_string(v, TRANSFORM_ON_FAILURE)?; - on_failure = Some(on_failure_str.parse()?); - } - - _ => {} - } - } - let mut final_default = None; - - if let Some(default_value) = default { - match (&type_, &default_value) { - (Value::Null, _) => { - return TransformTypeMustBeSetSnafu { - fields: format!("{:?}", fields), - default: default_value.to_string(), - } - .fail(); - } - (_, Value::Null) => {} // if default is not set, then it will be regarded as default null - (_, _) => { - let target = type_.parse_str_value(default_value.to_str_value().as_str())?; - final_default = Some(target); - } - } - } - let builder = TransformBuilder { - fields, - type_, - default: final_default, - index, - on_failure, - }; - - Ok(builder) - } -} diff --git a/src/pipeline/src/etl/transform/transformer/greptime.rs b/src/pipeline/src/etl/transform/transformer/greptime.rs index 7d3752ef2880..5ace3afccda7 100644 --- a/src/pipeline/src/etl/transform/transformer/greptime.rs +++ b/src/pipeline/src/etl/transform/transformer/greptime.rs @@ -14,7 +14,7 @@ pub mod coerce; -use std::collections::HashSet; +use std::collections::{BTreeMap, HashSet}; use std::sync::Arc; use ahash::HashMap; @@ -52,36 +52,37 @@ pub struct GreptimeTransformer { impl GreptimeTransformer { /// Add a default timestamp column to the transforms fn add_greptime_timestamp_column(transforms: &mut Transforms) { - let ns = chrono::Utc::now().timestamp_nanos_opt().unwrap_or(0); - let type_ = Value::Timestamp(Timestamp::Nanosecond(ns)); - let default = Some(type_.clone()); - - let transform = Transform { - real_fields: vec![OneInputOneOutputField::new( - InputFieldInfo { - name: DEFAULT_GREPTIME_TIMESTAMP_COLUMN.to_string(), - index: usize::MAX, - }, - ( - DEFAULT_GREPTIME_TIMESTAMP_COLUMN.to_string(), - transforms - .transforms - .iter() - .map(|x| x.real_fields.len()) - .sum(), - ), - )], - type_, - default, - index: Some(Index::Time), - on_failure: Some(crate::etl::transform::OnFailure::Default), - }; - let required_keys = transforms.required_keys_mut(); - required_keys.push(DEFAULT_GREPTIME_TIMESTAMP_COLUMN.to_string()); - - let output_keys = transforms.output_keys_mut(); - output_keys.push(DEFAULT_GREPTIME_TIMESTAMP_COLUMN.to_string()); - transforms.push(transform); + // let ns = chrono::Utc::now().timestamp_nanos_opt().unwrap_or(0); + // let type_ = Value::Timestamp(Timestamp::Nanosecond(ns)); + // let default = Some(type_.clone()); + + // let transform = Transform { + // real_fields: vec![OneInputOneOutputField::new( + // InputFieldInfo { + // name: DEFAULT_GREPTIME_TIMESTAMP_COLUMN.to_string(), + // index: usize::MAX, + // }, + // ( + // DEFAULT_GREPTIME_TIMESTAMP_COLUMN.to_string(), + // transforms + // .transforms + // .iter() + // .map(|x| x.real_fields.len()) + // .sum(), + // ), + // )], + // type_, + // default, + // index: Some(Index::Time), + // on_failure: Some(crate::etl::transform::OnFailure::Default), + // }; + // let required_keys = transforms.required_keys_mut(); + // required_keys.push(DEFAULT_GREPTIME_TIMESTAMP_COLUMN.to_string()); + + // let output_keys = transforms.output_keys_mut(); + // output_keys.push(DEFAULT_GREPTIME_TIMESTAMP_COLUMN.to_string()); + // transforms.push(transform); + todo!() } /// Generate the schema for the GreptimeTransformer @@ -161,30 +162,31 @@ impl Transformer for GreptimeTransformer { } } - fn transform_mut(&self, val: &mut Vec) -> Result { - let mut values = vec![GreptimeValue { value_data: None }; self.schema.len()]; - for transform in self.transforms.iter() { - for field in transform.real_fields.iter() { - let index = field.input_index(); - let output_index = field.output_index(); - match val.get(index) { - Some(v) => { - let value_data = coerce_value(v, transform)?; - // every transform fields has only one output field - values[output_index] = GreptimeValue { value_data }; - } - None => { - let default = transform.get_default(); - let value_data = match default { - Some(default) => coerce_value(default, transform)?, - None => None, - }; - values[output_index] = GreptimeValue { value_data }; - } - } - } - } - Ok(Row { values }) + fn transform_mut(&self, val: &mut BTreeMap) -> Result { + // let mut values = vec![GreptimeValue { value_data: None }; self.schema.len()]; + // for transform in self.transforms.iter() { + // for field in transform.real_fields.iter() { + // let index = field.input_index(); + // let output_index = field.output_index(); + // match val.get(index) { + // Some(v) => { + // let value_data = coerce_value(v, transform)?; + // // every transform fields has only one output field + // values[output_index] = GreptimeValue { value_data }; + // } + // None => { + // let default = transform.get_default(); + // let value_data = match default { + // Some(default) => coerce_value(default, transform)?, + // None => None, + // }; + // values[output_index] = GreptimeValue { value_data }; + // } + // } + // } + // } + // Ok(Row { values }) + todo!() } fn transforms(&self) -> &Transforms { From 13268f975e84794685111d6bd0fe8e555961200f Mon Sep 17 00:00:00 2001 From: Ning Sun Date: Thu, 23 Jan 2025 18:13:47 +0800 Subject: [PATCH 16/32] refactor: update dispatcher to accept BTreeMap --- src/pipeline/src/dispatcher.rs | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/src/pipeline/src/dispatcher.rs b/src/pipeline/src/dispatcher.rs index 45bd6b47cbfb..fa9e54cf0f4a 100644 --- a/src/pipeline/src/dispatcher.rs +++ b/src/pipeline/src/dispatcher.rs @@ -12,6 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. +use std::collections::BTreeMap; + use common_telemetry::debug; use snafu::OptionExt; use yaml_rust::Yaml; @@ -109,22 +111,17 @@ impl TryFrom<&Yaml> for Dispatcher { impl Dispatcher { /// execute dispatcher and returns matched rule if any - pub(crate) fn exec(&self, keys: &Vec, val: &Vec) -> Option<&Rule> { - if let Some(index) = keys.iter().position(|key| key == &self.field) { - if let Some(value) = val.get(index) { - for rule in &self.rules { - if rule.value == *value { - return Some(rule); - } + pub(crate) fn exec(&self, data: &BTreeMap) -> Option<&Rule> { + if let Some(value) = data.get(&self.field) { + for rule in &self.rules { + if rule.value == *value { + return Some(rule); } - - None - } else { - debug!("value at index {} is not found in {:?}", &index, val); - None } + + None } else { - debug!("field {} not found in keys {:?}", &self.field, keys); + debug!("field {} not found in keys {:?}", &self.field, data.keys()); None } } From a2148121e5502e3569058253dc433f9ccaf7720e Mon Sep 17 00:00:00 2001 From: Ning Sun Date: Thu, 23 Jan 2025 18:52:26 +0800 Subject: [PATCH 17/32] refactor: update identity pipeline --- src/pipeline/src/etl/error.rs | 7 - .../src/etl/transform/transformer/greptime.rs | 173 ++---------------- 2 files changed, 19 insertions(+), 161 deletions(-) diff --git a/src/pipeline/src/etl/error.rs b/src/pipeline/src/etl/error.rs index 2fd267ce9548..d1e0b56e6e9d 100644 --- a/src/pipeline/src/etl/error.rs +++ b/src/pipeline/src/etl/error.rs @@ -594,13 +594,6 @@ pub enum Error { TablePartRequiredForDispatcherRule, #[snafu(display("Value is required for dispatcher rule"))] ValueRequiredForDispatcherRule, - #[snafu(display("Keys and values length mismatch, values: {values}, keys: {keys}"))] - KeyValueLengthMismatch { - #[snafu(implicit)] - location: Location, - keys: usize, - values: usize, - }, #[snafu(display( "Reached max nested levels when flattening JSON object: {max_nested_levels}" ))] diff --git a/src/pipeline/src/etl/transform/transformer/greptime.rs b/src/pipeline/src/etl/transform/transformer/greptime.rs index 27338ccfb1b5..f7e59904a313 100644 --- a/src/pipeline/src/etl/transform/transformer/greptime.rs +++ b/src/pipeline/src/etl/transform/transformer/greptime.rs @@ -22,21 +22,19 @@ use api::helper::proto_value_type; use api::v1::column_data_type_extension::TypeExt; use api::v1::value::ValueData; use api::v1::{ColumnDataType, ColumnDataTypeExtension, JsonTypeExtension, SemanticType}; -use coerce::{coerce_columns, coerce_value}; +use coerce::coerce_columns; use greptime_proto::v1::{ColumnSchema, Row, Rows, Value as GreptimeValue}; use itertools::Itertools; use serde_json::{Map, Number, Value as JsonValue}; -use snafu::ensure; use crate::etl::error::{ - IdentifyPipelineColumnTypeMismatchSnafu, KeyValueLengthMismatchSnafu, - ReachedMaxNestedLevelsSnafu, Result, TransformColumnNameMustBeUniqueSnafu, TransformEmptySnafu, + IdentifyPipelineColumnTypeMismatchSnafu, ReachedMaxNestedLevelsSnafu, Result, + TransformColumnNameMustBeUniqueSnafu, TransformEmptySnafu, TransformMultipleTimestampIndexSnafu, TransformTimestampIndexCountSnafu, UnsupportedNumberTypeSnafu, }; -use crate::etl::field::{InputFieldInfo, OneInputOneOutputField}; use crate::etl::transform::index::Index; -use crate::etl::transform::{Transform, Transformer, Transforms}; +use crate::etl::transform::{Transformer, Transforms}; use crate::etl::value::{Timestamp, Value}; /// The header key that contains the pipeline params. @@ -329,23 +327,13 @@ fn resolve_number_schema( ) } -fn values_to_row(schema_info: &mut SchemaInfo, values: Vec, keys: &[String]) -> Result { - ensure!( - values.len() == keys.len(), - KeyValueLengthMismatchSnafu { - keys: keys.len(), - values: values.len(), - } - ); - +fn values_to_row(schema_info: &mut SchemaInfo, values: BTreeMap) -> Result { let mut row: Vec = Vec::with_capacity(schema_info.schema.len()); for _ in 0..schema_info.schema.len() { row.push(GreptimeValue { value_data: None }); } - for (idx, value) in values.into_iter().enumerate() { - // ensured by previous check - let column_name = keys[idx].clone(); + for (column_name, value) in values.into_iter() { if column_name == DEFAULT_GREPTIME_TIMESTAMP_COLUMN { continue; } @@ -524,107 +512,17 @@ fn values_to_row(schema_info: &mut SchemaInfo, values: Vec, keys: &[Strin Ok(Row { values: row }) } -fn json_value_to_row( - schema_info: &mut SchemaInfo, - map: Map, -) -> Result { - let mut row: Vec = Vec::with_capacity(schema_info.schema.len()); - for _ in 0..schema_info.schema.len() { - row.push(GreptimeValue { value_data: None }); - } - for (column_name, value) in map { - if column_name == DEFAULT_GREPTIME_TIMESTAMP_COLUMN { - continue; - } - let index = schema_info.index.get(&column_name).copied(); - match value { - serde_json::Value::Null => { - // do nothing - } - serde_json::Value::String(s) => { - resolve_schema( - index, - ValueData::StringValue(s), - ColumnSchema { - column_name, - datatype: ColumnDataType::String as i32, - semantic_type: SemanticType::Field as i32, - datatype_extension: None, - options: None, - }, - &mut row, - schema_info, - )?; - } - serde_json::Value::Bool(b) => { - resolve_schema( - index, - ValueData::BoolValue(b), - ColumnSchema { - column_name, - datatype: ColumnDataType::Boolean as i32, - semantic_type: SemanticType::Field as i32, - datatype_extension: None, - options: None, - }, - &mut row, - schema_info, - )?; - } - serde_json::Value::Number(n) => { - resolve_number_schema(n, column_name, index, &mut row, schema_info)?; - } - serde_json::Value::Array(_) | serde_json::Value::Object(_) => { - resolve_schema( - index, - ValueData::BinaryValue(jsonb::Value::from(value).to_vec()), - ColumnSchema { - column_name, - datatype: ColumnDataType::Binary as i32, - semantic_type: SemanticType::Field as i32, - datatype_extension: Some(ColumnDataTypeExtension { - type_ext: Some(TypeExt::JsonType(JsonTypeExtension::JsonBinary.into())), - }), - options: None, - }, - &mut row, - schema_info, - )?; - } - } - } - Ok(Row { values: row }) -} - fn identity_pipeline_inner<'a>( - array: PipelineExecInput, + array: Vec>, tag_column_names: Option>, - params: &GreptimePipelineParams, + _params: &GreptimePipelineParams, ) -> Result { let mut rows = Vec::with_capacity(array.len()); let mut schema_info = SchemaInfo::default(); - match array { - PipelineExecInput::Original(array) => { - for value in array { - if let serde_json::Value::Object(map) = value { - let object = if params.flatten_json_object() { - flatten_json_object(map, DEFAULT_MAX_NESTED_LEVELS_FOR_JSON_FLATTENING)? - } else { - map - }; - - let row = json_value_to_row(&mut schema_info, object)?; - rows.push(row); - } - } - } - PipelineExecInput::Intermediate { keys, array } => { - for values in array { - let row = values_to_row(&mut schema_info, values, &keys)?; - rows.push(row); - } - } + for values in array { + let row = values_to_row(&mut schema_info, values)?; + rows.push(row); } let greptime_timestamp_schema = ColumnSchema { @@ -662,36 +560,6 @@ fn identity_pipeline_inner<'a>( }) } -/// The input data format for pipeline -/// -/// It can either be raw input as in `serde_json::Value` or intermediate `Vec` -pub enum PipelineExecInput { - // multiple row values as a value object - Original(Vec), - // 2-dimension row values by column - Intermediate { - array: Vec>, - keys: Vec, - }, -} - -impl PipelineExecInput { - /// return the length of internal array - pub fn len(&self) -> usize { - match self { - PipelineExecInput::Original(array) => array.len(), - PipelineExecInput::Intermediate { array, .. } => array.len(), - } - } - - pub fn is_empty(&self) -> bool { - match self { - PipelineExecInput::Original(array) => array.is_empty(), - PipelineExecInput::Intermediate { array, .. } => array.is_empty(), - } - } -} - /// Identity pipeline for Greptime /// This pipeline will convert the input JSON array to Greptime Rows /// params table is used to set the semantic type of the row key column to Tag @@ -701,7 +569,7 @@ impl PipelineExecInput { /// 4. The pipeline will return an error if the same column datatype is mismatched /// 5. The pipeline will analyze the schema of each json record and merge them to get the final schema. pub fn identity_pipeline( - array: PipelineExecInput, + array: Vec>, table: Option>, params: &GreptimePipelineParams, ) -> Result { @@ -773,9 +641,9 @@ mod tests { use api::v1::SemanticType; use crate::etl::transform::transformer::greptime::{ - flatten_json_object, identity_pipeline_inner, GreptimePipelineParams, PipelineExecInput, + flatten_json_object, identity_pipeline_inner, GreptimePipelineParams, }; - use crate::identity_pipeline; + use crate::{identity_pipeline, Pipeline}; #[test] fn test_identify_pipeline() { @@ -800,11 +668,8 @@ mod tests { "gaga": "gaga" }), ]; - let rows = identity_pipeline( - PipelineExecInput::Original(array), - None, - &GreptimePipelineParams::default(), - ); + let array = Pipeline::prepare(array).unwrap(); + let rows = identity_pipeline(array, None, &GreptimePipelineParams::default()); assert!(rows.is_err()); assert_eq!( rows.err().unwrap().to_string(), @@ -833,7 +698,7 @@ mod tests { }), ]; let rows = identity_pipeline( - PipelineExecInput::Original(array), + Pipeline::prepare(array).unwrap(), None, &GreptimePipelineParams::default(), ); @@ -865,7 +730,7 @@ mod tests { }), ]; let rows = identity_pipeline( - PipelineExecInput::Original(array), + Pipeline::prepare(array).unwrap(), None, &GreptimePipelineParams::default(), ); @@ -899,7 +764,7 @@ mod tests { ]; let tag_column_names = ["name".to_string(), "address".to_string()]; let rows = identity_pipeline_inner( - PipelineExecInput::Original(array), + Pipeline::prepare(array).uwnrap(), Some(tag_column_names.iter()), &GreptimePipelineParams::default(), ); From c7e08eb3103e1217fc2f29c77ed514a58de2f565 Mon Sep 17 00:00:00 2001 From: Ning Sun Date: Thu, 23 Jan 2025 19:38:16 +0800 Subject: [PATCH 18/32] refactor: use new input for pipeline --- src/frontend/src/instance/otlp.rs | 4 +- src/servers/src/http/event.rs | 6 +- src/servers/src/http/extractor.rs | 25 +++---- src/servers/src/http/otlp.rs | 11 ++- src/servers/src/otlp/logs.rs | 7 +- src/servers/src/pipeline.rs | 118 ++++++++---------------------- src/servers/src/query_handler.rs | 6 +- 7 files changed, 64 insertions(+), 113 deletions(-) diff --git a/src/frontend/src/instance/otlp.rs b/src/frontend/src/instance/otlp.rs index 8c33f4dfdf4b..fff075cac6a1 100644 --- a/src/frontend/src/instance/otlp.rs +++ b/src/frontend/src/instance/otlp.rs @@ -20,7 +20,7 @@ use common_telemetry::tracing; use opentelemetry_proto::tonic::collector::logs::v1::ExportLogsServiceRequest; use opentelemetry_proto::tonic::collector::metrics::v1::ExportMetricsServiceRequest; use opentelemetry_proto::tonic::collector::trace::v1::ExportTraceServiceRequest; -use pipeline::PipelineWay; +use pipeline::{GreptimePipelineParams, PipelineWay}; use servers::error::{self, AuthSnafu, InFlightWriteBytesExceededSnafu, Result as ServerResult}; use servers::interceptor::{OpenTelemetryProtocolInterceptor, OpenTelemetryProtocolInterceptorRef}; use servers::otlp; @@ -115,6 +115,7 @@ impl OpenTelemetryProtocolHandler for Instance { pipeline_handler: PipelineHandlerRef, request: ExportLogsServiceRequest, pipeline: PipelineWay, + pipeline_params: GreptimePipelineParams, table_name: String, ctx: QueryContextRef, ) -> ServerResult { @@ -132,6 +133,7 @@ impl OpenTelemetryProtocolHandler for Instance { let (requests, rows) = otlp::logs::to_grpc_insert_requests( request, pipeline, + pipeline_params, table_name, &ctx, pipeline_handler, diff --git a/src/servers/src/http/event.rs b/src/servers/src/http/event.rs index 6d17144740db..e8f0d749f873 100644 --- a/src/servers/src/http/event.rs +++ b/src/servers/src/http/event.rs @@ -282,10 +282,9 @@ async fn dryrun_pipeline_inner( &pipeline_handler, PipelineDefinition::Resolved(pipeline), ¶ms, - PipelineExecInput::Original(value), + Pipeline::prepare(value)?, "dry_run".to_owned(), query_ctx, - db.as_ref(), true, ) .await?; @@ -604,10 +603,9 @@ pub(crate) async fn ingest_logs_inner( &state, PipelineDefinition::from_name(&pipeline_name, version), &pipeline_params, - PipelineExecInput::Original(request.values), + Pipeline::prepare(request.values), request.table, &query_ctx, - db.as_str(), true, ) .await?; diff --git a/src/servers/src/http/extractor.rs b/src/servers/src/http/extractor.rs index f3ae606636c5..ee662f36f615 100644 --- a/src/servers/src/http/extractor.rs +++ b/src/servers/src/http/extractor.rs @@ -18,7 +18,7 @@ use axum::extract::FromRequestParts; use axum::http::request::Parts; use axum::http::StatusCode; use http::HeaderMap; -use pipeline::SelectInfo; +use pipeline::{GreptimePipelineParams, SelectInfo}; use crate::http::header::constants::{ GREPTIME_LOG_EXTRACT_KEYS_HEADER_NAME, GREPTIME_LOG_PIPELINE_NAME_HEADER_NAME, @@ -91,6 +91,7 @@ where pub struct PipelineInfo { pub pipeline_name: Option, pub pipeline_version: Option, + pub pipeline_params: Option, } impl FromRequestParts for PipelineInfo @@ -105,20 +106,14 @@ where string_value_from_header(headers, GREPTIME_LOG_PIPELINE_NAME_HEADER_NAME)?; let pipeline_version = string_value_from_header(headers, GREPTIME_LOG_PIPELINE_VERSION_HEADER_NAME)?; - match (pipeline_name, pipeline_version) { - (Some(name), Some(version)) => Ok(PipelineInfo { - pipeline_name: Some(name), - pipeline_version: Some(version), - }), - (None, _) => Ok(PipelineInfo { - pipeline_name: None, - pipeline_version: None, - }), - (Some(name), None) => Ok(PipelineInfo { - pipeline_name: Some(name), - pipeline_version: None, - }), - } + let pipeline_parameters = + string_value_from_header(headers, GREPTIME_PIPELINE_PARAMS_HEADER)?; + + Ok(PipelineInfo { + pipeline_name, + pipeline_version, + pipeline_params: pipeline_parameters.map(|v| GreptimePipelineParams::from_params(v)), + }) } } diff --git a/src/servers/src/http/otlp.rs b/src/servers/src/http/otlp.rs index 6657bfc845a3..a7efa4b7d32b 100644 --- a/src/servers/src/http/otlp.rs +++ b/src/servers/src/http/otlp.rs @@ -126,11 +126,20 @@ pub async fn logs( PipelineWay::OtlpLogDirect(Box::new(select_info)) }; + let pipeline_params = pipeline_info.pipeline_params.unwrap_or_default(); + // here we use nightly feature `trait_upcasting` to convert handler to // pipeline_handler let pipeline_handler: Arc = handler.clone(); handler - .logs(pipeline_handler, request, pipeline, tablename, query_ctx) + .logs( + pipeline_handler, + request, + pipeline, + pipeline_params, + tablename, + query_ctx, + ) .await .map(|o| OtlpResponse { resp_body: ExportLogsServiceResponse { diff --git a/src/servers/src/otlp/logs.rs b/src/servers/src/otlp/logs.rs index ecf53988adf4..24232fcef01f 100644 --- a/src/servers/src/otlp/logs.rs +++ b/src/servers/src/otlp/logs.rs @@ -48,6 +48,7 @@ pub const LOG_TABLE_NAME: &str = "opentelemetry_logs"; pub async fn to_grpc_insert_requests( request: ExportLogsServiceRequest, pipeline: PipelineWay, + pipeline_params: GreptimePipelineParams, table_name: String, query_ctx: &QueryContextRef, pipeline_handler: PipelineHandlerRef, @@ -69,19 +70,17 @@ pub async fn to_grpc_insert_requests( } PipelineWay::Pipeline(pipeline_def) => { let data = parse_export_logs_service_request(request); + let array = Pipeline::prepare(data)?; let db_string = query_ctx.get_db_string(); - let pipeline_params = GreptimePipelineParams::default(); - let inserts = run_pipeline( &pipeline_handler, pipeline_def, &pipeline_params, - PipelineExecInput::Original(data), + array, table_name, query_ctx, - db_string.as_ref(), true, ) .await?; diff --git a/src/servers/src/pipeline.rs b/src/servers/src/pipeline.rs index 7ab1275fe0a3..bee45d476404 100644 --- a/src/servers/src/pipeline.rs +++ b/src/servers/src/pipeline.rs @@ -31,41 +31,6 @@ use crate::metrics::{ }; use crate::query_handler::PipelineHandlerRef; -#[inline] -fn pipeline_exec_with_intermediate_state( - pipeline: &Arc>, - intermediate_state: &mut Vec, - transformed: &mut Vec, - dispatched: &mut BTreeMap>>, - db: &str, - transform_timer: &Instant, -) -> Result<()> { - let r = pipeline - .exec_mut(intermediate_state) - .inspect_err(|_| { - METRIC_HTTP_LOGS_TRANSFORM_ELAPSED - .with_label_values(&[db, METRIC_FAILURE_VALUE]) - .observe(transform_timer.elapsed().as_secs_f64()); - }) - .context(PipelineTransformSnafu) - .context(PipelineSnafu)?; - - match r { - PipelineExecOutput::Transformed(row) => { - transformed.push(row); - } - PipelineExecOutput::DispatchedTo(dispatched_to) => { - if let Some(values) = dispatched.get_mut(&dispatched_to) { - values.push(intermediate_state.clone()); - } else { - dispatched.insert(dispatched_to, vec![intermediate_state.clone()]); - } - } - } - - Ok(()) -} - /// Never call this on `GreptimeIdentityPipeline` because it's a real pipeline pub async fn get_pipeline( pipeline_def: PipelineDefinition, @@ -89,12 +54,13 @@ pub(crate) async fn run_pipeline( state: &PipelineHandlerRef, pipeline_definition: PipelineDefinition, pipeline_parameters: &GreptimePipelineParams, - values: PipelineExecInput, + array: Vec>, table_name: String, query_ctx: &QueryContextRef, - db: &str, is_top_level: bool, ) -> Result> { + let db = query_ctx.get_db_string(); + if matches!( pipeline_definition, PipelineDefinition::GreptimeIdentityPipeline @@ -103,7 +69,7 @@ pub(crate) async fn run_pipeline( .get_table(&table_name, query_ctx) .await .context(CatalogSnafu)?; - pipeline::identity_pipeline(values, table, pipeline_parameters) + pipeline::identity_pipeline(array, table, pipeline_parameters) .map(|rows| { vec![RowInsertRequest { rows: Some(rows), @@ -118,44 +84,30 @@ pub(crate) async fn run_pipeline( let transform_timer = std::time::Instant::now(); let mut transformed = Vec::with_capacity(values.len()); - let mut dispatched: BTreeMap>> = BTreeMap::new(); - - match values { - PipelineExecInput::Original(array) => { - let mut intermediate_state = pipeline.init_intermediate_state(); - for v in array { - pipeline - .prepare(v, &mut intermediate_state) - .inspect_err(|_| { - METRIC_HTTP_LOGS_TRANSFORM_ELAPSED - .with_label_values(&[db, METRIC_FAILURE_VALUE]) - .observe(transform_timer.elapsed().as_secs_f64()); - }) - .context(PipelineTransformSnafu) - .context(PipelineSnafu)?; - - pipeline_exec_with_intermediate_state( - &pipeline, - &mut intermediate_state, - &mut transformed, - &mut dispatched, - db, - &transform_timer, - )?; - - pipeline.reset_intermediate_state(&mut intermediate_state); + let mut dispatched: BTreeMap>> = + BTreeMap::new(); + + for mut values in array { + let r = pipeline + .exec_mut(&mut values) + .inspect_err(|_| { + METRIC_HTTP_LOGS_TRANSFORM_ELAPSED + .with_label_values(&[db.as_str(), METRIC_FAILURE_VALUE]) + .observe(transform_timer.elapsed().as_secs_f64()); + }) + .context(PipelineTransformSnafu) + .context(PipelineSnafu)?; + + match r { + PipelineExecOutput::Transformed(row) => { + transformed.push(row); } - } - PipelineExecInput::Intermediate { array, .. } => { - for mut intermediate_state in array { - pipeline_exec_with_intermediate_state( - &pipeline, - &mut intermediate_state, - &mut transformed, - &mut dispatched, - db, - &transform_timer, - )?; + PipelineExecOutput::DispatchedTo(dispatched_to) => { + if let Some(coll) = dispatched.get_mut(&dispatched_to) { + coll.push(values); + } else { + dispatched.insert(dispatched_to, vec![values]); + } } } } @@ -176,7 +128,7 @@ pub(crate) async fn run_pipeline( // if current pipeline contains dispatcher and has several rules, we may // already accumulated several dispatched rules and rows. - for (dispatched_to, values) in dispatched { + for (dispatched_to, coll) in dispatched { // we generate the new table name according to `table_part` and // current custom table name. let table_name = format!("{}_{}", &table_name, dispatched_to.table_part); @@ -185,22 +137,14 @@ pub(crate) async fn run_pipeline( .as_deref() .unwrap_or(GREPTIME_INTERNAL_IDENTITY_PIPELINE_NAME); - // run pipeline recursively. Note that the values we are going to - // process is now intermediate version. It's in form of - // `Vec>`. + // run pipeline recursively. let requests = Box::pin(run_pipeline( state, PipelineDefinition::from_name(next_pipeline_name, None), pipeline_parameters, - PipelineExecInput::Intermediate { - array: values, - // FIXME(sunng87): this intermediate_keys is incorrect. what - // we will need is the keys that generated after processors - keys: pipeline.intermediate_keys().clone(), - }, + coll, table_name, query_ctx, - db, false, )) .await?; @@ -210,7 +154,7 @@ pub(crate) async fn run_pipeline( if is_top_level { METRIC_HTTP_LOGS_TRANSFORM_ELAPSED - .with_label_values(&[db, METRIC_SUCCESS_VALUE]) + .with_label_values(&[db.as_str(), METRIC_SUCCESS_VALUE]) .observe(transform_timer.elapsed().as_secs_f64()); } diff --git a/src/servers/src/query_handler.rs b/src/servers/src/query_handler.rs index 9029a8fc2a99..dd41305626b9 100644 --- a/src/servers/src/query_handler.rs +++ b/src/servers/src/query_handler.rs @@ -38,7 +38,10 @@ use log_query::LogQuery; use opentelemetry_proto::tonic::collector::logs::v1::ExportLogsServiceRequest; use opentelemetry_proto::tonic::collector::metrics::v1::ExportMetricsServiceRequest; use opentelemetry_proto::tonic::collector::trace::v1::ExportTraceServiceRequest; -use pipeline::{GreptimeTransformer, Pipeline, PipelineInfo, PipelineVersion, PipelineWay}; +use pipeline::{ + GreptimePipelineParams, GreptimeTransformer, Pipeline, PipelineInfo, PipelineVersion, + PipelineWay, +}; use serde_json::Value; use session::context::{QueryContext, QueryContextRef}; @@ -113,6 +116,7 @@ pub trait OpenTelemetryProtocolHandler: PipelineHandler { pipeline_handler: PipelineHandlerRef, request: ExportLogsServiceRequest, pipeline: PipelineWay, + pipeline_params: GreptimePipelineParams, table_name: String, ctx: QueryContextRef, ) -> Result; From eb6e8d2cda9c03a79ed4f69f1220980b05d311dd Mon Sep 17 00:00:00 2001 From: paomian Date: Fri, 24 Jan 2025 17:48:17 +0800 Subject: [PATCH 19/32] chore: wip --- src/pipeline/src/etl.rs | 538 +++++++++--------- src/pipeline/src/etl/error.rs | 5 + src/pipeline/src/etl/field.rs | 129 +---- src/pipeline/src/etl/processor.rs | 93 +-- src/pipeline/src/etl/processor/cmcd.rs | 8 +- src/pipeline/src/etl/processor/csv.rs | 4 +- src/pipeline/src/etl/processor/date.rs | 72 +-- src/pipeline/src/etl/processor/decolorize.rs | 67 +-- src/pipeline/src/etl/processor/digest.rs | 79 +-- src/pipeline/src/etl/processor/dissect.rs | 4 +- src/pipeline/src/etl/processor/epoch.rs | 68 +-- src/pipeline/src/etl/processor/gsub.rs | 82 +-- src/pipeline/src/etl/processor/join.rs | 77 +-- src/pipeline/src/etl/processor/json_path.rs | 72 +-- src/pipeline/src/etl/processor/letter.rs | 69 +-- src/pipeline/src/etl/processor/regex.rs | 4 +- src/pipeline/src/etl/processor/timestamp.rs | 89 +-- src/pipeline/src/etl/processor/urlencoding.rs | 73 +-- src/pipeline/src/etl/transform.rs | 119 +++- .../src/etl/transform/transformer/greptime.rs | 77 +-- .../transform/transformer/greptime/coerce.rs | 13 +- src/pipeline/src/lib.rs | 2 +- 22 files changed, 642 insertions(+), 1102 deletions(-) diff --git a/src/pipeline/src/etl.rs b/src/pipeline/src/etl.rs index 61b72efb470e..bca33a607f6e 100644 --- a/src/pipeline/src/etl.rs +++ b/src/pipeline/src/etl.rs @@ -23,11 +23,11 @@ pub mod value; use std::collections::BTreeMap; use std::sync::Arc; -use ahash::HashSet; -use common_telemetry::debug; -use error::{IntermediateKeyIndexSnafu, PrepareValueMustBeObjectSnafu, YamlLoadSnafu}; +use error::{ + IntermediateKeyIndexSnafu, PrepareValueMustBeObjectSnafu, YamlLoadSnafu, YamlParseSnafu, +}; use itertools::Itertools; -use processor::{Processor, ProcessorBuilder, Processors}; +use processor::{IntermediateStatus, Processor, Processors}; use snafu::{OptionExt, ResultExt}; use transform::{Transformer, Transforms}; use value::Value; @@ -56,6 +56,10 @@ where Content::Yaml(str) => { let docs = YamlLoader::load_from_str(str).context(YamlLoadSnafu)?; + if docs.len() != 1 { + return YamlParseSnafu.fail(); + } + let doc = &docs[0]; let description = doc[DESCRIPTION].as_str().map(|s| s.to_string()); @@ -144,6 +148,25 @@ impl PipelineExecOutput { } } +pub fn json_to_intermediate_state(val: serde_json::Value) -> Result { + match val { + serde_json::Value::Object(map) => { + let mut intermediate_state = BTreeMap::new(); + for (k, v) in map { + intermediate_state.insert(k, Value::try_from(v)?); + } + Ok(intermediate_state) + } + _ => PrepareValueMustBeObjectSnafu.fail(), + } +} + +pub fn json_array_to_intermediate_state( + val: Vec, +) -> Result> { + val.into_iter().map(json_to_intermediate_state).collect() +} + impl Pipeline where T: Transformer, @@ -152,27 +175,22 @@ where &self, val: &mut BTreeMap, ) -> Result> { - // for processor in self.processors.iter() { - // processor.exec_mut(val)?; - // } - - // let matched_rule = self - // .dispatcher - // .as_ref() - // .and_then(|dispatcher| dispatcher.exec(&self.intermediate_keys, val)); - - // match matched_rule { - // None => self - // .transformer - // .transform_mut(val) - // .map(PipelineExecOutput::Transformed), - // Some(rule) => Ok(PipelineExecOutput::DispatchedTo(rule.into())), - // } - todo!() - } + for processor in self.processors.iter() { + processor.exec_mut(val)?; + } - pub fn prepare(&self, val: serde_json::Value) -> Result> { - todo!() + let matched_rule = self + .dispatcher + .as_ref() + .and_then(|dispatcher| dispatcher.exec(val)); + + match matched_rule { + None => self + .transformer + .transform_mut(val) + .map(PipelineExecOutput::Transformed), + Some(rule) => Ok(PipelineExecOutput::DispatchedTo(rule.into())), + } } pub fn processors(&self) -> &processor::Processors { @@ -254,242 +272,242 @@ mod tests { use super::*; use crate::etl::transform::GreptimeTransformer; -// #[test] -// fn test_pipeline_prepare() { -// let input_value_str = r#" -// { -// "my_field": "1,2", -// "foo": "bar" -// } -// "#; -// let input_value: serde_json::Value = serde_json::from_str(input_value_str).unwrap(); - -// let pipeline_yaml = r#"description: 'Pipeline for Apache Tomcat' -// processors: -// - csv: -// field: my_field -// target_fields: field1, field2 -// transform: -// - field: field1 -// type: uint32 -// - field: field2 -// type: uint32 -// "#; -// let pipeline: Pipeline = parse(&Content::Yaml(pipeline_yaml)).unwrap(); -// let mut payload = pipeline.init_intermediate_state(); -// pipeline.prepare(input_value, &mut payload).unwrap(); -// assert_eq!(&["my_field"].to_vec(), pipeline.required_keys()); -// assert_eq!( -// payload, -// vec![Value::String("1,2".to_string()), Value::Null, Value::Null] -// ); -// let result = pipeline -// .exec_mut(&mut payload) -// .unwrap() -// .into_transformed() -// .unwrap(); - -// assert_eq!(result.values[0].value_data, Some(ValueData::U32Value(1))); -// assert_eq!(result.values[1].value_data, Some(ValueData::U32Value(2))); -// match &result.values[2].value_data { -// Some(ValueData::TimestampNanosecondValue(v)) => { -// assert_ne!(*v, 0); -// } -// _ => panic!("expect null value"), -// } -// } - -// #[test] -// fn test_dissect_pipeline() { -// let message = r#"129.37.245.88 - meln1ks [01/Aug/2024:14:22:47 +0800] "PATCH /observability/metrics/production HTTP/1.0" 501 33085"#.to_string(); -// let pipeline_str = r#"processors: -// - dissect: -// fields: -// - message -// patterns: -// - "%{ip} %{?ignored} %{username} [%{ts}] \"%{method} %{path} %{proto}\" %{status} %{bytes}" -// - timestamp: -// fields: -// - ts -// formats: -// - "%d/%b/%Y:%H:%M:%S %z" - -// transform: -// - fields: -// - ip -// - username -// - method -// - path -// - proto -// type: string -// - fields: -// - status -// type: uint16 -// - fields: -// - bytes -// type: uint32 -// - field: ts -// type: timestamp, ns -// index: time"#; -// let pipeline: Pipeline = parse(&Content::Yaml(pipeline_str)).unwrap(); -// let mut payload = pipeline.init_intermediate_state(); -// pipeline -// .prepare(serde_json::Value::String(message), &mut payload) -// .unwrap(); -// let result = pipeline -// .exec_mut(&mut payload) -// .unwrap() -// .into_transformed() -// .unwrap(); -// let sechema = pipeline.schemas(); - -// assert_eq!(sechema.len(), result.values.len()); -// let test = vec![ -// ( -// ColumnDataType::String as i32, -// Some(ValueData::StringValue("129.37.245.88".into())), -// ), -// ( -// ColumnDataType::String as i32, -// Some(ValueData::StringValue("meln1ks".into())), -// ), -// ( -// ColumnDataType::String as i32, -// Some(ValueData::StringValue("PATCH".into())), -// ), -// ( -// ColumnDataType::String as i32, -// Some(ValueData::StringValue( -// "/observability/metrics/production".into(), -// )), -// ), -// ( -// ColumnDataType::String as i32, -// Some(ValueData::StringValue("HTTP/1.0".into())), -// ), -// ( -// ColumnDataType::Uint16 as i32, -// Some(ValueData::U16Value(501)), -// ), -// ( -// ColumnDataType::Uint32 as i32, -// Some(ValueData::U32Value(33085)), -// ), -// ( -// ColumnDataType::TimestampNanosecond as i32, -// Some(ValueData::TimestampNanosecondValue(1722493367000000000)), -// ), -// ]; -// for i in 0..sechema.len() { -// let schema = &sechema[i]; -// let value = &result.values[i]; -// assert_eq!(schema.datatype, test[i].0); -// assert_eq!(value.value_data, test[i].1); -// } -// } - -// #[test] -// fn test_csv_pipeline() { -// let input_value_str = r#" -// { -// "my_field": "1,2", -// "foo": "bar" -// } -// "#; -// let input_value: serde_json::Value = serde_json::from_str(input_value_str).unwrap(); - -// let pipeline_yaml = r#" -// description: Pipeline for Apache Tomcat -// processors: -// - csv: -// field: my_field -// target_fields: field1, field2 -// transform: -// - field: field1 -// type: uint32 -// - field: field2 -// type: uint32 -// "#; - -// let pipeline: Pipeline = parse(&Content::Yaml(pipeline_yaml)).unwrap(); -// let mut payload = pipeline.init_intermediate_state(); -// pipeline.prepare(input_value, &mut payload).unwrap(); -// assert_eq!(&["my_field"].to_vec(), pipeline.required_keys()); -// assert_eq!( -// payload, -// vec![Value::String("1,2".to_string()), Value::Null, Value::Null] -// ); -// let result = pipeline -// .exec_mut(&mut payload) -// .unwrap() -// .into_transformed() -// .unwrap(); -// assert_eq!(result.values[0].value_data, Some(ValueData::U32Value(1))); -// assert_eq!(result.values[1].value_data, Some(ValueData::U32Value(2))); -// match &result.values[2].value_data { -// Some(ValueData::TimestampNanosecondValue(v)) => { -// assert_ne!(*v, 0); -// } -// _ => panic!("expect null value"), -// } -// } - -// #[test] -// fn test_date_pipeline() { -// let input_value_str = r#" -// { -// "my_field": "1,2", -// "foo": "bar", -// "test_time": "2014-5-17T04:34:56+00:00" -// } -// "#; -// let input_value: serde_json::Value = serde_json::from_str(input_value_str).unwrap(); - -// let pipeline_yaml = r#" -// --- -// description: Pipeline for Apache Tomcat - -// processors: -// - timestamp: -// field: test_time - -// transform: -// - field: test_time -// type: timestamp, ns -// index: time -// "#; - -// let pipeline: Pipeline = parse(&Content::Yaml(pipeline_yaml)).unwrap(); -// let schema = pipeline.schemas().clone(); -// let mut result = pipeline.init_intermediate_state(); -// pipeline.prepare(input_value, &mut result).unwrap(); -// let row = pipeline -// .exec_mut(&mut result) -// .unwrap() -// .into_transformed() -// .unwrap(); -// let output = Rows { -// schema, -// rows: vec![row], -// }; -// let schemas = output.schema; - -// assert_eq!(schemas.len(), 1); -// let schema = schemas[0].clone(); -// assert_eq!("test_time", schema.column_name); -// assert_eq!(ColumnDataType::TimestampNanosecond as i32, schema.datatype); -// assert_eq!(SemanticType::Timestamp as i32, schema.semantic_type); - -// let row = output.rows[0].clone(); -// assert_eq!(1, row.values.len()); -// let value_data = row.values[0].clone().value_data; -// assert_eq!( -// Some(v1::value::ValueData::TimestampNanosecondValue( -// 1400301296000000000 -// )), -// value_data -// ); -// } + // #[test] + // fn test_pipeline_prepare() { + // let input_value_str = r#" + // { + // "my_field": "1,2", + // "foo": "bar" + // } + // "#; + // let input_value: serde_json::Value = serde_json::from_str(input_value_str).unwrap(); + + // let pipeline_yaml = r#"description: 'Pipeline for Apache Tomcat' + // processors: + // - csv: + // field: my_field + // target_fields: field1, field2 + // transform: + // - field: field1 + // type: uint32 + // - field: field2 + // type: uint32 + // "#; + // let pipeline: Pipeline = parse(&Content::Yaml(pipeline_yaml)).unwrap(); + // let mut payload = pipeline.init_intermediate_state(); + // pipeline.prepare(input_value, &mut payload).unwrap(); + // assert_eq!(&["my_field"].to_vec(), pipeline.required_keys()); + // assert_eq!( + // payload, + // vec![Value::String("1,2".to_string()), Value::Null, Value::Null] + // ); + // let result = pipeline + // .exec_mut(&mut payload) + // .unwrap() + // .into_transformed() + // .unwrap(); + + // assert_eq!(result.values[0].value_data, Some(ValueData::U32Value(1))); + // assert_eq!(result.values[1].value_data, Some(ValueData::U32Value(2))); + // match &result.values[2].value_data { + // Some(ValueData::TimestampNanosecondValue(v)) => { + // assert_ne!(*v, 0); + // } + // _ => panic!("expect null value"), + // } + // } + + // #[test] + // fn test_dissect_pipeline() { + // let message = r#"129.37.245.88 - meln1ks [01/Aug/2024:14:22:47 +0800] "PATCH /observability/metrics/production HTTP/1.0" 501 33085"#.to_string(); + // let pipeline_str = r#"processors: + // - dissect: + // fields: + // - message + // patterns: + // - "%{ip} %{?ignored} %{username} [%{ts}] \"%{method} %{path} %{proto}\" %{status} %{bytes}" + // - timestamp: + // fields: + // - ts + // formats: + // - "%d/%b/%Y:%H:%M:%S %z" + + // transform: + // - fields: + // - ip + // - username + // - method + // - path + // - proto + // type: string + // - fields: + // - status + // type: uint16 + // - fields: + // - bytes + // type: uint32 + // - field: ts + // type: timestamp, ns + // index: time"#; + // let pipeline: Pipeline = parse(&Content::Yaml(pipeline_str)).unwrap(); + // let mut payload = pipeline.init_intermediate_state(); + // pipeline + // .prepare(serde_json::Value::String(message), &mut payload) + // .unwrap(); + // let result = pipeline + // .exec_mut(&mut payload) + // .unwrap() + // .into_transformed() + // .unwrap(); + // let sechema = pipeline.schemas(); + + // assert_eq!(sechema.len(), result.values.len()); + // let test = vec![ + // ( + // ColumnDataType::String as i32, + // Some(ValueData::StringValue("129.37.245.88".into())), + // ), + // ( + // ColumnDataType::String as i32, + // Some(ValueData::StringValue("meln1ks".into())), + // ), + // ( + // ColumnDataType::String as i32, + // Some(ValueData::StringValue("PATCH".into())), + // ), + // ( + // ColumnDataType::String as i32, + // Some(ValueData::StringValue( + // "/observability/metrics/production".into(), + // )), + // ), + // ( + // ColumnDataType::String as i32, + // Some(ValueData::StringValue("HTTP/1.0".into())), + // ), + // ( + // ColumnDataType::Uint16 as i32, + // Some(ValueData::U16Value(501)), + // ), + // ( + // ColumnDataType::Uint32 as i32, + // Some(ValueData::U32Value(33085)), + // ), + // ( + // ColumnDataType::TimestampNanosecond as i32, + // Some(ValueData::TimestampNanosecondValue(1722493367000000000)), + // ), + // ]; + // for i in 0..sechema.len() { + // let schema = &sechema[i]; + // let value = &result.values[i]; + // assert_eq!(schema.datatype, test[i].0); + // assert_eq!(value.value_data, test[i].1); + // } + // } + + // #[test] + // fn test_csv_pipeline() { + // let input_value_str = r#" + // { + // "my_field": "1,2", + // "foo": "bar" + // } + // "#; + // let input_value: serde_json::Value = serde_json::from_str(input_value_str).unwrap(); + + // let pipeline_yaml = r#" + // description: Pipeline for Apache Tomcat + // processors: + // - csv: + // field: my_field + // target_fields: field1, field2 + // transform: + // - field: field1 + // type: uint32 + // - field: field2 + // type: uint32 + // "#; + + // let pipeline: Pipeline = parse(&Content::Yaml(pipeline_yaml)).unwrap(); + // let mut payload = pipeline.init_intermediate_state(); + // pipeline.prepare(input_value, &mut payload).unwrap(); + // assert_eq!(&["my_field"].to_vec(), pipeline.required_keys()); + // assert_eq!( + // payload, + // vec![Value::String("1,2".to_string()), Value::Null, Value::Null] + // ); + // let result = pipeline + // .exec_mut(&mut payload) + // .unwrap() + // .into_transformed() + // .unwrap(); + // assert_eq!(result.values[0].value_data, Some(ValueData::U32Value(1))); + // assert_eq!(result.values[1].value_data, Some(ValueData::U32Value(2))); + // match &result.values[2].value_data { + // Some(ValueData::TimestampNanosecondValue(v)) => { + // assert_ne!(*v, 0); + // } + // _ => panic!("expect null value"), + // } + // } + + // #[test] + // fn test_date_pipeline() { + // let input_value_str = r#" + // { + // "my_field": "1,2", + // "foo": "bar", + // "test_time": "2014-5-17T04:34:56+00:00" + // } + // "#; + // let input_value: serde_json::Value = serde_json::from_str(input_value_str).unwrap(); + + // let pipeline_yaml = r#" + // --- + // description: Pipeline for Apache Tomcat + + // processors: + // - timestamp: + // field: test_time + + // transform: + // - field: test_time + // type: timestamp, ns + // index: time + // "#; + + // let pipeline: Pipeline = parse(&Content::Yaml(pipeline_yaml)).unwrap(); + // let schema = pipeline.schemas().clone(); + // let mut result = pipeline.init_intermediate_state(); + // pipeline.prepare(input_value, &mut result).unwrap(); + // let row = pipeline + // .exec_mut(&mut result) + // .unwrap() + // .into_transformed() + // .unwrap(); + // let output = Rows { + // schema, + // rows: vec![row], + // }; + // let schemas = output.schema; + + // assert_eq!(schemas.len(), 1); + // let schema = schemas[0].clone(); + // assert_eq!("test_time", schema.column_name); + // assert_eq!(ColumnDataType::TimestampNanosecond as i32, schema.datatype); + // assert_eq!(SemanticType::Timestamp as i32, schema.semantic_type); + + // let row = output.rows[0].clone(); + // assert_eq!(1, row.values.len()); + // let value_data = row.values[0].clone().value_data; + // assert_eq!( + // Some(v1::value::ValueData::TimestampNanosecondValue( + // 1400301296000000000 + // )), + // value_data + // ); + // } #[test] fn test_dispatcher() { diff --git a/src/pipeline/src/etl/error.rs b/src/pipeline/src/etl/error.rs index d1e0b56e6e9d..51080c86eebf 100644 --- a/src/pipeline/src/etl/error.rs +++ b/src/pipeline/src/etl/error.rs @@ -543,6 +543,11 @@ pub enum Error { #[snafu(implicit)] location: Location, }, + #[snafu(display("Yaml parse error."))] + YamlParse { + #[snafu(implicit)] + location: Location, + }, #[snafu(display("Prepare value must be an object"))] PrepareValueMustBeObject { #[snafu(implicit)] diff --git a/src/pipeline/src/etl/field.rs b/src/pipeline/src/etl/field.rs index 10fa681f236c..dd4835ec9279 100644 --- a/src/pipeline/src/etl/field.rs +++ b/src/pipeline/src/etl/field.rs @@ -19,133 +19,12 @@ use snafu::OptionExt; use super::error::{EmptyInputFieldSnafu, MissingInputFieldSnafu}; use crate::etl::error::{Error, Result}; -use crate::etl::find_key_index; - -/// Information about the input field including the name and index in intermediate keys. -#[derive(Debug, Default, Clone)] -pub struct InputFieldInfo { - pub(crate) name: String, - pub(crate) index: usize, -} - -impl InputFieldInfo { - /// Create a new input field info with the given field name and index. - pub(crate) fn new(field: impl Into, index: usize) -> Self { - InputFieldInfo { - name: field.into(), - index, - } - } -} - -/// Information about a field that has one input and one output. -#[derive(Debug, Default, Clone)] -pub struct OneInputOneOutputField { - input: InputFieldInfo, - output: Option<(String, usize)>, -} - -impl OneInputOneOutputField { - /// Create a new field with the given input and output. - pub(crate) fn new(input: InputFieldInfo, output: (String, usize)) -> Self { - OneInputOneOutputField { - input, - output: Some(output), - } - } - - /// Build a new field with the given processor kind, intermediate keys, input field, and target field. - pub(crate) fn build( - processor_kind: &str, - intermediate_keys: &[String], - input_field: &str, - target_field: &str, - ) -> Result { - let input_index = find_key_index(intermediate_keys, input_field, processor_kind)?; - - let input_field_info = InputFieldInfo::new(input_field, input_index); - let output_index = find_key_index(intermediate_keys, target_field, processor_kind)?; - Ok(OneInputOneOutputField::new( - input_field_info, - (target_field.to_string(), output_index), - )) - } - - /// Get the input field information. - pub(crate) fn input(&self) -> &InputFieldInfo { - &self.input - } - - /// Get the index of the input field. - pub(crate) fn input_index(&self) -> usize { - self.input.index - } - - /// Get the name of the input field. - pub(crate) fn input_name(&self) -> &str { - &self.input.name - } - - /// Get the index of the output field. - pub(crate) fn output_index(&self) -> usize { - *self.output().1 - } - - /// Get the name of the output field. - pub(crate) fn output_name(&self) -> &str { - self.output().0 - } - - /// Get the output field information. - pub(crate) fn output(&self) -> (&String, &usize) { - if let Some((name, index)) = &self.output { - (name, index) - } else { - (&self.input.name, &self.input.index) - } - } -} - -/// Information about a field that has one input and multiple outputs. -#[derive(Debug, Default, Clone)] -pub struct OneInputMultiOutputField { - input: InputFieldInfo, - /// Typically, processors that output multiple keys need to be distinguished by splicing the keys together. - prefix: Option, -} - -impl OneInputMultiOutputField { - /// Create a new field with the given input and prefix. - pub(crate) fn new(input: InputFieldInfo, prefix: Option) -> Self { - OneInputMultiOutputField { input, prefix } - } - - /// Get the input field information. - pub(crate) fn input(&self) -> &InputFieldInfo { - &self.input - } - - /// Get the index of the input field. - pub(crate) fn input_index(&self) -> usize { - self.input.index - } - - /// Get the name of the input field. - pub(crate) fn input_name(&self) -> &str { - &self.input.name - } - - /// Get the prefix for the output fields. - pub(crate) fn target_prefix(&self) -> &str { - self.prefix.as_deref().unwrap_or(&self.input.name) - } -} /// Raw processor-defined inputs and outputs #[derive(Debug, Default, Clone)] pub struct Field { - pub(crate) input_field: String, - pub(crate) target_field: Option, + input_field: String, + target_field: Option, } impl FromStr for Field { @@ -194,6 +73,10 @@ impl Field { pub(crate) fn target_or_input_field(&self) -> &str { self.target_field.as_deref().unwrap_or(&self.input_field) } + + pub(crate) fn set_target_field(&mut self, target_field: Option) { + self.target_field = target_field; + } } /// A collection of fields. diff --git a/src/pipeline/src/etl/processor.rs b/src/pipeline/src/etl/processor.rs index b6df91204c39..63854ad552a7 100644 --- a/src/pipeline/src/etl/processor.rs +++ b/src/pipeline/src/etl/processor.rs @@ -12,49 +12,48 @@ // See the License for the specific language governing permissions and // limitations under the License. -pub mod cmcd; -pub mod csv; +// pub mod cmcd; +// pub mod csv; pub mod date; pub mod decolorize; pub mod digest; -pub mod dissect; +// pub mod dissect; pub mod epoch; pub mod gsub; pub mod join; pub mod json_path; pub mod letter; -pub mod regex; +// pub mod regex; pub mod timestamp; pub mod urlencoding; use std::collections::BTreeMap; -use ahash::{HashSet, HashSetExt}; -use cmcd::{CmcdProcessor, CmcdProcessorBuilder}; -use csv::{CsvProcessor, CsvProcessorBuilder}; -use date::{DateProcessor, DateProcessorBuilder}; -use decolorize::{DecolorizeProcessor, DecolorizeProcessorBuilder}; -use digest::{DigestProcessor, DigestProcessorBuilder}; -use dissect::{DissectProcessor, DissectProcessorBuilder}; +// use cmcd::CmcdProcessor; +// use csv::CsvProcessor; +use date::DateProcessor; +use decolorize::DecolorizeProcessor; +use digest::DigestProcessor; +// use dissect::DissectProcessor; use enum_dispatch::enum_dispatch; -use epoch::{EpochProcessor, EpochProcessorBuilder}; -use gsub::{GsubProcessor, GsubProcessorBuilder}; -use itertools::Itertools; -use join::{JoinProcessor, JoinProcessorBuilder}; -use json_path::{JsonPathProcessor, JsonPathProcessorBuilder}; -use letter::{LetterProcessor, LetterProcessorBuilder}; -use regex::{RegexProcessor, RegexProcessorBuilder}; +use epoch::EpochProcessor; +use gsub::GsubProcessor; +use join::JoinProcessor; +use json_path::JsonPathProcessor; +use letter::LetterProcessor; +// use regex::RegexProcessor; use snafu::{OptionExt, ResultExt}; -use timestamp::{TimestampProcessor, TimestampProcessorBuilder}; -use urlencoding::{UrlEncodingProcessor, UrlEncodingProcessorBuilder}; +use timestamp::TimestampProcessor; +use urlencoding::UrlEncodingProcessor; use super::error::{ FailedParseFieldFromStringSnafu, FieldMustBeTypeSnafu, ProcessorKeyMustBeStringSnafu, - ProcessorMustBeMapSnafu, ProcessorMustHaveStringKeySnafu, UnsupportedProcessorSnafu, + ProcessorMustBeMapSnafu, ProcessorMustHaveStringKeySnafu, }; use super::field::{Field, Fields}; use crate::etl::error::{Error, Result}; use crate::etl::value::Value; +use crate::etl_error::UnsupportedProcessorSnafu; const FIELD_NAME: &str = "field"; const FIELDS_NAME: &str = "fields"; @@ -67,6 +66,8 @@ const TARGET_FIELDS_NAME: &str = "target_fields"; const JSON_PATH_NAME: &str = "json_path"; const JSON_PATH_RESULT_INDEX_NAME: &str = "result_index"; +pub type IntermediateStatus = BTreeMap; + /// Processor trait defines the interface for all processors. /// /// A processor is a transformation that can be applied to a field in a document @@ -82,19 +83,19 @@ pub trait Processor: std::fmt::Debug + Send + Sync + 'static { fn ignore_missing(&self) -> bool; /// Execute the processor on a vector which be preprocessed by the pipeline - fn exec_mut(&self, val: &mut BTreeMap) -> Result<()>; + fn exec_mut(&self, val: &mut IntermediateStatus) -> Result<()>; } #[derive(Debug)] #[enum_dispatch] pub enum ProcessorKind { - Cmcd(CmcdProcessor), - Csv(CsvProcessor), - Dissect(DissectProcessor), + // Cmcd(CmcdProcessor), + // Csv(CsvProcessor), + // Dissect(DissectProcessor), Gsub(GsubProcessor), Join(JoinProcessor), Letter(LetterProcessor), - Regex(RegexProcessor), + // Regex(RegexProcessor), Timestamp(TimestampProcessor), UrlEncoding(UrlEncodingProcessor), Epoch(EpochProcessor), @@ -104,18 +105,6 @@ pub enum ProcessorKind { Digest(DigestProcessor), } -/// ProcessorBuilder trait defines the interface for all processor builders -/// A processor builder is used to create a processor -#[enum_dispatch(ProcessorBuilders)] -pub trait ProcessorBuilder: std::fmt::Debug + Send + Sync + 'static { - /// Get the processor's output keys - fn output_keys(&self) -> HashSet<&str>; - /// Get the processor's input keys - fn input_keys(&self) -> HashSet<&str>; - /// Build the processor - fn build(self, intermediate_keys: &[String]) -> Result; -} - #[derive(Debug, Default)] pub struct Processors { /// A ordered list of processors @@ -166,7 +155,33 @@ fn parse_processor(doc: &yaml_rust::Yaml) -> Result { let str_key = key.as_str().context(ProcessorKeyMustBeStringSnafu)?; - todo!() + let processor = match str_key { + // cmcd::PROCESSOR_CMCD => ProcessorKind::Cmcd(CmcdProcessor::try_from(value)?), + // csv::PROCESSOR_CSV => ProcessorKind::Csv(CsvProcessor::try_from(value)?), + // dissect::PROCESSOR_DISSECT => ProcessorKind::Dissect(DissectProcessor::try_from(value)?), + epoch::PROCESSOR_EPOCH => ProcessorKind::Epoch(EpochProcessor::try_from(value)?), + date::PROCESSOR_DATE => ProcessorKind::Date(DateProcessor::try_from(value)?), + gsub::PROCESSOR_GSUB => ProcessorKind::Gsub(GsubProcessor::try_from(value)?), + join::PROCESSOR_JOIN => ProcessorKind::Join(JoinProcessor::try_from(value)?), + letter::PROCESSOR_LETTER => ProcessorKind::Letter(LetterProcessor::try_from(value)?), + // regex::PROCESSOR_REGEX => ProcessorKind::Regex(RegexProcessor::try_from(value)?), + timestamp::PROCESSOR_TIMESTAMP => { + ProcessorKind::Timestamp(TimestampProcessor::try_from(value)?) + } + urlencoding::PROCESSOR_URL_ENCODING => { + ProcessorKind::UrlEncoding(UrlEncodingProcessor::try_from(value)?) + } + json_path::PROCESSOR_JSON_PATH => { + ProcessorKind::JsonPath(json_path::JsonPathProcessor::try_from(value)?) + } + decolorize::PROCESSOR_DECOLORIZE => { + ProcessorKind::Decolorize(DecolorizeProcessor::try_from(value)?) + } + digest::PROCESSOR_DIGEST => ProcessorKind::Digest(DigestProcessor::try_from(value)?), + _ => return UnsupportedProcessorSnafu { processor: str_key }.fail(), + }; + + Ok(processor) } pub(crate) fn yaml_string(v: &yaml_rust::Yaml, field: &str) -> Result { diff --git a/src/pipeline/src/etl/processor/cmcd.rs b/src/pipeline/src/etl/processor/cmcd.rs index 086fe8f3d610..944487472691 100644 --- a/src/pipeline/src/etl/processor/cmcd.rs +++ b/src/pipeline/src/etl/processor/cmcd.rs @@ -27,7 +27,7 @@ use crate::etl::error::{ FailedToParseIntKeySnafu, KeyMustBeStringSnafu, ProcessorExpectStringSnafu, ProcessorMissingFieldSnafu, Result, }; -use crate::etl::field::{Field, Fields, InputFieldInfo, OneInputMultiOutputField}; +use crate::etl::field::{Field, Fields, InputField, OneInputMultiOutputField}; use crate::etl::find_key_index; use crate::etl::processor::{ yaml_bool, yaml_new_field, yaml_new_fields, Processor, ProcessorBuilder, ProcessorKind, @@ -35,6 +35,8 @@ use crate::etl::processor::{ }; use crate::etl::value::Value; +use super::IntermediateStatus; + pub(crate) const PROCESSOR_CMCD: &str = "cmcd"; const CMCD_KEY_BR: &str = "br"; // Encoded bitrate, Integer kbps @@ -135,7 +137,7 @@ impl CmcdProcessorBuilder { for field in self.fields.into_iter() { let input_index = find_key_index(intermediate_keys, field.input_field(), "cmcd")?; - let input_field_info = InputFieldInfo::new(field.input_field(), input_index); + let input_field_info = InputField::new(field.input_field(), input_index); let (_, cmcd_field_outputs) = Self::build_cmcd_outputs(&field, intermediate_keys)?; @@ -372,7 +374,7 @@ impl Processor for CmcdProcessor { self.ignore_missing } - fn exec_mut(&self, val: &mut Vec) -> Result<()> { + fn exec_mut(&self, val: &mut IntermediateStatus) -> Result<()> { for (field_index, field) in self.fields.iter().enumerate() { let field_value_index = field.input_index(); match val.get(field_value_index) { diff --git a/src/pipeline/src/etl/processor/csv.rs b/src/pipeline/src/etl/processor/csv.rs index c9cb5f847db1..86f39fc89369 100644 --- a/src/pipeline/src/etl/processor/csv.rs +++ b/src/pipeline/src/etl/processor/csv.rs @@ -24,7 +24,7 @@ use crate::etl::error::{ CsvNoRecordSnafu, CsvQuoteNameSnafu, CsvReadSnafu, CsvSeparatorNameSnafu, Error, KeyMustBeStringSnafu, ProcessorExpectStringSnafu, ProcessorMissingFieldSnafu, Result, }; -use crate::etl::field::{Fields, InputFieldInfo, OneInputMultiOutputField}; +use crate::etl::field::{Fields, InputField, OneInputMultiOutputField}; use crate::etl::find_key_index; use crate::etl::processor::{ yaml_bool, yaml_new_field, yaml_new_fields, yaml_string, Processor, ProcessorBuilder, @@ -64,7 +64,7 @@ impl CsvProcessorBuilder { for field in self.fields { let input_index = find_key_index(intermediate_keys, field.input_field(), "csv")?; - let input_field_info = InputFieldInfo::new(field.input_field(), input_index); + let input_field_info = InputField::new(field.input_field(), input_index); let real_field = OneInputMultiOutputField::new(input_field_info, None); real_fields.push(real_field); } diff --git a/src/pipeline/src/etl/processor/date.rs b/src/pipeline/src/etl/processor/date.rs index fa202a0edff2..e080b795402c 100644 --- a/src/pipeline/src/etl/processor/date.rs +++ b/src/pipeline/src/etl/processor/date.rs @@ -14,21 +14,21 @@ use std::sync::Arc; -use ahash::HashSet; use chrono::{DateTime, NaiveDateTime}; use chrono_tz::Tz; use lazy_static::lazy_static; use snafu::{OptionExt, ResultExt}; +use super::IntermediateStatus; use crate::etl::error::{ DateFailedToGetLocalTimezoneSnafu, DateFailedToGetTimestampSnafu, DateParseSnafu, DateParseTimezoneSnafu, Error, KeyMustBeStringSnafu, ProcessorExpectStringSnafu, ProcessorFailedToParseStringSnafu, ProcessorMissingFieldSnafu, Result, }; -use crate::etl::field::{Fields, OneInputOneOutputField}; +use crate::etl::field::Fields; use crate::etl::processor::{ - yaml_bool, yaml_new_field, yaml_new_fields, yaml_string, yaml_strings, Processor, - ProcessorBuilder, ProcessorKind, FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME, + yaml_bool, yaml_new_field, yaml_new_fields, yaml_string, yaml_strings, Processor, FIELDS_NAME, + FIELD_NAME, IGNORE_MISSING_NAME, }; use crate::etl::value::{Timestamp, Value}; @@ -88,55 +88,7 @@ impl std::ops::Deref for Formats { } } -#[derive(Debug, Default)] -pub struct DateProcessorBuilder { - fields: Fields, - formats: Formats, - timezone: Option>, - locale: Option>, - ignore_missing: bool, -} - -impl ProcessorBuilder for DateProcessorBuilder { - fn output_keys(&self) -> HashSet<&str> { - self.fields - .iter() - .map(|f| f.target_or_input_field()) - .collect() - } - - fn input_keys(&self) -> HashSet<&str> { - self.fields.iter().map(|f| f.input_field()).collect() - } - - fn build(self, intermediate_keys: &[String]) -> Result { - self.build(intermediate_keys).map(ProcessorKind::Date) - } -} - -impl DateProcessorBuilder { - pub fn build(self, intermediate_keys: &[String]) -> Result { - let mut real_fields = vec![]; - for field in self.fields.into_iter() { - let input = OneInputOneOutputField::build( - "date", - intermediate_keys, - field.input_field(), - field.target_or_input_field(), - )?; - real_fields.push(input); - } - Ok(DateProcessor { - fields: real_fields, - formats: self.formats, - timezone: self.timezone, - locale: self.locale, - ignore_missing: self.ignore_missing, - }) - } -} - -impl TryFrom<&yaml_rust::yaml::Hash> for DateProcessorBuilder { +impl TryFrom<&yaml_rust::yaml::Hash> for DateProcessor { type Error = Error; fn try_from(hash: &yaml_rust::yaml::Hash) -> Result { @@ -181,7 +133,7 @@ impl TryFrom<&yaml_rust::yaml::Hash> for DateProcessorBuilder { } } - let builder = DateProcessorBuilder { + let builder = DateProcessor { fields, formats, timezone, @@ -197,7 +149,7 @@ impl TryFrom<&yaml_rust::yaml::Hash> for DateProcessorBuilder { /// Reserved for compatibility only #[derive(Debug, Default)] pub struct DateProcessor { - fields: Vec, + fields: Fields, formats: Formats, timezone: Option>, locale: Option>, // to support locale @@ -242,20 +194,20 @@ impl Processor for DateProcessor { self.ignore_missing } - fn exec_mut(&self, val: &mut Vec) -> Result<()> { + fn exec_mut(&self, val: &mut IntermediateStatus) -> Result<()> { for field in self.fields.iter() { - let index = field.input_index(); + let index = field.input_field(); match val.get(index) { Some(Value::String(s)) => { let timestamp = self.parse(s)?; - let output_index = field.output_index(); - val[output_index] = Value::Timestamp(timestamp); + let output_key = field.target_or_input_field(); + val.insert(output_key.to_string(), Value::Timestamp(timestamp)); } Some(Value::Null) | None => { if !self.ignore_missing { return ProcessorMissingFieldSnafu { processor: self.kind().to_string(), - field: field.input_name().to_string(), + field: field.input_field().to_string(), } .fail(); } diff --git a/src/pipeline/src/etl/processor/decolorize.rs b/src/pipeline/src/etl/processor/decolorize.rs index e72bc28a1e66..2547b99d6824 100644 --- a/src/pipeline/src/etl/processor/decolorize.rs +++ b/src/pipeline/src/etl/processor/decolorize.rs @@ -18,18 +18,17 @@ //! from Grafana Loki and [`strip_ansi_escape_codes`](https://vector.dev/docs/reference/vrl/functions/#strip_ansi_escape_codes) //! from Vector VRL. -use ahash::HashSet; use once_cell::sync::Lazy; use regex::Regex; use snafu::OptionExt; +use super::IntermediateStatus; use crate::etl::error::{ Error, KeyMustBeStringSnafu, ProcessorExpectStringSnafu, ProcessorMissingFieldSnafu, Result, }; -use crate::etl::field::{Fields, OneInputOneOutputField}; +use crate::etl::field::Fields; use crate::etl::processor::{ - yaml_bool, yaml_new_field, yaml_new_fields, ProcessorBuilder, ProcessorKind, FIELDS_NAME, - FIELD_NAME, IGNORE_MISSING_NAME, + yaml_bool, yaml_new_field, yaml_new_fields, FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME, }; use crate::etl::value::Value; @@ -37,52 +36,10 @@ pub(crate) const PROCESSOR_DECOLORIZE: &str = "decolorize"; static RE: Lazy = Lazy::new(|| Regex::new(r"\x1b\[[0-9;]*m").unwrap()); -#[derive(Debug, Default)] -pub struct DecolorizeProcessorBuilder { - fields: Fields, - ignore_missing: bool, -} - -impl ProcessorBuilder for DecolorizeProcessorBuilder { - fn output_keys(&self) -> HashSet<&str> { - self.fields - .iter() - .map(|f| f.target_or_input_field()) - .collect() - } - - fn input_keys(&self) -> HashSet<&str> { - self.fields.iter().map(|f| f.input_field()).collect() - } - - fn build(self, intermediate_keys: &[String]) -> Result { - self.build(intermediate_keys).map(ProcessorKind::Decolorize) - } -} - -impl DecolorizeProcessorBuilder { - fn build(self, intermediate_keys: &[String]) -> Result { - let mut real_fields = vec![]; - for field in self.fields.into_iter() { - let input = OneInputOneOutputField::build( - "decolorize", - intermediate_keys, - field.input_field(), - field.target_or_input_field(), - )?; - real_fields.push(input); - } - Ok(DecolorizeProcessor { - fields: real_fields, - ignore_missing: self.ignore_missing, - }) - } -} - /// Remove ANSI color control codes from the input text. #[derive(Debug, Default)] pub struct DecolorizeProcessor { - fields: Vec, + fields: Fields, ignore_missing: bool, } @@ -103,7 +60,7 @@ impl DecolorizeProcessor { } } -impl TryFrom<&yaml_rust::yaml::Hash> for DecolorizeProcessorBuilder { +impl TryFrom<&yaml_rust::yaml::Hash> for DecolorizeProcessor { type Error = Error; fn try_from(value: &yaml_rust::yaml::Hash) -> Result { @@ -129,7 +86,7 @@ impl TryFrom<&yaml_rust::yaml::Hash> for DecolorizeProcessorBuilder { } } - Ok(DecolorizeProcessorBuilder { + Ok(DecolorizeProcessor { fields, ignore_missing, }) @@ -145,23 +102,23 @@ impl crate::etl::processor::Processor for DecolorizeProcessor { self.ignore_missing } - fn exec_mut(&self, val: &mut Vec) -> Result<()> { + fn exec_mut(&self, val: &mut IntermediateStatus) -> Result<()> { for field in self.fields.iter() { - let index = field.input_index(); + let index = field.input_field(); match val.get(index) { Some(Value::Null) | None => { if !self.ignore_missing { return ProcessorMissingFieldSnafu { processor: self.kind(), - field: field.input_name(), + field: field.input_field(), } .fail(); } } Some(v) => { let result = self.process(v)?; - let output_index = field.output_index(); - val[output_index] = result; + let output_index = field.target_or_input_field(); + val.insert(output_index.to_string(), result); } } } @@ -176,7 +133,7 @@ mod tests { #[test] fn test_decolorize_processor() { let processor = DecolorizeProcessor { - fields: vec![], + fields: Fields::default(), ignore_missing: false, }; diff --git a/src/pipeline/src/etl/processor/digest.rs b/src/pipeline/src/etl/processor/digest.rs index 29054365ad03..64bb2a2f6d8a 100644 --- a/src/pipeline/src/etl/processor/digest.rs +++ b/src/pipeline/src/etl/processor/digest.rs @@ -21,17 +21,16 @@ use std::borrow::Cow; -use ahash::HashSet; use regex::Regex; use snafu::OptionExt; +use super::IntermediateStatus; use crate::etl::error::{ Error, KeyMustBeStringSnafu, ProcessorExpectStringSnafu, ProcessorMissingFieldSnafu, Result, }; -use crate::etl::field::{Fields, OneInputOneOutputField}; +use crate::etl::field::Fields; use crate::etl::processor::{ - yaml_bool, yaml_new_field, yaml_new_fields, ProcessorBuilder, ProcessorKind, FIELDS_NAME, - FIELD_NAME, IGNORE_MISSING_NAME, + yaml_bool, yaml_new_field, yaml_new_fields, FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME, }; use crate::etl::value::Value; use crate::etl_error::DigestPatternInvalidSnafu; @@ -88,54 +87,10 @@ impl PresetPattern { } } -#[derive(Debug, Default)] -pub struct DigestProcessorBuilder { - fields: Fields, - patterns: Vec, - ignore_missing: bool, -} - -impl ProcessorBuilder for DigestProcessorBuilder { - fn output_keys(&self) -> HashSet<&str> { - self.fields - .iter() - .map(|f| f.target_or_input_field()) - .collect() - } - - fn input_keys(&self) -> HashSet<&str> { - self.fields.iter().map(|f| f.input_field()).collect() - } - - fn build(self, intermediate_keys: &[String]) -> Result { - self.build(intermediate_keys).map(ProcessorKind::Digest) - } -} - -impl DigestProcessorBuilder { - fn build(self, intermediate_keys: &[String]) -> Result { - let mut real_fields = Vec::with_capacity(self.fields.len()); - for field in self.fields.into_iter() { - let input = OneInputOneOutputField::build( - "digest", - intermediate_keys, - field.input_field(), - field.target_or_input_field(), - )?; - real_fields.push(input); - } - Ok(DigestProcessor { - fields: real_fields, - ignore_missing: self.ignore_missing, - patterns: self.patterns, - }) - } -} - /// Computes a digest (hash) of the input string. #[derive(Debug, Default)] pub struct DigestProcessor { - fields: Vec, + fields: Fields, ignore_missing: bool, patterns: Vec, } @@ -169,7 +124,7 @@ impl DigestProcessor { } } -impl TryFrom<&yaml_rust::yaml::Hash> for DigestProcessorBuilder { +impl TryFrom<&yaml_rust::yaml::Hash> for DigestProcessor { type Error = Error; fn try_from(value: &yaml_rust::yaml::Hash) -> Result { @@ -226,10 +181,10 @@ impl TryFrom<&yaml_rust::yaml::Hash> for DigestProcessorBuilder { } for field in fields.iter_mut() { - field.target_field = Some(format!("{}_digest", field.input_field())); + field.set_target_field(Some(format!("{}_digest", field.input_field()))); } - Ok(DigestProcessorBuilder { + Ok(DigestProcessor { fields, patterns, ignore_missing, @@ -246,23 +201,23 @@ impl crate::etl::processor::Processor for DigestProcessor { self.ignore_missing } - fn exec_mut(&self, val: &mut Vec) -> Result<()> { + fn exec_mut(&self, val: &mut IntermediateStatus) -> Result<()> { for field in self.fields.iter() { - let index = field.input_index(); + let index = field.input_field(); match val.get(index) { Some(Value::Null) | None => { if !self.ignore_missing { return ProcessorMissingFieldSnafu { processor: self.kind(), - field: field.input_name(), + field: field.input_field(), } .fail(); } } Some(v) => { let result = self.process(v)?; - let output_index = field.output_index(); - val[output_index] = result; + let output_index = field.target_or_input_field(); + val.insert(output_index.to_string(), result); } } } @@ -278,7 +233,7 @@ mod tests { #[test] fn test_digest_processor_ip() { let processor = DigestProcessor { - fields: vec![], + fields: Fields::default(), ignore_missing: false, patterns: vec![PresetPattern::Ip.regex()], }; @@ -306,7 +261,7 @@ mod tests { #[test] fn test_digest_processor_uuid() { let processor = DigestProcessor { - fields: vec![], + fields: Fields::default(), ignore_missing: false, patterns: vec![PresetPattern::Uuid.regex()], }; @@ -339,7 +294,7 @@ mod tests { #[test] fn test_digest_processor_brackets() { let processor = DigestProcessor { - fields: vec![], + fields: Fields::default(), ignore_missing: false, patterns: vec![PresetPattern::Bracketed.regex()], }; @@ -389,7 +344,7 @@ mod tests { #[test] fn test_digest_processor_quotes() { let processor = DigestProcessor { - fields: vec![], + fields: Fields::default(), ignore_missing: false, patterns: vec![PresetPattern::Quoted.regex()], }; @@ -409,7 +364,7 @@ mod tests { #[test] fn test_digest_processor_custom_regex() { let processor = DigestProcessor { - fields: vec![], + fields: Fields::default(), ignore_missing: false, patterns: vec![Regex::new(r"\d+").unwrap()], }; diff --git a/src/pipeline/src/etl/processor/dissect.rs b/src/pipeline/src/etl/processor/dissect.rs index a9ccf5e8735e..13ad9175e7df 100644 --- a/src/pipeline/src/etl/processor/dissect.rs +++ b/src/pipeline/src/etl/processor/dissect.rs @@ -25,7 +25,7 @@ use crate::etl::error::{ DissectOrderOnlyAppendSnafu, DissectSplitExceedsInputSnafu, DissectSplitNotMatchInputSnafu, Error, KeyMustBeStringSnafu, ProcessorExpectStringSnafu, ProcessorMissingFieldSnafu, Result, }; -use crate::etl::field::{Fields, InputFieldInfo, OneInputMultiOutputField}; +use crate::etl::field::{Fields, InputField, OneInputMultiOutputField}; use crate::etl::find_key_index; use crate::etl::processor::{ yaml_bool, yaml_new_field, yaml_new_fields, yaml_parse_string, yaml_parse_strings, yaml_string, @@ -612,7 +612,7 @@ impl ProcessorBuilder for DissectProcessorBuilder { for field in self.fields.into_iter() { let input_index = find_key_index(intermediate_keys, field.input_field(), "dissect")?; - let input_field_info = InputFieldInfo::new(field.input_field(), input_index); + let input_field_info = InputField::new(field.input_field(), input_index); let real_field = OneInputMultiOutputField::new(input_field_info, field.target_field); real_fields.push(real_field); diff --git a/src/pipeline/src/etl/processor/epoch.rs b/src/pipeline/src/etl/processor/epoch.rs index f2c03fd120de..29ad6bd3d97d 100644 --- a/src/pipeline/src/etl/processor/epoch.rs +++ b/src/pipeline/src/etl/processor/epoch.rs @@ -12,17 +12,17 @@ // See the License for the specific language governing permissions and // limitations under the License. -use ahash::HashSet; use snafu::{OptionExt, ResultExt}; +use super::IntermediateStatus; use crate::etl::error::{ EpochInvalidResolutionSnafu, Error, FailedToParseIntSnafu, KeyMustBeStringSnafu, ProcessorMissingFieldSnafu, ProcessorUnsupportedValueSnafu, Result, }; -use crate::etl::field::{Fields, OneInputOneOutputField}; +use crate::etl::field::Fields; use crate::etl::processor::{ - yaml_bool, yaml_new_field, yaml_new_fields, yaml_string, Processor, ProcessorBuilder, - ProcessorKind, FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME, + yaml_bool, yaml_new_field, yaml_new_fields, yaml_string, Processor, FIELDS_NAME, FIELD_NAME, + IGNORE_MISSING_NAME, }; use crate::etl::value::time::{ MICROSECOND_RESOLUTION, MICRO_RESOLUTION, MILLISECOND_RESOLUTION, MILLI_RESOLUTION, @@ -57,56 +57,12 @@ impl TryFrom<&str> for Resolution { } } -#[derive(Debug, Default)] -pub struct EpochProcessorBuilder { - fields: Fields, - resolution: Resolution, - ignore_missing: bool, -} - -impl ProcessorBuilder for EpochProcessorBuilder { - fn output_keys(&self) -> HashSet<&str> { - self.fields - .iter() - .map(|f| f.target_or_input_field()) - .collect() - } - - fn input_keys(&self) -> HashSet<&str> { - self.fields.iter().map(|f| f.input_field()).collect() - } - - fn build(self, intermediate_keys: &[String]) -> Result { - self.build(intermediate_keys).map(ProcessorKind::Epoch) - } -} - -impl EpochProcessorBuilder { - pub fn build(self, intermediate_keys: &[String]) -> Result { - let mut real_fields = vec![]; - for field in self.fields.into_iter() { - let input = OneInputOneOutputField::build( - "epoch", - intermediate_keys, - field.input_field(), - field.target_or_input_field(), - )?; - real_fields.push(input); - } - Ok(EpochProcessor { - fields: real_fields, - resolution: self.resolution, - ignore_missing: self.ignore_missing, - }) - } -} - /// support string, integer, float, time, epoch /// deprecated it should be removed in the future /// Reserved for compatibility only #[derive(Debug, Default)] pub struct EpochProcessor { - fields: Vec, + fields: Fields, resolution: Resolution, ignore_missing: bool, // description @@ -157,7 +113,7 @@ impl EpochProcessor { } } -impl TryFrom<&yaml_rust::yaml::Hash> for EpochProcessorBuilder { +impl TryFrom<&yaml_rust::yaml::Hash> for EpochProcessor { type Error = Error; fn try_from(hash: &yaml_rust::yaml::Hash) -> Result { @@ -188,7 +144,7 @@ impl TryFrom<&yaml_rust::yaml::Hash> for EpochProcessorBuilder { _ => {} } } - let builder = EpochProcessorBuilder { + let builder = EpochProcessor { fields, resolution, ignore_missing, @@ -207,23 +163,23 @@ impl Processor for EpochProcessor { self.ignore_missing } - fn exec_mut(&self, val: &mut Vec) -> Result<()> { + fn exec_mut(&self, val: &mut IntermediateStatus) -> Result<()> { for field in self.fields.iter() { - let index = field.input_index(); + let index = field.input_field(); match val.get(index) { Some(Value::Null) | None => { if !self.ignore_missing { return ProcessorMissingFieldSnafu { processor: self.kind(), - field: field.input_name(), + field: field.input_field(), } .fail(); } } Some(v) => { let timestamp = self.parse(v)?; - let output_index = field.output_index(); - val[output_index] = Value::Timestamp(timestamp); + let output_index = field.target_or_input_field(); + val.insert(output_index.to_string(), Value::Timestamp(timestamp)); } } } diff --git a/src/pipeline/src/etl/processor/gsub.rs b/src/pipeline/src/etl/processor/gsub.rs index 54c8306ec4de..dbdb9c5c3047 100644 --- a/src/pipeline/src/etl/processor/gsub.rs +++ b/src/pipeline/src/etl/processor/gsub.rs @@ -12,18 +12,18 @@ // See the License for the specific language governing permissions and // limitations under the License. -use ahash::HashSet; use regex::Regex; use snafu::{OptionExt, ResultExt}; +use super::IntermediateStatus; use crate::etl::error::{ Error, GsubPatternRequiredSnafu, GsubReplacementRequiredSnafu, KeyMustBeStringSnafu, ProcessorExpectStringSnafu, ProcessorMissingFieldSnafu, RegexSnafu, Result, }; -use crate::etl::field::{Fields, OneInputOneOutputField}; +use crate::etl::field::Fields; use crate::etl::processor::{ - yaml_bool, yaml_new_field, yaml_new_fields, yaml_string, ProcessorBuilder, ProcessorKind, - FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME, PATTERN_NAME, + yaml_bool, yaml_new_field, yaml_new_fields, yaml_string, FIELDS_NAME, FIELD_NAME, + IGNORE_MISSING_NAME, PATTERN_NAME, }; use crate::etl::value::Value; @@ -31,68 +31,10 @@ pub(crate) const PROCESSOR_GSUB: &str = "gsub"; const REPLACEMENT_NAME: &str = "replacement"; -#[derive(Debug, Default)] -pub struct GsubProcessorBuilder { - fields: Fields, - pattern: Option, - replacement: Option, - ignore_missing: bool, -} - -impl ProcessorBuilder for GsubProcessorBuilder { - fn output_keys(&self) -> HashSet<&str> { - self.fields - .iter() - .map(|f| f.target_or_input_field()) - .collect() - } - - fn input_keys(&self) -> HashSet<&str> { - self.fields.iter().map(|f| f.input_field()).collect() - } - - fn build(self, intermediate_keys: &[String]) -> Result { - self.build(intermediate_keys).map(ProcessorKind::Gsub) - } -} - -impl GsubProcessorBuilder { - fn check(self) -> Result { - if self.pattern.is_none() { - return GsubPatternRequiredSnafu.fail(); - } - - if self.replacement.is_none() { - return GsubReplacementRequiredSnafu.fail(); - } - - Ok(self) - } - - fn build(self, intermediate_keys: &[String]) -> Result { - let mut real_fields = vec![]; - for field in self.fields.into_iter() { - let input = OneInputOneOutputField::build( - "gsub", - intermediate_keys, - field.input_field(), - field.target_or_input_field(), - )?; - real_fields.push(input); - } - Ok(GsubProcessor { - fields: real_fields, - pattern: self.pattern, - replacement: self.replacement, - ignore_missing: self.ignore_missing, - }) - } -} - /// A processor to replace all matches of a pattern in string by a replacement, only support string value, and array string value #[derive(Debug, Default)] pub struct GsubProcessor { - fields: Vec, + fields: Fields, pattern: Option, replacement: Option, ignore_missing: bool, @@ -136,7 +78,7 @@ impl GsubProcessor { } } -impl TryFrom<&yaml_rust::yaml::Hash> for GsubProcessorBuilder { +impl TryFrom<&yaml_rust::yaml::Hash> for GsubProcessor { type Error = Error; fn try_from(value: &yaml_rust::yaml::Hash) -> Result { @@ -176,7 +118,7 @@ impl TryFrom<&yaml_rust::yaml::Hash> for GsubProcessorBuilder { } } - let builder = GsubProcessorBuilder { + let builder = GsubProcessor { fields, pattern, replacement, @@ -196,23 +138,23 @@ impl crate::etl::processor::Processor for GsubProcessor { self.ignore_missing } - fn exec_mut(&self, val: &mut Vec) -> Result<()> { + fn exec_mut(&self, val: &mut IntermediateStatus) -> Result<()> { for field in self.fields.iter() { - let index = field.input_index(); + let index = field.input_field(); match val.get(index) { Some(Value::Null) | None => { if !self.ignore_missing { return ProcessorMissingFieldSnafu { processor: self.kind(), - field: field.input_name(), + field: field.input_field(), } .fail(); } } Some(v) => { let result = self.process(v)?; - let output_index = field.output_index(); - val[output_index] = result; + let output_index = field.target_or_input_field(); + val.insert(output_index.to_string(), result); } } } diff --git a/src/pipeline/src/etl/processor/join.rs b/src/pipeline/src/etl/processor/join.rs index ddbc086ab8da..6913a5428873 100644 --- a/src/pipeline/src/etl/processor/join.rs +++ b/src/pipeline/src/etl/processor/join.rs @@ -12,79 +12,26 @@ // See the License for the specific language governing permissions and // limitations under the License. -use ahash::HashSet; use snafu::OptionExt; +use super::IntermediateStatus; use crate::etl::error::{ Error, JoinSeparatorRequiredSnafu, KeyMustBeStringSnafu, ProcessorExpectStringSnafu, ProcessorMissingFieldSnafu, Result, }; -use crate::etl::field::{Fields, OneInputOneOutputField}; +use crate::etl::field::Fields; use crate::etl::processor::{ - yaml_bool, yaml_new_field, yaml_new_fields, yaml_string, Processor, ProcessorBuilder, - ProcessorKind, FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME, SEPARATOR_NAME, + yaml_bool, yaml_new_field, yaml_new_fields, yaml_string, Processor, FIELDS_NAME, FIELD_NAME, + IGNORE_MISSING_NAME, SEPARATOR_NAME, }; use crate::etl::value::{Array, Value}; pub(crate) const PROCESSOR_JOIN: &str = "join"; -#[derive(Debug, Default)] -pub struct JoinProcessorBuilder { - fields: Fields, - separator: Option, - ignore_missing: bool, -} - -impl ProcessorBuilder for JoinProcessorBuilder { - fn output_keys(&self) -> HashSet<&str> { - self.fields - .iter() - .map(|f| f.target_or_input_field()) - .collect() - } - - fn input_keys(&self) -> HashSet<&str> { - self.fields.iter().map(|f| f.input_field()).collect() - } - - fn build(self, intermediate_keys: &[String]) -> Result { - self.build(intermediate_keys).map(ProcessorKind::Join) - } -} - -impl JoinProcessorBuilder { - fn check(self) -> Result { - if self.separator.is_none() { - return JoinSeparatorRequiredSnafu.fail(); - } - - Ok(self) - } - - pub fn build(self, intermediate_keys: &[String]) -> Result { - let mut real_fields = vec![]; - for field in self.fields.into_iter() { - let input = OneInputOneOutputField::build( - "join", - intermediate_keys, - field.input_field(), - field.target_or_input_field(), - )?; - real_fields.push(input); - } - - Ok(JoinProcessor { - fields: real_fields, - separator: self.separator, - ignore_missing: self.ignore_missing, - }) - } -} - /// A processor to join each element of an array into a single string using a separator string between each element #[derive(Debug, Default)] pub struct JoinProcessor { - fields: Vec, + fields: Fields, separator: Option, ignore_missing: bool, } @@ -110,7 +57,7 @@ impl JoinProcessor { } } -impl TryFrom<&yaml_rust::yaml::Hash> for JoinProcessorBuilder { +impl TryFrom<&yaml_rust::yaml::Hash> for JoinProcessor { type Error = Error; fn try_from(value: &yaml_rust::yaml::Hash) -> Result { @@ -140,7 +87,7 @@ impl TryFrom<&yaml_rust::yaml::Hash> for JoinProcessorBuilder { } } - let builder = JoinProcessorBuilder { + let builder = JoinProcessor { fields, separator, ignore_missing, @@ -158,20 +105,20 @@ impl Processor for JoinProcessor { self.ignore_missing } - fn exec_mut(&self, val: &mut Vec) -> Result<()> { + fn exec_mut(&self, val: &mut IntermediateStatus) -> Result<()> { for field in self.fields.iter() { - let index = field.input_index(); + let index = field.input_field(); match val.get(index) { Some(Value::Array(arr)) => { let result = self.process(arr)?; - let output_index = field.output_index(); - val[output_index] = result; + let output_index = field.target_or_input_field(); + val.insert(output_index.to_string(), result); } Some(Value::Null) | None => { if !self.ignore_missing { return ProcessorMissingFieldSnafu { processor: self.kind(), - field: field.input_name(), + field: field.input_field(), } .fail(); } diff --git a/src/pipeline/src/etl/processor/json_path.rs b/src/pipeline/src/etl/processor/json_path.rs index c09d338c637f..c7b4210e83f1 100644 --- a/src/pipeline/src/etl/processor/json_path.rs +++ b/src/pipeline/src/etl/processor/json_path.rs @@ -12,17 +12,15 @@ // See the License for the specific language governing permissions and // limitations under the License. -use ahash::HashSet; use jsonpath_rust::JsonPath; use snafu::{OptionExt, ResultExt}; use super::{ - yaml_bool, yaml_new_field, yaml_new_fields, yaml_string, Processor, ProcessorBuilder, + yaml_bool, yaml_new_field, yaml_new_fields, yaml_string, IntermediateStatus, Processor, FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME, JSON_PATH_NAME, JSON_PATH_RESULT_INDEX_NAME, }; use crate::etl::error::{Error, Result}; -use crate::etl::field::{Fields, OneInputOneOutputField}; -use crate::etl::processor::ProcessorKind; +use crate::etl::field::Fields; use crate::etl_error::{ JsonPathParseResultIndexSnafu, JsonPathParseSnafu, KeyMustBeStringSnafu, ProcessorMissingFieldSnafu, @@ -31,54 +29,7 @@ use crate::Value; pub(crate) const PROCESSOR_JSON_PATH: &str = "json_path"; -#[derive(Debug)] -pub struct JsonPathProcessorBuilder { - fields: Fields, - json_path: JsonPath, - ignore_missing: bool, - result_idex: Option, -} - -impl JsonPathProcessorBuilder { - fn build(self, intermediate_keys: &[String]) -> Result { - let mut real_fields = vec![]; - for field in self.fields.into_iter() { - let input = OneInputOneOutputField::build( - JSON_PATH_NAME, - intermediate_keys, - field.input_field(), - field.target_or_input_field(), - )?; - real_fields.push(input); - } - - Ok(JsonPathProcessor { - fields: real_fields, - json_path: self.json_path, - ignore_missing: self.ignore_missing, - result_idex: self.result_idex, - }) - } -} - -impl ProcessorBuilder for JsonPathProcessorBuilder { - fn output_keys(&self) -> HashSet<&str> { - self.fields - .iter() - .map(|f| f.target_or_input_field()) - .collect() - } - - fn input_keys(&self) -> HashSet<&str> { - self.fields.iter().map(|f| f.input_field()).collect() - } - - fn build(self, intermediate_keys: &[String]) -> Result { - self.build(intermediate_keys).map(ProcessorKind::JsonPath) - } -} - -impl TryFrom<&yaml_rust::yaml::Hash> for JsonPathProcessorBuilder { +impl TryFrom<&yaml_rust::yaml::Hash> for JsonPathProcessor { type Error = Error; fn try_from(value: &yaml_rust::yaml::Hash) -> std::result::Result { @@ -117,7 +68,7 @@ impl TryFrom<&yaml_rust::yaml::Hash> for JsonPathProcessorBuilder { } } if let Some(json_path) = json_path { - let processor = JsonPathProcessorBuilder { + let processor = JsonPathProcessor { fields, json_path, ignore_missing, @@ -137,7 +88,7 @@ impl TryFrom<&yaml_rust::yaml::Hash> for JsonPathProcessorBuilder { #[derive(Debug)] pub struct JsonPathProcessor { - fields: Vec, + fields: Fields, json_path: JsonPath, ignore_missing: bool, result_idex: Option, @@ -146,7 +97,7 @@ pub struct JsonPathProcessor { impl Default for JsonPathProcessor { fn default() -> Self { JsonPathProcessor { - fields: vec![], + fields: Fields::default(), json_path: JsonPath::try_from("$").unwrap(), ignore_missing: false, result_idex: None, @@ -179,21 +130,20 @@ impl Processor for JsonPathProcessor { self.ignore_missing } - fn exec_mut(&self, val: &mut Vec) -> Result<()> { + fn exec_mut(&self, val: &mut IntermediateStatus) -> Result<()> { for field in self.fields.iter() { - let index = field.input_index(); + let index = field.input_field(); match val.get(index) { Some(v) => { let processed = self.process_field(v)?; - - let output_index = field.output_index(); - val[output_index] = processed; + let output_index = field.target_or_input_field(); + val.insert(output_index.to_string(), processed); } None => { if !self.ignore_missing { return ProcessorMissingFieldSnafu { processor: self.kind(), - field: field.input_name(), + field: field.input_field(), } .fail(); } diff --git a/src/pipeline/src/etl/processor/letter.rs b/src/pipeline/src/etl/processor/letter.rs index 8eb939918104..960521853e48 100644 --- a/src/pipeline/src/etl/processor/letter.rs +++ b/src/pipeline/src/etl/processor/letter.rs @@ -12,17 +12,17 @@ // See the License for the specific language governing permissions and // limitations under the License. -use ahash::HashSet; use snafu::OptionExt; +use super::IntermediateStatus; use crate::etl::error::{ Error, KeyMustBeStringSnafu, LetterInvalidMethodSnafu, ProcessorExpectStringSnafu, ProcessorMissingFieldSnafu, Result, }; -use crate::etl::field::{Fields, OneInputOneOutputField}; +use crate::etl::field::Fields; use crate::etl::processor::{ - yaml_bool, yaml_new_field, yaml_new_fields, yaml_string, Processor, ProcessorBuilder, - ProcessorKind, FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME, METHOD_NAME, + yaml_bool, yaml_new_field, yaml_new_fields, yaml_string, Processor, FIELDS_NAME, FIELD_NAME, + IGNORE_MISSING_NAME, METHOD_NAME, }; use crate::etl::value::Value; @@ -59,55 +59,10 @@ impl std::str::FromStr for Method { } } -#[derive(Debug, Default)] -pub struct LetterProcessorBuilder { - fields: Fields, - method: Method, - ignore_missing: bool, -} - -impl ProcessorBuilder for LetterProcessorBuilder { - fn output_keys(&self) -> HashSet<&str> { - self.fields - .iter() - .map(|f| f.target_or_input_field()) - .collect() - } - - fn input_keys(&self) -> HashSet<&str> { - self.fields.iter().map(|f| f.input_field()).collect() - } - - fn build(self, intermediate_keys: &[String]) -> Result { - self.build(intermediate_keys).map(ProcessorKind::Letter) - } -} - -impl LetterProcessorBuilder { - pub fn build(self, intermediate_keys: &[String]) -> Result { - let mut real_fields = vec![]; - for field in self.fields.into_iter() { - let input = OneInputOneOutputField::build( - "letter", - intermediate_keys, - field.input_field(), - field.target_or_input_field(), - )?; - real_fields.push(input); - } - - Ok(LetterProcessor { - fields: real_fields, - method: self.method, - ignore_missing: self.ignore_missing, - }) - } -} - /// only support string value #[derive(Debug, Default)] pub struct LetterProcessor { - fields: Vec, + fields: Fields, method: Method, ignore_missing: bool, } @@ -125,7 +80,7 @@ impl LetterProcessor { } } -impl TryFrom<&yaml_rust::yaml::Hash> for LetterProcessorBuilder { +impl TryFrom<&yaml_rust::yaml::Hash> for LetterProcessor { type Error = Error; fn try_from(value: &yaml_rust::yaml::Hash) -> Result { @@ -154,7 +109,7 @@ impl TryFrom<&yaml_rust::yaml::Hash> for LetterProcessorBuilder { } } - Ok(LetterProcessorBuilder { + Ok(LetterProcessor { fields, method, ignore_missing, @@ -171,20 +126,20 @@ impl Processor for LetterProcessor { self.ignore_missing } - fn exec_mut(&self, val: &mut Vec) -> Result<()> { + fn exec_mut(&self, val: &mut IntermediateStatus) -> Result<()> { for field in self.fields.iter() { - let index = field.input_index(); + let index = field.input_field(); match val.get(index) { Some(Value::String(s)) => { let result = self.process_field(s)?; - let (_, output_index) = field.output(); - val[*output_index] = result; + let output_key = field.target_or_input_field(); + val.insert(output_key.to_string(), result); } Some(Value::Null) | None => { if !self.ignore_missing { return ProcessorMissingFieldSnafu { processor: self.kind(), - field: field.input_name(), + field: field.input_field(), } .fail(); } diff --git a/src/pipeline/src/etl/processor/regex.rs b/src/pipeline/src/etl/processor/regex.rs index de25195f99ab..a6ffa86d1689 100644 --- a/src/pipeline/src/etl/processor/regex.rs +++ b/src/pipeline/src/etl/processor/regex.rs @@ -28,7 +28,7 @@ use crate::etl::error::{ RegexNamedGroupNotFoundSnafu, RegexNoValidFieldSnafu, RegexNoValidPatternSnafu, RegexSnafu, Result, }; -use crate::etl::field::{Fields, InputFieldInfo, OneInputMultiOutputField}; +use crate::etl::field::{Fields, InputField, OneInputMultiOutputField}; use crate::etl::find_key_index; use crate::etl::processor::{ yaml_bool, yaml_new_field, yaml_new_fields, yaml_string, yaml_strings, Processor, @@ -173,7 +173,7 @@ impl RegexProcessorBuilder { let mut real_fields = vec![]; for field in self.fields.into_iter() { let input_index = find_key_index(intermediate_keys, field.input_field(), "regex")?; - let input_field_info = InputFieldInfo::new(field.input_field(), input_index); + let input_field_info = InputField::new(field.input_field(), input_index); let input = OneInputMultiOutputField::new(input_field_info, field.target_field); real_fields.push(input); diff --git a/src/pipeline/src/etl/processor/timestamp.rs b/src/pipeline/src/etl/processor/timestamp.rs index 18b6711c1d80..bf90e78f2165 100644 --- a/src/pipeline/src/etl/processor/timestamp.rs +++ b/src/pipeline/src/etl/processor/timestamp.rs @@ -14,22 +14,22 @@ use std::sync::Arc; -use ahash::HashSet; use chrono::{DateTime, NaiveDateTime}; use chrono_tz::Tz; use lazy_static::lazy_static; use snafu::{OptionExt, ResultExt}; +use super::IntermediateStatus; use crate::etl::error::{ DateFailedToGetLocalTimezoneSnafu, DateFailedToGetTimestampSnafu, DateInvalidFormatSnafu, DateParseSnafu, DateParseTimezoneSnafu, EpochInvalidResolutionSnafu, Error, KeyMustBeStringSnafu, ProcessorFailedToParseStringSnafu, ProcessorMissingFieldSnafu, ProcessorUnsupportedValueSnafu, Result, }; -use crate::etl::field::{Fields, OneInputOneOutputField}; +use crate::etl::field::Fields; use crate::etl::processor::{ - yaml_bool, yaml_new_field, yaml_new_fields, yaml_string, yaml_strings, Processor, - ProcessorBuilder, ProcessorKind, FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME, + yaml_bool, yaml_new_field, yaml_new_fields, yaml_string, yaml_strings, Processor, FIELDS_NAME, + FIELD_NAME, IGNORE_MISSING_NAME, }; use crate::etl::value::time::{ MICROSECOND_RESOLUTION, MICRO_RESOLUTION, MILLISECOND_RESOLUTION, MILLI_RESOLUTION, @@ -114,56 +114,10 @@ impl std::ops::Deref for Formats { } } -#[derive(Debug)] -pub struct TimestampProcessorBuilder { - fields: Fields, - formats: Formats, - resolution: Resolution, - ignore_missing: bool, -} - -impl ProcessorBuilder for TimestampProcessorBuilder { - fn output_keys(&self) -> HashSet<&str> { - self.fields - .iter() - .map(|f| f.target_or_input_field()) - .collect() - } - - fn input_keys(&self) -> HashSet<&str> { - self.fields.iter().map(|f| f.input_field()).collect() - } - - fn build(self, intermediate_keys: &[String]) -> Result { - self.build(intermediate_keys).map(ProcessorKind::Timestamp) - } -} - -impl TimestampProcessorBuilder { - pub fn build(self, intermediate_keys: &[String]) -> Result { - let mut real_fields = vec![]; - for field in self.fields.into_iter() { - let input = OneInputOneOutputField::build( - "timestamp", - intermediate_keys, - field.input_field(), - field.target_or_input_field(), - )?; - real_fields.push(input); - } - Ok(TimestampProcessor { - fields: real_fields, - formats: self.formats, - resolution: self.resolution, - ignore_missing: self.ignore_missing, - }) - } -} - /// support string, integer, float, time, epoch #[derive(Debug, Default)] pub struct TimestampProcessor { - fields: Vec, + fields: Fields, formats: Formats, resolution: Resolution, ignore_missing: bool, @@ -289,7 +243,7 @@ fn parse_formats(yaml: &yaml_rust::yaml::Yaml) -> Result, Tz)>> } } -impl TryFrom<&yaml_rust::yaml::Hash> for TimestampProcessorBuilder { +impl TryFrom<&yaml_rust::yaml::Hash> for TimestampProcessor { type Error = Error; fn try_from(hash: &yaml_rust::yaml::Hash) -> Result { @@ -324,7 +278,7 @@ impl TryFrom<&yaml_rust::yaml::Hash> for TimestampProcessorBuilder { } } - let processor_builder = TimestampProcessorBuilder { + let processor_builder = TimestampProcessor { fields, formats, resolution, @@ -344,23 +298,23 @@ impl Processor for TimestampProcessor { self.ignore_missing } - fn exec_mut(&self, val: &mut Vec) -> Result<()> { + fn exec_mut(&self, val: &mut IntermediateStatus) -> Result<()> { for field in self.fields.iter() { - let index = field.input().index; + let index = field.input_field(); match val.get(index) { Some(Value::Null) | None => { if !self.ignore_missing { return ProcessorMissingFieldSnafu { processor: self.kind(), - field: field.input_name(), + field: field.input_field(), } .fail(); } } Some(v) => { let result = self.parse(v)?; - let (_, index) = field.output(); - val[*index] = Value::Timestamp(result); + let output_key = field.target_or_input_field(); + val.insert(output_key.to_string(), Value::Timestamp(result)); } } } @@ -372,18 +326,9 @@ impl Processor for TimestampProcessor { mod tests { use yaml_rust::YamlLoader; - use super::{TimestampProcessor, TimestampProcessorBuilder}; + use super::TimestampProcessor; use crate::etl::value::{Timestamp, Value}; - fn builder_to_native_processor(builder: TimestampProcessorBuilder) -> TimestampProcessor { - TimestampProcessor { - fields: vec![], - formats: builder.formats, - resolution: builder.resolution, - ignore_missing: builder.ignore_missing, - } - } - #[test] fn test_parse_epoch() { let processor_yaml_str = r#"fields: @@ -397,9 +342,7 @@ formats: "#; let yaml = &YamlLoader::load_from_str(processor_yaml_str).unwrap()[0]; let timestamp_yaml = yaml.as_hash().unwrap(); - let processor = builder_to_native_processor( - TimestampProcessorBuilder::try_from(timestamp_yaml).unwrap(), - ); + let processor = TimestampProcessor::try_from(timestamp_yaml).unwrap(); let values = [ ( @@ -451,9 +394,7 @@ formats: "#; let yaml = &YamlLoader::load_from_str(processor_yaml_str).unwrap()[0]; let timestamp_yaml = yaml.as_hash().unwrap(); - let processor = builder_to_native_processor( - TimestampProcessorBuilder::try_from(timestamp_yaml).unwrap(), - ); + let processor = TimestampProcessor::try_from(timestamp_yaml).unwrap(); let values: Vec<&str> = vec![ "2014-5-17T12:34:56", diff --git a/src/pipeline/src/etl/processor/urlencoding.rs b/src/pipeline/src/etl/processor/urlencoding.rs index ca42aae23677..c14c7d87b11f 100644 --- a/src/pipeline/src/etl/processor/urlencoding.rs +++ b/src/pipeline/src/etl/processor/urlencoding.rs @@ -12,7 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -use ahash::HashSet; +use std::collections::BTreeMap; + use snafu::{OptionExt, ResultExt}; use urlencoding::{decode, encode}; @@ -20,10 +21,10 @@ use crate::etl::error::{ Error, KeyMustBeStringSnafu, ProcessorExpectStringSnafu, ProcessorMissingFieldSnafu, Result, UrlEncodingDecodeSnafu, UrlEncodingInvalidMethodSnafu, }; -use crate::etl::field::{Fields, OneInputOneOutputField}; +use crate::etl::field::Fields; use crate::etl::processor::{ - yaml_bool, yaml_new_field, yaml_new_fields, yaml_string, ProcessorBuilder, ProcessorKind, - FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME, METHOD_NAME, + yaml_bool, yaml_new_field, yaml_new_fields, yaml_string, FIELDS_NAME, FIELD_NAME, + IGNORE_MISSING_NAME, METHOD_NAME, }; use crate::etl::value::Value; @@ -57,55 +58,10 @@ impl std::str::FromStr for Method { } } -#[derive(Debug, Default)] -pub struct UrlEncodingProcessorBuilder { - fields: Fields, - method: Method, - ignore_missing: bool, -} - -impl ProcessorBuilder for UrlEncodingProcessorBuilder { - fn output_keys(&self) -> HashSet<&str> { - self.fields - .iter() - .map(|f| f.target_or_input_field()) - .collect() - } - - fn input_keys(&self) -> HashSet<&str> { - self.fields.iter().map(|f| f.input_field()).collect() - } - - fn build(self, intermediate_keys: &[String]) -> Result { - self.build(intermediate_keys) - .map(ProcessorKind::UrlEncoding) - } -} - -impl UrlEncodingProcessorBuilder { - fn build(self, intermediate_keys: &[String]) -> Result { - let mut real_fields = vec![]; - for field in self.fields.into_iter() { - let input = OneInputOneOutputField::build( - "urlencoding", - intermediate_keys, - field.input_field(), - field.target_or_input_field(), - )?; - real_fields.push(input); - } - Ok(UrlEncodingProcessor { - fields: real_fields, - method: self.method, - ignore_missing: self.ignore_missing, - }) - } -} - /// only support string value #[derive(Debug, Default)] pub struct UrlEncodingProcessor { - fields: Vec, + fields: Fields, method: Method, ignore_missing: bool, } @@ -120,7 +76,7 @@ impl UrlEncodingProcessor { } } -impl TryFrom<&yaml_rust::yaml::Hash> for UrlEncodingProcessorBuilder { +impl TryFrom<&yaml_rust::yaml::Hash> for UrlEncodingProcessor { type Error = Error; fn try_from(value: &yaml_rust::yaml::Hash) -> Result { @@ -152,7 +108,7 @@ impl TryFrom<&yaml_rust::yaml::Hash> for UrlEncodingProcessorBuilder { _ => {} } } - let processor = UrlEncodingProcessorBuilder { + let processor = UrlEncodingProcessor { fields, method, ignore_missing, @@ -171,20 +127,20 @@ impl crate::etl::processor::Processor for UrlEncodingProcessor { self.ignore_missing } - fn exec_mut(&self, val: &mut Vec) -> Result<()> { + fn exec_mut(&self, val: &mut BTreeMap) -> Result<()> { for field in self.fields.iter() { - let index = field.input_index(); + let index = field.input_field(); match val.get(index) { Some(Value::String(s)) => { let result = self.process_field(s)?; - let output_index = field.output_index(); - val[output_index] = result; + let output_index = field.target_or_input_field(); + val.insert(output_index.to_string(), result); } Some(Value::Null) | None => { if !self.ignore_missing { return ProcessorMissingFieldSnafu { processor: self.kind(), - field: field.input_name(), + field: field.input_field(), } .fail(); } @@ -205,6 +161,7 @@ impl crate::etl::processor::Processor for UrlEncodingProcessor { #[cfg(test)] mod tests { + use crate::etl::field::Fields; use crate::etl::processor::urlencoding::UrlEncodingProcessor; use crate::etl::value::Value; @@ -220,7 +177,7 @@ mod tests { } { let processor = UrlEncodingProcessor { - fields: vec![], + fields: Fields::default(), method: super::Method::Encode, ignore_missing: false, }; diff --git a/src/pipeline/src/etl/transform.rs b/src/pipeline/src/etl/transform.rs index 4daa3a4d8cf4..7191d272069c 100644 --- a/src/pipeline/src/etl/transform.rs +++ b/src/pipeline/src/etl/transform.rs @@ -17,11 +17,7 @@ pub mod transformer; use std::collections::BTreeMap; -use snafu::OptionExt; - use crate::etl::error::{Error, Result}; -use crate::etl::find_key_index; -use crate::etl::processor::yaml_string; use crate::etl::transform::index::Index; use crate::etl::value::Value; @@ -32,14 +28,15 @@ const TRANSFORM_INDEX: &str = "index"; const TRANSFORM_DEFAULT: &str = "default"; const TRANSFORM_ON_FAILURE: &str = "on_failure"; +use snafu::OptionExt; pub use transformer::greptime::GreptimeTransformer; use super::error::{ KeyMustBeStringSnafu, TransformElementMustBeMapSnafu, TransformOnFailureInvalidValueSnafu, TransformTypeMustBeSetSnafu, }; -use super::field::{Fields, InputFieldInfo, OneInputOneOutputField}; -use super::processor::{yaml_new_field, yaml_new_fields}; +use super::field::Fields; +use super::processor::{yaml_new_field, yaml_new_fields, yaml_string}; pub trait Transformer: std::fmt::Debug + Sized + Send + Sync + 'static { type Output; @@ -104,14 +101,43 @@ impl TryFrom<&Vec> for Transforms { type Error = Error; fn try_from(docs: &Vec) -> Result { - todo!() + let mut transforms = Vec::with_capacity(100); + let mut all_output_keys: Vec = Vec::with_capacity(100); + let mut all_required_keys = Vec::with_capacity(100); + for doc in docs { + let transform_builder: Transform = doc + .as_hash() + .context(TransformElementMustBeMapSnafu)? + .try_into()?; + let mut transform_output_keys = transform_builder + .fields + .iter() + .map(|f| f.target_or_input_field().to_string()) + .collect(); + all_output_keys.append(&mut transform_output_keys); + + let mut transform_required_keys = transform_builder + .fields + .iter() + .map(|f| f.input_field().to_string()) + .collect(); + all_required_keys.append(&mut transform_required_keys); + + transforms.push(transform_builder); + } + + all_required_keys.sort(); + + Ok(Transforms { + transforms: transforms, + }) } } /// only field is required #[derive(Debug, Clone)] pub struct Transform { - pub real_fields: Vec, + pub fields: Fields, pub type_: Value, @@ -125,7 +151,7 @@ pub struct Transform { impl Default for Transform { fn default() -> Self { Transform { - real_fields: Vec::new(), + fields: Fields::default(), type_: Value::Null, default: None, index: None, @@ -143,3 +169,78 @@ impl Transform { &self.type_ } } + +impl TryFrom<&yaml_rust::yaml::Hash> for Transform { + type Error = Error; + + fn try_from(hash: &yaml_rust::yaml::Hash) -> Result { + let mut fields = Fields::default(); + let mut type_ = Value::Null; + let mut default = None; + let mut index = None; + let mut on_failure = None; + + for (k, v) in hash { + let key = k + .as_str() + .with_context(|| KeyMustBeStringSnafu { k: k.clone() })?; + match key { + TRANSFORM_FIELD => { + fields = Fields::one(yaml_new_field(v, TRANSFORM_FIELD)?); + } + + TRANSFORM_FIELDS => { + fields = yaml_new_fields(v, TRANSFORM_FIELDS)?; + } + + TRANSFORM_TYPE => { + let t = yaml_string(v, TRANSFORM_TYPE)?; + type_ = Value::parse_str_type(&t)?; + } + + TRANSFORM_INDEX => { + let index_str = yaml_string(v, TRANSFORM_INDEX)?; + index = Some(index_str.try_into()?); + } + + TRANSFORM_DEFAULT => { + default = Some(Value::try_from(v)?); + } + + TRANSFORM_ON_FAILURE => { + let on_failure_str = yaml_string(v, TRANSFORM_ON_FAILURE)?; + on_failure = Some(on_failure_str.parse()?); + } + + _ => {} + } + } + let mut final_default = None; + + if let Some(default_value) = default { + match (&type_, &default_value) { + (Value::Null, _) => { + return TransformTypeMustBeSetSnafu { + fields: format!("{:?}", fields), + default: default_value.to_string(), + } + .fail(); + } + (_, Value::Null) => {} // if default is not set, then it will be regarded as default null + (_, _) => { + let target = type_.parse_str_value(default_value.to_str_value().as_str())?; + final_default = Some(target); + } + } + } + let builder = Transform { + fields, + type_, + default: final_default, + index, + on_failure, + }; + + Ok(builder) + } +} diff --git a/src/pipeline/src/etl/transform/transformer/greptime.rs b/src/pipeline/src/etl/transform/transformer/greptime.rs index f7e59904a313..eeff061f755c 100644 --- a/src/pipeline/src/etl/transform/transformer/greptime.rs +++ b/src/pipeline/src/etl/transform/transformer/greptime.rs @@ -22,7 +22,7 @@ use api::helper::proto_value_type; use api::v1::column_data_type_extension::TypeExt; use api::v1::value::ValueData; use api::v1::{ColumnDataType, ColumnDataTypeExtension, JsonTypeExtension, SemanticType}; -use coerce::coerce_columns; +use coerce::{coerce_columns, coerce_value}; use greptime_proto::v1::{ColumnSchema, Row, Rows, Value as GreptimeValue}; use itertools::Itertools; use serde_json::{Map, Number, Value as JsonValue}; @@ -33,6 +33,7 @@ use crate::etl::error::{ TransformMultipleTimestampIndexSnafu, TransformTimestampIndexCountSnafu, UnsupportedNumberTypeSnafu, }; +use crate::etl::processor::IntermediateStatus; use crate::etl::transform::index::Index; use crate::etl::transform::{Transformer, Transforms}; use crate::etl::value::{Timestamp, Value}; @@ -142,9 +143,9 @@ impl Transformer for GreptimeTransformer { for transform in transforms.iter() { let target_fields_set = transform - .real_fields + .fields .iter() - .map(|f| f.output_name()) + .map(|f| f.target_or_input_field()) .collect::>(); let intersections: Vec<_> = column_names_set.intersection(&target_fields_set).collect(); @@ -157,16 +158,17 @@ impl Transformer for GreptimeTransformer { if let Some(idx) = transform.index { if idx == Index::Time { - match transform.real_fields.len() { + match transform.fields.len() { //Safety unwrap is fine here because we have checked the length of real_fields - 1 => timestamp_columns - .push(transform.real_fields.first().unwrap().input_name()), + 1 => { + timestamp_columns.push(transform.fields.first().unwrap().input_field()) + } _ => { return TransformMultipleTimestampIndexSnafu { columns: transform - .real_fields + .fields .iter() - .map(|x| x.input_name()) + .map(|x| x.input_field()) .join(", "), } .fail(); @@ -195,31 +197,31 @@ impl Transformer for GreptimeTransformer { } } - fn transform_mut(&self, val: &mut BTreeMap) -> Result { - // let mut values = vec![GreptimeValue { value_data: None }; self.schema.len()]; - // for transform in self.transforms.iter() { - // for field in transform.real_fields.iter() { - // let index = field.input_index(); - // let output_index = field.output_index(); - // match val.get(index) { - // Some(v) => { - // let value_data = coerce_value(v, transform)?; - // // every transform fields has only one output field - // values[output_index] = GreptimeValue { value_data }; - // } - // None => { - // let default = transform.get_default(); - // let value_data = match default { - // Some(default) => coerce_value(default, transform)?, - // None => None, - // }; - // values[output_index] = GreptimeValue { value_data }; - // } - // } - // } - // } - // Ok(Row { values }) - todo!() + fn transform_mut(&self, val: &mut IntermediateStatus) -> Result { + let mut values = vec![GreptimeValue { value_data: None }; self.schema.len()]; + let mut output_index = 0; + for transform in self.transforms.iter() { + for field in transform.fields.iter() { + let index = field.input_field(); + match val.get(index) { + Some(v) => { + let value_data = coerce_value(v, transform)?; + // every transform fields has only one output field + values[output_index] = GreptimeValue { value_data }; + } + None => { + let default = transform.get_default(); + let value_data = match default { + Some(default) => coerce_value(default, transform)?, + None => None, + }; + values[output_index] = GreptimeValue { value_data }; + } + } + output_index += 1; + } + } + Ok(Row { values }) } fn transforms(&self) -> &Transforms { @@ -643,6 +645,7 @@ mod tests { use crate::etl::transform::transformer::greptime::{ flatten_json_object, identity_pipeline_inner, GreptimePipelineParams, }; + use crate::etl::{json_array_to_intermediate_state, json_to_intermediate_state}; use crate::{identity_pipeline, Pipeline}; #[test] @@ -668,7 +671,7 @@ mod tests { "gaga": "gaga" }), ]; - let array = Pipeline::prepare(array).unwrap(); + let array = json_array_to_intermediate_state(array).unwrap(); let rows = identity_pipeline(array, None, &GreptimePipelineParams::default()); assert!(rows.is_err()); assert_eq!( @@ -698,7 +701,7 @@ mod tests { }), ]; let rows = identity_pipeline( - Pipeline::prepare(array).unwrap(), + json_array_to_intermediate_state(array).unwrap(), None, &GreptimePipelineParams::default(), ); @@ -730,7 +733,7 @@ mod tests { }), ]; let rows = identity_pipeline( - Pipeline::prepare(array).unwrap(), + json_array_to_intermediate_state(array).unwrap(), None, &GreptimePipelineParams::default(), ); @@ -764,7 +767,7 @@ mod tests { ]; let tag_column_names = ["name".to_string(), "address".to_string()]; let rows = identity_pipeline_inner( - Pipeline::prepare(array).uwnrap(), + json_array_to_intermediate_state(array).unwrap(), Some(tag_column_names.iter()), &GreptimePipelineParams::default(), ); diff --git a/src/pipeline/src/etl/transform/transformer/greptime/coerce.rs b/src/pipeline/src/etl/transform/transformer/greptime/coerce.rs index 5f448b386cbd..da345b3bdeb3 100644 --- a/src/pipeline/src/etl/transform/transformer/greptime/coerce.rs +++ b/src/pipeline/src/etl/transform/transformer/greptime/coerce.rs @@ -71,12 +71,11 @@ impl TryFrom for ValueData { } } -// TODO(yuanbohan): add fulltext support in datatype_extension pub(crate) fn coerce_columns(transform: &Transform) -> Result> { let mut columns = Vec::new(); - for field in transform.real_fields.iter() { - let column_name = field.output_name().to_string(); + for field in transform.fields.iter() { + let column_name = field.target_or_input_field().to_string(); let (datatype, datatype_extension) = coerce_type(transform)?; @@ -477,12 +476,14 @@ fn coerce_json_value(v: &Value, transform: &Transform) -> Result Date: Fri, 24 Jan 2025 22:39:47 +0800 Subject: [PATCH 20/32] refactor: use updated prepare api --- src/pipeline/src/lib.rs | 5 +++-- src/servers/src/http/event.rs | 14 +++++++++----- src/servers/src/otlp/logs.rs | 15 +++++++++------ src/servers/src/pipeline.rs | 7 +++---- 4 files changed, 24 insertions(+), 17 deletions(-) diff --git a/src/pipeline/src/lib.rs b/src/pipeline/src/lib.rs index 529908145f45..c0003d3f4ea1 100644 --- a/src/pipeline/src/lib.rs +++ b/src/pipeline/src/lib.rs @@ -26,8 +26,9 @@ pub use etl::transform::transformer::identity_pipeline; pub use etl::transform::{GreptimeTransformer, Transformer}; pub use etl::value::{Array, Map, Value}; pub use etl::{ - error as etl_error, parse, Content, DispatchedTo, Pipeline, PipelineDefinition, - PipelineExecOutput, PipelineWay, SelectInfo, GREPTIME_INTERNAL_IDENTITY_PIPELINE_NAME, + error as etl_error, json_array_to_intermediate_state, parse, Content, DispatchedTo, Pipeline, + PipelineDefinition, PipelineExecOutput, PipelineWay, SelectInfo, + GREPTIME_INTERNAL_IDENTITY_PIPELINE_NAME, }; pub use manager::{ error, pipeline_operator, table, util, PipelineInfo, PipelineRef, PipelineTableRef, diff --git a/src/servers/src/http/event.rs b/src/servers/src/http/event.rs index e8f0d749f873..fc4ca58b7543 100644 --- a/src/servers/src/http/event.rs +++ b/src/servers/src/http/event.rs @@ -30,10 +30,11 @@ use common_telemetry::{error, warn}; use datatypes::value::column_data_to_json; use headers::ContentType; use lazy_static::lazy_static; +use pipeline::error::PipelineTransformSnafu; use pipeline::util::to_pipeline_version; use pipeline::{ - GreptimePipelineParams, GreptimeTransformer, PipelineDefinition, PipelineExecInput, - PipelineVersion, GREPTIME_PIPELINE_PARAMS_HEADER, + GreptimePipelineParams, GreptimeTransformer, PipelineDefinition, PipelineVersion, + GREPTIME_PIPELINE_PARAMS_HEADER, }; use serde::{Deserialize, Serialize}; use serde_json::{json, Deserializer, Map, Value}; @@ -275,14 +276,15 @@ async fn dryrun_pipeline_inner( pipeline_handler: PipelineHandlerRef, query_ctx: &QueryContextRef, ) -> Result { - let db = query_ctx.get_db_string(); let params = GreptimePipelineParams::default(); let results = run_pipeline( &pipeline_handler, PipelineDefinition::Resolved(pipeline), ¶ms, - Pipeline::prepare(value)?, + pipeline::json_array_to_intermediate_state(value) + .context(PipelineTransformSnafu) + .context(PipelineSnafu)?, "dry_run".to_owned(), query_ctx, true, @@ -603,7 +605,9 @@ pub(crate) async fn ingest_logs_inner( &state, PipelineDefinition::from_name(&pipeline_name, version), &pipeline_params, - Pipeline::prepare(request.values), + pipeline::json_array_to_intermediate_state(request.values) + .context(PipelineTransformSnafu) + .context(PipelineSnafu)?, request.table, &query_ctx, true, diff --git a/src/servers/src/otlp/logs.rs b/src/servers/src/otlp/logs.rs index 24232fcef01f..54f29c291621 100644 --- a/src/servers/src/otlp/logs.rs +++ b/src/servers/src/otlp/logs.rs @@ -25,14 +25,17 @@ use jsonb::{Number as JsonbNumber, Value as JsonbValue}; use opentelemetry_proto::tonic::collector::logs::v1::ExportLogsServiceRequest; use opentelemetry_proto::tonic::common::v1::{any_value, AnyValue, InstrumentationScope, KeyValue}; use opentelemetry_proto::tonic::logs::v1::{LogRecord, ResourceLogs, ScopeLogs}; -use pipeline::{GreptimePipelineParams, PipelineExecInput, PipelineWay, SchemaInfo, SelectInfo}; +use pipeline::error::PipelineTransformSnafu; +use pipeline::{GreptimePipelineParams, PipelineWay, SchemaInfo, SelectInfo}; use serde_json::{Map, Value}; use session::context::QueryContextRef; -use snafu::ensure; +use snafu::{ensure, ResultExt}; use super::trace::attributes::OtlpAnyValue; use super::utils::{bytes_to_hex_string, key_value_to_jsonb}; -use crate::error::{IncompatibleSchemaSnafu, Result, UnsupportedJsonDataTypeForTagSnafu}; +use crate::error::{ + IncompatibleSchemaSnafu, PipelineSnafu, Result, UnsupportedJsonDataTypeForTagSnafu, +}; use crate::pipeline::run_pipeline; use crate::query_handler::PipelineHandlerRef; @@ -70,9 +73,9 @@ pub async fn to_grpc_insert_requests( } PipelineWay::Pipeline(pipeline_def) => { let data = parse_export_logs_service_request(request); - let array = Pipeline::prepare(data)?; - - let db_string = query_ctx.get_db_string(); + let array = pipeline::json_array_to_intermediate_state(data) + .context(PipelineTransformSnafu) + .context(PipelineSnafu)?; let inserts = run_pipeline( &pipeline_handler, diff --git a/src/servers/src/pipeline.rs b/src/servers/src/pipeline.rs index bee45d476404..bf7e949db37b 100644 --- a/src/servers/src/pipeline.rs +++ b/src/servers/src/pipeline.rs @@ -14,13 +14,12 @@ use std::collections::BTreeMap; use std::sync::Arc; -use std::time::Instant; -use api::v1::{Row, RowInsertRequest, Rows}; +use api::v1::{RowInsertRequest, Rows}; use pipeline::error::PipelineTransformSnafu; use pipeline::{ DispatchedTo, GreptimePipelineParams, GreptimeTransformer, Pipeline, PipelineDefinition, - PipelineExecInput, PipelineExecOutput, GREPTIME_INTERNAL_IDENTITY_PIPELINE_NAME, + PipelineExecOutput, GREPTIME_INTERNAL_IDENTITY_PIPELINE_NAME, }; use session::context::QueryContextRef; use snafu::ResultExt; @@ -83,7 +82,7 @@ pub(crate) async fn run_pipeline( let transform_timer = std::time::Instant::now(); - let mut transformed = Vec::with_capacity(values.len()); + let mut transformed = Vec::with_capacity(array.len()); let mut dispatched: BTreeMap>> = BTreeMap::new(); From daa9ec163a1411980e516512334252f19ca9aba3 Mon Sep 17 00:00:00 2001 From: Ning Sun Date: Sat, 25 Jan 2025 22:10:53 +0800 Subject: [PATCH 21/32] refactor: improve error and header name --- src/pipeline/src/etl/transform/transformer/greptime.rs | 4 +--- src/pipeline/src/lib.rs | 4 +--- src/servers/src/error.rs | 9 +++++++++ src/servers/src/http/event.rs | 6 ++---- src/servers/src/http/extractor.rs | 6 +++--- src/servers/src/http/header.rs | 2 ++ src/servers/src/http/otlp.rs | 2 +- src/servers/src/otlp/logs.rs | 8 +++----- src/servers/src/pipeline.rs | 7 ++----- 9 files changed, 24 insertions(+), 24 deletions(-) diff --git a/src/pipeline/src/etl/transform/transformer/greptime.rs b/src/pipeline/src/etl/transform/transformer/greptime.rs index eeff061f755c..1d17472b9737 100644 --- a/src/pipeline/src/etl/transform/transformer/greptime.rs +++ b/src/pipeline/src/etl/transform/transformer/greptime.rs @@ -38,9 +38,6 @@ use crate::etl::transform::index::Index; use crate::etl::transform::{Transformer, Transforms}; use crate::etl::value::{Timestamp, Value}; -/// The header key that contains the pipeline params. -pub const GREPTIME_PIPELINE_PARAMS_HEADER: &str = "x-greptime-pipeline-params"; - const DEFAULT_GREPTIME_TIMESTAMP_COLUMN: &str = "greptime_timestamp"; const DEFAULT_MAX_NESTED_LEVELS_FOR_JSON_FLATTENING: usize = 10; @@ -575,6 +572,7 @@ pub fn identity_pipeline( table: Option>, params: &GreptimePipelineParams, ) -> Result { + // TODO: flatten match table { Some(table) => { let table_info = table.table_info(); diff --git a/src/pipeline/src/lib.rs b/src/pipeline/src/lib.rs index c0003d3f4ea1..b4e3405cf154 100644 --- a/src/pipeline/src/lib.rs +++ b/src/pipeline/src/lib.rs @@ -19,9 +19,7 @@ mod metrics; pub use etl::error::Result; pub use etl::processor::Processor; -pub use etl::transform::transformer::greptime::{ - GreptimePipelineParams, SchemaInfo, GREPTIME_PIPELINE_PARAMS_HEADER, -}; +pub use etl::transform::transformer::greptime::{GreptimePipelineParams, SchemaInfo}; pub use etl::transform::transformer::identity_pipeline; pub use etl::transform::{GreptimeTransformer, Transformer}; pub use etl::value::{Array, Map, Value}; diff --git a/src/servers/src/error.rs b/src/servers/src/error.rs index 375151bfec1e..a6ab75a3bcc7 100644 --- a/src/servers/src/error.rs +++ b/src/servers/src/error.rs @@ -157,6 +157,14 @@ pub enum Error { location: Location, }, + #[snafu(display("Pipeline transform error"))] + PipelineTransform { + #[snafu(source)] + source: pipeline::etl_error::Error, + #[snafu(implicit)] + location: Location, + }, + #[snafu(display("Not supported: {}", feat))] NotSupported { feat: String }, @@ -619,6 +627,7 @@ impl ErrorExt for Error { | CheckDatabaseValidity { source, .. } => source.status_code(), Pipeline { source, .. } => source.status_code(), + PipelineTransform { source, .. } => source.status_code(), NotSupported { .. } | InvalidParameter { .. } diff --git a/src/servers/src/http/event.rs b/src/servers/src/http/event.rs index fc4ca58b7543..978891078cce 100644 --- a/src/servers/src/http/event.rs +++ b/src/servers/src/http/event.rs @@ -32,10 +32,7 @@ use headers::ContentType; use lazy_static::lazy_static; use pipeline::error::PipelineTransformSnafu; use pipeline::util::to_pipeline_version; -use pipeline::{ - GreptimePipelineParams, GreptimeTransformer, PipelineDefinition, PipelineVersion, - GREPTIME_PIPELINE_PARAMS_HEADER, -}; +use pipeline::{GreptimePipelineParams, GreptimeTransformer, PipelineDefinition, PipelineVersion}; use serde::{Deserialize, Serialize}; use serde_json::{json, Deserializer, Map, Value}; use session::context::{Channel, QueryContext, QueryContextRef}; @@ -45,6 +42,7 @@ use crate::error::{ status_code_to_http_status, Error, InvalidParameterSnafu, ParseJsonSnafu, PipelineSnafu, Result, UnsupportedContentTypeSnafu, }; +use crate::http::header::constants::GREPTIME_PIPELINE_PARAMS_HEADER; use crate::http::header::CONTENT_TYPE_PROTOBUF_STR; use crate::http::result::greptime_manage_resp::GreptimedbManageResponse; use crate::http::result::greptime_result_v1::GreptimedbV1Response; diff --git a/src/servers/src/http/extractor.rs b/src/servers/src/http/extractor.rs index ee662f36f615..ae578f21d302 100644 --- a/src/servers/src/http/extractor.rs +++ b/src/servers/src/http/extractor.rs @@ -23,7 +23,7 @@ use pipeline::{GreptimePipelineParams, SelectInfo}; use crate::http::header::constants::{ GREPTIME_LOG_EXTRACT_KEYS_HEADER_NAME, GREPTIME_LOG_PIPELINE_NAME_HEADER_NAME, GREPTIME_LOG_PIPELINE_VERSION_HEADER_NAME, GREPTIME_LOG_TABLE_NAME_HEADER_NAME, - GREPTIME_TRACE_TABLE_NAME_HEADER_NAME, + GREPTIME_PIPELINE_PARAMS_HEADER, GREPTIME_TRACE_TABLE_NAME_HEADER_NAME, }; /// Axum extractor for optional target log table name from HTTP header @@ -91,7 +91,7 @@ where pub struct PipelineInfo { pub pipeline_name: Option, pub pipeline_version: Option, - pub pipeline_params: Option, + pub pipeline_params: GreptimePipelineParams, } impl FromRequestParts for PipelineInfo @@ -112,7 +112,7 @@ where Ok(PipelineInfo { pipeline_name, pipeline_version, - pipeline_params: pipeline_parameters.map(|v| GreptimePipelineParams::from_params(v)), + pipeline_params: GreptimePipelineParams::from_params(pipeline_parameters.as_deref()), }) } } diff --git a/src/servers/src/http/header.rs b/src/servers/src/http/header.rs index 51a07ca01f0c..e14ce6172958 100644 --- a/src/servers/src/http/header.rs +++ b/src/servers/src/http/header.rs @@ -50,6 +50,8 @@ pub mod constants { pub const GREPTIME_LOG_TABLE_NAME_HEADER_NAME: &str = "x-greptime-log-table-name"; pub const GREPTIME_LOG_EXTRACT_KEYS_HEADER_NAME: &str = "x-greptime-log-extract-keys"; pub const GREPTIME_TRACE_TABLE_NAME_HEADER_NAME: &str = "x-greptime-trace-table-name"; + /// The header key that contains the pipeline params. + pub const GREPTIME_PIPELINE_PARAMS_HEADER: &str = "x-greptime-pipeline-params"; } pub static GREPTIME_DB_HEADER_FORMAT: HeaderName = diff --git a/src/servers/src/http/otlp.rs b/src/servers/src/http/otlp.rs index a7efa4b7d32b..d8579fc960b3 100644 --- a/src/servers/src/http/otlp.rs +++ b/src/servers/src/http/otlp.rs @@ -126,7 +126,7 @@ pub async fn logs( PipelineWay::OtlpLogDirect(Box::new(select_info)) }; - let pipeline_params = pipeline_info.pipeline_params.unwrap_or_default(); + let pipeline_params = pipeline_info.pipeline_params; // here we use nightly feature `trait_upcasting` to convert handler to // pipeline_handler diff --git a/src/servers/src/otlp/logs.rs b/src/servers/src/otlp/logs.rs index 54f29c291621..5936bd40ad60 100644 --- a/src/servers/src/otlp/logs.rs +++ b/src/servers/src/otlp/logs.rs @@ -25,7 +25,6 @@ use jsonb::{Number as JsonbNumber, Value as JsonbValue}; use opentelemetry_proto::tonic::collector::logs::v1::ExportLogsServiceRequest; use opentelemetry_proto::tonic::common::v1::{any_value, AnyValue, InstrumentationScope, KeyValue}; use opentelemetry_proto::tonic::logs::v1::{LogRecord, ResourceLogs, ScopeLogs}; -use pipeline::error::PipelineTransformSnafu; use pipeline::{GreptimePipelineParams, PipelineWay, SchemaInfo, SelectInfo}; use serde_json::{Map, Value}; use session::context::QueryContextRef; @@ -34,7 +33,7 @@ use snafu::{ensure, ResultExt}; use super::trace::attributes::OtlpAnyValue; use super::utils::{bytes_to_hex_string, key_value_to_jsonb}; use crate::error::{ - IncompatibleSchemaSnafu, PipelineSnafu, Result, UnsupportedJsonDataTypeForTagSnafu, + IncompatibleSchemaSnafu, PipelineTransformSnafu, Result, UnsupportedJsonDataTypeForTagSnafu, }; use crate::pipeline::run_pipeline; use crate::query_handler::PipelineHandlerRef; @@ -73,9 +72,8 @@ pub async fn to_grpc_insert_requests( } PipelineWay::Pipeline(pipeline_def) => { let data = parse_export_logs_service_request(request); - let array = pipeline::json_array_to_intermediate_state(data) - .context(PipelineTransformSnafu) - .context(PipelineSnafu)?; + let array = + pipeline::json_array_to_intermediate_state(data).context(PipelineTransformSnafu)?; let inserts = run_pipeline( &pipeline_handler, diff --git a/src/servers/src/pipeline.rs b/src/servers/src/pipeline.rs index bf7e949db37b..4d16cb6c351f 100644 --- a/src/servers/src/pipeline.rs +++ b/src/servers/src/pipeline.rs @@ -16,7 +16,6 @@ use std::collections::BTreeMap; use std::sync::Arc; use api::v1::{RowInsertRequest, Rows}; -use pipeline::error::PipelineTransformSnafu; use pipeline::{ DispatchedTo, GreptimePipelineParams, GreptimeTransformer, Pipeline, PipelineDefinition, PipelineExecOutput, GREPTIME_INTERNAL_IDENTITY_PIPELINE_NAME, @@ -24,7 +23,7 @@ use pipeline::{ use session::context::QueryContextRef; use snafu::ResultExt; -use crate::error::{CatalogSnafu, PipelineSnafu, Result}; +use crate::error::{CatalogSnafu, PipelineTransformSnafu, Result}; use crate::metrics::{ METRIC_FAILURE_VALUE, METRIC_HTTP_LOGS_TRANSFORM_ELAPSED, METRIC_SUCCESS_VALUE, }; @@ -76,7 +75,6 @@ pub(crate) async fn run_pipeline( }] }) .context(PipelineTransformSnafu) - .context(PipelineSnafu) } else { let pipeline = get_pipeline(pipeline_definition, state, query_ctx).await?; @@ -94,8 +92,7 @@ pub(crate) async fn run_pipeline( .with_label_values(&[db.as_str(), METRIC_FAILURE_VALUE]) .observe(transform_timer.elapsed().as_secs_f64()); }) - .context(PipelineTransformSnafu) - .context(PipelineSnafu)?; + .context(PipelineTransformSnafu)?; match r { PipelineExecOutput::Transformed(row) => { From df5c35de3c37f67e40e642705209bee720b78aef Mon Sep 17 00:00:00 2001 From: Ning Sun Date: Sun, 26 Jan 2025 11:53:26 +0800 Subject: [PATCH 22/32] feat: port flatten to new api --- Cargo.lock | 4 +- .../src/etl/transform/transformer/greptime.rs | 53 ++++++++++--------- src/pipeline/src/etl/value/map.rs | 6 +++ 3 files changed, 36 insertions(+), 27 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index bb37b729e684..c7dbc90df646 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -13483,7 +13483,7 @@ checksum = "2bbd5b46c938e506ecbce286b6628a02171d56153ba733b6c741fc627ec9579b" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.96", ] [[package]] @@ -13494,7 +13494,7 @@ checksum = "053c4c462dc91d3b1504c6fe5a726dd15e216ba718e84a0e46a88fbe5ded3515" dependencies = [ "proc-macro2", "quote", - "syn 2.0.90", + "syn 2.0.96", ] [[package]] diff --git a/src/pipeline/src/etl/transform/transformer/greptime.rs b/src/pipeline/src/etl/transform/transformer/greptime.rs index 1d17472b9737..67015e4d5252 100644 --- a/src/pipeline/src/etl/transform/transformer/greptime.rs +++ b/src/pipeline/src/etl/transform/transformer/greptime.rs @@ -572,14 +572,22 @@ pub fn identity_pipeline( table: Option>, params: &GreptimePipelineParams, ) -> Result { - // TODO: flatten + let input = if params.flatten_json_object() { + array + .into_iter() + .map(|item| flatten_object(item, DEFAULT_MAX_NESTED_LEVELS_FOR_JSON_FLATTENING)) + .collect::>>>()? + } else { + array + }; + match table { Some(table) => { let table_info = table.table_info(); let tag_column_names = table_info.meta.row_key_column_names(); - identity_pipeline_inner(array, Some(tag_column_names), params) + identity_pipeline_inner(input, Some(tag_column_names), params) } - None => identity_pipeline_inner(array, None::>, params), + None => identity_pipeline_inner(input, None::>, params), } } @@ -587,24 +595,24 @@ pub fn identity_pipeline( /// /// The `max_nested_levels` parameter is used to limit the nested levels of the JSON object. /// The error will be returned if the nested levels is greater than the `max_nested_levels`. -pub fn flatten_json_object( - object: Map, +pub fn flatten_object( + object: BTreeMap, max_nested_levels: usize, -) -> Result> { - let mut flattened = Map::new(); +) -> Result> { + let mut flattened = BTreeMap::new(); if !object.is_empty() { // it will use recursion to flatten the object. - do_flatten_json_object(&mut flattened, None, object, 1, max_nested_levels)?; + do_flatten_object(&mut flattened, None, object, 1, max_nested_levels)?; } Ok(flattened) } -fn do_flatten_json_object( - dest: &mut Map, +fn do_flatten_object( + dest: &mut BTreeMap, base: Option<&str>, - object: Map, + object: BTreeMap, current_level: usize, max_nested_levels: usize, ) -> Result<()> { @@ -617,11 +625,11 @@ fn do_flatten_json_object( let new_key = base.map_or_else(|| key.clone(), |base_key| format!("{base_key}.{key}")); match value { - JsonValue::Object(object) => { - do_flatten_json_object( + Value::Map(object) => { + do_flatten_object( dest, Some(&new_key), - object, + object.values, current_level + 1, max_nested_levels, )?; @@ -640,9 +648,7 @@ fn do_flatten_json_object( mod tests { use api::v1::SemanticType; - use crate::etl::transform::transformer::greptime::{ - flatten_json_object, identity_pipeline_inner, GreptimePipelineParams, - }; + use super::*; use crate::etl::{json_array_to_intermediate_state, json_to_intermediate_state}; use crate::{identity_pipeline, Pipeline}; @@ -864,14 +870,11 @@ mod tests { ]; for (input, max_depth, expected) in test_cases { - let flattened_object = - flatten_json_object(input.as_object().unwrap().clone(), max_depth); - match flattened_object { - Ok(flattened_object) => { - assert_eq!(&flattened_object, expected.unwrap().as_object().unwrap()) - } - Err(_) => assert_eq!(None, expected), - } + let input = json_to_intermediate_state(input).unwrap(); + let expected = expected.map(|e| json_to_intermediate_state(e).unwrap()); + + let flattened_object = flatten_object(input, max_depth).ok(); + assert_eq!(flattened_object, expected); } } diff --git a/src/pipeline/src/etl/value/map.rs b/src/pipeline/src/etl/value/map.rs index 004a617b0f9c..9e730ef532d8 100644 --- a/src/pipeline/src/etl/value/map.rs +++ b/src/pipeline/src/etl/value/map.rs @@ -49,6 +49,12 @@ impl From> for Map { } } +impl From> for Map { + fn from(values: BTreeMap) -> Self { + Self { values } + } +} + impl std::ops::Deref for Map { type Target = BTreeMap; From 8e1b6e920c1d98f9f975575925dfe64329d0fbfb Mon Sep 17 00:00:00 2001 From: paomian Date: Sun, 26 Jan 2025 11:56:07 +0800 Subject: [PATCH 23/32] chore: update pipeline api --- src/pipeline/benches/processor.rs | 6 ++---- src/pipeline/tests/common.rs | 12 +++++------- src/pipeline/tests/dissect.rs | 4 ++-- src/pipeline/tests/pipeline.rs | 29 +++++++++-------------------- 4 files changed, 18 insertions(+), 33 deletions(-) diff --git a/src/pipeline/benches/processor.rs b/src/pipeline/benches/processor.rs index 01d1a293d66e..ba7240b9d527 100644 --- a/src/pipeline/benches/processor.rs +++ b/src/pipeline/benches/processor.rs @@ -13,24 +13,22 @@ // limitations under the License. use criterion::{black_box, criterion_group, criterion_main, Criterion}; -use pipeline::{parse, Content, GreptimeTransformer, Pipeline, Result}; +use pipeline::{json_to_intermediate_state, parse, Content, GreptimeTransformer, Pipeline, Result}; use serde_json::{Deserializer, Value}; fn processor_mut( pipeline: &Pipeline, input_values: Vec, ) -> Result> { - let mut payload = pipeline.init_intermediate_state(); let mut result = Vec::with_capacity(input_values.len()); for v in input_values { - pipeline.prepare(v, &mut payload)?; + let mut payload = json_to_intermediate_state(v).unwrap(); let r = pipeline .exec_mut(&mut payload)? .into_transformed() .expect("expect transformed result "); result.push(r); - pipeline.reset_intermediate_state(&mut payload); } Ok(result) diff --git a/src/pipeline/tests/common.rs b/src/pipeline/tests/common.rs index 781c3a30fe0f..89bebbf85bb9 100644 --- a/src/pipeline/tests/common.rs +++ b/src/pipeline/tests/common.rs @@ -13,7 +13,7 @@ // limitations under the License. use greptime_proto::v1::{ColumnDataType, ColumnSchema, Rows, SemanticType}; -use pipeline::{parse, Content, GreptimeTransformer, Pipeline}; +use pipeline::{json_to_intermediate_state, parse, Content, GreptimeTransformer, Pipeline}; /// test util function to parse and execute pipeline pub fn parse_and_exec(input_str: &str, pipeline_yaml: &str) -> Rows { @@ -22,7 +22,6 @@ pub fn parse_and_exec(input_str: &str, pipeline_yaml: &str) -> Rows { let yaml_content = Content::Yaml(pipeline_yaml); let pipeline: Pipeline = parse(&yaml_content).expect("failed to parse pipeline"); - let mut result = pipeline.init_intermediate_state(); let schema = pipeline.schemas().clone(); @@ -31,20 +30,19 @@ pub fn parse_and_exec(input_str: &str, pipeline_yaml: &str) -> Rows { match input_value { serde_json::Value::Array(array) => { for value in array { - pipeline.prepare(value, &mut result).unwrap(); + let mut intermediate_status = json_to_intermediate_state(value).unwrap(); let row = pipeline - .exec_mut(&mut result) + .exec_mut(&mut intermediate_status) .expect("failed to exec pipeline") .into_transformed() .expect("expect transformed result "); rows.push(row); - pipeline.reset_intermediate_state(&mut result); } } serde_json::Value::Object(_) => { - pipeline.prepare(input_value, &mut result).unwrap(); + let mut intermediate_status = json_to_intermediate_state(input_value).unwrap(); let row = pipeline - .exec_mut(&mut result) + .exec_mut(&mut intermediate_status) .expect("failed to exec pipeline") .into_transformed() .expect("expect transformed result "); diff --git a/src/pipeline/tests/dissect.rs b/src/pipeline/tests/dissect.rs index 56386d0e860a..a93112d68945 100644 --- a/src/pipeline/tests/dissect.rs +++ b/src/pipeline/tests/dissect.rs @@ -16,6 +16,7 @@ mod common; use greptime_proto::v1::value::ValueData::StringValue; use greptime_proto::v1::{ColumnDataType, SemanticType}; +use pipeline::json_to_intermediate_state; fn make_string_column_schema(name: String) -> greptime_proto::v1::ColumnSchema { common::make_column_schema(name, ColumnDataType::String, SemanticType::Field) @@ -273,9 +274,8 @@ transform: let yaml_content = pipeline::Content::Yaml(pipeline_yaml); let pipeline: pipeline::Pipeline = pipeline::parse(&yaml_content).expect("failed to parse pipeline"); - let mut result = pipeline.init_intermediate_state(); + let mut result = json_to_intermediate_state(input_value).unwrap(); - pipeline.prepare(input_value, &mut result).unwrap(); let row = pipeline.exec_mut(&mut result); assert!(row.is_err()); diff --git a/src/pipeline/tests/pipeline.rs b/src/pipeline/tests/pipeline.rs index f0fa3992e4bf..c34187c80c91 100644 --- a/src/pipeline/tests/pipeline.rs +++ b/src/pipeline/tests/pipeline.rs @@ -20,7 +20,7 @@ use greptime_proto::v1::value::ValueData::{ U32Value, U64Value, U8Value, }; use greptime_proto::v1::Value as GreptimeValue; -use pipeline::{parse, Content, GreptimeTransformer, Pipeline}; +use pipeline::{json_to_intermediate_state, parse, Content, GreptimeTransformer, Pipeline}; #[test] fn test_complex_data() { @@ -420,10 +420,7 @@ transform: let yaml_content = Content::Yaml(pipeline_yaml); let pipeline: Pipeline = parse(&yaml_content).expect("failed to parse pipeline"); - let mut stats = pipeline.init_intermediate_state(); - pipeline - .prepare(input_value, &mut stats) - .expect("failed to prepare pipeline"); + let mut stats = json_to_intermediate_state(input_value).unwrap(); let row = pipeline .exec_mut(&mut stats) @@ -492,8 +489,7 @@ transform: let yaml_content = Content::Yaml(pipeline_yaml); let pipeline: Pipeline = parse(&yaml_content).unwrap(); - let mut status = pipeline.init_intermediate_state(); - pipeline.prepare(input_value, &mut status).unwrap(); + let mut status = json_to_intermediate_state(input_value).unwrap(); let row = pipeline .exec_mut(&mut status) .unwrap() @@ -601,9 +597,7 @@ transform: let yaml_content = Content::Yaml(pipeline_yaml); let pipeline: Pipeline = parse(&yaml_content).unwrap(); - let mut status = pipeline.init_intermediate_state(); - - pipeline.prepare(input_value, &mut status).unwrap(); + let mut status = json_to_intermediate_state(input_value).unwrap(); let row = pipeline .exec_mut(&mut status) .unwrap() @@ -668,8 +662,7 @@ transform: let yaml_content = Content::Yaml(pipeline_yaml); let pipeline: Pipeline = parse(&yaml_content).unwrap(); - let mut status = pipeline.init_intermediate_state(); - pipeline.prepare(input_value, &mut status).unwrap(); + let mut status = json_to_intermediate_state(input_value).unwrap(); let row = pipeline .exec_mut(&mut status) .unwrap() @@ -708,8 +701,7 @@ transform: let yaml_content = Content::Yaml(pipeline_yaml); let pipeline: Pipeline = parse(&yaml_content).unwrap(); - let mut status = pipeline.init_intermediate_state(); - pipeline.prepare(input_value, &mut status).unwrap(); + let mut status = json_to_intermediate_state(input_value).unwrap(); let row = pipeline .exec_mut(&mut status) @@ -768,8 +760,7 @@ transform: let yaml_content = Content::Yaml(pipeline_yaml); let pipeline: Pipeline = parse(&yaml_content).unwrap(); - let mut status = pipeline.init_intermediate_state(); - pipeline.prepare(input_value, &mut status).unwrap(); + let mut status = json_to_intermediate_state(input_value).unwrap(); let row = pipeline .exec_mut(&mut status) .unwrap() @@ -841,8 +832,7 @@ transform: let yaml_content = Content::Yaml(pipeline_yaml); let pipeline: Pipeline = parse(&yaml_content).unwrap(); - let mut status = pipeline.init_intermediate_state(); - pipeline.prepare(input_value1, &mut status).unwrap(); + let mut status = json_to_intermediate_state(input_value1).unwrap(); let dispatched_to = pipeline .exec_mut(&mut status) .unwrap() @@ -851,8 +841,7 @@ transform: assert_eq!(dispatched_to.table_part, "http"); assert_eq!(dispatched_to.pipeline.unwrap(), "access_log_pipeline"); - let mut status = pipeline.init_intermediate_state(); - pipeline.prepare(input_value2, &mut status).unwrap(); + let mut status = json_to_intermediate_state(input_value2).unwrap(); let row = pipeline .exec_mut(&mut status) .unwrap() From c803209e4e5028e11dc29a85bfb42cb43443148e Mon Sep 17 00:00:00 2001 From: paomian Date: Sun, 26 Jan 2025 15:22:41 +0800 Subject: [PATCH 24/32] chore: fix transform and some pipeline test --- src/pipeline/src/etl.rs | 458 +++++++++--------- .../src/etl/transform/transformer/greptime.rs | 53 +- src/pipeline/src/lib.rs | 6 +- 3 files changed, 244 insertions(+), 273 deletions(-) diff --git a/src/pipeline/src/etl.rs b/src/pipeline/src/etl.rs index bca33a607f6e..f302e655a816 100644 --- a/src/pipeline/src/etl.rs +++ b/src/pipeline/src/etl.rs @@ -272,242 +272,228 @@ mod tests { use super::*; use crate::etl::transform::GreptimeTransformer; - // #[test] - // fn test_pipeline_prepare() { - // let input_value_str = r#" - // { - // "my_field": "1,2", - // "foo": "bar" - // } - // "#; - // let input_value: serde_json::Value = serde_json::from_str(input_value_str).unwrap(); - - // let pipeline_yaml = r#"description: 'Pipeline for Apache Tomcat' - // processors: - // - csv: - // field: my_field - // target_fields: field1, field2 - // transform: - // - field: field1 - // type: uint32 - // - field: field2 - // type: uint32 - // "#; - // let pipeline: Pipeline = parse(&Content::Yaml(pipeline_yaml)).unwrap(); - // let mut payload = pipeline.init_intermediate_state(); - // pipeline.prepare(input_value, &mut payload).unwrap(); - // assert_eq!(&["my_field"].to_vec(), pipeline.required_keys()); - // assert_eq!( - // payload, - // vec![Value::String("1,2".to_string()), Value::Null, Value::Null] - // ); - // let result = pipeline - // .exec_mut(&mut payload) - // .unwrap() - // .into_transformed() - // .unwrap(); - - // assert_eq!(result.values[0].value_data, Some(ValueData::U32Value(1))); - // assert_eq!(result.values[1].value_data, Some(ValueData::U32Value(2))); - // match &result.values[2].value_data { - // Some(ValueData::TimestampNanosecondValue(v)) => { - // assert_ne!(*v, 0); - // } - // _ => panic!("expect null value"), - // } - // } - - // #[test] - // fn test_dissect_pipeline() { - // let message = r#"129.37.245.88 - meln1ks [01/Aug/2024:14:22:47 +0800] "PATCH /observability/metrics/production HTTP/1.0" 501 33085"#.to_string(); - // let pipeline_str = r#"processors: - // - dissect: - // fields: - // - message - // patterns: - // - "%{ip} %{?ignored} %{username} [%{ts}] \"%{method} %{path} %{proto}\" %{status} %{bytes}" - // - timestamp: - // fields: - // - ts - // formats: - // - "%d/%b/%Y:%H:%M:%S %z" - - // transform: - // - fields: - // - ip - // - username - // - method - // - path - // - proto - // type: string - // - fields: - // - status - // type: uint16 - // - fields: - // - bytes - // type: uint32 - // - field: ts - // type: timestamp, ns - // index: time"#; - // let pipeline: Pipeline = parse(&Content::Yaml(pipeline_str)).unwrap(); - // let mut payload = pipeline.init_intermediate_state(); - // pipeline - // .prepare(serde_json::Value::String(message), &mut payload) - // .unwrap(); - // let result = pipeline - // .exec_mut(&mut payload) - // .unwrap() - // .into_transformed() - // .unwrap(); - // let sechema = pipeline.schemas(); - - // assert_eq!(sechema.len(), result.values.len()); - // let test = vec![ - // ( - // ColumnDataType::String as i32, - // Some(ValueData::StringValue("129.37.245.88".into())), - // ), - // ( - // ColumnDataType::String as i32, - // Some(ValueData::StringValue("meln1ks".into())), - // ), - // ( - // ColumnDataType::String as i32, - // Some(ValueData::StringValue("PATCH".into())), - // ), - // ( - // ColumnDataType::String as i32, - // Some(ValueData::StringValue( - // "/observability/metrics/production".into(), - // )), - // ), - // ( - // ColumnDataType::String as i32, - // Some(ValueData::StringValue("HTTP/1.0".into())), - // ), - // ( - // ColumnDataType::Uint16 as i32, - // Some(ValueData::U16Value(501)), - // ), - // ( - // ColumnDataType::Uint32 as i32, - // Some(ValueData::U32Value(33085)), - // ), - // ( - // ColumnDataType::TimestampNanosecond as i32, - // Some(ValueData::TimestampNanosecondValue(1722493367000000000)), - // ), - // ]; - // for i in 0..sechema.len() { - // let schema = &sechema[i]; - // let value = &result.values[i]; - // assert_eq!(schema.datatype, test[i].0); - // assert_eq!(value.value_data, test[i].1); - // } - // } - - // #[test] - // fn test_csv_pipeline() { - // let input_value_str = r#" - // { - // "my_field": "1,2", - // "foo": "bar" - // } - // "#; - // let input_value: serde_json::Value = serde_json::from_str(input_value_str).unwrap(); - - // let pipeline_yaml = r#" - // description: Pipeline for Apache Tomcat - // processors: - // - csv: - // field: my_field - // target_fields: field1, field2 - // transform: - // - field: field1 - // type: uint32 - // - field: field2 - // type: uint32 - // "#; - - // let pipeline: Pipeline = parse(&Content::Yaml(pipeline_yaml)).unwrap(); - // let mut payload = pipeline.init_intermediate_state(); - // pipeline.prepare(input_value, &mut payload).unwrap(); - // assert_eq!(&["my_field"].to_vec(), pipeline.required_keys()); - // assert_eq!( - // payload, - // vec![Value::String("1,2".to_string()), Value::Null, Value::Null] - // ); - // let result = pipeline - // .exec_mut(&mut payload) - // .unwrap() - // .into_transformed() - // .unwrap(); - // assert_eq!(result.values[0].value_data, Some(ValueData::U32Value(1))); - // assert_eq!(result.values[1].value_data, Some(ValueData::U32Value(2))); - // match &result.values[2].value_data { - // Some(ValueData::TimestampNanosecondValue(v)) => { - // assert_ne!(*v, 0); - // } - // _ => panic!("expect null value"), - // } - // } - - // #[test] - // fn test_date_pipeline() { - // let input_value_str = r#" - // { - // "my_field": "1,2", - // "foo": "bar", - // "test_time": "2014-5-17T04:34:56+00:00" - // } - // "#; - // let input_value: serde_json::Value = serde_json::from_str(input_value_str).unwrap(); - - // let pipeline_yaml = r#" - // --- - // description: Pipeline for Apache Tomcat - - // processors: - // - timestamp: - // field: test_time - - // transform: - // - field: test_time - // type: timestamp, ns - // index: time - // "#; - - // let pipeline: Pipeline = parse(&Content::Yaml(pipeline_yaml)).unwrap(); - // let schema = pipeline.schemas().clone(); - // let mut result = pipeline.init_intermediate_state(); - // pipeline.prepare(input_value, &mut result).unwrap(); - // let row = pipeline - // .exec_mut(&mut result) - // .unwrap() - // .into_transformed() - // .unwrap(); - // let output = Rows { - // schema, - // rows: vec![row], - // }; - // let schemas = output.schema; - - // assert_eq!(schemas.len(), 1); - // let schema = schemas[0].clone(); - // assert_eq!("test_time", schema.column_name); - // assert_eq!(ColumnDataType::TimestampNanosecond as i32, schema.datatype); - // assert_eq!(SemanticType::Timestamp as i32, schema.semantic_type); - - // let row = output.rows[0].clone(); - // assert_eq!(1, row.values.len()); - // let value_data = row.values[0].clone().value_data; - // assert_eq!( - // Some(v1::value::ValueData::TimestampNanosecondValue( - // 1400301296000000000 - // )), - // value_data - // ); - // } + #[test] + fn test_pipeline_prepare() { + let input_value_str = r#" + { + "my_field": "1,2", + "foo": "bar" + } + "#; + let input_value: serde_json::Value = serde_json::from_str(input_value_str).unwrap(); + + let pipeline_yaml = r#"description: 'Pipeline for Apache Tomcat' + processors: + - csv: + field: my_field + target_fields: field1, field2 + transform: + - field: field1 + type: uint32 + - field: field2 + type: uint32 + "#; + let pipeline: Pipeline = parse(&Content::Yaml(pipeline_yaml)).unwrap(); + let mut payload = json_to_intermediate_state(input_value).unwrap(); + let result = pipeline + .exec_mut(&mut payload) + .unwrap() + .into_transformed() + .unwrap(); + + assert_eq!(result.values[0].value_data, Some(ValueData::U32Value(1))); + assert_eq!(result.values[1].value_data, Some(ValueData::U32Value(2))); + match &result.values[2].value_data { + Some(ValueData::TimestampNanosecondValue(v)) => { + assert_ne!(*v, 0); + } + _ => panic!("expect null value"), + } + } + + #[test] + fn test_dissect_pipeline() { + let message = r#"129.37.245.88 - meln1ks [01/Aug/2024:14:22:47 +0800] "PATCH /observability/metrics/production HTTP/1.0" 501 33085"#.to_string(); + let pipeline_str = r#"processors: + - dissect: + fields: + - message + patterns: + - "%{ip} %{?ignored} %{username} [%{ts}] \"%{method} %{path} %{proto}\" %{status} %{bytes}" + - timestamp: + fields: + - ts + formats: + - "%d/%b/%Y:%H:%M:%S %z" + + transform: + - fields: + - ip + - username + - method + - path + - proto + type: string + - fields: + - status + type: uint16 + - fields: + - bytes + type: uint32 + - field: ts + type: timestamp, ns + index: time"#; + let pipeline: Pipeline = parse(&Content::Yaml(pipeline_str)).unwrap(); + let mut payload = BTreeMap::new(); + payload.insert("message".to_string(), Value::String(message)); + let result = pipeline + .exec_mut(&mut payload) + .unwrap() + .into_transformed() + .unwrap(); + let sechema = pipeline.schemas(); + + assert_eq!(sechema.len(), result.values.len()); + let test = vec![ + ( + ColumnDataType::String as i32, + Some(ValueData::StringValue("129.37.245.88".into())), + ), + ( + ColumnDataType::String as i32, + Some(ValueData::StringValue("meln1ks".into())), + ), + ( + ColumnDataType::String as i32, + Some(ValueData::StringValue("PATCH".into())), + ), + ( + ColumnDataType::String as i32, + Some(ValueData::StringValue( + "/observability/metrics/production".into(), + )), + ), + ( + ColumnDataType::String as i32, + Some(ValueData::StringValue("HTTP/1.0".into())), + ), + ( + ColumnDataType::Uint16 as i32, + Some(ValueData::U16Value(501)), + ), + ( + ColumnDataType::Uint32 as i32, + Some(ValueData::U32Value(33085)), + ), + ( + ColumnDataType::TimestampNanosecond as i32, + Some(ValueData::TimestampNanosecondValue(1722493367000000000)), + ), + ]; + for i in 0..sechema.len() { + let schema = &sechema[i]; + let value = &result.values[i]; + assert_eq!(schema.datatype, test[i].0); + assert_eq!(value.value_data, test[i].1); + } + } + + #[test] + fn test_csv_pipeline() { + let input_value_str = r#" + { + "my_field": "1,2", + "foo": "bar" + } + "#; + let input_value: serde_json::Value = serde_json::from_str(input_value_str).unwrap(); + + let pipeline_yaml = r#" + description: Pipeline for Apache Tomcat + processors: + - csv: + field: my_field + target_fields: field1, field2 + transform: + - field: field1 + type: uint32 + - field: field2 + type: uint32 + "#; + + let pipeline: Pipeline = parse(&Content::Yaml(pipeline_yaml)).unwrap(); + let mut payload = json_to_intermediate_state(input_value).unwrap(); + let result = pipeline + .exec_mut(&mut payload) + .unwrap() + .into_transformed() + .unwrap(); + assert_eq!(result.values[0].value_data, Some(ValueData::U32Value(1))); + assert_eq!(result.values[1].value_data, Some(ValueData::U32Value(2))); + match &result.values[2].value_data { + Some(ValueData::TimestampNanosecondValue(v)) => { + assert_ne!(*v, 0); + } + _ => panic!("expect null value"), + } + } + + #[test] + fn test_date_pipeline() { + let input_value_str = r#" + { + "my_field": "1,2", + "foo": "bar", + "test_time": "2014-5-17T04:34:56+00:00" + } + "#; + let input_value: serde_json::Value = serde_json::from_str(input_value_str).unwrap(); + + let pipeline_yaml = r#" + --- + description: Pipeline for Apache Tomcat + + processors: + - timestamp: + field: test_time + + transform: + - field: test_time + type: timestamp, ns + index: time + "#; + + let pipeline: Pipeline = parse(&Content::Yaml(pipeline_yaml)).unwrap(); + let schema = pipeline.schemas().clone(); + let mut result = json_to_intermediate_state(input_value).unwrap(); + + let row = pipeline + .exec_mut(&mut result) + .unwrap() + .into_transformed() + .unwrap(); + let output = Rows { + schema, + rows: vec![row], + }; + let schemas = output.schema; + + assert_eq!(schemas.len(), 1); + let schema = schemas[0].clone(); + assert_eq!("test_time", schema.column_name); + assert_eq!(ColumnDataType::TimestampNanosecond as i32, schema.datatype); + assert_eq!(SemanticType::Timestamp as i32, schema.semantic_type); + + let row = output.rows[0].clone(); + assert_eq!(1, row.values.len()); + let value_data = row.values[0].clone().value_data; + assert_eq!( + Some(v1::value::ValueData::TimestampNanosecondValue( + 1400301296000000000 + )), + value_data + ); + } #[test] fn test_dispatcher() { diff --git a/src/pipeline/src/etl/transform/transformer/greptime.rs b/src/pipeline/src/etl/transform/transformer/greptime.rs index 67015e4d5252..749806261a02 100644 --- a/src/pipeline/src/etl/transform/transformer/greptime.rs +++ b/src/pipeline/src/etl/transform/transformer/greptime.rs @@ -25,7 +25,7 @@ use api::v1::{ColumnDataType, ColumnDataTypeExtension, JsonTypeExtension, Semant use coerce::{coerce_columns, coerce_value}; use greptime_proto::v1::{ColumnSchema, Row, Rows, Value as GreptimeValue}; use itertools::Itertools; -use serde_json::{Map, Number, Value as JsonValue}; +use serde_json::Number; use crate::etl::error::{ IdentifyPipelineColumnTypeMismatchSnafu, ReachedMaxNestedLevelsSnafu, Result, @@ -33,9 +33,10 @@ use crate::etl::error::{ TransformMultipleTimestampIndexSnafu, TransformTimestampIndexCountSnafu, UnsupportedNumberTypeSnafu, }; +use crate::etl::field::{Field, Fields}; use crate::etl::processor::IntermediateStatus; use crate::etl::transform::index::Index; -use crate::etl::transform::{Transformer, Transforms}; +use crate::etl::transform::{Transform, Transformer, Transforms}; use crate::etl::value::{Timestamp, Value}; const DEFAULT_GREPTIME_TIMESTAMP_COLUMN: &str = "greptime_timestamp"; @@ -83,37 +84,21 @@ impl GreptimePipelineParams { impl GreptimeTransformer { /// Add a default timestamp column to the transforms fn add_greptime_timestamp_column(transforms: &mut Transforms) { - // let ns = chrono::Utc::now().timestamp_nanos_opt().unwrap_or(0); - // let type_ = Value::Timestamp(Timestamp::Nanosecond(ns)); - // let default = Some(type_.clone()); - - // let transform = Transform { - // real_fields: vec![OneInputOneOutputField::new( - // InputFieldInfo { - // name: DEFAULT_GREPTIME_TIMESTAMP_COLUMN.to_string(), - // index: usize::MAX, - // }, - // ( - // DEFAULT_GREPTIME_TIMESTAMP_COLUMN.to_string(), - // transforms - // .transforms - // .iter() - // .map(|x| x.real_fields.len()) - // .sum(), - // ), - // )], - // type_, - // default, - // index: Some(Index::Time), - // on_failure: Some(crate::etl::transform::OnFailure::Default), - // }; - // let required_keys = transforms.required_keys_mut(); - // required_keys.push(DEFAULT_GREPTIME_TIMESTAMP_COLUMN.to_string()); - - // let output_keys = transforms.output_keys_mut(); - // output_keys.push(DEFAULT_GREPTIME_TIMESTAMP_COLUMN.to_string()); - // transforms.push(transform); - todo!() + let ns = chrono::Utc::now().timestamp_nanos_opt().unwrap_or(0); + let type_ = Value::Timestamp(Timestamp::Nanosecond(ns)); + let default = Some(type_.clone()); + + let transform = Transform { + fields: Fields::one(Field::new( + DEFAULT_GREPTIME_TIMESTAMP_COLUMN.to_string(), + None, + )), + type_, + default, + index: Some(Index::Time), + on_failure: Some(crate::etl::transform::OnFailure::Default), + }; + transforms.push(transform); } /// Generate the schema for the GreptimeTransformer @@ -650,7 +635,7 @@ mod tests { use super::*; use crate::etl::{json_array_to_intermediate_state, json_to_intermediate_state}; - use crate::{identity_pipeline, Pipeline}; + use crate::identity_pipeline; #[test] fn test_identify_pipeline() { diff --git a/src/pipeline/src/lib.rs b/src/pipeline/src/lib.rs index b4e3405cf154..a6c82f9353cf 100644 --- a/src/pipeline/src/lib.rs +++ b/src/pipeline/src/lib.rs @@ -24,9 +24,9 @@ pub use etl::transform::transformer::identity_pipeline; pub use etl::transform::{GreptimeTransformer, Transformer}; pub use etl::value::{Array, Map, Value}; pub use etl::{ - error as etl_error, json_array_to_intermediate_state, parse, Content, DispatchedTo, Pipeline, - PipelineDefinition, PipelineExecOutput, PipelineWay, SelectInfo, - GREPTIME_INTERNAL_IDENTITY_PIPELINE_NAME, + error as etl_error, json_array_to_intermediate_state, json_to_intermediate_state, parse, + Content, DispatchedTo, Pipeline, PipelineDefinition, PipelineExecOutput, PipelineWay, + SelectInfo, GREPTIME_INTERNAL_IDENTITY_PIPELINE_NAME, }; pub use manager::{ error, pipeline_operator, table, util, PipelineInfo, PipelineRef, PipelineTableRef, From 9de70d3910cf08989e61fedfa8b5af4b4ad84d5d Mon Sep 17 00:00:00 2001 From: Ning Sun Date: Sun, 26 Jan 2025 17:07:09 +0800 Subject: [PATCH 25/32] refactor: reimplement cmcd --- src/pipeline/src/etl/processor.rs | 8 +- src/pipeline/src/etl/processor/cmcd.rs | 251 ++++++------------------- 2 files changed, 60 insertions(+), 199 deletions(-) diff --git a/src/pipeline/src/etl/processor.rs b/src/pipeline/src/etl/processor.rs index 63854ad552a7..b20258f20818 100644 --- a/src/pipeline/src/etl/processor.rs +++ b/src/pipeline/src/etl/processor.rs @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -// pub mod cmcd; +pub mod cmcd; // pub mod csv; pub mod date; pub mod decolorize; @@ -29,7 +29,7 @@ pub mod urlencoding; use std::collections::BTreeMap; -// use cmcd::CmcdProcessor; +use cmcd::CmcdProcessor; // use csv::CsvProcessor; use date::DateProcessor; use decolorize::DecolorizeProcessor; @@ -89,7 +89,7 @@ pub trait Processor: std::fmt::Debug + Send + Sync + 'static { #[derive(Debug)] #[enum_dispatch] pub enum ProcessorKind { - // Cmcd(CmcdProcessor), + Cmcd(CmcdProcessor), // Csv(CsvProcessor), // Dissect(DissectProcessor), Gsub(GsubProcessor), @@ -156,7 +156,7 @@ fn parse_processor(doc: &yaml_rust::Yaml) -> Result { let str_key = key.as_str().context(ProcessorKeyMustBeStringSnafu)?; let processor = match str_key { - // cmcd::PROCESSOR_CMCD => ProcessorKind::Cmcd(CmcdProcessor::try_from(value)?), + cmcd::PROCESSOR_CMCD => ProcessorKind::Cmcd(CmcdProcessor::try_from(value)?), // csv::PROCESSOR_CSV => ProcessorKind::Csv(CsvProcessor::try_from(value)?), // dissect::PROCESSOR_DISSECT => ProcessorKind::Dissect(DissectProcessor::try_from(value)?), epoch::PROCESSOR_EPOCH => ProcessorKind::Epoch(EpochProcessor::try_from(value)?), diff --git a/src/pipeline/src/etl/processor/cmcd.rs b/src/pipeline/src/etl/processor/cmcd.rs index 944487472691..37df6e8fbec1 100644 --- a/src/pipeline/src/etl/processor/cmcd.rs +++ b/src/pipeline/src/etl/processor/cmcd.rs @@ -18,7 +18,6 @@ use std::collections::BTreeMap; -use ahash::HashSet; use snafu::{OptionExt, ResultExt}; use urlencoding::decode; @@ -27,11 +26,10 @@ use crate::etl::error::{ FailedToParseIntKeySnafu, KeyMustBeStringSnafu, ProcessorExpectStringSnafu, ProcessorMissingFieldSnafu, Result, }; -use crate::etl::field::{Field, Fields, InputField, OneInputMultiOutputField}; -use crate::etl::find_key_index; +use crate::etl::field::Fields; use crate::etl::processor::{ - yaml_bool, yaml_new_field, yaml_new_fields, Processor, ProcessorBuilder, ProcessorKind, - FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME, + yaml_bool, yaml_new_field, yaml_new_fields, Processor, FIELDS_NAME, FIELD_NAME, + IGNORE_MISSING_NAME, }; use crate::etl::value::Value; @@ -79,139 +77,6 @@ const CMCD_KEYS: [&str; 18] = [ CMCD_KEY_V, ]; -/// CmcdProcessorBuilder is a builder for CmcdProcessor -/// parse from raw yaml -#[derive(Debug, Default)] -pub struct CmcdProcessorBuilder { - fields: Fields, - output_keys: HashSet, - ignore_missing: bool, -} - -impl CmcdProcessorBuilder { - /// build_cmcd_outputs build cmcd output info - /// generate index and function for each output - pub(super) fn build_cmcd_outputs( - field: &Field, - intermediate_keys: &[String], - ) -> Result<(BTreeMap, Vec)> { - let mut output_index = BTreeMap::new(); - let mut cmcd_field_outputs = Vec::with_capacity(CMCD_KEYS.len()); - for cmcd in CMCD_KEYS { - let final_key = generate_key(field.target_or_input_field(), cmcd); - let index = find_key_index(intermediate_keys, &final_key, "cmcd")?; - output_index.insert(final_key.clone(), index); - match cmcd { - CMCD_KEY_BS | CMCD_KEY_SU => { - let output_info = CmcdOutputInfo::new(final_key, cmcd, index, bs_su); - cmcd_field_outputs.push(output_info); - } - CMCD_KEY_BR | CMCD_KEY_BL | CMCD_KEY_D | CMCD_KEY_DL | CMCD_KEY_MTP - | CMCD_KEY_RTP | CMCD_KEY_TB => { - let output_info = CmcdOutputInfo::new(final_key, cmcd, index, br_tb); - cmcd_field_outputs.push(output_info); - } - CMCD_KEY_CID | CMCD_KEY_NRR | CMCD_KEY_OT | CMCD_KEY_SF | CMCD_KEY_SID - | CMCD_KEY_ST | CMCD_KEY_V => { - let output_info = CmcdOutputInfo::new(final_key, cmcd, index, cid_v); - cmcd_field_outputs.push(output_info); - } - CMCD_KEY_NOR => { - let output_info = CmcdOutputInfo::new(final_key, cmcd, index, nor); - cmcd_field_outputs.push(output_info); - } - CMCD_KEY_PR => { - let output_info = CmcdOutputInfo::new(final_key, cmcd, index, pr); - cmcd_field_outputs.push(output_info); - } - _ => {} - } - } - Ok((output_index, cmcd_field_outputs)) - } - - /// build CmcdProcessor from CmcdProcessorBuilder - pub fn build(self, intermediate_keys: &[String]) -> Result { - let mut real_fields = vec![]; - let mut cmcd_outputs = Vec::with_capacity(CMCD_KEYS.len()); - for field in self.fields.into_iter() { - let input_index = find_key_index(intermediate_keys, field.input_field(), "cmcd")?; - - let input_field_info = InputField::new(field.input_field(), input_index); - - let (_, cmcd_field_outputs) = Self::build_cmcd_outputs(&field, intermediate_keys)?; - - cmcd_outputs.push(cmcd_field_outputs); - - let real_field = OneInputMultiOutputField::new(input_field_info, field.target_field); - real_fields.push(real_field); - } - Ok(CmcdProcessor { - fields: real_fields, - cmcd_outputs, - ignore_missing: self.ignore_missing, - }) - } -} - -impl ProcessorBuilder for CmcdProcessorBuilder { - fn output_keys(&self) -> HashSet<&str> { - self.output_keys.iter().map(|s| s.as_str()).collect() - } - - fn input_keys(&self) -> HashSet<&str> { - self.fields.iter().map(|f| f.input_field()).collect() - } - - fn build(self, intermediate_keys: &[String]) -> Result { - self.build(intermediate_keys).map(ProcessorKind::Cmcd) - } -} - -fn generate_key(prefix: &str, key: &str) -> String { - format!("{}_{}", prefix, key) -} - -/// CmcdOutputInfo is a struct to store output info -#[derive(Debug)] -pub(super) struct CmcdOutputInfo { - /// {input_field}_{cmcd_key} - final_key: String, - /// cmcd key - key: &'static str, - /// index in intermediate_keys - index: usize, - /// function to resolve value - f: fn(&str, &str, Option<&str>) -> Result, -} - -impl CmcdOutputInfo { - fn new( - final_key: String, - key: &'static str, - index: usize, - f: fn(&str, &str, Option<&str>) -> Result, - ) -> Self { - Self { - final_key, - key, - index, - f, - } - } -} - -impl Default for CmcdOutputInfo { - fn default() -> Self { - Self { - final_key: String::default(), - key: "", - index: 0, - f: |_, _, _| Ok(Value::Null), - } - } -} - /// function to resolve CMCD_KEY_BS | CMCD_KEY_SU fn bs_su(_: &str, _: &str, _: Option<&str>) -> Result { Ok(Value::Boolean(true)) @@ -288,9 +153,7 @@ fn pr(s: &str, k: &str, v: Option<&str>) -> Result { /// 12. Transport Layer Security SHOULD be used to protect all transmission of CMCD data. #[derive(Debug, Default)] pub struct CmcdProcessor { - fields: Vec, - cmcd_outputs: Vec>, - + fields: Fields, ignore_missing: bool, } @@ -299,27 +162,52 @@ impl CmcdProcessor { format!("{}_{}", prefix, key) } - fn parse(&self, field_index: usize, s: &str) -> Result> { - let parts = s.split(','); - let mut result = Vec::new(); + fn parse(&self, name: &str, value: &str) -> Result> { + let mut working_set = BTreeMap::new(); + + let parts = value.split(','); + for part in parts { let mut kv = part.split('='); - let k = kv.next().context(CmcdMissingKeySnafu { part, s })?; + let k = kv.next().context(CmcdMissingKeySnafu { part, s: value })?; let v = kv.next(); - for cmcd_key in self.cmcd_outputs[field_index].iter() { - if cmcd_key.key == k { - let val = (cmcd_key.f)(s, k, v)?; - result.push((cmcd_key.index, val)); + for cmcd_key in CMCD_KEYS { + if cmcd_key == k { + match cmcd_key { + CMCD_KEY_BS | CMCD_KEY_SU => { + working_set + .insert(Self::generate_key(name, cmcd_key), bs_su(value, k, v)?); + } + CMCD_KEY_BR | CMCD_KEY_BL | CMCD_KEY_D | CMCD_KEY_DL | CMCD_KEY_MTP + | CMCD_KEY_RTP | CMCD_KEY_TB => { + working_set + .insert(Self::generate_key(name, cmcd_key), br_tb(value, k, v)?); + } + CMCD_KEY_CID | CMCD_KEY_NRR | CMCD_KEY_OT | CMCD_KEY_SF | CMCD_KEY_SID + | CMCD_KEY_ST | CMCD_KEY_V => { + working_set + .insert(Self::generate_key(name, cmcd_key), cid_v(value, k, v)?); + } + CMCD_KEY_NOR => { + working_set + .insert(Self::generate_key(name, cmcd_key), nor(value, k, v)?); + } + CMCD_KEY_PR => { + working_set + .insert(Self::generate_key(name, cmcd_key), pr(value, k, v)?); + } + + _ => {} + } } } } - - Ok(result) + Ok(working_set) } } -impl TryFrom<&yaml_rust::yaml::Hash> for CmcdProcessorBuilder { +impl TryFrom<&yaml_rust::yaml::Hash> for CmcdProcessor { type Error = Error; fn try_from(value: &yaml_rust::yaml::Hash) -> Result { @@ -346,22 +234,12 @@ impl TryFrom<&yaml_rust::yaml::Hash> for CmcdProcessorBuilder { } } - let output_keys = fields - .iter() - .flat_map(|f| { - CMCD_KEYS - .iter() - .map(|cmcd_key| generate_key(f.target_or_input_field(), cmcd_key)) - }) - .collect(); - - let builder = CmcdProcessorBuilder { + let proc = CmcdProcessor { fields, - output_keys, ignore_missing, }; - Ok(builder) + Ok(proc) } } @@ -375,20 +253,19 @@ impl Processor for CmcdProcessor { } fn exec_mut(&self, val: &mut IntermediateStatus) -> Result<()> { - for (field_index, field) in self.fields.iter().enumerate() { - let field_value_index = field.input_index(); - match val.get(field_value_index) { - Some(Value::String(v)) => { - let result_list = self.parse(field_index, v)?; - for (output_index, v) in result_list { - val[output_index] = v; - } + for field in self.fields.iter() { + let name = field.input_field(); + + match val.get(name) { + Some(Value::String(s)) => { + let results = self.parse(name, s)?; + val.extend(results); } Some(Value::Null) | None => { if !self.ignore_missing { return ProcessorMissingFieldSnafu { processor: self.kind().to_string(), - field: field.input_name().to_string(), + field: name.to_string(), } .fail(); } @@ -402,6 +279,7 @@ impl Processor for CmcdProcessor { } } } + Ok(()) } } @@ -412,9 +290,9 @@ mod tests { use urlencoding::decode; - use super::{CmcdProcessorBuilder, CMCD_KEYS}; + use super::CmcdProcessor; use crate::etl::field::{Field, Fields}; - use crate::etl::value::{Map, Value}; + use crate::etl::value::Value; #[test] fn test_cmcd() { @@ -548,37 +426,20 @@ mod tests { let field = Field::new("prefix", None); - let output_keys = CMCD_KEYS - .iter() - .map(|k| format!("prefix_{}", k)) - .collect::>(); - - let mut intermediate_keys = vec!["prefix".to_string()]; - intermediate_keys.append(&mut (output_keys.clone())); - - let builder = CmcdProcessorBuilder { + let processor = CmcdProcessor { fields: Fields::new(vec![field]), - output_keys: output_keys.iter().map(|s| s.to_string()).collect(), ignore_missing: false, }; - let processor = builder.build(&intermediate_keys).unwrap(); - for (s, vec) in ss.into_iter() { let decoded = decode(s).unwrap().to_string(); - let values = vec + let expected = vec .into_iter() .map(|(k, v)| (k.to_string(), v)) .collect::>(); - let expected = Map { values }; - let actual = processor.parse(0, &decoded).unwrap(); - let actual = actual - .into_iter() - .map(|(index, value)| (intermediate_keys[index].clone(), value)) - .collect::>(); - let actual = Map { values: actual }; + let actual = processor.parse("prefix", &decoded).unwrap(); assert_eq!(actual, expected); } } From f289318ba84f86a3818167cd3900c6b7bd7ca492 Mon Sep 17 00:00:00 2001 From: Ning Sun Date: Sun, 26 Jan 2025 17:30:18 +0800 Subject: [PATCH 26/32] refactor: update csv processor --- src/pipeline/src/etl/processor.rs | 8 +- src/pipeline/src/etl/processor/csv.rs | 171 ++++++-------------------- 2 files changed, 45 insertions(+), 134 deletions(-) diff --git a/src/pipeline/src/etl/processor.rs b/src/pipeline/src/etl/processor.rs index b20258f20818..1e19f194abc3 100644 --- a/src/pipeline/src/etl/processor.rs +++ b/src/pipeline/src/etl/processor.rs @@ -13,7 +13,7 @@ // limitations under the License. pub mod cmcd; -// pub mod csv; +pub mod csv; pub mod date; pub mod decolorize; pub mod digest; @@ -30,7 +30,7 @@ pub mod urlencoding; use std::collections::BTreeMap; use cmcd::CmcdProcessor; -// use csv::CsvProcessor; +use csv::CsvProcessor; use date::DateProcessor; use decolorize::DecolorizeProcessor; use digest::DigestProcessor; @@ -90,7 +90,7 @@ pub trait Processor: std::fmt::Debug + Send + Sync + 'static { #[enum_dispatch] pub enum ProcessorKind { Cmcd(CmcdProcessor), - // Csv(CsvProcessor), + Csv(CsvProcessor), // Dissect(DissectProcessor), Gsub(GsubProcessor), Join(JoinProcessor), @@ -157,7 +157,7 @@ fn parse_processor(doc: &yaml_rust::Yaml) -> Result { let processor = match str_key { cmcd::PROCESSOR_CMCD => ProcessorKind::Cmcd(CmcdProcessor::try_from(value)?), - // csv::PROCESSOR_CSV => ProcessorKind::Csv(CsvProcessor::try_from(value)?), + csv::PROCESSOR_CSV => ProcessorKind::Csv(CsvProcessor::try_from(value)?), // dissect::PROCESSOR_DISSECT => ProcessorKind::Dissect(DissectProcessor::try_from(value)?), epoch::PROCESSOR_EPOCH => ProcessorKind::Epoch(EpochProcessor::try_from(value)?), date::PROCESSOR_DATE => ProcessorKind::Date(DateProcessor::try_from(value)?), diff --git a/src/pipeline/src/etl/processor/csv.rs b/src/pipeline/src/etl/processor/csv.rs index 86f39fc89369..a0fac70de15c 100644 --- a/src/pipeline/src/etl/processor/csv.rs +++ b/src/pipeline/src/etl/processor/csv.rs @@ -14,7 +14,8 @@ // Reference: https://www.elastic.co/guide/en/elasticsearch/reference/current/csv-processor.html -use ahash::HashSet; +use std::collections::BTreeMap; + use csv::{ReaderBuilder, Trim}; use itertools::EitherOrBoth::{Both, Left, Right}; use itertools::Itertools; @@ -24,11 +25,10 @@ use crate::etl::error::{ CsvNoRecordSnafu, CsvQuoteNameSnafu, CsvReadSnafu, CsvSeparatorNameSnafu, Error, KeyMustBeStringSnafu, ProcessorExpectStringSnafu, ProcessorMissingFieldSnafu, Result, }; -use crate::etl::field::{Fields, InputField, OneInputMultiOutputField}; -use crate::etl::find_key_index; +use crate::etl::field::Fields; use crate::etl::processor::{ - yaml_bool, yaml_new_field, yaml_new_fields, yaml_string, Processor, ProcessorBuilder, - ProcessorKind, FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME, + yaml_bool, yaml_new_field, yaml_new_fields, yaml_string, Processor, FIELDS_NAME, FIELD_NAME, + IGNORE_MISSING_NAME, }; use crate::etl::value::Value; @@ -40,76 +40,17 @@ const TRIM_NAME: &str = "trim"; const EMPTY_VALUE_NAME: &str = "empty_value"; const TARGET_FIELDS: &str = "target_fields"; -#[derive(Debug, Default)] -pub struct CsvProcessorBuilder { - reader: ReaderBuilder, - - fields: Fields, - ignore_missing: bool, - - // Value used to fill empty fields, empty fields will be skipped if this is not provided. - empty_value: Option, - target_fields: Vec, - // description - // if - // ignore_failure - // on_failure - // tag -} - -impl CsvProcessorBuilder { - fn build(self, intermediate_keys: &[String]) -> Result { - let mut real_fields = vec![]; - - for field in self.fields { - let input_index = find_key_index(intermediate_keys, field.input_field(), "csv")?; - - let input_field_info = InputField::new(field.input_field(), input_index); - let real_field = OneInputMultiOutputField::new(input_field_info, None); - real_fields.push(real_field); - } - - let output_index_info = self - .target_fields - .iter() - .map(|f| find_key_index(intermediate_keys, f, "csv")) - .collect::>>()?; - Ok(CsvProcessor { - reader: self.reader, - fields: real_fields, - ignore_missing: self.ignore_missing, - empty_value: self.empty_value, - output_index_info, - }) - } -} - -impl ProcessorBuilder for CsvProcessorBuilder { - fn output_keys(&self) -> HashSet<&str> { - self.target_fields.iter().map(|s| s.as_str()).collect() - } - - fn input_keys(&self) -> HashSet<&str> { - self.fields.iter().map(|f| f.input_field()).collect() - } - - fn build(self, intermediate_keys: &[String]) -> Result { - self.build(intermediate_keys).map(ProcessorKind::Csv) - } -} - /// only support string value -#[derive(Debug)] +#[derive(Debug, Default)] pub struct CsvProcessor { reader: ReaderBuilder, - - fields: Vec, + fields: Fields, ignore_missing: bool, // Value used to fill empty fields, empty fields will be skipped if this is not provided. empty_value: Option, - output_index_info: Vec, + target_fields: Vec, // description // if // ignore_failure @@ -119,18 +60,20 @@ pub struct CsvProcessor { impl CsvProcessor { // process the csv format string to a map with target_fields as keys - fn process(&self, val: &str) -> Result> { + fn process(&self, val: &str) -> Result> { let mut reader = self.reader.from_reader(val.as_bytes()); if let Some(result) = reader.records().next() { let record: csv::StringRecord = result.context(CsvReadSnafu)?; - let values: Vec<(usize, Value)> = self - .output_index_info + let values = self + .target_fields .iter() .zip_longest(record.iter()) .filter_map(|zipped| match zipped { - Both(target_field, val) => Some((*target_field, Value::String(val.into()))), + Both(target_field, val) => { + Some((target_field.clone(), Value::String(val.into()))) + } // if target fields are more than extracted fields, fill the rest with empty value Left(target_field) => { let value = self @@ -138,7 +81,7 @@ impl CsvProcessor { .as_ref() .map(|s| Value::String(s.clone())) .unwrap_or(Value::Null); - Some((*target_field, value)) + Some((target_field.clone(), value)) } // if extracted fields are more than target fields, ignore the rest Right(_) => None, @@ -152,7 +95,7 @@ impl CsvProcessor { } } -impl TryFrom<&yaml_rust::yaml::Hash> for CsvProcessorBuilder { +impl TryFrom<&yaml_rust::yaml::Hash> for CsvProcessor { type Error = Error; fn try_from(hash: &yaml_rust::yaml::Hash) -> Result { @@ -224,8 +167,8 @@ impl TryFrom<&yaml_rust::yaml::Hash> for CsvProcessorBuilder { _ => {} } } - let builder = { - CsvProcessorBuilder { + let proc = { + CsvProcessor { reader, fields, ignore_missing, @@ -234,7 +177,7 @@ impl TryFrom<&yaml_rust::yaml::Hash> for CsvProcessorBuilder { } }; - Ok(builder) + Ok(proc) } } @@ -247,21 +190,20 @@ impl Processor for CsvProcessor { self.ignore_missing } - fn exec_mut(&self, val: &mut Vec) -> Result<()> { + fn exec_mut(&self, val: &mut BTreeMap) -> Result<()> { for field in self.fields.iter() { - let index = field.input_index(); - match val.get(index) { + let name = field.input_field(); + + match val.get(name) { Some(Value::String(v)) => { - let resule_list = self.process(v)?; - for (k, v) in resule_list { - val[k] = v; - } + let results = self.process(v)?; + val.extend(results); } Some(Value::Null) | None => { if !self.ignore_missing { return ProcessorMissingFieldSnafu { processor: self.kind().to_string(), - field: field.input_name().to_string(), + field: name.to_string(), } .fail(); } @@ -282,37 +224,28 @@ impl Processor for CsvProcessor { #[cfg(test)] mod tests { - use ahash::HashMap; - - use super::Value; - use crate::etl::processor::csv::CsvProcessorBuilder; + use super::*; + use crate::etl::field::Field; #[test] fn test_equal_length() { let mut reader = csv::ReaderBuilder::new(); reader.has_headers(false); - let builder = CsvProcessorBuilder { + let processor = CsvProcessor { reader, + fields: Fields::new(vec![Field::new("data", None)]), target_fields: vec!["a".into(), "b".into()], ..Default::default() }; - let intermediate_keys = vec!["data".into(), "a".into(), "b".into()]; - - let processor = builder.build(&intermediate_keys).unwrap(); - let result = processor - .process("1,2") - .unwrap() - .into_iter() - .map(|(k, v)| (intermediate_keys[k].clone(), v)) - .collect::>(); + let result = processor.process("1,2").unwrap(); let values = [ ("a".into(), Value::String("1".into())), ("b".into(), Value::String("2".into())), ] .into_iter() - .collect::>(); + .collect(); assert_eq!(result, values); } @@ -324,21 +257,14 @@ mod tests { { let mut reader = csv::ReaderBuilder::new(); reader.has_headers(false); - let builder = CsvProcessorBuilder { + let processor = CsvProcessor { reader, + fields: Fields::new(vec![Field::new("data", None)]), target_fields: vec!["a".into(), "b".into(), "c".into()], ..Default::default() }; - let intermediate_keys = vec!["data".into(), "a".into(), "b".into(), "c".into()]; - - let processor = builder.build(&intermediate_keys).unwrap(); - let result = processor - .process("1,2") - .unwrap() - .into_iter() - .map(|(k, v)| (intermediate_keys[k].clone(), v)) - .collect::>(); + let result = processor.process("1,2").unwrap(); let values = [ ("a".into(), Value::String("1".into())), @@ -346,7 +272,7 @@ mod tests { ("c".into(), Value::Null), ] .into_iter() - .collect::>(); + .collect(); assert_eq!(result, values); } @@ -355,22 +281,15 @@ mod tests { { let mut reader = csv::ReaderBuilder::new(); reader.has_headers(false); - let builder = CsvProcessorBuilder { + let processor = CsvProcessor { reader, + fields: Fields::new(vec![Field::new("data", None)]), target_fields: vec!["a".into(), "b".into(), "c".into()], empty_value: Some("default".into()), ..Default::default() }; - let intermediate_keys = vec!["data".into(), "a".into(), "b".into(), "c".into()]; - - let processor = builder.build(&intermediate_keys).unwrap(); - let result = processor - .process("1,2") - .unwrap() - .into_iter() - .map(|(k, v)| (intermediate_keys[k].clone(), v)) - .collect::>(); + let result = processor.process("1,2").unwrap(); let values = [ ("a".into(), Value::String("1".into())), @@ -389,22 +308,14 @@ mod tests { fn test_target_fields_has_less_length() { let mut reader = csv::ReaderBuilder::new(); reader.has_headers(false); - let builder = CsvProcessorBuilder { + let processor = CsvProcessor { reader, target_fields: vec!["a".into(), "b".into()], empty_value: Some("default".into()), ..Default::default() }; - let intermediate_keys = vec!["data".into(), "a".into(), "b".into()]; - - let processor = builder.build(&intermediate_keys).unwrap(); - let result = processor - .process("1,2") - .unwrap() - .into_iter() - .map(|(k, v)| (intermediate_keys[k].clone(), v)) - .collect::>(); + let result = processor.process("1,2").unwrap(); let values = [ ("a".into(), Value::String("1".into())), From 60a6421d16eb1bdcbdfde67dc0b3a89661125171 Mon Sep 17 00:00:00 2001 From: Ning Sun Date: Sun, 26 Jan 2025 18:05:40 +0800 Subject: [PATCH 27/32] fmt: update format --- src/pipeline/src/etl/processor/cmcd.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/pipeline/src/etl/processor/cmcd.rs b/src/pipeline/src/etl/processor/cmcd.rs index 37df6e8fbec1..8d8b546f7216 100644 --- a/src/pipeline/src/etl/processor/cmcd.rs +++ b/src/pipeline/src/etl/processor/cmcd.rs @@ -21,6 +21,7 @@ use std::collections::BTreeMap; use snafu::{OptionExt, ResultExt}; use urlencoding::decode; +use super::IntermediateStatus; use crate::etl::error::{ CmcdMissingKeySnafu, CmcdMissingValueSnafu, Error, FailedToParseFloatKeySnafu, FailedToParseIntKeySnafu, KeyMustBeStringSnafu, ProcessorExpectStringSnafu, @@ -33,8 +34,6 @@ use crate::etl::processor::{ }; use crate::etl::value::Value; -use super::IntermediateStatus; - pub(crate) const PROCESSOR_CMCD: &str = "cmcd"; const CMCD_KEY_BR: &str = "br"; // Encoded bitrate, Integer kbps From 1a29cfee5284c65c97fa380c4442c68a4bb0e5f7 Mon Sep 17 00:00:00 2001 From: paomian Date: Mon, 27 Jan 2025 11:13:23 +0800 Subject: [PATCH 28/32] chore: fix regex and dissect processor --- src/pipeline/src/etl.rs | 91 ++- src/pipeline/src/etl/processor.rs | 14 +- src/pipeline/src/etl/processor/dissect.rs | 642 +++++++++------------- src/pipeline/src/etl/processor/regex.rs | 301 ++-------- src/pipeline/tests/regex.rs | 2 + src/servers/src/http/event.rs | 2 +- tests-integration/tests/http.rs | 2 +- 7 files changed, 381 insertions(+), 673 deletions(-) diff --git a/src/pipeline/src/etl.rs b/src/pipeline/src/etl.rs index f302e655a816..cac5c44c17be 100644 --- a/src/pipeline/src/etl.rs +++ b/src/pipeline/src/etl.rs @@ -283,15 +283,15 @@ mod tests { let input_value: serde_json::Value = serde_json::from_str(input_value_str).unwrap(); let pipeline_yaml = r#"description: 'Pipeline for Apache Tomcat' - processors: - - csv: - field: my_field - target_fields: field1, field2 - transform: - - field: field1 - type: uint32 - - field: field2 - type: uint32 +processors: + - csv: + field: my_field + target_fields: field1, field2 +transform: + - field: field1 + type: uint32 + - field: field2 + type: uint32 "#; let pipeline: Pipeline = parse(&Content::Yaml(pipeline_yaml)).unwrap(); let mut payload = json_to_intermediate_state(input_value).unwrap(); @@ -315,34 +315,34 @@ mod tests { fn test_dissect_pipeline() { let message = r#"129.37.245.88 - meln1ks [01/Aug/2024:14:22:47 +0800] "PATCH /observability/metrics/production HTTP/1.0" 501 33085"#.to_string(); let pipeline_str = r#"processors: - - dissect: - fields: - - message - patterns: - - "%{ip} %{?ignored} %{username} [%{ts}] \"%{method} %{path} %{proto}\" %{status} %{bytes}" - - timestamp: - fields: - - ts - formats: - - "%d/%b/%Y:%H:%M:%S %z" + - dissect: + fields: + - message + patterns: + - "%{ip} %{?ignored} %{username} [%{ts}] \"%{method} %{path} %{proto}\" %{status} %{bytes}" + - timestamp: + fields: + - ts + formats: + - "%d/%b/%Y:%H:%M:%S %z" - transform: - - fields: - - ip - - username - - method - - path - - proto - type: string - - fields: - - status - type: uint16 - - fields: - - bytes - type: uint32 - - field: ts - type: timestamp, ns - index: time"#; +transform: + - fields: + - ip + - username + - method + - path + - proto + type: string + - fields: + - status + type: uint16 + - fields: + - bytes + type: uint32 + - field: ts + type: timestamp, ns + index: time"#; let pipeline: Pipeline = parse(&Content::Yaml(pipeline_str)).unwrap(); let mut payload = BTreeMap::new(); payload.insert("message".to_string(), Value::String(message)); @@ -449,18 +449,17 @@ mod tests { "#; let input_value: serde_json::Value = serde_json::from_str(input_value_str).unwrap(); - let pipeline_yaml = r#" - --- - description: Pipeline for Apache Tomcat + let pipeline_yaml = r#"--- +description: Pipeline for Apache Tomcat - processors: - - timestamp: - field: test_time +processors: + - timestamp: + field: test_time - transform: - - field: test_time - type: timestamp, ns - index: time +transform: + - field: test_time + type: timestamp, ns + index: time "#; let pipeline: Pipeline = parse(&Content::Yaml(pipeline_yaml)).unwrap(); diff --git a/src/pipeline/src/etl/processor.rs b/src/pipeline/src/etl/processor.rs index 1e19f194abc3..376282afecef 100644 --- a/src/pipeline/src/etl/processor.rs +++ b/src/pipeline/src/etl/processor.rs @@ -17,13 +17,13 @@ pub mod csv; pub mod date; pub mod decolorize; pub mod digest; -// pub mod dissect; +pub mod dissect; pub mod epoch; pub mod gsub; pub mod join; pub mod json_path; pub mod letter; -// pub mod regex; +pub mod regex; pub mod timestamp; pub mod urlencoding; @@ -34,14 +34,14 @@ use csv::CsvProcessor; use date::DateProcessor; use decolorize::DecolorizeProcessor; use digest::DigestProcessor; -// use dissect::DissectProcessor; +use dissect::DissectProcessor; use enum_dispatch::enum_dispatch; use epoch::EpochProcessor; use gsub::GsubProcessor; use join::JoinProcessor; use json_path::JsonPathProcessor; use letter::LetterProcessor; -// use regex::RegexProcessor; +use regex::RegexProcessor; use snafu::{OptionExt, ResultExt}; use timestamp::TimestampProcessor; use urlencoding::UrlEncodingProcessor; @@ -95,7 +95,7 @@ pub enum ProcessorKind { Gsub(GsubProcessor), Join(JoinProcessor), Letter(LetterProcessor), - // Regex(RegexProcessor), + Regex(RegexProcessor), Timestamp(TimestampProcessor), UrlEncoding(UrlEncodingProcessor), Epoch(EpochProcessor), @@ -158,13 +158,13 @@ fn parse_processor(doc: &yaml_rust::Yaml) -> Result { let processor = match str_key { cmcd::PROCESSOR_CMCD => ProcessorKind::Cmcd(CmcdProcessor::try_from(value)?), csv::PROCESSOR_CSV => ProcessorKind::Csv(CsvProcessor::try_from(value)?), - // dissect::PROCESSOR_DISSECT => ProcessorKind::Dissect(DissectProcessor::try_from(value)?), + dissect::PROCESSOR_DISSECT => ProcessorKind::Dissect(DissectProcessor::try_from(value)?), epoch::PROCESSOR_EPOCH => ProcessorKind::Epoch(EpochProcessor::try_from(value)?), date::PROCESSOR_DATE => ProcessorKind::Date(DateProcessor::try_from(value)?), gsub::PROCESSOR_GSUB => ProcessorKind::Gsub(GsubProcessor::try_from(value)?), join::PROCESSOR_JOIN => ProcessorKind::Join(JoinProcessor::try_from(value)?), letter::PROCESSOR_LETTER => ProcessorKind::Letter(LetterProcessor::try_from(value)?), - // regex::PROCESSOR_REGEX => ProcessorKind::Regex(RegexProcessor::try_from(value)?), + regex::PROCESSOR_REGEX => ProcessorKind::Regex(RegexProcessor::try_from(value)?), timestamp::PROCESSOR_TIMESTAMP => { ProcessorKind::Timestamp(TimestampProcessor::try_from(value)?) } diff --git a/src/pipeline/src/etl/processor/dissect.rs b/src/pipeline/src/etl/processor/dissect.rs index 13ad9175e7df..5755a0aeb8a5 100644 --- a/src/pipeline/src/etl/processor/dissect.rs +++ b/src/pipeline/src/etl/processor/dissect.rs @@ -18,6 +18,7 @@ use ahash::{HashMap, HashMapExt, HashSet, HashSetExt}; use itertools::Itertools; use snafu::OptionExt; +use super::IntermediateStatus; use crate::etl::error::{ DissectAppendOrderAlreadySetSnafu, DissectConsecutiveNamesSnafu, DissectEmptyPatternSnafu, DissectEndModifierAlreadySetSnafu, DissectInvalidPatternSnafu, DissectModifierAlreadySetSnafu, @@ -25,12 +26,10 @@ use crate::etl::error::{ DissectOrderOnlyAppendSnafu, DissectSplitExceedsInputSnafu, DissectSplitNotMatchInputSnafu, Error, KeyMustBeStringSnafu, ProcessorExpectStringSnafu, ProcessorMissingFieldSnafu, Result, }; -use crate::etl::field::{Fields, InputField, OneInputMultiOutputField}; -use crate::etl::find_key_index; +use crate::etl::field::Fields; use crate::etl::processor::{ yaml_bool, yaml_new_field, yaml_new_fields, yaml_parse_string, yaml_parse_strings, yaml_string, - Processor, ProcessorBuilder, ProcessorKind, FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME, - PATTERNS_NAME, PATTERN_NAME, + Processor, FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME, PATTERNS_NAME, PATTERN_NAME, }; use crate::etl::value::Value; @@ -69,14 +68,7 @@ impl std::fmt::Display for EndModifier { } } -#[derive(Debug, PartialEq, Default)] -struct NameInfo { - name: String, - start_modifier: Option, - end_modifier: Option, -} - -impl NameInfo { +impl Name { fn is_name_empty(&self) -> bool { self.name.is_empty() } @@ -140,26 +132,9 @@ impl NameInfo { } } -impl std::fmt::Display for NameInfo { - fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { - write!(f, "{}", self.name) - } -} - -impl From<&str> for NameInfo { - fn from(value: &str) -> Self { - NameInfo { - name: value.to_string(), - start_modifier: None, - end_modifier: None, - } - } -} - #[derive(Debug, PartialEq, Default)] struct Name { name: String, - index: usize, start_modifier: Option, end_modifier: Option, } @@ -170,57 +145,12 @@ impl std::fmt::Display for Name { } } -impl From for Name { - fn from(value: NameInfo) -> Self { +impl From<&str> for Name { + fn from(value: &str) -> Self { Name { - name: value.name, - index: 0, - start_modifier: value.start_modifier, - end_modifier: value.end_modifier, - } - } -} - -impl Name { - fn is_name_empty(&self) -> bool { - self.name.is_empty() - } - - fn is_empty(&self) -> bool { - self.name.is_empty() && self.start_modifier.is_none() && self.end_modifier.is_none() - } - - fn is_end_modifier_set(&self) -> bool { - self.end_modifier.is_some() - } -} - -#[derive(Debug, PartialEq)] -enum PartInfo { - Split(String), - Name(NameInfo), -} - -impl PartInfo { - fn is_empty(&self) -> bool { - match self { - PartInfo::Split(v) => v.is_empty(), - PartInfo::Name(v) => v.is_empty(), - } - } - - fn empty_split() -> Self { - PartInfo::Split(String::new()) - } - - fn empty_name() -> Self { - PartInfo::Name(NameInfo::default()) - } - - fn push(&mut self, ch: char) { - match self { - PartInfo::Split(v) => v.push(ch), - PartInfo::Name(v) => v.name.push(ch), + name: value.to_string(), + start_modifier: None, + end_modifier: None, } } } @@ -246,13 +176,11 @@ impl Part { fn empty_name() -> Self { Part::Name(Name::default()) } -} -impl From for Part { - fn from(value: PartInfo) -> Self { - match value { - PartInfo::Split(v) => Part::Split(v), - PartInfo::Name(v) => Part::Name(v.into()), + fn push(&mut self, ch: char) { + match self { + Part::Split(v) => v.push(ch), + Part::Name(v) => v.name.push(ch), } } } @@ -271,42 +199,12 @@ impl Deref for Pattern { } } -impl From for Pattern { - fn from(value: PatternInfo) -> Self { - let parts = value.parts.into_iter().map(|x| x.into()).collect(); - Pattern { - origin: value.origin, - parts, - } - } -} - -#[derive(Debug, Default)] -struct PatternInfo { - origin: String, - parts: Vec, -} - -impl std::ops::Deref for PatternInfo { - type Target = Vec; - - fn deref(&self) -> &Self::Target { - &self.parts - } -} - -impl std::ops::DerefMut for PatternInfo { - fn deref_mut(&mut self) -> &mut Self::Target { - &mut self.parts - } -} - -impl std::str::FromStr for PatternInfo { +impl std::str::FromStr for Pattern { type Err = Error; fn from_str(s: &str) -> Result { let mut parts = vec![]; - let mut cursor = PartInfo::empty_split(); + let mut cursor = Part::empty_split(); let origin = s.to_string(); let chars: Vec = origin.chars().collect(); @@ -316,27 +214,27 @@ impl std::str::FromStr for PatternInfo { let ch = chars[pos]; match (ch, &mut cursor) { // if cursor is Split part, and found %{, then ready to start a Name part - ('%', PartInfo::Split(_)) if matches!(chars.get(pos + 1), Some('{')) => { + ('%', Part::Split(_)) if matches!(chars.get(pos + 1), Some('{')) => { if !cursor.is_empty() { parts.push(cursor); } - cursor = PartInfo::empty_name(); + cursor = Part::empty_name(); pos += 1; // skip '{' } // if cursor is Split part, and not found % or {, then continue the Split part - (_, PartInfo::Split(_)) => { + (_, Part::Split(_)) => { cursor.push(ch); } // if cursor is Name part, and found }, then end the Name part, start the next Split part - ('}', PartInfo::Name(_)) => { + ('}', Part::Name(_)) => { parts.push(cursor); - cursor = PartInfo::empty_split(); + cursor = Part::empty_split(); } - ('+', PartInfo::Name(name)) if !name.is_start_modifier_set() => { + ('+', Part::Name(name)) if !name.is_start_modifier_set() => { name.try_start_modifier(StartModifier::Append(None))?; } - ('/', PartInfo::Name(name)) if name.is_append_modifier_set() => { + ('/', Part::Name(name)) if name.is_append_modifier_set() => { let mut order = 0; let mut j = pos + 1; while j < chars.len() { @@ -360,16 +258,16 @@ impl std::str::FromStr for PatternInfo { name.try_append_order(order)?; pos = j - 1; // this will change the position to the last digit of the order } - ('?', PartInfo::Name(name)) if !name.is_start_modifier_set() => { + ('?', Part::Name(name)) if !name.is_start_modifier_set() => { name.try_start_modifier(StartModifier::NamedSkip)?; } - ('*', PartInfo::Name(name)) if !name.is_start_modifier_set() => { + ('*', Part::Name(name)) if !name.is_start_modifier_set() => { name.try_start_modifier(StartModifier::MapKey)?; } - ('&', PartInfo::Name(name)) if !name.is_start_modifier_set() => { + ('&', Part::Name(name)) if !name.is_start_modifier_set() => { name.try_start_modifier(StartModifier::MapVal)?; } - ('-', PartInfo::Name(name)) if !name.is_end_modifier_set() => { + ('-', Part::Name(name)) if !name.is_end_modifier_set() => { if let Some('>') = chars.get(pos + 1) { } else { return DissectInvalidPatternSnafu { @@ -391,7 +289,7 @@ impl std::str::FromStr for PatternInfo { name.try_end_modifier()?; pos += 1; // only skip '>', the next loop will skip '}' } - (_, PartInfo::Name(name)) if !is_valid_char(ch) => { + (_, Part::Name(name)) if !is_valid_char(ch) => { let tail: String = if name.is_name_empty() { format!("Invalid '{ch}'") } else { @@ -399,7 +297,7 @@ impl std::str::FromStr for PatternInfo { }; return DissectInvalidPatternSnafu { s, detail: tail }.fail(); } - (_, PartInfo::Name(_)) => { + (_, Part::Name(_)) => { cursor.push(ch); } } @@ -408,8 +306,8 @@ impl std::str::FromStr for PatternInfo { } match cursor { - PartInfo::Split(ref split) if !split.is_empty() => parts.push(cursor), - PartInfo::Name(name) if !name.is_empty() => { + Part::Split(ref split) if !split.is_empty() => parts.push(cursor), + Part::Name(name) if !name.is_empty() => { return DissectInvalidPatternSnafu { s, detail: format!("'{name}' is not closed"), @@ -425,7 +323,7 @@ impl std::str::FromStr for PatternInfo { } } -impl PatternInfo { +impl Pattern { fn check(&self) -> Result<()> { if self.len() == 0 { return DissectEmptyPatternSnafu.fail(); @@ -438,21 +336,21 @@ impl PatternInfo { let this_part = &self[i]; let next_part = self.get(i + 1); match (this_part, next_part) { - (PartInfo::Split(split), _) if split.is_empty() => { + (Part::Split(split), _) if split.is_empty() => { return DissectInvalidPatternSnafu { s: &self.origin, detail: "Empty split is not allowed", } .fail(); } - (PartInfo::Name(name1), Some(PartInfo::Name(name2))) => { + (Part::Name(name1), Some(Part::Name(name2))) => { return DissectInvalidPatternSnafu { s: &self.origin, detail: format!("consecutive names are not allowed: '{name1}' '{name2}'",), } .fail(); } - (PartInfo::Name(name), _) if name.is_name_empty() => { + (Part::Name(name), _) if name.is_name_empty() => { if let Some(ref m) = name.start_modifier { return DissectInvalidPatternSnafu { s: &self.origin, @@ -461,7 +359,7 @@ impl PatternInfo { .fail(); } } - (PartInfo::Name(name), _) => match name.start_modifier { + (Part::Name(name), _) => match name.start_modifier { Some(StartModifier::MapKey) => { if map_keys.contains(&name.name) { return DissectInvalidPatternSnafu { @@ -509,128 +407,128 @@ impl PatternInfo { } } -impl std::fmt::Display for PatternInfo { - fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { - write!(f, "{}", self.origin) - } -} - -#[derive(Debug, Default)] -pub struct DissectProcessorBuilder { - fields: Fields, - patterns: Vec, - ignore_missing: bool, - append_separator: Option, - output_keys: HashSet, -} - -impl DissectProcessorBuilder { - fn build_output_keys(patterns: &[PatternInfo]) -> HashSet { - patterns - .iter() - .flat_map(|pattern| pattern.iter()) - .filter_map(|p| match p { - PartInfo::Name(name) => { - if !name.is_empty() - && (name.start_modifier.is_none() - || name - .start_modifier - .as_ref() - .is_some_and(|x| matches!(x, StartModifier::Append(_)))) - { - Some(name.to_string()) - } else { - None - } - } - _ => None, - }) - .collect() - } - - fn part_info_to_part(part_info: PartInfo, intermediate_keys: &[String]) -> Result { - match part_info { - PartInfo::Split(s) => Ok(Part::Split(s)), - PartInfo::Name(n) => match n.start_modifier { - None | Some(StartModifier::Append(_)) => { - let index = find_key_index(intermediate_keys, &n.name, "dissect")?; - Ok(Part::Name(Name { - name: n.name, - index, - start_modifier: n.start_modifier, - end_modifier: n.end_modifier, - })) - } - _ => Ok(Part::Name(Name { - name: n.name, - index: usize::MAX, - start_modifier: n.start_modifier, - end_modifier: n.end_modifier, - })), - }, - } - } - - fn pattern_info_to_pattern( - pattern_info: PatternInfo, - intermediate_keys: &[String], - ) -> Result { - let original = pattern_info.origin; - let pattern = pattern_info - .parts - .into_iter() - .map(|part_info| Self::part_info_to_part(part_info, intermediate_keys)) - .collect::>>()?; - Ok(Pattern { - origin: original, - parts: pattern, - }) - } - - fn build_patterns_from_pattern_infos( - patterns: Vec, - intermediate_keys: &[String], - ) -> Result> { - patterns - .into_iter() - .map(|pattern_info| Self::pattern_info_to_pattern(pattern_info, intermediate_keys)) - .collect() - } -} - -impl ProcessorBuilder for DissectProcessorBuilder { - fn output_keys(&self) -> HashSet<&str> { - self.output_keys.iter().map(|s| s.as_str()).collect() - } - - fn input_keys(&self) -> HashSet<&str> { - self.fields.iter().map(|f| f.input_field()).collect() - } - - fn build(self, intermediate_keys: &[String]) -> Result { - let mut real_fields = vec![]; - for field in self.fields.into_iter() { - let input_index = find_key_index(intermediate_keys, field.input_field(), "dissect")?; - - let input_field_info = InputField::new(field.input_field(), input_index); - - let real_field = OneInputMultiOutputField::new(input_field_info, field.target_field); - real_fields.push(real_field); - } - let patterns = Self::build_patterns_from_pattern_infos(self.patterns, intermediate_keys)?; - let processor = DissectProcessor { - fields: real_fields, - patterns, - ignore_missing: self.ignore_missing, - append_separator: self.append_separator, - }; - Ok(ProcessorKind::Dissect(processor)) - } -} +// impl std::fmt::Display for PatternInfo { +// fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { +// write!(f, "{}", self.origin) +// } +// } + +// #[derive(Debug, Default)] +// pub struct DissectProcessorBuilder { +// fields: Fields, +// patterns: Vec, +// ignore_missing: bool, +// append_separator: Option, +// output_keys: HashSet, +// } + +// impl DissectProcessorBuilder { +// fn build_output_keys(patterns: &[PatternInfo]) -> HashSet { +// patterns +// .iter() +// .flat_map(|pattern| pattern.iter()) +// .filter_map(|p| match p { +// PartInfo::Name(name) => { +// if !name.is_empty() +// && (name.start_modifier.is_none() +// || name +// .start_modifier +// .as_ref() +// .is_some_and(|x| matches!(x, StartModifier::Append(_)))) +// { +// Some(name.to_string()) +// } else { +// None +// } +// } +// _ => None, +// }) +// .collect() +// } + +// fn part_info_to_part(part_info: PartInfo, intermediate_keys: &[String]) -> Result { +// match part_info { +// PartInfo::Split(s) => Ok(Part::Split(s)), +// PartInfo::Name(n) => match n.start_modifier { +// None | Some(StartModifier::Append(_)) => { +// let index = find_key_index(intermediate_keys, &n.name, "dissect")?; +// Ok(Part::Name(Name { +// name: n.name, +// index, +// start_modifier: n.start_modifier, +// end_modifier: n.end_modifier, +// })) +// } +// _ => Ok(Part::Name(Name { +// name: n.name, +// index: usize::MAX, +// start_modifier: n.start_modifier, +// end_modifier: n.end_modifier, +// })), +// }, +// } +// } + +// fn pattern_info_to_pattern( +// pattern_info: PatternInfo, +// intermediate_keys: &[String], +// ) -> Result { +// let original = pattern_info.origin; +// let pattern = pattern_info +// .parts +// .into_iter() +// .map(|part_info| Self::part_info_to_part(part_info, intermediate_keys)) +// .collect::>>()?; +// Ok(Pattern { +// origin: original, +// parts: pattern, +// }) +// } + +// fn build_patterns_from_pattern_infos( +// patterns: Vec, +// intermediate_keys: &[String], +// ) -> Result> { +// patterns +// .into_iter() +// .map(|pattern_info| Self::pattern_info_to_pattern(pattern_info, intermediate_keys)) +// .collect() +// } +// } + +// impl ProcessorBuilder for DissectProcessorBuilder { +// fn output_keys(&self) -> HashSet<&str> { +// self.output_keys.iter().map(|s| s.as_str()).collect() +// } + +// fn input_keys(&self) -> HashSet<&str> { +// self.fields.iter().map(|f| f.input_field()).collect() +// } + +// fn build(self, intermediate_keys: &[String]) -> Result { +// let mut real_fields = vec![]; +// for field in self.fields.into_iter() { +// let input_index = find_key_index(intermediate_keys, field.input_field(), "dissect")?; + +// let input_field_info = InputField::new(field.input_field(), input_index); + +// let real_field = OneInputMultiOutputField::new(input_field_info, field.target_field); +// real_fields.push(real_field); +// } +// let patterns = Self::build_patterns_from_pattern_infos(self.patterns, intermediate_keys)?; +// let processor = DissectProcessor { +// fields: real_fields, +// patterns, +// ignore_missing: self.ignore_missing, +// append_separator: self.append_separator, +// }; +// Ok(ProcessorKind::Dissect(processor)) +// } +// } #[derive(Debug, Default)] pub struct DissectProcessor { - fields: Vec, + fields: Fields, patterns: Vec, ignore_missing: bool, @@ -639,33 +537,37 @@ pub struct DissectProcessor { } impl DissectProcessor { - fn process_pattern(&self, chs: &[char], pattern: &Pattern) -> Result> { + fn process_name_value<'a, 'b>( + name: &'a Name, + value: String, + appends: &'b mut HashMap<&'a String, Vec<(String, u32)>>, + map: &mut Vec<(&'a String, Value)>, + ) { + match name.start_modifier { + Some(StartModifier::NamedSkip) => { + // do nothing, ignore this match + } + Some(StartModifier::Append(order)) => { + appends + .entry(&name.name) + .or_default() + .push((value, order.unwrap_or_default())); + } + Some(_) => { + // do nothing, ignore MapKey and MapVal + // because transform can know the key name + } + None => { + map.push((&name.name, Value::String(value))); + } + } + } + + fn process_pattern(&self, chs: &[char], pattern: &Pattern) -> Result> { let mut map = Vec::new(); let mut pos = 0; - let mut appends: HashMap> = HashMap::new(); - - let mut process_name_value = |name: &Name, value: String| { - let name_index = name.index; - match name.start_modifier { - Some(StartModifier::NamedSkip) => { - // do nothing, ignore this match - } - Some(StartModifier::Append(order)) => { - appends - .entry(name_index) - .or_default() - .push((value, order.unwrap_or_default())); - } - Some(_) => { - // do nothing, ignore MapKey and MapVal - // because transform can know the key name - } - None => { - map.push((name_index, Value::String(value))); - } - } - }; + let mut appends: HashMap<&String, Vec<(String, u32)>> = HashMap::new(); for i in 0..pattern.len() { let this_part = &pattern[i]; @@ -701,7 +603,7 @@ impl DissectProcessor { // if Name part is the last part, then the rest of the input is the value (Part::Name(name), None) => { let value = chs[pos..].iter().collect::(); - process_name_value(name, value); + Self::process_name_value(name, value, &mut appends, &mut map); } // if Name part, and next part is Split, then find the matched value of the name @@ -717,7 +619,7 @@ impl DissectProcessor { if !name.is_name_empty() { let value = chs[pos..end].iter().collect::(); - process_name_value(name, value); + Self::process_name_value(name, value, &mut appends, &mut map); } if name.is_end_modifier_set() { @@ -745,10 +647,10 @@ impl DissectProcessor { } } - Ok(map) + Ok(map.into_iter().map(|(k, v)| (k.to_string(), v)).collect()) } - fn process(&self, val: &str) -> Result> { + fn process(&self, val: &str) -> Result> { let chs = val.chars().collect::>(); for pattern in &self.patterns { @@ -760,7 +662,7 @@ impl DissectProcessor { } } -impl TryFrom<&yaml_rust::yaml::Hash> for DissectProcessorBuilder { +impl TryFrom<&yaml_rust::yaml::Hash> for DissectProcessor { type Error = Error; fn try_from(value: &yaml_rust::yaml::Hash) -> Result { @@ -782,7 +684,7 @@ impl TryFrom<&yaml_rust::yaml::Hash> for DissectProcessorBuilder { fields = yaml_new_fields(v, FIELDS_NAME)?; } PATTERN_NAME => { - let pattern: PatternInfo = yaml_parse_string(v, PATTERN_NAME)?; + let pattern: Pattern = yaml_parse_string(v, PATTERN_NAME)?; patterns = vec![pattern]; } PATTERNS_NAME => { @@ -797,13 +699,12 @@ impl TryFrom<&yaml_rust::yaml::Hash> for DissectProcessorBuilder { _ => {} } } - let output_keys = Self::build_output_keys(&patterns); - let builder = DissectProcessorBuilder { + // let output_keys = Self::build_output_keys(&patterns); + let builder = DissectProcessor { fields, patterns, ignore_missing, append_separator, - output_keys, }; Ok(builder) @@ -819,21 +720,21 @@ impl Processor for DissectProcessor { self.ignore_missing } - fn exec_mut(&self, val: &mut Vec) -> Result<()> { + fn exec_mut(&self, val: &mut IntermediateStatus) -> Result<()> { for field in self.fields.iter() { - let index = field.input_index(); + let index = field.input_field(); match val.get(index) { Some(Value::String(val_str)) => { let r = self.process(val_str)?; for (k, v) in r { - val[k] = v; + val.insert(k, v); } } Some(Value::Null) | None => { if !self.ignore_missing { return ProcessorMissingFieldSnafu { processor: self.kind(), - field: field.input_name(), + field: field.input_field(), } .fail(); } @@ -859,26 +760,19 @@ fn is_valid_char(ch: char) -> bool { mod tests { use ahash::HashMap; - use super::{DissectProcessor, EndModifier, NameInfo, PartInfo, PatternInfo, StartModifier}; - use crate::etl::processor::dissect::DissectProcessorBuilder; + use super::{DissectProcessor, EndModifier, Name, Part, StartModifier}; + use crate::etl::processor::dissect::Pattern; use crate::etl::value::Value; fn assert(pattern_str: &str, input: &str, expected: HashMap) { let chs = input.chars().collect::>(); - let pattern_infos: Vec = vec![pattern_str.parse().unwrap()]; - let output_keys: Vec = DissectProcessorBuilder::build_output_keys(&pattern_infos) - .into_iter() - .collect(); - let pattern = - DissectProcessorBuilder::build_patterns_from_pattern_infos(pattern_infos, &output_keys) - .unwrap(); + let patterns: Vec = vec![pattern_str.parse().unwrap()]; let processor = DissectProcessor::default(); let result: HashMap = processor - .process_pattern(&chs, &pattern[0]) + .process_pattern(&chs, &patterns[0]) .unwrap() .into_iter() - .map(|(k, v)| (output_keys[k].to_string(), v)) .collect(); assert_eq!(result, expected, "pattern: {}", pattern_str); @@ -889,28 +783,28 @@ mod tests { let cases = [( "%{clientip} %{ident} %{auth} [%{timestamp}] \"%{verb} %{request} HTTP/%{httpversion}\" %{status} %{size}", vec![ - PartInfo::Name("clientip".into()), - PartInfo::Split(" ".into()), - PartInfo::Name("ident".into()), - PartInfo::Split(" ".into()), - PartInfo::Name("auth".into()), - PartInfo::Split(" [".into()), - PartInfo::Name("timestamp".into()), - PartInfo::Split("] \"".into()), - PartInfo::Name("verb".into()), - PartInfo::Split(" ".into()), - PartInfo::Name("request".into()), - PartInfo::Split(" HTTP/".into()), - PartInfo::Name("httpversion".into()), - PartInfo::Split("\" ".into()), - PartInfo::Name("status".into()), - PartInfo::Split(" ".into()), - PartInfo::Name("size".into()), + Part::Name("clientip".into()), + Part::Split(" ".into()), + Part::Name("ident".into()), + Part::Split(" ".into()), + Part::Name("auth".into()), + Part::Split(" [".into()), + Part::Name("timestamp".into()), + Part::Split("] \"".into()), + Part::Name("verb".into()), + Part::Split(" ".into()), + Part::Name("request".into()), + Part::Split(" HTTP/".into()), + Part::Name("httpversion".into()), + Part::Split("\" ".into()), + Part::Name("status".into()), + Part::Split(" ".into()), + Part::Name("size".into()), ], )]; for (pattern, expected) in cases.into_iter() { - let p: PatternInfo = pattern.parse().unwrap(); + let p: Pattern = pattern.parse().unwrap(); assert_eq!(p.parts, expected); } } @@ -921,13 +815,13 @@ mod tests { ( "%{} %{}", vec![ - PartInfo::Name(NameInfo { + Part::Name(Name { name: "".into(), start_modifier: None, end_modifier: None, }), - PartInfo::Split(" ".into()), - PartInfo::Name(NameInfo { + Part::Split(" ".into()), + Part::Name(Name { name: "".into(), start_modifier: None, end_modifier: None, @@ -937,61 +831,61 @@ mod tests { ( "%{ts->} %{level}", vec![ - PartInfo::Name(NameInfo { + Part::Name(Name { name: "ts".into(), start_modifier: None, end_modifier: Some(EndModifier), }), - PartInfo::Split(" ".into()), - PartInfo::Name("level".into()), + Part::Split(" ".into()), + Part::Name("level".into()), ], ), ( "[%{ts}]%{->}[%{level}]", vec![ - PartInfo::Split("[".into()), - PartInfo::Name(NameInfo { + Part::Split("[".into()), + Part::Name(Name { name: "ts".into(), start_modifier: None, end_modifier: None, }), - PartInfo::Split("]".into()), - PartInfo::Name(NameInfo { + Part::Split("]".into()), + Part::Name(Name { name: "".into(), start_modifier: None, end_modifier: Some(EndModifier), }), - PartInfo::Split("[".into()), - PartInfo::Name(NameInfo { + Part::Split("[".into()), + Part::Name(Name { name: "level".into(), start_modifier: None, end_modifier: None, }), - PartInfo::Split("]".into()), + Part::Split("]".into()), ], ), ( "%{+name} %{+name} %{+name} %{+name}", vec![ - PartInfo::Name(NameInfo { + Part::Name(Name { name: "name".into(), start_modifier: Some(StartModifier::Append(None)), end_modifier: None, }), - PartInfo::Split(" ".into()), - PartInfo::Name(NameInfo { + Part::Split(" ".into()), + Part::Name(Name { name: "name".into(), start_modifier: Some(StartModifier::Append(None)), end_modifier: None, }), - PartInfo::Split(" ".into()), - PartInfo::Name(NameInfo { + Part::Split(" ".into()), + Part::Name(Name { name: "name".into(), start_modifier: Some(StartModifier::Append(None)), end_modifier: None, }), - PartInfo::Split(" ".into()), - PartInfo::Name(NameInfo { + Part::Split(" ".into()), + Part::Name(Name { name: "name".into(), start_modifier: Some(StartModifier::Append(None)), end_modifier: None, @@ -1001,25 +895,25 @@ mod tests { ( "%{+name/2} %{+name/4} %{+name/3} %{+name/1}", vec![ - PartInfo::Name(NameInfo { + Part::Name(Name { name: "name".into(), start_modifier: Some(StartModifier::Append(Some(2))), end_modifier: None, }), - PartInfo::Split(" ".into()), - PartInfo::Name(NameInfo { + Part::Split(" ".into()), + Part::Name(Name { name: "name".into(), start_modifier: Some(StartModifier::Append(Some(4))), end_modifier: None, }), - PartInfo::Split(" ".into()), - PartInfo::Name(NameInfo { + Part::Split(" ".into()), + Part::Name(Name { name: "name".into(), start_modifier: Some(StartModifier::Append(Some(3))), end_modifier: None, }), - PartInfo::Split(" ".into()), - PartInfo::Name(NameInfo { + Part::Split(" ".into()), + Part::Name(Name { name: "name".into(), start_modifier: Some(StartModifier::Append(Some(1))), end_modifier: None, @@ -1029,67 +923,67 @@ mod tests { ( "%{clientip} %{?ident} %{?auth} [%{timestamp}]", vec![ - PartInfo::Name(NameInfo { + Part::Name(Name { name: "clientip".into(), start_modifier: None, end_modifier: None, }), - PartInfo::Split(" ".into()), - PartInfo::Name(NameInfo { + Part::Split(" ".into()), + Part::Name(Name { name: "ident".into(), start_modifier: Some(StartModifier::NamedSkip), end_modifier: None, }), - PartInfo::Split(" ".into()), - PartInfo::Name(NameInfo { + Part::Split(" ".into()), + Part::Name(Name { name: "auth".into(), start_modifier: Some(StartModifier::NamedSkip), end_modifier: None, }), - PartInfo::Split(" [".into()), - PartInfo::Name(NameInfo { + Part::Split(" [".into()), + Part::Name(Name { name: "timestamp".into(), start_modifier: None, end_modifier: None, }), - PartInfo::Split("]".into()), + Part::Split("]".into()), ], ), ( "[%{ts}] [%{level}] %{*p1}:%{&p1} %{*p2}:%{&p2}", vec![ - PartInfo::Split("[".into()), - PartInfo::Name(NameInfo { + Part::Split("[".into()), + Part::Name(Name { name: "ts".into(), start_modifier: None, end_modifier: None, }), - PartInfo::Split("] [".into()), - PartInfo::Name(NameInfo { + Part::Split("] [".into()), + Part::Name(Name { name: "level".into(), start_modifier: None, end_modifier: None, }), - PartInfo::Split("] ".into()), - PartInfo::Name(NameInfo { + Part::Split("] ".into()), + Part::Name(Name { name: "p1".into(), start_modifier: Some(StartModifier::MapKey), end_modifier: None, }), - PartInfo::Split(":".into()), - PartInfo::Name(NameInfo { + Part::Split(":".into()), + Part::Name(Name { name: "p1".into(), start_modifier: Some(StartModifier::MapVal), end_modifier: None, }), - PartInfo::Split(" ".into()), - PartInfo::Name(NameInfo { + Part::Split(" ".into()), + Part::Name(Name { name: "p2".into(), start_modifier: Some(StartModifier::MapKey), end_modifier: None, }), - PartInfo::Split(":".into()), - PartInfo::Name(NameInfo { + Part::Split(":".into()), + Part::Name(Name { name: "p2".into(), start_modifier: Some(StartModifier::MapVal), end_modifier: None, @@ -1099,13 +993,13 @@ mod tests { ( "%{&p1}:%{*p1}", vec![ - PartInfo::Name(NameInfo { + Part::Name(Name { name: "p1".into(), start_modifier: Some(StartModifier::MapVal), end_modifier: None, }), - PartInfo::Split(":".into()), - PartInfo::Name(NameInfo { + Part::Split(":".into()), + Part::Name(Name { name: "p1".into(), start_modifier: Some(StartModifier::MapKey), end_modifier: None, @@ -1115,7 +1009,7 @@ mod tests { ]; for (pattern, expected) in cases.into_iter() { - let p: PatternInfo = pattern.parse().unwrap(); + let p: Pattern = pattern.parse().unwrap(); assert_eq!(p.parts, expected); } } @@ -1195,7 +1089,7 @@ mod tests { ]; for (pattern, expected) in cases.into_iter() { - let err = pattern.parse::().unwrap_err(); + let err = pattern.parse::().unwrap_err(); assert_eq!(err.to_string(), expected); } } diff --git a/src/pipeline/src/etl/processor/regex.rs b/src/pipeline/src/etl/processor/regex.rs index a6ffa86d1689..e6988c773f52 100644 --- a/src/pipeline/src/etl/processor/regex.rs +++ b/src/pipeline/src/etl/processor/regex.rs @@ -18,21 +18,20 @@ const PATTERNS_NAME: &str = "patterns"; pub(crate) const PROCESSOR_REGEX: &str = "regex"; -use ahash::{HashSet, HashSetExt}; use lazy_static::lazy_static; use regex::Regex; use snafu::{OptionExt, ResultExt}; +use super::IntermediateStatus; use crate::etl::error::{ Error, KeyMustBeStringSnafu, ProcessorExpectStringSnafu, ProcessorMissingFieldSnafu, RegexNamedGroupNotFoundSnafu, RegexNoValidFieldSnafu, RegexNoValidPatternSnafu, RegexSnafu, Result, }; -use crate::etl::field::{Fields, InputField, OneInputMultiOutputField}; -use crate::etl::find_key_index; +use crate::etl::field::Fields; use crate::etl::processor::{ - yaml_bool, yaml_new_field, yaml_new_fields, yaml_string, yaml_strings, Processor, - ProcessorBuilder, ProcessorKind, FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME, PATTERN_NAME, + yaml_bool, yaml_new_field, yaml_new_fields, yaml_string, yaml_strings, Processor, FIELDS_NAME, + FIELD_NAME, IGNORE_MISSING_NAME, PATTERN_NAME, }; use crate::etl::value::Value; @@ -83,113 +82,7 @@ impl std::str::FromStr for GroupRegex { } } -#[derive(Debug, Default)] -pub struct RegexProcessorBuilder { - fields: Fields, - patterns: Vec, - ignore_missing: bool, - output_keys: HashSet, -} - -impl ProcessorBuilder for RegexProcessorBuilder { - fn output_keys(&self) -> HashSet<&str> { - self.output_keys.iter().map(|k| k.as_str()).collect() - } - - fn input_keys(&self) -> HashSet<&str> { - self.fields.iter().map(|f| f.input_field()).collect() - } - - fn build(self, intermediate_keys: &[String]) -> Result { - self.build(intermediate_keys).map(ProcessorKind::Regex) - } -} - -impl RegexProcessorBuilder { - fn check(self) -> Result { - if self.fields.is_empty() { - return RegexNoValidFieldSnafu { - processor: PROCESSOR_REGEX, - } - .fail(); - } - - if self.patterns.is_empty() { - return RegexNoValidPatternSnafu { - processor: PROCESSOR_REGEX, - } - .fail(); - } - - Ok(self) - } - - fn build_group_output_info( - group_regex: &GroupRegex, - om_field: &OneInputMultiOutputField, - intermediate_keys: &[String], - ) -> Result> { - group_regex - .groups - .iter() - .map(|g| { - let key = generate_key(om_field.target_prefix(), g); - let index = find_key_index(intermediate_keys, &key, "regex"); - index.map(|index| OutPutInfo { - final_key: key, - group_name: g.to_string(), - index, - }) - }) - .collect::>>() - } - - fn build_group_output_infos( - patterns: &[GroupRegex], - om_field: &OneInputMultiOutputField, - intermediate_keys: &[String], - ) -> Result>> { - patterns - .iter() - .map(|group_regex| { - Self::build_group_output_info(group_regex, om_field, intermediate_keys) - }) - .collect::>>() - } - - fn build_output_info( - real_fields: &[OneInputMultiOutputField], - patterns: &[GroupRegex], - intermediate_keys: &[String], - ) -> Result { - let inner = real_fields - .iter() - .map(|om_field| Self::build_group_output_infos(patterns, om_field, intermediate_keys)) - .collect::>>(); - inner.map(|inner| RegexProcessorOutputInfo { inner }) - } - - fn build(self, intermediate_keys: &[String]) -> Result { - let mut real_fields = vec![]; - for field in self.fields.into_iter() { - let input_index = find_key_index(intermediate_keys, field.input_field(), "regex")?; - let input_field_info = InputField::new(field.input_field(), input_index); - - let input = OneInputMultiOutputField::new(input_field_info, field.target_field); - real_fields.push(input); - } - let output_info = Self::build_output_info(&real_fields, &self.patterns, intermediate_keys)?; - Ok(RegexProcessor { - // fields: Fields::one(Field::new("test".to_string())), - fields: real_fields, - patterns: self.patterns, - output_info, - ignore_missing: self.ignore_missing, - }) - } -} - -impl TryFrom<&yaml_rust::yaml::Hash> for RegexProcessorBuilder { +impl TryFrom<&yaml_rust::yaml::Hash> for RegexProcessor { type Error = Error; fn try_from(value: &yaml_rust::yaml::Hash) -> Result { @@ -226,61 +119,44 @@ impl TryFrom<&yaml_rust::yaml::Hash> for RegexProcessorBuilder { } } - let pattern_output_keys = patterns - .iter() - .flat_map(|pattern| pattern.groups.iter()) - .collect::>(); - let mut output_keys = HashSet::new(); - for field in fields.iter() { - for x in pattern_output_keys.iter() { - output_keys.insert(generate_key(field.target_or_input_field(), x)); - } - } - - let processor_builder = RegexProcessorBuilder { + let processor_builder = RegexProcessor { fields, patterns, ignore_missing, - output_keys, }; processor_builder.check() } } -#[derive(Debug, Default)] -struct OutPutInfo { - final_key: String, - group_name: String, - index: usize, -} - -#[derive(Debug, Default)] -struct RegexProcessorOutputInfo { - pub inner: Vec>>, -} - -impl RegexProcessorOutputInfo { - fn get_output_index( - &self, - field_index: usize, - pattern_index: usize, - group_index: usize, - ) -> usize { - self.inner[field_index][pattern_index][group_index].index - } -} /// only support string value /// if no value found from a pattern, the target_field will be ignored #[derive(Debug, Default)] pub struct RegexProcessor { - fields: Vec, - output_info: RegexProcessorOutputInfo, + fields: Fields, patterns: Vec, ignore_missing: bool, } impl RegexProcessor { + fn check(self) -> Result { + if self.fields.is_empty() { + return RegexNoValidFieldSnafu { + processor: PROCESSOR_REGEX, + } + .fail(); + } + + if self.patterns.is_empty() { + return RegexNoValidPatternSnafu { + processor: PROCESSOR_REGEX, + } + .fail(); + } + + Ok(self) + } + fn try_with_patterns(&mut self, patterns: Vec) -> Result<()> { let mut rs = vec![]; for pattern in patterns { @@ -291,21 +167,13 @@ impl RegexProcessor { Ok(()) } - fn process( - &self, - val: &str, - gr: &GroupRegex, - index: (usize, usize), - ) -> Result> { + fn process<'a>(&self, val: &str, gr: &'a GroupRegex) -> Result> { let mut result = Vec::new(); if let Some(captures) = gr.regex.captures(val) { - for (group_index, group) in gr.groups.iter().enumerate() { + for group in gr.groups.iter() { if let Some(capture) = captures.name(group) { let value = capture.as_str().to_string(); - let index = self - .output_info - .get_output_index(index.0, index.1, group_index); - result.push((index, Value::String(value))); + result.push((group, Value::String(value))); } } } @@ -322,9 +190,9 @@ impl Processor for RegexProcessor { self.ignore_missing } - fn exec_mut(&self, val: &mut Vec) -> Result<()> { - for (field_index, field) in self.fields.iter().enumerate() { - let index = field.input_index(); + fn exec_mut(&self, val: &mut IntermediateStatus) -> Result<()> { + for field in self.fields.iter() { + let index = field.input_field(); let mut result_list = None; match val.get(index) { Some(Value::String(s)) => { @@ -336,8 +204,8 @@ impl Processor for RegexProcessor { // val[output_index] = result; // } // } - for (gr_index, gr) in self.patterns.iter().enumerate() { - let result = self.process(s.as_str(), gr, (field_index, gr_index))?; + for gr in self.patterns.iter() { + let result = self.process(s.as_str(), gr)?; if !result.is_empty() { match result_list.as_mut() { None => { @@ -354,7 +222,7 @@ impl Processor for RegexProcessor { if !self.ignore_missing { return ProcessorMissingFieldSnafu { processor: self.kind(), - field: field.input_name(), + field: field.input_field(), } .fail(); } @@ -372,7 +240,7 @@ impl Processor for RegexProcessor { None => {} Some(result_list) => { for (output_index, result) in result_list { - val[output_index] = result; + val.insert(generate_key(index, output_index), result); } } } @@ -388,7 +256,7 @@ mod tests { use ahash::{HashMap, HashMapExt}; use itertools::Itertools; - use crate::etl::processor::regex::RegexProcessorBuilder; + use crate::etl::processor::regex::RegexProcessor; use crate::etl::value::{Map, Value}; #[test] @@ -402,23 +270,21 @@ ignore_missing: false"#; .pop() .unwrap(); let processor_yaml_hash = processor_yaml.as_hash().unwrap(); - let builder = RegexProcessorBuilder::try_from(processor_yaml_hash).unwrap(); - let intermediate_keys = ["a".to_string(), "a_ar".to_string()]; - let processor = builder.build(&intermediate_keys).unwrap(); + let processor = RegexProcessor::try_from(processor_yaml_hash).unwrap(); // single field (with prefix), multiple patterns let result = processor - .process("123", &processor.patterns[0], (0, 0)) + .process("123", &processor.patterns[0]) .unwrap() .into_iter() - .map(|(k, v)| (intermediate_keys[k].clone(), v)) + .map(|(k, v)| (k.to_string(), v)) .collect(); let map = Map { values: result }; let v = Map { - values: vec![("a_ar".to_string(), Value::String("1".to_string()))] + values: vec![("ar".to_string(), Value::String("1".to_string()))] .into_iter() .collect(), }; @@ -464,30 +330,23 @@ ignore_missing: false"#; .pop() .unwrap(); let processor_yaml_hash = processor_yaml.as_hash().unwrap(); - let builder = RegexProcessorBuilder::try_from(processor_yaml_hash).unwrap(); - let intermediate_keys = [ - "breadcrumbs", - "breadcrumbs_parent", - "breadcrumbs_edge", - "breadcrumbs_origin", - "breadcrumbs_peer", - "breadcrumbs_wrapper", - ] - .iter() - .map(|k| k.to_string()) - .collect_vec(); - let processor = builder.build(&intermediate_keys).unwrap(); + let processor = RegexProcessor::try_from(processor_yaml_hash).unwrap(); let mut result = BTreeMap::new(); - for (index, pattern) in processor.patterns.iter().enumerate() { + for pattern in processor.patterns.iter() { let r = processor - .process(&breadcrumbs_str, pattern, (0, index)) + .process(&breadcrumbs_str, pattern) .unwrap() .into_iter() - .map(|(k, v)| (intermediate_keys[k].clone(), v)) + .map(|(k, v)| (k.to_string(), v)) .collect::>(); result.extend(r); } - let map = Map { values: result }; + let map = Map { + values: result + .into_iter() + .map(|(k, v)| (format!("breadcrumbs_{}", k), v)) + .collect(), + }; assert_eq!(temporary_map, map); } @@ -515,67 +374,21 @@ ignore_missing: false"#; .pop() .unwrap(); let processor_yaml_hash = processor_yaml.as_hash().unwrap(); - let builder = RegexProcessorBuilder::try_from(processor_yaml_hash).unwrap(); - - let intermediate_keys = [ - "breadcrumbs_parent", - "breadcrumbs_edge", - "breadcrumbs_origin", - "breadcrumbs_peer", - "breadcrumbs_wrapper", - "edge_ip", - "edge_request_id", - "edge_request_end_time", - "edge_turn_around_time", - "edge_dns_lookup_time", - "edge_geo", - "edge_asn", - "origin_ip", - "origin_request_id", - "origin_request_end_time", - "origin_turn_around_time", - "origin_dns_lookup_time", - "origin_geo", - "origin_asn", - "peer_ip", - "peer_request_id", - "peer_request_end_time", - "peer_turn_around_time", - "peer_dns_lookup_time", - "peer_geo", - "peer_asn", - "parent_ip", - "parent_request_id", - "parent_request_end_time", - "parent_turn_around_time", - "parent_dns_lookup_time", - "parent_geo", - "parent_asn", - "wrapper_ip", - "wrapper_request_id", - "wrapper_request_end_time", - "wrapper_turn_around_time", - "wrapper_dns_lookup_time", - "wrapper_geo", - "wrapper_asn", - ] - .iter() - .map(|k| k.to_string()) - .collect_vec(); - let processor = builder.build(&intermediate_keys).unwrap(); + let processor = RegexProcessor::try_from(processor_yaml_hash).unwrap(); let mut result = HashMap::new(); - for (field_index, field) in processor.fields.iter().enumerate() { - for (pattern_index, pattern) in processor.patterns.iter().enumerate() { + for field in processor.fields.iter() { + for pattern in processor.patterns.iter() { let s = temporary_map - .get(field.input_name()) + .get(field.input_field()) .unwrap() .to_str_value(); + let prefix = field.target_or_input_field(); let r = processor - .process(&s, pattern, (field_index, pattern_index)) + .process(&s, pattern) .unwrap() .into_iter() - .map(|(k, v)| (intermediate_keys[k].clone(), v)) + .map(|(k, v)| (format!("{}_{}", prefix, k), v)) .collect::>(); result.extend(r); } diff --git a/src/pipeline/tests/regex.rs b/src/pipeline/tests/regex.rs index a8a7daaf5c6f..93b0897db0eb 100644 --- a/src/pipeline/tests/regex.rs +++ b/src/pipeline/tests/regex.rs @@ -93,6 +93,8 @@ transform: assert_eq!(output.schema, *EXPECTED_SCHEMA); + println!("{:?}", output.rows); + assert_eq!( output.rows[0].values[0].value_data, Some(StringValue("123".to_string())) diff --git a/src/servers/src/http/event.rs b/src/servers/src/http/event.rs index 978891078cce..d6d8e89a56ea 100644 --- a/src/servers/src/http/event.rs +++ b/src/servers/src/http/event.rs @@ -573,7 +573,7 @@ fn extract_pipeline_value_by_content_type( ct if ct == *TEXT_CONTENT_TYPE || ct == *TEXT_UTF8_CONTENT_TYPE => payload .lines() .filter(|line| !line.is_empty()) - .map(|line| Value::String(line.to_string())) + .map(|line| json!({"message": line})) .collect(), _ => UnsupportedContentTypeSnafu { content_type }.fail()?, }) diff --git a/tests-integration/tests/http.rs b/tests-integration/tests/http.rs index d5880b7eae08..4321e2a9d950 100644 --- a/tests-integration/tests/http.rs +++ b/tests-integration/tests/http.rs @@ -1663,7 +1663,7 @@ pub async fn test_plain_text_ingestion(store_type: StorageType) { processors: - dissect: fields: - - line + - message patterns: - "%{+ts} %{+ts} %{content}" - date: From 448f94dfc99d5aaf5e2f31c3854e48d502601b55 Mon Sep 17 00:00:00 2001 From: paomian Date: Mon, 27 Jan 2025 11:51:46 +0800 Subject: [PATCH 29/32] chore: fix test --- src/pipeline/src/etl/processor.rs | 2 +- src/pipeline/src/etl/processor/cmcd.rs | 2 +- src/pipeline/src/etl/processor/dissect.rs | 4 ++-- src/pipeline/src/etl/processor/regex.rs | 5 +++-- src/pipeline/src/etl/transform.rs | 4 +--- src/pipeline/tests/regex.rs | 2 -- 6 files changed, 8 insertions(+), 11 deletions(-) diff --git a/src/pipeline/src/etl/processor.rs b/src/pipeline/src/etl/processor.rs index 376282afecef..005feca3794e 100644 --- a/src/pipeline/src/etl/processor.rs +++ b/src/pipeline/src/etl/processor.rs @@ -91,7 +91,7 @@ pub trait Processor: std::fmt::Debug + Send + Sync + 'static { pub enum ProcessorKind { Cmcd(CmcdProcessor), Csv(CsvProcessor), - // Dissect(DissectProcessor), + Dissect(DissectProcessor), Gsub(GsubProcessor), Join(JoinProcessor), Letter(LetterProcessor), diff --git a/src/pipeline/src/etl/processor/cmcd.rs b/src/pipeline/src/etl/processor/cmcd.rs index 8d8b546f7216..a5da69d0be42 100644 --- a/src/pipeline/src/etl/processor/cmcd.rs +++ b/src/pipeline/src/etl/processor/cmcd.rs @@ -257,7 +257,7 @@ impl Processor for CmcdProcessor { match val.get(name) { Some(Value::String(s)) => { - let results = self.parse(name, s)?; + let results = self.parse(field.target_or_input_field(), s)?; val.extend(results); } Some(Value::Null) | None => { diff --git a/src/pipeline/src/etl/processor/dissect.rs b/src/pipeline/src/etl/processor/dissect.rs index 5755a0aeb8a5..b35884b82671 100644 --- a/src/pipeline/src/etl/processor/dissect.rs +++ b/src/pipeline/src/etl/processor/dissect.rs @@ -537,10 +537,10 @@ pub struct DissectProcessor { } impl DissectProcessor { - fn process_name_value<'a, 'b>( + fn process_name_value<'a>( name: &'a Name, value: String, - appends: &'b mut HashMap<&'a String, Vec<(String, u32)>>, + appends: &mut HashMap<&'a String, Vec<(String, u32)>>, map: &mut Vec<(&'a String, Value)>, ) { match name.start_modifier { diff --git a/src/pipeline/src/etl/processor/regex.rs b/src/pipeline/src/etl/processor/regex.rs index e6988c773f52..fad905479a83 100644 --- a/src/pipeline/src/etl/processor/regex.rs +++ b/src/pipeline/src/etl/processor/regex.rs @@ -193,6 +193,7 @@ impl Processor for RegexProcessor { fn exec_mut(&self, val: &mut IntermediateStatus) -> Result<()> { for field in self.fields.iter() { let index = field.input_field(); + let prefix = field.target_or_input_field(); let mut result_list = None; match val.get(index) { Some(Value::String(s)) => { @@ -239,8 +240,8 @@ impl Processor for RegexProcessor { match result_list { None => {} Some(result_list) => { - for (output_index, result) in result_list { - val.insert(generate_key(index, output_index), result); + for (output_key, result) in result_list { + val.insert(generate_key(prefix, output_key), result); } } } diff --git a/src/pipeline/src/etl/transform.rs b/src/pipeline/src/etl/transform.rs index 7191d272069c..e3039d6c7ac4 100644 --- a/src/pipeline/src/etl/transform.rs +++ b/src/pipeline/src/etl/transform.rs @@ -128,9 +128,7 @@ impl TryFrom<&Vec> for Transforms { all_required_keys.sort(); - Ok(Transforms { - transforms: transforms, - }) + Ok(Transforms { transforms }) } } diff --git a/src/pipeline/tests/regex.rs b/src/pipeline/tests/regex.rs index 93b0897db0eb..a8a7daaf5c6f 100644 --- a/src/pipeline/tests/regex.rs +++ b/src/pipeline/tests/regex.rs @@ -93,8 +93,6 @@ transform: assert_eq!(output.schema, *EXPECTED_SCHEMA); - println!("{:?}", output.rows); - assert_eq!( output.rows[0].values[0].value_data, Some(StringValue("123".to_string())) From b5d2969be831eb66c75a53830e7d1a4330ec1a69 Mon Sep 17 00:00:00 2001 From: Ning Sun Date: Mon, 27 Jan 2025 16:56:21 +0800 Subject: [PATCH 30/32] test: add integration test for http pipeline --- tests-integration/tests/http.rs | 194 +++++++++++++++++++++++++++++++- 1 file changed, 193 insertions(+), 1 deletion(-) diff --git a/tests-integration/tests/http.rs b/tests-integration/tests/http.rs index 4321e2a9d950..413a656e6004 100644 --- a/tests-integration/tests/http.rs +++ b/tests-integration/tests/http.rs @@ -93,6 +93,7 @@ macro_rules! http_tests { test_plain_text_ingestion, test_identify_pipeline, test_identify_pipeline_with_flatten, + test_pipeline_dispatcher, test_otlp_metrics, test_otlp_traces, @@ -1359,6 +1360,197 @@ pub async fn test_identify_pipeline(store_type: StorageType) { guard.remove_all().await; } +pub async fn test_pipeline_dispatcher(storage_type: StorageType) { + common_telemetry::init_default_ut_logging(); + let (app, mut guard) = + setup_test_http_app_with_frontend(storage_type, "test_pipeline_dispatcher").await; + + // handshake + let client = TestClient::new(app).await; + + let root_pipeline = r#" +processors: + - date: + field: time + formats: + - "%Y-%m-%d %H:%M:%S%.3f" + ignore_missing: true + +dispatcher: + field: type + rules: + - value: http + table_part: http + pipeline: http + - value: db + table_part: db + - value: not_found + table_part: not_found + pipeline: not_found + +transform: + - fields: + - id1, id1_root + - id2, id2_root + type: int32 + - fields: + - type + - log + - logger + type: string + - field: time + type: time + index: timestamp +"#; + + let http_pipeline = r#" +processors: + +transform: + - fields: + - id1, id1_http + - id2, id2_http + type: int32 + - fields: + - log + - logger + type: string + - field: time + type: time + index: timestamp +"#; + + // 1. create pipeline + let res = client + .post("/v1/events/pipelines/root") + .header("Content-Type", "application/x-yaml") + .body(root_pipeline) + .send() + .await; + + assert_eq!(res.status(), StatusCode::OK); + + let res = client + .post("/v1/events/pipelines/http") + .header("Content-Type", "application/x-yaml") + .body(http_pipeline) + .send() + .await; + + assert_eq!(res.status(), StatusCode::OK); + + // 2. write data + let data_body = r#" +[ + { + "id1": "2436", + "id2": "2528", + "logger": "INTERACT.MANAGER", + "type": "http", + "time": "2024-05-25 20:16:37.217", + "log": "ClusterAdapter:enter sendTextDataToCluster\\n" + } +] +"#; + let res = client + .post("/v1/events/logs?db=public&table=logs1&pipeline_name=root") + .header("Content-Type", "application/json") + .body(data_body) + .send() + .await; + assert_eq!(res.status(), StatusCode::OK); + + let data_body = r#" +[ + { + "id1": "2436", + "id2": "2528", + "logger": "INTERACT.MANAGER", + "type": "db", + "time": "2024-05-25 20:16:37.217", + "log": "ClusterAdapter:enter sendTextDataToCluster\\n" + } +] +"#; + let res = client + .post("/v1/events/logs?db=public&table=logs1&pipeline_name=root") + .header("Content-Type", "application/json") + .body(data_body) + .send() + .await; + assert_eq!(res.status(), StatusCode::OK); + + let data_body = r#" +[ + { + "id1": "2436", + "id2": "2528", + "logger": "INTERACT.MANAGER", + "type": "api", + "time": "2024-05-25 20:16:37.217", + "log": "ClusterAdapter:enter sendTextDataToCluster\\n" + } +] +"#; + let res = client + .post("/v1/events/logs?db=public&table=logs1&pipeline_name=root") + .header("Content-Type", "application/json") + .body(data_body) + .send() + .await; + assert_eq!(res.status(), StatusCode::OK); + + let data_body = r#" +[ + { + "id1": "2436", + "id2": "2528", + "logger": "INTERACT.MANAGER", + "type": "not_found", + "time": "2024-05-25 20:16:37.217", + "log": "ClusterAdapter:enter sendTextDataToCluster\\n" + } +] +"#; + let res = client + .post("/v1/events/logs?db=public&table=logs1&pipeline_name=root") + .header("Content-Type", "application/json") + .body(data_body) + .send() + .await; + assert_eq!(res.status(), StatusCode::BAD_REQUEST); + + // 3. verify data + let expected = "[[2436]]"; + validate_data( + "test_dispatcher_pipeline default table", + &client, + "select id1_root from logs1", + expected, + ) + .await; + + let expected = "[[2436]]"; + validate_data( + "test_dispatcher_pipeline http table", + &client, + "select id1_http from logs1_http", + expected, + ) + .await; + + let expected = "[[\"2436\"]]"; + validate_data( + "test_dispatcher_pipeline db table", + &client, + "select id1 from logs1_db", + expected, + ) + .await; + + guard.remove_all().await; +} + pub async fn test_identify_pipeline_with_flatten(store_type: StorageType) { common_telemetry::init_default_ut_logging(); let (app, mut guard) = @@ -2248,7 +2440,7 @@ async fn validate_data(test_name: &str, client: &TestClient, sql: &str, expected .get(format!("/v1/sql?sql={sql}").as_str()) .send() .await; - assert_eq!(res.status(), StatusCode::OK); + assert_eq!(res.status(), StatusCode::OK, "validate {test_name} fail"); let resp = res.text().await; let v = get_rows_from_output(&resp); From fc4b3f1f9801d262a483a6b8eb40bf110698f3f4 Mon Sep 17 00:00:00 2001 From: Ning Sun Date: Tue, 28 Jan 2025 17:56:41 +0800 Subject: [PATCH 31/32] refactor: improve regex pipeline --- src/pipeline/src/etl/processor/dissect.rs | 119 ---------------------- src/pipeline/src/etl/processor/regex.rs | 107 ++++++------------- 2 files changed, 29 insertions(+), 197 deletions(-) diff --git a/src/pipeline/src/etl/processor/dissect.rs b/src/pipeline/src/etl/processor/dissect.rs index b35884b82671..9ac28f7bf09e 100644 --- a/src/pipeline/src/etl/processor/dissect.rs +++ b/src/pipeline/src/etl/processor/dissect.rs @@ -407,125 +407,6 @@ impl Pattern { } } -// impl std::fmt::Display for PatternInfo { -// fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { -// write!(f, "{}", self.origin) -// } -// } - -// #[derive(Debug, Default)] -// pub struct DissectProcessorBuilder { -// fields: Fields, -// patterns: Vec, -// ignore_missing: bool, -// append_separator: Option, -// output_keys: HashSet, -// } - -// impl DissectProcessorBuilder { -// fn build_output_keys(patterns: &[PatternInfo]) -> HashSet { -// patterns -// .iter() -// .flat_map(|pattern| pattern.iter()) -// .filter_map(|p| match p { -// PartInfo::Name(name) => { -// if !name.is_empty() -// && (name.start_modifier.is_none() -// || name -// .start_modifier -// .as_ref() -// .is_some_and(|x| matches!(x, StartModifier::Append(_)))) -// { -// Some(name.to_string()) -// } else { -// None -// } -// } -// _ => None, -// }) -// .collect() -// } - -// fn part_info_to_part(part_info: PartInfo, intermediate_keys: &[String]) -> Result { -// match part_info { -// PartInfo::Split(s) => Ok(Part::Split(s)), -// PartInfo::Name(n) => match n.start_modifier { -// None | Some(StartModifier::Append(_)) => { -// let index = find_key_index(intermediate_keys, &n.name, "dissect")?; -// Ok(Part::Name(Name { -// name: n.name, -// index, -// start_modifier: n.start_modifier, -// end_modifier: n.end_modifier, -// })) -// } -// _ => Ok(Part::Name(Name { -// name: n.name, -// index: usize::MAX, -// start_modifier: n.start_modifier, -// end_modifier: n.end_modifier, -// })), -// }, -// } -// } - -// fn pattern_info_to_pattern( -// pattern_info: PatternInfo, -// intermediate_keys: &[String], -// ) -> Result { -// let original = pattern_info.origin; -// let pattern = pattern_info -// .parts -// .into_iter() -// .map(|part_info| Self::part_info_to_part(part_info, intermediate_keys)) -// .collect::>>()?; -// Ok(Pattern { -// origin: original, -// parts: pattern, -// }) -// } - -// fn build_patterns_from_pattern_infos( -// patterns: Vec, -// intermediate_keys: &[String], -// ) -> Result> { -// patterns -// .into_iter() -// .map(|pattern_info| Self::pattern_info_to_pattern(pattern_info, intermediate_keys)) -// .collect() -// } -// } - -// impl ProcessorBuilder for DissectProcessorBuilder { -// fn output_keys(&self) -> HashSet<&str> { -// self.output_keys.iter().map(|s| s.as_str()).collect() -// } - -// fn input_keys(&self) -> HashSet<&str> { -// self.fields.iter().map(|f| f.input_field()).collect() -// } - -// fn build(self, intermediate_keys: &[String]) -> Result { -// let mut real_fields = vec![]; -// for field in self.fields.into_iter() { -// let input_index = find_key_index(intermediate_keys, field.input_field(), "dissect")?; - -// let input_field_info = InputField::new(field.input_field(), input_index); - -// let real_field = OneInputMultiOutputField::new(input_field_info, field.target_field); -// real_fields.push(real_field); -// } -// let patterns = Self::build_patterns_from_pattern_infos(self.patterns, intermediate_keys)?; -// let processor = DissectProcessor { -// fields: real_fields, -// patterns, -// ignore_missing: self.ignore_missing, -// append_separator: self.append_separator, -// }; -// Ok(ProcessorKind::Dissect(processor)) -// } -// } - #[derive(Debug, Default)] pub struct DissectProcessor { fields: Fields, diff --git a/src/pipeline/src/etl/processor/regex.rs b/src/pipeline/src/etl/processor/regex.rs index fad905479a83..27f30f65d9ae 100644 --- a/src/pipeline/src/etl/processor/regex.rs +++ b/src/pipeline/src/etl/processor/regex.rs @@ -18,6 +18,8 @@ const PATTERNS_NAME: &str = "patterns"; pub(crate) const PROCESSOR_REGEX: &str = "regex"; +use std::collections::BTreeMap; + use lazy_static::lazy_static; use regex::Regex; use snafu::{OptionExt, ResultExt}; @@ -167,13 +169,15 @@ impl RegexProcessor { Ok(()) } - fn process<'a>(&self, val: &str, gr: &'a GroupRegex) -> Result> { - let mut result = Vec::new(); - if let Some(captures) = gr.regex.captures(val) { - for group in gr.groups.iter() { - if let Some(capture) = captures.name(group) { - let value = capture.as_str().to_string(); - result.push((group, Value::String(value))); + fn process(&self, prefix: &str, val: &str) -> Result> { + let mut result = BTreeMap::new(); + for gr in self.patterns.iter() { + if let Some(captures) = gr.regex.captures(val) { + for group in gr.groups.iter() { + if let Some(capture) = captures.name(group) { + let value = capture.as_str().to_string(); + result.insert(generate_key(prefix, group), Value::String(value)); + } } } } @@ -194,30 +198,10 @@ impl Processor for RegexProcessor { for field in self.fields.iter() { let index = field.input_field(); let prefix = field.target_or_input_field(); - let mut result_list = None; match val.get(index) { Some(Value::String(s)) => { - // we get rust borrow checker error here - // for (gr_index, gr) in self.patterns.iter().enumerate() { - // let result_list = self.process(s.as_str(), gr, (field_index, gr_index))?; - // for (output_index, result) in result_list { - //cannot borrow `*val` as mutable because it is also borrowed as immutable mutable borrow occurs here - // val[output_index] = result; - // } - // } - for gr in self.patterns.iter() { - let result = self.process(s.as_str(), gr)?; - if !result.is_empty() { - match result_list.as_mut() { - None => { - result_list = Some(result); - } - Some(result_list) => { - result_list.extend(result); - } - } - } - } + let result = self.process(prefix, s)?; + val.extend(result); } Some(Value::Null) | None => { if !self.ignore_missing { @@ -236,15 +220,6 @@ impl Processor for RegexProcessor { .fail(); } } - // safety here - match result_list { - None => {} - Some(result_list) => { - for (output_key, result) in result_list { - val.insert(generate_key(prefix, output_key), result); - } - } - } } Ok(()) @@ -275,17 +250,12 @@ ignore_missing: false"#; // single field (with prefix), multiple patterns - let result = processor - .process("123", &processor.patterns[0]) - .unwrap() - .into_iter() - .map(|(k, v)| (k.to_string(), v)) - .collect(); + let result = processor.process("a", "123").unwrap(); let map = Map { values: result }; let v = Map { - values: vec![("ar".to_string(), Value::String("1".to_string()))] + values: vec![("a_ar".to_string(), Value::String("1".to_string()))] .into_iter() .collect(), }; @@ -302,7 +272,7 @@ ignore_missing: false"#; let cw = "[c=w,n=US_CA_SANJOSE,o=55155]"; let breadcrumbs_str = [cc, cg, co, cp, cw].iter().join(","); - let values = [ + let temporary_map: BTreeMap = [ ("breadcrumbs_parent", Value::String(cc.to_string())), ("breadcrumbs_edge", Value::String(cg.to_string())), ("breadcrumbs_origin", Value::String(co.to_string())), @@ -312,7 +282,6 @@ ignore_missing: false"#; .into_iter() .map(|(k, v)| (k.to_string(), v)) .collect(); - let temporary_map = Map { values }; { // single field (with prefix), multiple patterns @@ -332,23 +301,10 @@ ignore_missing: false"#; .unwrap(); let processor_yaml_hash = processor_yaml.as_hash().unwrap(); let processor = RegexProcessor::try_from(processor_yaml_hash).unwrap(); - let mut result = BTreeMap::new(); - for pattern in processor.patterns.iter() { - let r = processor - .process(&breadcrumbs_str, pattern) - .unwrap() - .into_iter() - .map(|(k, v)| (k.to_string(), v)) - .collect::>(); - result.extend(r); - } - let map = Map { - values: result - .into_iter() - .map(|(k, v)| (format!("breadcrumbs_{}", k), v)) - .collect(), - }; - assert_eq!(temporary_map, map); + + let result = processor.process("breadcrumbs", &breadcrumbs_str).unwrap(); + + assert_eq!(temporary_map, result); } { @@ -379,20 +335,15 @@ ignore_missing: false"#; let mut result = HashMap::new(); for field in processor.fields.iter() { - for pattern in processor.patterns.iter() { - let s = temporary_map - .get(field.input_field()) - .unwrap() - .to_str_value(); - let prefix = field.target_or_input_field(); - let r = processor - .process(&s, pattern) - .unwrap() - .into_iter() - .map(|(k, v)| (format!("{}_{}", prefix, k), v)) - .collect::>(); - result.extend(r); - } + let s = temporary_map + .get(field.input_field()) + .unwrap() + .to_str_value(); + let prefix = field.target_or_input_field(); + + let r = processor.process(prefix, &s).unwrap(); + + result.extend(r); } let new_values = vec![ From 592f2f452ba76e552d562b9ec3cd63c052191588 Mon Sep 17 00:00:00 2001 From: Ning Sun Date: Tue, 28 Jan 2025 23:35:16 +0800 Subject: [PATCH 32/32] refactor: improve required field check --- src/pipeline/src/etl/processor/gsub.rs | 44 ++++++--------------- src/pipeline/src/etl/processor/join.rs | 22 +++-------- src/pipeline/src/etl/processor/json_path.rs | 32 +++++++-------- 3 files changed, 33 insertions(+), 65 deletions(-) diff --git a/src/pipeline/src/etl/processor/gsub.rs b/src/pipeline/src/etl/processor/gsub.rs index dbdb9c5c3047..7f0f601f44f3 100644 --- a/src/pipeline/src/etl/processor/gsub.rs +++ b/src/pipeline/src/etl/processor/gsub.rs @@ -32,35 +32,17 @@ pub(crate) const PROCESSOR_GSUB: &str = "gsub"; const REPLACEMENT_NAME: &str = "replacement"; /// A processor to replace all matches of a pattern in string by a replacement, only support string value, and array string value -#[derive(Debug, Default)] +#[derive(Debug)] pub struct GsubProcessor { fields: Fields, - pattern: Option, - replacement: Option, + pattern: Regex, + replacement: String, ignore_missing: bool, } impl GsubProcessor { - fn check(self) -> Result { - if self.pattern.is_none() { - return GsubPatternRequiredSnafu.fail(); - } - - if self.replacement.is_none() { - return GsubReplacementRequiredSnafu.fail(); - } - - Ok(self) - } - fn process_string(&self, val: &str) -> Result { - let replacement = self.replacement.as_ref().unwrap(); - let new_val = self - .pattern - .as_ref() - .unwrap() - .replace_all(val, replacement) - .to_string(); + let new_val = self.pattern.replace_all(val, &self.replacement).to_string(); let val = Value::String(new_val); Ok(val) @@ -118,14 +100,12 @@ impl TryFrom<&yaml_rust::yaml::Hash> for GsubProcessor { } } - let builder = GsubProcessor { + Ok(GsubProcessor { fields, - pattern, - replacement, + pattern: pattern.context(GsubPatternRequiredSnafu)?, + replacement: replacement.context(GsubReplacementRequiredSnafu)?, ignore_missing, - }; - - builder.check() + }) } } @@ -164,15 +144,17 @@ impl crate::etl::processor::Processor for GsubProcessor { #[cfg(test)] mod tests { + use super::*; use crate::etl::processor::gsub::GsubProcessor; use crate::etl::value::Value; #[test] fn test_string_value() { let processor = GsubProcessor { - pattern: Some(regex::Regex::new(r"\d+").unwrap()), - replacement: Some("xxx".to_string()), - ..Default::default() + fields: Fields::default(), + pattern: regex::Regex::new(r"\d+").unwrap(), + replacement: "xxx".to_string(), + ignore_missing: false, }; let val = Value::String("123".to_string()); diff --git a/src/pipeline/src/etl/processor/join.rs b/src/pipeline/src/etl/processor/join.rs index 6913a5428873..72fafdbf7dd1 100644 --- a/src/pipeline/src/etl/processor/join.rs +++ b/src/pipeline/src/etl/processor/join.rs @@ -32,29 +32,20 @@ pub(crate) const PROCESSOR_JOIN: &str = "join"; #[derive(Debug, Default)] pub struct JoinProcessor { fields: Fields, - separator: Option, + separator: String, ignore_missing: bool, } impl JoinProcessor { fn process(&self, arr: &Array) -> Result { - let sep = self.separator.as_ref().unwrap(); let val = arr .iter() .map(|v| v.to_str_value()) .collect::>() - .join(sep); + .join(&self.separator); Ok(Value::String(val)) } - - fn check(self) -> Result { - if self.separator.is_none() { - return JoinSeparatorRequiredSnafu.fail(); - } - - Ok(self) - } } impl TryFrom<&yaml_rust::yaml::Hash> for JoinProcessor { @@ -87,12 +78,11 @@ impl TryFrom<&yaml_rust::yaml::Hash> for JoinProcessor { } } - let builder = JoinProcessor { + Ok(JoinProcessor { fields, - separator, + separator: separator.context(JoinSeparatorRequiredSnafu)?, ignore_missing, - }; - builder.check() + }) } } @@ -146,7 +136,7 @@ mod tests { #[test] fn test_join_processor() { let processor = JoinProcessor { - separator: Some("-".to_string()), + separator: "-".to_string(), ..Default::default() }; diff --git a/src/pipeline/src/etl/processor/json_path.rs b/src/pipeline/src/etl/processor/json_path.rs index c7b4210e83f1..92916263e4e9 100644 --- a/src/pipeline/src/etl/processor/json_path.rs +++ b/src/pipeline/src/etl/processor/json_path.rs @@ -67,22 +67,18 @@ impl TryFrom<&yaml_rust::yaml::Hash> for JsonPathProcessor { _ => {} } } - if let Some(json_path) = json_path { - let processor = JsonPathProcessor { - fields, - json_path, - ignore_missing, - result_idex, - }; - - Ok(processor) - } else { - ProcessorMissingFieldSnafu { + + let processor = JsonPathProcessor { + fields, + json_path: json_path.context(ProcessorMissingFieldSnafu { processor: PROCESSOR_JSON_PATH, field: JSON_PATH_NAME, - } - .fail() - } + })?, + ignore_missing, + result_index: result_idex, + }; + + Ok(processor) } } @@ -91,7 +87,7 @@ pub struct JsonPathProcessor { fields: Fields, json_path: JsonPath, ignore_missing: bool, - result_idex: Option, + result_index: Option, } impl Default for JsonPathProcessor { @@ -100,7 +96,7 @@ impl Default for JsonPathProcessor { fields: Fields::default(), json_path: JsonPath::try_from("$").unwrap(), ignore_missing: false, - result_idex: None, + result_index: None, } } } @@ -110,7 +106,7 @@ impl JsonPathProcessor { let processed = self.json_path.find(val); match processed { Value::Array(arr) => { - if let Some(index) = self.result_idex { + if let Some(index) = self.result_index { Ok(arr.get(index).cloned().unwrap_or(Value::Null)) } else { Ok(Value::Array(arr)) @@ -166,7 +162,7 @@ mod test { let json_path = JsonPath::try_from("$.hello").unwrap(); let processor = JsonPathProcessor { json_path, - result_idex: Some(0), + result_index: Some(0), ..Default::default() };