From 0e4574452ce8b4c0a90b708ee49480956096367e Mon Sep 17 00:00:00 2001
From: Ning Sun <sunning@greptime.com>
Date: Tue, 14 Jan 2025 22:34:26 +0800
Subject: [PATCH 01/32] fmt: correct format

---
 src/pipeline/src/dispatcher.rs | 1 -
 1 file changed, 1 deletion(-)
diff --git a/src/pipeline/src/dispatcher.rs b/src/pipeline/src/dispatcher.rs
index f16fd7e57fb2..f54531e802e3 100644
--- a/src/pipeline/src/dispatcher.rs
+++ b/src/pipeline/src/dispatcher.rs
@@ -15,7 +15,6 @@
 use snafu::OptionExt;
 use yaml_rust::Yaml;
 
-use crate::etl::error::{Error, Result};
 use crate::etl_error::{
     FieldRequiredForDispatcherSnafu, TablePartRequiredForDispatcherRuleSnafu,
     ValueRequiredForDispatcherRuleSnafu,

From aa63b875873a9b0233db995ff70232abe5260b95 Mon Sep 17 00:00:00 2001
From: Ning Sun <sunning@greptime.com>
Date: Wed, 15 Jan 2025 10:40:11 +0800
Subject: [PATCH 02/32] test: add negative tests

---
 src/pipeline/src/dispatcher.rs | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/pipeline/src/dispatcher.rs b/src/pipeline/src/dispatcher.rs
index f54531e802e3..7c4207f6b5c1 100644
--- a/src/pipeline/src/dispatcher.rs
+++ b/src/pipeline/src/dispatcher.rs
@@ -83,6 +83,7 @@ impl TryFrom<&Yaml> for Dispatcher {
                         .as_str()
                         .map(|s| s.to_string())
                         .context(TablePartRequiredForDispatcherRuleSnafu)?;
+
                     let pipeline = rule[PIPELINE].as_str().map(|s| s.to_string());
 
                     if rule[VALUE].is_badvalue() {

From 6c226b52b7e3e9536d4bc12b7af3cbb6f4daa931 Mon Sep 17 00:00:00 2001
From: Ning Sun <sunning@greptime.com>
Date: Thu, 16 Jan 2025 19:17:57 +0800
Subject: [PATCH 03/32] feat: Add pipeline dispatching and execution output
 handling

---
 src/pipeline/src/dispatcher.rs |  24 ++++
 src/pipeline/src/etl.rs        |  74 +++++++++++--
 src/pipeline/src/lib.rs        |   5 +-
 src/servers/src/http/event.rs  | 194 ++++++++++++++++++++++++---------
 4 files changed, 236 insertions(+), 61 deletions(-)

diff --git a/src/pipeline/src/dispatcher.rs b/src/pipeline/src/dispatcher.rs
index 7c4207f6b5c1..a2a1e9fa1425 100644
--- a/src/pipeline/src/dispatcher.rs
+++ b/src/pipeline/src/dispatcher.rs
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+use common_telemetry::debug;
 use snafu::OptionExt;
 use yaml_rust::Yaml;
 
@@ -105,3 +106,26 @@ impl TryFrom<&Yaml> for Dispatcher {
         Ok(Dispatcher { field, rules })
     }
 }
+
+impl Dispatcher {
+    /// execute dispatcher and returns matched rule if any
+    pub(crate) fn exec(&self, keys: &Vec<String>, val: &Vec<Value>) -> Option<&Rule> {
+        if let Some(index) = keys.iter().position(|key| key == &self.field) {
+            if let Some(value) = val.get(index) {
+                for rule in &self.rules {
+                    if rule.value == *value {
+                        return Some(rule);
+                    }
+                }
+
+                None
+            } else {
+                debug!("value at index {} is not found in {:?}", &index, val);
+                None
+            }
+        } else {
+            debug!("field {} not found in keys {:?}", &self.field, keys);
+            None
+        }
+    }
+}
diff --git a/src/pipeline/src/etl.rs b/src/pipeline/src/etl.rs
index d55cf25d543d..a12b9d7b0478 100644
--- a/src/pipeline/src/etl.rs
+++ b/src/pipeline/src/etl.rs
@@ -30,7 +30,7 @@ use transform::{TransformBuilders, Transformer, Transforms};
 use value::Value;
 use yaml_rust::YamlLoader;
 
-use crate::dispatcher::Dispatcher;
+use crate::dispatcher::{Dispatcher, Rule};
 use crate::etl::error::Result;
 
 const DESCRIPTION: &str = "description";
@@ -192,16 +192,60 @@ where
     // pub on_failure: processor::Processors,
 }
 
+/// Where the pipeline executed is dispatched to, with context information
+#[derive(Debug, Hash, PartialEq, Eq, Clone, PartialOrd, Ord)]
+pub struct DispatchedTo {
+    pub table_part: String,
+    pub pipeline: Option<String>,
+}
+
+impl From<&Rule> for DispatchedTo {
+    fn from(value: &Rule) -> Self {
+        DispatchedTo {
+            table_part: value.table_part.clone(),
+            pipeline: value.pipeline.clone(),
+        }
+    }
+}
+
+/// The result of pipeline execution
+#[derive(Debug)]
+pub enum PipelineExecOutput<O> {
+    Transformed(O),
+    DispatchedTo(DispatchedTo),
+}
+
+impl<O> PipelineExecOutput<O> {
+    pub(crate) fn into_transformed(self) -> Option<O> {
+        if let Self::Transformed(o) = self {
+            Some(o)
+        } else {
+            None
+        }
+    }
+}
+
 impl<T> Pipeline<T>
 where
     T: Transformer,
 {
-    pub fn exec_mut(&self, val: &mut Vec<Value>) -> Result<T::VecOutput> {
+    pub fn exec_mut(&self, val: &mut Vec<Value>) -> Result<PipelineExecOutput<T::VecOutput>> {
         for processor in self.processors.iter() {
             processor.exec_mut(val)?;
         }
 
-        self.transformer.transform_mut(val)
+        let matched_rule = self
+            .dispatcher
+            .as_ref()
+            .and_then(|dispatcher| dispatcher.exec(&self.intermediate_keys, val));
+
+        match matched_rule {
+            None => self
+                .transformer
+                .transform_mut(val)
+                .map(PipelineExecOutput::Transformed),
+            Some(rule) => Ok(PipelineExecOutput::DispatchedTo(rule.into())),
+        }
     }
 
     pub fn prepare_pipeline_value(&self, val: Value, result: &mut [Value]) -> Result<()> {
@@ -379,7 +423,11 @@ transform:
             payload,
             vec![Value::String("1,2".to_string()), Value::Null, Value::Null]
         );
-        let result = pipeline.exec_mut(&mut payload).unwrap();
+        let result = pipeline
+            .exec_mut(&mut payload)
+            .unwrap()
+            .into_transformed()
+            .unwrap();
 
         assert_eq!(result.values[0].value_data, Some(ValueData::U32Value(1)));
         assert_eq!(result.values[1].value_data, Some(ValueData::U32Value(2)));
@@ -428,7 +476,11 @@ transform:
         pipeline
             .prepare(serde_json::Value::String(message), &mut payload)
             .unwrap();
-        let result = pipeline.exec_mut(&mut payload).unwrap();
+        let result = pipeline
+            .exec_mut(&mut payload)
+            .unwrap()
+            .into_transformed()
+            .unwrap();
         let sechema = pipeline.schemas();
 
         assert_eq!(sechema.len(), result.values.len());
@@ -507,7 +559,11 @@ transform:
             payload,
             vec![Value::String("1,2".to_string()), Value::Null, Value::Null]
         );
-        let result = pipeline.exec_mut(&mut payload).unwrap();
+        let result = pipeline
+            .exec_mut(&mut payload)
+            .unwrap()
+            .into_transformed()
+            .unwrap();
         assert_eq!(result.values[0].value_data, Some(ValueData::U32Value(1)));
         assert_eq!(result.values[1].value_data, Some(ValueData::U32Value(2)));
         match &result.values[2].value_data {
@@ -547,7 +603,11 @@ transform:
         let schema = pipeline.schemas().clone();
         let mut result = pipeline.init_intermediate_state();
         pipeline.prepare(input_value, &mut result).unwrap();
-        let row = pipeline.exec_mut(&mut result).unwrap();
+        let row = pipeline
+            .exec_mut(&mut result)
+            .unwrap()
+            .into_transformed()
+            .unwrap();
         let output = Rows {
             schema,
             rows: vec![row],
diff --git a/src/pipeline/src/lib.rs b/src/pipeline/src/lib.rs
index 49ecea41c449..1cf5589b47f3 100644
--- a/src/pipeline/src/lib.rs
+++ b/src/pipeline/src/lib.rs
@@ -23,7 +23,10 @@ pub use etl::transform::transformer::greptime::SchemaInfo;
 pub use etl::transform::transformer::identity_pipeline;
 pub use etl::transform::{GreptimeTransformer, Transformer};
 pub use etl::value::{Array, Map, Value};
-pub use etl::{error as etl_error, parse, Content, Pipeline, PipelineWay, SelectInfo};
+pub use etl::{
+    error as etl_error, parse, Content, DispatchedTo, Pipeline, PipelineExecOutput, PipelineWay,
+    SelectInfo,
+};
 pub use manager::{
     error, pipeline_operator, table, util, PipelineInfo, PipelineRef, PipelineTableRef,
     PipelineVersion,
diff --git a/src/servers/src/http/event.rs b/src/servers/src/http/event.rs
index 17fc56f56135..685d67fc8f1a 100644
--- a/src/servers/src/http/event.rs
+++ b/src/servers/src/http/event.rs
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+use std::collections::BTreeMap;
 use std::result::Result as StdResult;
 use std::str::FromStr;
 use std::sync::Arc;
@@ -32,7 +33,7 @@ use datatypes::value::column_data_to_json;
 use lazy_static::lazy_static;
 use pipeline::error::PipelineTransformSnafu;
 use pipeline::util::to_pipeline_version;
-use pipeline::{GreptimeTransformer, PipelineVersion};
+use pipeline::{DispatchedTo, GreptimeTransformer, PipelineExecOutput, PipelineVersion};
 use serde::{Deserialize, Serialize};
 use serde_json::{json, Deserializer, Map, Value};
 use session::context::{Channel, QueryContext, QueryContextRef};
@@ -551,72 +552,159 @@ fn extract_pipeline_value_by_content_type(
     })
 }
 
-pub(crate) async fn ingest_logs_inner(
-    state: PipelineHandlerRef,
-    pipeline_name: String,
+enum PipelineInputValue {
+    // multiple row values as a value object
+    Original(Vec<serde_json::Value>),
+    // 2-dimension row values by column
+    Intermediate(Vec<Vec<Value>>),
+}
+
+async fn run_pipeline(
+    state: &PipelineHandlerRef,
+    pipeline_name: &str,
     version: PipelineVersion,
-    log_ingest_requests: Vec<LogIngestRequest>,
-    query_ctx: QueryContextRef,
-) -> Result<HttpResponse> {
-    let db = query_ctx.get_db_string();
-    let exec_timer = std::time::Instant::now();
+    value: PipelineInputValue,
+    table_name: String,
+    query_ctx: &QueryContextRef,
+    db: &str,
+    is_top_level: bool,
+) -> Result<Vec<RowInsertRequest>> {
+    if pipeline_name == GREPTIME_INTERNAL_IDENTITY_PIPELINE_NAME {
+        let table = state
+            .get_table(&table, &query_ctx)
+            .await
+            .context(CatalogSnafu)?;
+        pipeline::identity_pipeline(request.values, table)
+            .map(|rows| {
+                vec![RowInsertRequest {
+                    rows: Some(rows),
+                    table_name: table_name,
+                }]
+            })
+            .context(PipelineTransformSnafu)
+            .context(PipelineSnafu)
+    } else {
+        let pipeline = state
+            .get_pipeline(&pipeline_name, version, query_ctx.clone())
+            .await?;
 
-    let mut insert_requests = Vec::with_capacity(log_ingest_requests.len());
+        let transform_timer = std::time::Instant::now();
+        let mut intermediate_state = pipeline.init_intermediate_state();
 
-    for request in log_ingest_requests {
-        let transformed_data: Rows = if pipeline_name == GREPTIME_INTERNAL_IDENTITY_PIPELINE_NAME {
-            let table = state
-                .get_table(&request.table, &query_ctx)
-                .await
-                .context(CatalogSnafu)?;
-            pipeline::identity_pipeline(request.values, table)
-                .context(PipelineTransformSnafu)
-                .context(PipelineSnafu)?
-        } else {
-            let pipeline = state
-                .get_pipeline(&pipeline_name, version, query_ctx.clone())
-                .await?;
+        let mut transformed = Vec::with_capacity(request.values.len());
+        let mut dispatched: BTreeMap<DispatchedTo, Vec<Value>> = BTreeMap::new();
 
-            let transform_timer = std::time::Instant::now();
-            let mut intermediate_state = pipeline.init_intermediate_state();
-            let mut results = Vec::with_capacity(request.values.len());
-            for v in request.values {
-                pipeline
-                    .prepare(v, &mut intermediate_state)
-                    .inspect_err(|_| {
+        for v in request.values {
+            pipeline
+                .prepare(v, &mut intermediate_state)
+                .inspect_err(|_| {
+                    if is_top_level {
                         METRIC_HTTP_LOGS_TRANSFORM_ELAPSED
-                            .with_label_values(&[db.as_str(), METRIC_FAILURE_VALUE])
+                            .with_label_values(&[db, METRIC_FAILURE_VALUE])
                             .observe(transform_timer.elapsed().as_secs_f64());
-                    })
-                    .context(PipelineTransformSnafu)
-                    .context(PipelineSnafu)?;
-                let r = pipeline
-                    .exec_mut(&mut intermediate_state)
-                    .inspect_err(|_| {
+                    }
+                })
+                .context(PipelineTransformSnafu)
+                .context(PipelineSnafu)?;
+            let r = pipeline
+                .exec_mut(&mut intermediate_state)
+                .inspect_err(|_| {
+                    if is_top_level {
                         METRIC_HTTP_LOGS_TRANSFORM_ELAPSED
-                            .with_label_values(&[db.as_str(), METRIC_FAILURE_VALUE])
+                            .with_label_values(&[db, METRIC_FAILURE_VALUE])
                             .observe(transform_timer.elapsed().as_secs_f64());
-                    })
-                    .context(PipelineTransformSnafu)
-                    .context(PipelineSnafu)?;
-                results.push(r);
-                pipeline.reset_intermediate_state(&mut intermediate_state);
+                    }
+                })
+                .context(PipelineTransformSnafu)
+                .context(PipelineSnafu)?;
+
+            match r {
+                PipelineExecOutput::Transformed(row) => {
+                    transformed.push(row);
+                }
+                PipelineExecOutput::DispatchedTo(dispatched_to) => {
+                    if let Some(values) = dispatched.get_mut(&dispatched_to) {
+                        // FIXME: can only push intermediate state
+                        values.push(v.clone());
+                    } else {
+                        dispatched.insert(dispatched_to, vec![v]);
+                    }
+                }
             }
 
+            pipeline.reset_intermediate_state(&mut intermediate_state);
+        }
+
+        let mut results = Vec::new();
+        if !transformed.is_empty() {
+            results.push(RowInsertRequest {
+                rows: Some(Rows {
+                    rows: transformed,
+                    schema: pipeline.schemas().clone(),
+                }),
+                table_name,
+            })
+        }
+
+        for (dispatched_to, values) in dispatched {
+            let request = LogIngestRequest {
+                values,
+                table: format!("{}_{}", table_name, dispatched_to.table_part),
+            };
+            let next_pipeline_name = dispatched_to
+                .pipeline
+                .as_deref()
+                .unwrap_or(GREPTIME_INTERNAL_IDENTITY_PIPELINE_NAME);
+
+            let requests = Box::pin(run_pipeline(
+                state,
+                next_pipeline_name,
+                None,
+                request,
+                query_ctx,
+                db,
+                false,
+            ))
+            .await?;
+
+            results.extend(requests);
+        }
+
+        if is_top_level {
             METRIC_HTTP_LOGS_TRANSFORM_ELAPSED
-                .with_label_values(&[db.as_str(), METRIC_SUCCESS_VALUE])
+                .with_label_values(&[db, METRIC_SUCCESS_VALUE])
                 .observe(transform_timer.elapsed().as_secs_f64());
+        }
 
-            Rows {
-                rows: results,
-                schema: pipeline.schemas().clone(),
-            }
-        };
+        Ok(results)
+    }
+}
+
+pub(crate) async fn ingest_logs_inner(
+    state: PipelineHandlerRef,
+    pipeline_name: String,
+    version: PipelineVersion,
+    log_ingest_requests: Vec<LogIngestRequest>,
+    query_ctx: QueryContextRef,
+) -> Result<HttpResponse> {
+    let db = query_ctx.get_db_string();
+    let exec_timer = std::time::Instant::now();
+
+    let mut insert_requests = Vec::with_capacity(log_ingest_requests.len());
+
+    for request in log_ingest_requests {
+        let requests = run_pipeline(
+            &state,
+            &pipeline_name,
+            version,
+            request,
+            &query_ctx,
+            db.as_str(),
+            true,
+        )
+        .await?;
 
-        insert_requests.push(RowInsertRequest {
-            rows: Some(transformed_data),
-            table_name: request.table.clone(),
-        });
+        insert_requests.extend(requests);
     }
 
     let output = state

From 63f79097c963197e63b34eac670c19abd56e5b38 Mon Sep 17 00:00:00 2001
From: Ning Sun <sunning@greptime.com>
Date: Fri, 17 Jan 2025 18:53:59 +0800
Subject: [PATCH 04/32] refactor: Enhance ingest function to correctly process
 original data values

custom table names during pipeline execution while optimizing the management of
transformed rows and multiple dispatched pipelines
---
 .../src/etl/transform/transformer/greptime.rs |  46 ++++-
 src/pipeline/src/lib.rs                       |   2 +-
 src/servers/src/http/event.rs                 | 160 ++++++++++++------
 3 files changed, 145 insertions(+), 63 deletions(-)

diff --git a/src/pipeline/src/etl/transform/transformer/greptime.rs b/src/pipeline/src/etl/transform/transformer/greptime.rs
index 0ae15d0d50b7..087f5bc97516 100644
--- a/src/pipeline/src/etl/transform/transformer/greptime.rs
+++ b/src/pipeline/src/etl/transform/transformer/greptime.rs
@@ -363,17 +363,28 @@ fn json_value_to_row(
 }
 
 fn identity_pipeline_inner<'a>(
-    array: Vec<serde_json::Value>,
+    array: PipelineExecInput,
     tag_column_names: Option<impl Iterator<Item = &'a String>>,
 ) -> Result<Rows> {
     let mut rows = Vec::with_capacity(array.len());
     let mut schema_info = SchemaInfo::default();
-    for value in array {
-        if let serde_json::Value::Object(map) = value {
-            let row = json_value_to_row(&mut schema_info, map)?;
-            rows.push(row);
+
+    match array {
+        PipelineExecInput::Original(array) => {
+            for value in array {
+                if let serde_json::Value::Object(map) = value {
+                    let row = json_value_to_row(&mut schema_info, map)?;
+                    rows.push(row);
+                }
+            }
+        }
+        PipelineExecInput::Intermediate { keys, array } => {
+            for values in array {
+                todo!()
+            }
         }
     }
+
     let greptime_timestamp_schema = ColumnSchema {
         column_name: DEFAULT_GREPTIME_TIMESTAMP_COLUMN.to_string(),
         datatype: ColumnDataType::TimestampNanosecond as i32,
@@ -409,6 +420,29 @@ fn identity_pipeline_inner<'a>(
     })
 }
 
+/// The input data format for pipeline
+///
+/// It can either be raw input as in `serde_json::Value` or intermediate `Vec<Value>`
+pub enum PipelineExecInput {
+    // multiple row values as a value object
+    Original(Vec<serde_json::Value>),
+    // 2-dimension row values by column
+    Intermediate {
+        array: Vec<Vec<Value>>,
+        keys: Vec<String>,
+    },
+}
+
+impl PipelineExecInput {
+    /// return the length of internal array
+    pub fn len(&self) -> usize {
+        match self {
+            PipelineExecInput::Original(array) => array.len(),
+            PipelineExecInput::Intermediate { array, .. } => array.len(),
+        }
+    }
+}
+
 /// Identity pipeline for Greptime
 /// This pipeline will convert the input JSON array to Greptime Rows
 /// params table is used to set the semantic type of the row key column to Tag
@@ -418,7 +452,7 @@ fn identity_pipeline_inner<'a>(
 /// 4. The pipeline will return an error if the same column datatype is mismatched
 /// 5. The pipeline will analyze the schema of each json record and merge them to get the final schema.
 pub fn identity_pipeline(
-    array: Vec<serde_json::Value>,
+    array: PipelineExecInput,
     table: Option<Arc<table::Table>>,
 ) -> Result<Rows> {
     match table {
diff --git a/src/pipeline/src/lib.rs b/src/pipeline/src/lib.rs
index 1cf5589b47f3..8ebf9ab0b9f1 100644
--- a/src/pipeline/src/lib.rs
+++ b/src/pipeline/src/lib.rs
@@ -19,7 +19,7 @@ mod metrics;
 
 pub use etl::error::Result;
 pub use etl::processor::Processor;
-pub use etl::transform::transformer::greptime::SchemaInfo;
+pub use etl::transform::transformer::greptime::{PipelineExecInput, SchemaInfo};
 pub use etl::transform::transformer::identity_pipeline;
 pub use etl::transform::{GreptimeTransformer, Transformer};
 pub use etl::value::{Array, Map, Value};
diff --git a/src/servers/src/http/event.rs b/src/servers/src/http/event.rs
index 685d67fc8f1a..446306ef26a0 100644
--- a/src/servers/src/http/event.rs
+++ b/src/servers/src/http/event.rs
@@ -18,7 +18,7 @@ use std::str::FromStr;
 use std::sync::Arc;
 use std::time::Instant;
 
-use api::v1::{RowInsertRequest, RowInsertRequests, Rows};
+use api::v1::{Row, RowInsertRequest, RowInsertRequests, Rows};
 use axum::body::HttpBody;
 use axum::extract::{FromRequest, Multipart, Path, Query, State};
 use axum::headers::ContentType;
@@ -33,7 +33,10 @@ use datatypes::value::column_data_to_json;
 use lazy_static::lazy_static;
 use pipeline::error::PipelineTransformSnafu;
 use pipeline::util::to_pipeline_version;
-use pipeline::{DispatchedTo, GreptimeTransformer, PipelineExecOutput, PipelineVersion};
+use pipeline::{
+    DispatchedTo, GreptimeTransformer, Pipeline, PipelineExecInput, PipelineExecOutput,
+    PipelineVersion,
+};
 use serde::{Deserialize, Serialize};
 use serde_json::{json, Deserializer, Map, Value};
 use session::context::{Channel, QueryContext, QueryContextRef};
@@ -552,18 +555,49 @@ fn extract_pipeline_value_by_content_type(
     })
 }
 
-enum PipelineInputValue {
-    // multiple row values as a value object
-    Original(Vec<serde_json::Value>),
-    // 2-dimension row values by column
-    Intermediate(Vec<Vec<Value>>),
+#[inline]
+fn pipline_exec_with_intermediate_state(
+    pipeline: &Arc<Pipeline<GreptimeTransformer>>,
+    intermediate_state: &mut Vec<pipeline::Value>,
+    transformed: &mut Vec<Row>,
+    dispatched: &mut BTreeMap<DispatchedTo, Vec<Vec<pipeline::Value>>>,
+    db: &str,
+    transform_timer: &Instant,
+    is_top_level: bool,
+) -> Result<()> {
+    let r = pipeline
+        .exec_mut(intermediate_state)
+        .inspect_err(|_| {
+            if is_top_level {
+                METRIC_HTTP_LOGS_TRANSFORM_ELAPSED
+                    .with_label_values(&[db, METRIC_FAILURE_VALUE])
+                    .observe(transform_timer.elapsed().as_secs_f64());
+            }
+        })
+        .context(PipelineTransformSnafu)
+        .context(PipelineSnafu)?;
+
+    match r {
+        PipelineExecOutput::Transformed(row) => {
+            transformed.push(row);
+        }
+        PipelineExecOutput::DispatchedTo(dispatched_to) => {
+            if let Some(values) = dispatched.get_mut(&dispatched_to) {
+                values.push(intermediate_state.clone());
+            } else {
+                dispatched.insert(dispatched_to, vec![intermediate_state.clone()]);
+            }
+        }
+    }
+
+    Ok(())
 }
 
 async fn run_pipeline(
     state: &PipelineHandlerRef,
     pipeline_name: &str,
     version: PipelineVersion,
-    value: PipelineInputValue,
+    values: PipelineExecInput,
     table_name: String,
     query_ctx: &QueryContextRef,
     db: &str,
@@ -571,10 +605,10 @@ async fn run_pipeline(
 ) -> Result<Vec<RowInsertRequest>> {
     if pipeline_name == GREPTIME_INTERNAL_IDENTITY_PIPELINE_NAME {
         let table = state
-            .get_table(&table, &query_ctx)
+            .get_table(&table_name, &query_ctx)
             .await
             .context(CatalogSnafu)?;
-        pipeline::identity_pipeline(request.values, table)
+        pipeline::identity_pipeline(values, table)
             .map(|rows| {
                 vec![RowInsertRequest {
                     rows: Some(rows),
@@ -591,76 +625,89 @@ async fn run_pipeline(
         let transform_timer = std::time::Instant::now();
         let mut intermediate_state = pipeline.init_intermediate_state();
 
-        let mut transformed = Vec::with_capacity(request.values.len());
-        let mut dispatched: BTreeMap<DispatchedTo, Vec<Value>> = BTreeMap::new();
-
-        for v in request.values {
-            pipeline
-                .prepare(v, &mut intermediate_state)
-                .inspect_err(|_| {
-                    if is_top_level {
-                        METRIC_HTTP_LOGS_TRANSFORM_ELAPSED
-                            .with_label_values(&[db, METRIC_FAILURE_VALUE])
-                            .observe(transform_timer.elapsed().as_secs_f64());
-                    }
-                })
-                .context(PipelineTransformSnafu)
-                .context(PipelineSnafu)?;
-            let r = pipeline
-                .exec_mut(&mut intermediate_state)
-                .inspect_err(|_| {
-                    if is_top_level {
-                        METRIC_HTTP_LOGS_TRANSFORM_ELAPSED
-                            .with_label_values(&[db, METRIC_FAILURE_VALUE])
-                            .observe(transform_timer.elapsed().as_secs_f64());
-                    }
-                })
-                .context(PipelineTransformSnafu)
-                .context(PipelineSnafu)?;
-
-            match r {
-                PipelineExecOutput::Transformed(row) => {
-                    transformed.push(row);
+        let mut transformed = Vec::with_capacity(values.len());
+        let mut dispatched: BTreeMap<DispatchedTo, Vec<Vec<pipeline::Value>>> = BTreeMap::new();
+
+        match values {
+            PipelineExecInput::Original(array) => {
+                for v in array {
+                    pipeline
+                        .prepare(v, &mut intermediate_state)
+                        .inspect_err(|_| {
+                            if is_top_level {
+                                METRIC_HTTP_LOGS_TRANSFORM_ELAPSED
+                                    .with_label_values(&[db, METRIC_FAILURE_VALUE])
+                                    .observe(transform_timer.elapsed().as_secs_f64());
+                            }
+                        })
+                        .context(PipelineTransformSnafu)
+                        .context(PipelineSnafu)?;
+
+                    pipline_exec_with_intermediate_state(
+                        &pipeline,
+                        &mut intermediate_state,
+                        &mut transformed,
+                        &mut dispatched,
+                        db,
+                        &transform_timer,
+                        is_top_level,
+                    )?;
+
+                    pipeline.reset_intermediate_state(&mut intermediate_state);
                 }
-                PipelineExecOutput::DispatchedTo(dispatched_to) => {
-                    if let Some(values) = dispatched.get_mut(&dispatched_to) {
-                        // FIXME: can only push intermediate state
-                        values.push(v.clone());
-                    } else {
-                        dispatched.insert(dispatched_to, vec![v]);
-                    }
+            }
+            PipelineExecInput::Intermediate { array, .. } => {
+                for mut intermediate_state in array {
+                    pipline_exec_with_intermediate_state(
+                        &pipeline,
+                        &mut intermediate_state,
+                        &mut transformed,
+                        &mut dispatched,
+                        db,
+                        &transform_timer,
+                        is_top_level,
+                    )?;
                 }
             }
-
-            pipeline.reset_intermediate_state(&mut intermediate_state);
         }
 
         let mut results = Vec::new();
+        // if current pipeline generates some transformed results, build it as
+        // `RowInsertRequest` and append to results. If the pipeline doesn't
+        // have dispatch, this will be only output of the pipeline.
         if !transformed.is_empty() {
             results.push(RowInsertRequest {
                 rows: Some(Rows {
                     rows: transformed,
                     schema: pipeline.schemas().clone(),
                 }),
-                table_name,
+                table_name: table_name.clone(),
             })
         }
 
+        // if current pipeline contains dispatcher and has several rules, we may
+        // already accumulated several dispatched rules and rows.
         for (dispatched_to, values) in dispatched {
-            let request = LogIngestRequest {
-                values,
-                table: format!("{}_{}", table_name, dispatched_to.table_part),
-            };
+            // we generate the new table name according to `table_part` and
+            // current custom table name.
+            let table_name = format!("{}_{}", &table_name, dispatched_to.table_part);
             let next_pipeline_name = dispatched_to
                 .pipeline
                 .as_deref()
                 .unwrap_or(GREPTIME_INTERNAL_IDENTITY_PIPELINE_NAME);
 
+            // run pipeline recursively. Note that the values we are going to
+            // process is now intermediate version. It's in form of
+            // `Vec<Vec<pipeline::Value>>`.
             let requests = Box::pin(run_pipeline(
                 state,
                 next_pipeline_name,
                 None,
-                request,
+                PipelineExecInput::Intermediate {
+                    array: values,
+                    keys: pipeline.intermediate_keys().clone(),
+                },
+                table_name,
                 query_ctx,
                 db,
                 false,
@@ -697,7 +744,8 @@ pub(crate) async fn ingest_logs_inner(
             &state,
             &pipeline_name,
             version,
-            request,
+            PipelineExecInput::Original(request.values),
+            request.table,
             &query_ctx,
             db.as_str(),
             true,

From 9fd85359e65a91fcc76a9ed548a511b938687cee Mon Sep 17 00:00:00 2001
From: Ning Sun <sunning@greptime.com>
Date: Mon, 20 Jan 2025 15:35:57 +0800
Subject: [PATCH 05/32] refactor: call greptime_identity with intermediate
 values

---
 src/pipeline/src/dispatcher.rs                |   4 +-
 src/pipeline/src/etl/error.rs                 |  11 +-
 .../src/etl/transform/transformer/greptime.rs | 211 +++++++++++++++++-
 src/pipeline/src/etl/value.rs                 |  23 ++
 src/servers/src/http/event.rs                 |   4 +-
 5 files changed, 245 insertions(+), 8 deletions(-)

diff --git a/src/pipeline/src/dispatcher.rs b/src/pipeline/src/dispatcher.rs
index a2a1e9fa1425..45bd6b47cbfb 100644
--- a/src/pipeline/src/dispatcher.rs
+++ b/src/pipeline/src/dispatcher.rs
@@ -16,8 +16,8 @@ use common_telemetry::debug;
 use snafu::OptionExt;
 use yaml_rust::Yaml;
 
-use crate::etl_error::{
-    FieldRequiredForDispatcherSnafu, TablePartRequiredForDispatcherRuleSnafu,
+use crate::etl::error::{
+    Error, FieldRequiredForDispatcherSnafu, Result, TablePartRequiredForDispatcherRuleSnafu,
     ValueRequiredForDispatcherRuleSnafu,
 };
 use crate::Value;
diff --git a/src/pipeline/src/etl/error.rs b/src/pipeline/src/etl/error.rs
index e19aaad8396e..526ed61ce4a9 100644
--- a/src/pipeline/src/etl/error.rs
+++ b/src/pipeline/src/etl/error.rs
@@ -590,10 +590,17 @@ pub enum Error {
     },
     #[snafu(display("Field is required for dispatcher"))]
     FieldRequiredForDispatcher,
-    #[snafu(display("table_part is required for dispatcher rule"))]
+    #[snafu(display("Table_part is required for dispatcher rule"))]
     TablePartRequiredForDispatcherRule,
-    #[snafu(display("value is required for dispatcher rule"))]
+    #[snafu(display("Value is required for dispatcher rule"))]
     ValueRequiredForDispatcherRule,
+    #[snafu(display("Keys and values length mismatch, values: {values}, keys: {keys}"))]
+    KeyValueLengthMismatch {
+        #[snafu(implicit)]
+        location: Location,
+        keys: usize,
+        values: usize,
+    },
 }
 
 pub type Result<T> = std::result::Result<T, Error>;
diff --git a/src/pipeline/src/etl/transform/transformer/greptime.rs b/src/pipeline/src/etl/transform/transformer/greptime.rs
index 087f5bc97516..94e922ab7845 100644
--- a/src/pipeline/src/etl/transform/transformer/greptime.rs
+++ b/src/pipeline/src/etl/transform/transformer/greptime.rs
@@ -26,10 +26,12 @@ use coerce::{coerce_columns, coerce_value};
 use greptime_proto::v1::{ColumnSchema, Row, Rows, Value as GreptimeValue};
 use itertools::Itertools;
 use serde_json::{Map, Number};
+use snafu::ensure;
 
 use crate::etl::error::{
-    IdentifyPipelineColumnTypeMismatchSnafu, Result, TransformColumnNameMustBeUniqueSnafu,
-    TransformEmptySnafu, TransformMultipleTimestampIndexSnafu, TransformTimestampIndexCountSnafu,
+    IdentifyPipelineColumnTypeMismatchSnafu, KeyValueLengthMismatchSnafu, Result,
+    TransformColumnNameMustBeUniqueSnafu, TransformEmptySnafu,
+    TransformMultipleTimestampIndexSnafu, TransformTimestampIndexCountSnafu,
     UnsupportedNumberTypeSnafu,
 };
 use crate::etl::field::{InputFieldInfo, OneInputOneOutputField};
@@ -290,6 +292,201 @@ fn resolve_number_schema(
     )
 }
 
+fn values_to_row(schema_info: &mut SchemaInfo, values: Vec<Value>, keys: &[String]) -> Result<Row> {
+    ensure!(
+        values.len() == keys.len(),
+        KeyValueLengthMismatchSnafu {
+            keys: keys.len(),
+            values: values.len(),
+        }
+    );
+
+    let mut row: Vec<GreptimeValue> = Vec::with_capacity(schema_info.schema.len());
+    for _ in 0..schema_info.schema.len() {
+        row.push(GreptimeValue { value_data: None });
+    }
+
+    for (idx, value) in values.into_iter().enumerate() {
+        // ensured by previous check
+        let column_name = keys[idx].clone();
+        if column_name == DEFAULT_GREPTIME_TIMESTAMP_COLUMN {
+            continue;
+        }
+
+        let index = schema_info.index.get(&column_name).copied();
+
+        match value {
+            Value::Null => {}
+
+            Value::Int8(_) | Value::Int16(_) | Value::Int32(_) | Value::Int64(_) => {
+                // safe unwrap after type matched
+                let v = value.as_i64().unwrap();
+                resolve_schema(
+                    index,
+                    ValueData::I64Value(v),
+                    ColumnSchema {
+                        column_name,
+                        datatype: ColumnDataType::Int64 as i32,
+                        semantic_type: SemanticType::Field as i32,
+                        datatype_extension: None,
+                        options: None,
+                    },
+                    &mut row,
+                    schema_info,
+                )?;
+            }
+
+            Value::Uint8(_) | Value::Uint16(_) | Value::Uint32(_) | Value::Uint64(_) => {
+                // safe unwrap after type matched
+                let v = value.as_u64().unwrap();
+                resolve_schema(
+                    index,
+                    ValueData::U64Value(v),
+                    ColumnSchema {
+                        column_name,
+                        datatype: ColumnDataType::Uint64 as i32,
+                        semantic_type: SemanticType::Field as i32,
+                        datatype_extension: None,
+                        options: None,
+                    },
+                    &mut row,
+                    schema_info,
+                )?;
+            }
+
+            Value::Float32(_) | Value::Float64(_) => {
+                // safe unwrap after type matched
+                let v = value.as_f64().unwrap();
+                resolve_schema(
+                    index,
+                    ValueData::F64Value(v),
+                    ColumnSchema {
+                        column_name,
+                        datatype: ColumnDataType::Float64 as i32,
+                        semantic_type: SemanticType::Field as i32,
+                        datatype_extension: None,
+                        options: None,
+                    },
+                    &mut row,
+                    schema_info,
+                )?;
+            }
+
+            Value::Boolean(v) => {
+                resolve_schema(
+                    index,
+                    ValueData::BoolValue(v),
+                    ColumnSchema {
+                        column_name,
+                        datatype: ColumnDataType::Boolean as i32,
+                        semantic_type: SemanticType::Field as i32,
+                        datatype_extension: None,
+                        options: None,
+                    },
+                    &mut row,
+                    schema_info,
+                )?;
+            }
+            Value::String(v) => {
+                resolve_schema(
+                    index,
+                    ValueData::StringValue(v),
+                    ColumnSchema {
+                        column_name,
+                        datatype: ColumnDataType::String as i32,
+                        semantic_type: SemanticType::Field as i32,
+                        datatype_extension: None,
+                        options: None,
+                    },
+                    &mut row,
+                    schema_info,
+                )?;
+            }
+
+            Value::Timestamp(Timestamp::Nanosecond(ns)) => {
+                resolve_schema(
+                    index,
+                    ValueData::TimestampNanosecondValue(ns),
+                    ColumnSchema {
+                        column_name,
+                        datatype: ColumnDataType::TimestampNanosecond as i32,
+                        semantic_type: SemanticType::Field as i32,
+                        datatype_extension: None,
+                        options: None,
+                    },
+                    &mut row,
+                    schema_info,
+                )?;
+            }
+            Value::Timestamp(Timestamp::Microsecond(us)) => {
+                resolve_schema(
+                    index,
+                    ValueData::TimestampMicrosecondValue(us),
+                    ColumnSchema {
+                        column_name,
+                        datatype: ColumnDataType::TimestampMicrosecond as i32,
+                        semantic_type: SemanticType::Field as i32,
+                        datatype_extension: None,
+                        options: None,
+                    },
+                    &mut row,
+                    schema_info,
+                )?;
+            }
+            Value::Timestamp(Timestamp::Millisecond(ms)) => {
+                resolve_schema(
+                    index,
+                    ValueData::TimestampMillisecondValue(ms),
+                    ColumnSchema {
+                        column_name,
+                        datatype: ColumnDataType::TimestampMillisecond as i32,
+                        semantic_type: SemanticType::Field as i32,
+                        datatype_extension: None,
+                        options: None,
+                    },
+                    &mut row,
+                    schema_info,
+                )?;
+            }
+            Value::Timestamp(Timestamp::Second(s)) => {
+                resolve_schema(
+                    index,
+                    ValueData::TimestampSecondValue(s),
+                    ColumnSchema {
+                        column_name,
+                        datatype: ColumnDataType::TimestampSecond as i32,
+                        semantic_type: SemanticType::Field as i32,
+                        datatype_extension: None,
+                        options: None,
+                    },
+                    &mut row,
+                    schema_info,
+                )?;
+            }
+
+            Value::Array(_) | Value::Map(_) => {
+                let data: jsonb::Value = value.into();
+                resolve_schema(
+                    index,
+                    ValueData::BinaryValue(data.to_vec()),
+                    ColumnSchema {
+                        column_name,
+                        datatype: ColumnDataType::Binary as i32,
+                        semantic_type: SemanticType::Field as i32,
+                        datatype_extension: Some(ColumnDataTypeExtension {
+                            type_ext: Some(TypeExt::JsonType(JsonTypeExtension::JsonBinary.into())),
+                        }),
+                        options: None,
+                    },
+                    &mut row,
+                    schema_info,
+                )?;
+            }
+        }
+    }
+    Ok(Row { values: row })
+}
+
 fn json_value_to_row(
     schema_info: &mut SchemaInfo,
     map: Map<String, serde_json::Value>,
@@ -380,7 +577,8 @@ fn identity_pipeline_inner<'a>(
         }
         PipelineExecInput::Intermediate { keys, array } => {
             for values in array {
-                todo!()
+                let row = values_to_row(&mut schema_info, values, &keys)?;
+                rows.push(row);
             }
         }
     }
@@ -441,6 +639,13 @@ impl PipelineExecInput {
             PipelineExecInput::Intermediate { array, .. } => array.len(),
         }
     }
+
+    pub fn is_empty(&self) -> bool {
+        match self {
+            PipelineExecInput::Original(array) => array.is_empty(),
+            PipelineExecInput::Intermediate { array, .. } => array.is_empty(),
+        }
+    }
 }
 
 /// Identity pipeline for Greptime
diff --git a/src/pipeline/src/etl/value.rs b/src/pipeline/src/etl/value.rs
index fee9a2c52742..5d97c0cbd913 100644
--- a/src/pipeline/src/etl/value.rs
+++ b/src/pipeline/src/etl/value.rs
@@ -249,6 +249,29 @@ impl Value {
         }
     }
 
+    pub fn as_i64(&self) -> Option<i64> {
+        match self {
+            Value::Uint32(v) => Some(*v as i64),
+            Value::Uint16(v) => Some(*v as i64),
+            Value::Uint8(v) => Some(*v as i64),
+            Value::Int64(v) => Some(*v),
+            Value::Int32(v) => Some(*v as i64),
+            Value::Int16(v) => Some(*v as i64),
+            Value::Int8(v) => Some(*v as i64),
+            _ => None,
+        }
+    }
+
+    pub fn as_u64(&self) -> Option<u64> {
+        match self {
+            Value::Uint64(v) => Some(*v),
+            Value::Uint32(v) => Some(*v as u64),
+            Value::Uint16(v) => Some(*v as u64),
+            Value::Uint8(v) => Some(*v as u64),
+            _ => None,
+        }
+    }
+
     pub fn as_f64(&self) -> Option<f64> {
         match self {
             Value::Float32(v) => Some(*v as f64),
diff --git a/src/servers/src/http/event.rs b/src/servers/src/http/event.rs
index 446306ef26a0..83038dcea78a 100644
--- a/src/servers/src/http/event.rs
+++ b/src/servers/src/http/event.rs
@@ -612,7 +612,7 @@ async fn run_pipeline(
             .map(|rows| {
                 vec![RowInsertRequest {
                     rows: Some(rows),
-                    table_name: table_name,
+                    table_name,
                 }]
             })
             .context(PipelineTransformSnafu)
@@ -705,6 +705,8 @@ async fn run_pipeline(
                 None,
                 PipelineExecInput::Intermediate {
                     array: values,
+                    // FIXME(sunng87): this intermediate_keys is incorrect. what
+                    // we will need is the keys that generated after processors
                     keys: pipeline.intermediate_keys().clone(),
                 },
                 table_name,

From a9bc720b724e60100b0a8bfb0ce33558d152a6b5 Mon Sep 17 00:00:00 2001
From: Ning Sun <sunning@greptime.com>
Date: Mon, 20 Jan 2025 16:00:27 +0800
Subject: [PATCH 06/32] fix: typo

---
 src/servers/src/http/event.rs | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/servers/src/http/event.rs b/src/servers/src/http/event.rs
index 83038dcea78a..6cea3123626d 100644
--- a/src/servers/src/http/event.rs
+++ b/src/servers/src/http/event.rs
@@ -556,7 +556,7 @@ fn extract_pipeline_value_by_content_type(
 }
 
 #[inline]
-fn pipline_exec_with_intermediate_state(
+fn pipeline_exec_with_intermediate_state(
     pipeline: &Arc<Pipeline<GreptimeTransformer>>,
     intermediate_state: &mut Vec<pipeline::Value>,
     transformed: &mut Vec<Row>,
@@ -643,7 +643,7 @@ async fn run_pipeline(
                         .context(PipelineTransformSnafu)
                         .context(PipelineSnafu)?;
 
-                    pipline_exec_with_intermediate_state(
+                    pipeline_exec_with_intermediate_state(
                         &pipeline,
                         &mut intermediate_state,
                         &mut transformed,
@@ -658,7 +658,7 @@ async fn run_pipeline(
             }
             PipelineExecInput::Intermediate { array, .. } => {
                 for mut intermediate_state in array {
-                    pipline_exec_with_intermediate_state(
+                    pipeline_exec_with_intermediate_state(
                         &pipeline,
                         &mut intermediate_state,
                         &mut transformed,

From 81e57a32660fba09c683aced87457e51bc67213f Mon Sep 17 00:00:00 2001
From: Ning Sun <sunning@greptime.com>
Date: Mon, 20 Jan 2025 16:37:13 +0800
Subject: [PATCH 07/32] test: port tests to refactored apis

---
 src/pipeline/benches/processor.rs             |   5 +-
 src/pipeline/src/etl.rs                       |  10 +-
 .../src/etl/transform/transformer/greptime.rs |  13 +-
 src/pipeline/tests/common.rs                  |   8 +-
 src/pipeline/tests/pipeline.rs                | 117 ++++++++++++++++--
 5 files changed, 136 insertions(+), 17 deletions(-)

diff --git a/src/pipeline/benches/processor.rs b/src/pipeline/benches/processor.rs
index 8cf221af5b10..01d1a293d66e 100644
--- a/src/pipeline/benches/processor.rs
+++ b/src/pipeline/benches/processor.rs
@@ -25,7 +25,10 @@ fn processor_mut(
 
     for v in input_values {
         pipeline.prepare(v, &mut payload)?;
-        let r = pipeline.exec_mut(&mut payload)?;
+        let r = pipeline
+            .exec_mut(&mut payload)?
+            .into_transformed()
+            .expect("expect transformed result ");
         result.push(r);
         pipeline.reset_intermediate_state(&mut payload);
     }
diff --git a/src/pipeline/src/etl.rs b/src/pipeline/src/etl.rs
index a12b9d7b0478..50889cb37fad 100644
--- a/src/pipeline/src/etl.rs
+++ b/src/pipeline/src/etl.rs
@@ -216,13 +216,21 @@ pub enum PipelineExecOutput<O> {
 }
 
 impl<O> PipelineExecOutput<O> {
-    pub(crate) fn into_transformed(self) -> Option<O> {
+    pub fn into_transformed(self) -> Option<O> {
         if let Self::Transformed(o) = self {
             Some(o)
         } else {
             None
         }
     }
+
+    pub fn into_dispatched(self) -> Option<DispatchedTo> {
+        if let Self::DispatchedTo(d) = self {
+            Some(d)
+        } else {
+            None
+        }
+    }
 }
 
 impl<T> Pipeline<T>
diff --git a/src/pipeline/src/etl/transform/transformer/greptime.rs b/src/pipeline/src/etl/transform/transformer/greptime.rs
index 94e922ab7845..7d3752ef2880 100644
--- a/src/pipeline/src/etl/transform/transformer/greptime.rs
+++ b/src/pipeline/src/etl/transform/transformer/greptime.rs
@@ -675,7 +675,7 @@ mod tests {
     use api::v1::SemanticType;
 
     use crate::etl::transform::transformer::greptime::identity_pipeline_inner;
-    use crate::identity_pipeline;
+    use crate::{identity_pipeline, PipelineExecInput};
 
     #[test]
     fn test_identify_pipeline() {
@@ -700,7 +700,7 @@ mod tests {
                     "gaga": "gaga"
                 }),
             ];
-            let rows = identity_pipeline(array, None);
+            let rows = identity_pipeline(PipelineExecInput::Original(array), None);
             assert!(rows.is_err());
             assert_eq!(
                 rows.err().unwrap().to_string(),
@@ -728,7 +728,7 @@ mod tests {
                     "gaga": "gaga"
                 }),
             ];
-            let rows = identity_pipeline(array, None);
+            let rows = identity_pipeline(PipelineExecInput::Original(array), None);
             assert!(rows.is_err());
             assert_eq!(
                 rows.err().unwrap().to_string(),
@@ -756,7 +756,7 @@ mod tests {
                     "gaga": "gaga"
                 }),
             ];
-            let rows = identity_pipeline(array, None);
+            let rows = identity_pipeline(PipelineExecInput::Original(array), None);
             assert!(rows.is_ok());
             let rows = rows.unwrap();
             assert_eq!(rows.schema.len(), 8);
@@ -786,7 +786,10 @@ mod tests {
                 }),
             ];
             let tag_column_names = ["name".to_string(), "address".to_string()];
-            let rows = identity_pipeline_inner(array, Some(tag_column_names.iter()));
+            let rows = identity_pipeline_inner(
+                PipelineExecInput::Original(array),
+                Some(tag_column_names.iter()),
+            );
             assert!(rows.is_ok());
             let rows = rows.unwrap();
             assert_eq!(rows.schema.len(), 8);
diff --git a/src/pipeline/tests/common.rs b/src/pipeline/tests/common.rs
index d825c91e4cb3..781c3a30fe0f 100644
--- a/src/pipeline/tests/common.rs
+++ b/src/pipeline/tests/common.rs
@@ -34,7 +34,9 @@ pub fn parse_and_exec(input_str: &str, pipeline_yaml: &str) -> Rows {
                 pipeline.prepare(value, &mut result).unwrap();
                 let row = pipeline
                     .exec_mut(&mut result)
-                    .expect("failed to exec pipeline");
+                    .expect("failed to exec pipeline")
+                    .into_transformed()
+                    .expect("expect transformed result ");
                 rows.push(row);
                 pipeline.reset_intermediate_state(&mut result);
             }
@@ -43,7 +45,9 @@ pub fn parse_and_exec(input_str: &str, pipeline_yaml: &str) -> Rows {
             pipeline.prepare(input_value, &mut result).unwrap();
             let row = pipeline
                 .exec_mut(&mut result)
-                .expect("failed to exec pipeline");
+                .expect("failed to exec pipeline")
+                .into_transformed()
+                .expect("expect transformed result ");
             rows.push(row);
         }
         _ => {
diff --git a/src/pipeline/tests/pipeline.rs b/src/pipeline/tests/pipeline.rs
index cb84e9ad0c8e..f0fa3992e4bf 100644
--- a/src/pipeline/tests/pipeline.rs
+++ b/src/pipeline/tests/pipeline.rs
@@ -427,7 +427,9 @@ transform:
 
     let row = pipeline
         .exec_mut(&mut stats)
-        .expect("failed to exec pipeline");
+        .expect("failed to exec pipeline")
+        .into_transformed()
+        .expect("expect transformed result ");
 
     let output = Rows {
         schema: pipeline.schemas().clone(),
@@ -492,7 +494,11 @@ transform:
 
     let mut status = pipeline.init_intermediate_state();
     pipeline.prepare(input_value, &mut status).unwrap();
-    let row = pipeline.exec_mut(&mut status).unwrap();
+    let row = pipeline
+        .exec_mut(&mut status)
+        .unwrap()
+        .into_transformed()
+        .expect("expect transformed result ");
     let r = row
         .values
         .into_iter()
@@ -598,7 +604,11 @@ transform:
     let mut status = pipeline.init_intermediate_state();
 
     pipeline.prepare(input_value, &mut status).unwrap();
-    let row = pipeline.exec_mut(&mut status).unwrap();
+    let row = pipeline
+        .exec_mut(&mut status)
+        .unwrap()
+        .into_transformed()
+        .expect("expect transformed result ");
 
     let r = row
         .values
@@ -638,10 +648,10 @@ processors:
   - dissect:
       fields:
         - line
-      patterns: 
+      patterns:
         - "%{+ts} %{+ts} %{content}"
   - date:
-      fields: 
+      fields:
         - ts
       formats:
         - "%Y-%m-%d %H:%M:%S%.3f"
@@ -660,7 +670,11 @@ transform:
 
     let mut status = pipeline.init_intermediate_state();
     pipeline.prepare(input_value, &mut status).unwrap();
-    let row = pipeline.exec_mut(&mut status).unwrap();
+    let row = pipeline
+        .exec_mut(&mut status)
+        .unwrap()
+        .into_transformed()
+        .expect("expect transformed result ");
     let r = row
         .values
         .into_iter()
@@ -696,7 +710,12 @@ transform:
 
     let mut status = pipeline.init_intermediate_state();
     pipeline.prepare(input_value, &mut status).unwrap();
-    let row = pipeline.exec_mut(&mut status).unwrap();
+
+    let row = pipeline
+        .exec_mut(&mut status)
+        .unwrap()
+        .into_transformed()
+        .expect("expect transformed result ");
 
     let r = row
         .values
@@ -751,7 +770,11 @@ transform:
 
     let mut status = pipeline.init_intermediate_state();
     pipeline.prepare(input_value, &mut status).unwrap();
-    let row = pipeline.exec_mut(&mut status).unwrap();
+    let row = pipeline
+        .exec_mut(&mut status)
+        .unwrap()
+        .into_transformed()
+        .expect("expect transformed result ");
 
     let mut r = row
         .values
@@ -770,3 +793,81 @@ transform:
 
     assert_eq!(expected, r);
 }
+
+#[test]
+fn test_dispatch() {
+    let input_value_str1 = r#"
+{
+    "line": "2024-05-25 20:16:37.217 [http] hello world"
+}
+"#;
+    let input_value1 = serde_json::from_str::<serde_json::Value>(input_value_str1).unwrap();
+    let input_value_str2 = r#"
+{
+    "line": "2024-05-25 20:16:37.217 [database] hello world"
+}
+"#;
+    let input_value2 = serde_json::from_str::<serde_json::Value>(input_value_str2).unwrap();
+
+    let pipeline_yaml = r#"
+processors:
+  - dissect:
+      fields:
+        - line
+      patterns:
+        - "%{+ts} %{+ts} [%{logger}] %{content}"
+  - date:
+      fields:
+        - ts
+      formats:
+        - "%Y-%m-%d %H:%M:%S%.3f"
+
+dispatcher:
+  field: logger
+  rules:
+    - value: http
+      table_part: http
+      pipeline: access_log_pipeline
+
+transform:
+  - fields:
+      - content
+    type: string
+  - field: ts
+    type: time
+    index: timestamp
+"#;
+
+    let yaml_content = Content::Yaml(pipeline_yaml);
+    let pipeline: Pipeline<GreptimeTransformer> = parse(&yaml_content).unwrap();
+
+    let mut status = pipeline.init_intermediate_state();
+    pipeline.prepare(input_value1, &mut status).unwrap();
+    let dispatched_to = pipeline
+        .exec_mut(&mut status)
+        .unwrap()
+        .into_dispatched()
+        .expect("expect dispatched result ");
+    assert_eq!(dispatched_to.table_part, "http");
+    assert_eq!(dispatched_to.pipeline.unwrap(), "access_log_pipeline");
+
+    let mut status = pipeline.init_intermediate_state();
+    pipeline.prepare(input_value2, &mut status).unwrap();
+    let row = pipeline
+        .exec_mut(&mut status)
+        .unwrap()
+        .into_transformed()
+        .expect("expect transformed result ");
+    let r = row
+        .values
+        .into_iter()
+        .map(|v| v.value_data.unwrap())
+        .collect::<Vec<_>>();
+
+    let expected = vec![
+        StringValue("hello world".into()),
+        TimestampNanosecondValue(1716668197217000000),
+    ];
+
+    assert_eq!(expected, r);
+}

From d37b59dcec6ad567cabf8398bbd33169771367e3 Mon Sep 17 00:00:00 2001
From: Ning Sun <sunning@greptime.com>
Date: Mon, 20 Jan 2025 19:27:06 +0800
Subject: [PATCH 08/32] refactor: adapt dryrun api call

---
 src/servers/src/http/event.rs | 249 +++++++++++++++++++++-------------
 1 file changed, 155 insertions(+), 94 deletions(-)

diff --git a/src/servers/src/http/event.rs b/src/servers/src/http/event.rs
index 6cea3123626d..1c341e1610d6 100644
--- a/src/servers/src/http/event.rs
+++ b/src/servers/src/http/event.rs
@@ -276,86 +276,105 @@ fn transform_ndjson_array_factory(
 }
 
 /// Dryrun pipeline with given data
-fn dryrun_pipeline_inner(
+async fn dryrun_pipeline_inner(
     value: Vec<Value>,
-    pipeline: &pipeline::Pipeline<GreptimeTransformer>,
+    pipeline: Arc<pipeline::Pipeline<GreptimeTransformer>>,
+    pipeline_handler: PipelineHandlerRef,
+    query_ctx: &QueryContextRef,
 ) -> Result<Response> {
-    let mut intermediate_state = pipeline.init_intermediate_state();
+    let db = query_ctx.get_db_string();
 
-    let mut results = Vec::with_capacity(value.len());
-    for v in value {
-        pipeline
-            .prepare(v, &mut intermediate_state)
-            .context(PipelineTransformSnafu)
-            .context(PipelineSnafu)?;
-        let r = pipeline
-            .exec_mut(&mut intermediate_state)
-            .context(PipelineTransformSnafu)
-            .context(PipelineSnafu)?;
-        results.push(r);
-        pipeline.reset_intermediate_state(&mut intermediate_state);
-    }
+    let results = run_pipeline(
+        &pipeline_handler,
+        PipelineDefinition::Resolved(pipeline),
+        PipelineExecInput::Original(value),
+        "dry_run".to_owned(),
+        query_ctx,
+        db.as_ref(),
+        true,
+    )
+    .await?;
 
     let colume_type_key = "colume_type";
     let data_type_key = "data_type";
     let name_key = "name";
 
-    let schema = pipeline
-        .schemas()
-        .iter()
-        .map(|cs| {
-            let mut map = Map::new();
-            map.insert(name_key.to_string(), Value::String(cs.column_name.clone()));
-            map.insert(
-                data_type_key.to_string(),
-                Value::String(cs.datatype().as_str_name().to_string()),
-            );
-            map.insert(
-                colume_type_key.to_string(),
-                Value::String(cs.semantic_type().as_str_name().to_string()),
-            );
-            map.insert(
-                "fulltext".to_string(),
-                Value::Bool(
-                    cs.options
-                        .clone()
-                        .is_some_and(|x| x.options.contains_key("fulltext")),
-                ),
-            );
-            Value::Object(map)
-        })
-        .collect::<Vec<_>>();
-    let rows = results
+    let results = results
         .into_iter()
-        .map(|row| {
-            let row = row
-                .values
-                .into_iter()
-                .enumerate()
-                .map(|(idx, v)| {
-                    v.value_data
-                        .map(|d| {
-                            let mut map = Map::new();
-                            map.insert("value".to_string(), column_data_to_json(d));
-                            map.insert("key".to_string(), schema[idx][name_key].clone());
-                            map.insert(
-                                "semantic_type".to_string(),
-                                schema[idx][colume_type_key].clone(),
-                            );
-                            map.insert("data_type".to_string(), schema[idx][data_type_key].clone());
-                            Value::Object(map)
-                        })
-                        .unwrap_or(Value::Null)
-                })
-                .collect();
-            Value::Array(row)
+        .filter_map(|row| {
+            if let Some(rows) = row.rows {
+                let table_name = row.table_name;
+                let schema = rows.schema;
+
+                let schema = schema
+                    .iter()
+                    .map(|cs| {
+                        let mut map = Map::new();
+                        map.insert(name_key.to_string(), Value::String(cs.column_name.clone()));
+                        map.insert(
+                            data_type_key.to_string(),
+                            Value::String(cs.datatype().as_str_name().to_string()),
+                        );
+                        map.insert(
+                            colume_type_key.to_string(),
+                            Value::String(cs.semantic_type().as_str_name().to_string()),
+                        );
+                        map.insert(
+                            "fulltext".to_string(),
+                            Value::Bool(
+                                cs.options
+                                    .clone()
+                                    .is_some_and(|x| x.options.contains_key("fulltext")),
+                            ),
+                        );
+                        Value::Object(map)
+                    })
+                    .collect::<Vec<_>>();
+
+                let rows = rows
+                    .rows
+                    .into_iter()
+                    .map(|row| {
+                        row.values
+                            .into_iter()
+                            .enumerate()
+                            .map(|(idx, v)| {
+                                v.value_data
+                                    .map(|d| {
+                                        let mut map = Map::new();
+                                        map.insert("value".to_string(), column_data_to_json(d));
+                                        map.insert(
+                                            "key".to_string(),
+                                            schema[idx][name_key].clone(),
+                                        );
+                                        map.insert(
+                                            "semantic_type".to_string(),
+                                            schema[idx][colume_type_key].clone(),
+                                        );
+                                        map.insert(
+                                            "data_type".to_string(),
+                                            schema[idx][data_type_key].clone(),
+                                        );
+                                        Value::Object(map)
+                                    })
+                                    .unwrap_or(Value::Null)
+                            })
+                            .collect()
+                    })
+                    .collect();
+
+                let mut result = Map::new();
+                result.insert("schema".to_string(), Value::Array(schema));
+                result.insert("rows".to_string(), Value::Array(rows));
+                result.insert("table_name".to_string(), Value::String(table_name));
+                let result = Value::Object(result);
+                Some(result)
+            } else {
+                None
+            }
         })
-        .collect::<Vec<_>>();
-    let mut result = Map::new();
-    result.insert("schema".to_string(), Value::Array(schema));
-    result.insert("rows".to_string(), Value::Array(rows));
-    let result = Value::Object(result);
-    Ok(Json(result).into_response())
+        .collect();
+    Ok(Json(Value::Array(results)).into_response())
 }
 
 /// Dryrun pipeline with given data
@@ -421,6 +440,9 @@ pub async fn pipeline_dryrun(
 ) -> Result<Response> {
     let handler = log_state.log_handler;
 
+    query_ctx.set_channel(Channel::Http);
+    let query_ctx = Arc::new(query_ctx);
+
     match check_pipeline_dryrun_params_valid(&payload) {
         Some(params) => {
             let data = params.data;
@@ -433,20 +455,29 @@ pub async fn pipeline_dryrun(
                         to_pipeline_version(params.pipeline_version).context(PipelineSnafu)?;
                     let pipeline_name = check_pipeline_name_exists(params.pipeline_name)?;
                     let pipeline = handler
-                        .get_pipeline(&pipeline_name, version, Arc::new(query_ctx))
+                        .get_pipeline(&pipeline_name, version, query_ctx.clone())
                         .await?;
-                    dryrun_pipeline_inner(data, &pipeline)
+                    dryrun_pipeline_inner(data, pipeline, handler, &query_ctx).await
                 }
                 Some(pipeline) => {
                     let pipeline = handler.build_pipeline(&pipeline);
                     match pipeline {
-                        Ok(pipeline) => match dryrun_pipeline_inner(data, &pipeline) {
-                            Ok(response) => Ok(response),
-                            Err(e) => Ok(add_step_info_for_pipeline_dryrun_error(
-                                "Failed to exec pipeline",
-                                e,
-                            )),
-                        },
+                        Ok(pipeline) => {
+                            match dryrun_pipeline_inner(
+                                data,
+                                Arc::new(pipeline),
+                                handler,
+                                &query_ctx,
+                            )
+                            .await
+                            {
+                                Ok(response) => Ok(response),
+                                Err(e) => Ok(add_step_info_for_pipeline_dryrun_error(
+                                    "Failed to exec pipeline",
+                                    e,
+                                )),
+                            }
+                        }
                         Err(e) => Ok(add_step_info_for_pipeline_dryrun_error(
                             "Failed to build pipeline",
                             e,
@@ -470,14 +501,11 @@ pub async fn pipeline_dryrun(
 
             check_data_valid(value.len())?;
 
-            query_ctx.set_channel(Channel::Http);
-            let query_ctx = Arc::new(query_ctx);
-
             let pipeline = handler
                 .get_pipeline(&pipeline_name, version, query_ctx.clone())
                 .await?;
 
-            dryrun_pipeline_inner(value, &pipeline)
+            dryrun_pipeline_inner(value, pipeline, handler, &query_ctx).await
         }
     }
 }
@@ -593,17 +621,54 @@ fn pipeline_exec_with_intermediate_state(
     Ok(())
 }
 
-async fn run_pipeline(
+/// Enum for holding information of a pipeline, which is either pipeline itself,
+/// or information that be used to retrieve a pipeline from `PipelineHandler`
+enum PipelineDefinition<'a> {
+    Resolved(Arc<Pipeline<GreptimeTransformer>>),
+    ByNameAndValue((&'a str, PipelineVersion)),
+    GreptimeIdentityPipeline,
+}
+
+impl<'a> PipelineDefinition<'a> {
+    pub fn from_name(name: &'a str, version: PipelineVersion) -> Self {
+        if name == GREPTIME_INTERNAL_IDENTITY_PIPELINE_NAME {
+            Self::GreptimeIdentityPipeline
+        } else {
+            Self::ByNameAndValue((name, version))
+        }
+    }
+
+    /// Never call this on `GreptimeIdentityPipeline` because it's a real pipeline
+    pub async fn get_pipeline(
+        self,
+        handler: &PipelineHandlerRef,
+        query_ctx: &QueryContextRef,
+    ) -> Result<Arc<Pipeline<GreptimeTransformer>>> {
+        match self {
+            Self::Resolved(pipeline) => Ok(pipeline),
+            Self::ByNameAndValue((name, version)) => {
+                handler.get_pipeline(name, version, query_ctx.clone()).await
+            }
+            _ => {
+                unreachable!("Never call get_pipeline on identity.")
+            }
+        }
+    }
+}
+
+async fn run_pipeline<'a>(
     state: &PipelineHandlerRef,
-    pipeline_name: &str,
-    version: PipelineVersion,
+    pipeline_definition: PipelineDefinition<'a>,
     values: PipelineExecInput,
     table_name: String,
     query_ctx: &QueryContextRef,
     db: &str,
     is_top_level: bool,
 ) -> Result<Vec<RowInsertRequest>> {
-    if pipeline_name == GREPTIME_INTERNAL_IDENTITY_PIPELINE_NAME {
+    if matches!(
+        pipeline_definition,
+        PipelineDefinition::GreptimeIdentityPipeline
+    ) {
         let table = state
             .get_table(&table_name, &query_ctx)
             .await
@@ -618,9 +683,7 @@ async fn run_pipeline(
             .context(PipelineTransformSnafu)
             .context(PipelineSnafu)
     } else {
-        let pipeline = state
-            .get_pipeline(&pipeline_name, version, query_ctx.clone())
-            .await?;
+        let pipeline = pipeline_definition.get_pipeline(state, query_ctx).await?;
 
         let transform_timer = std::time::Instant::now();
         let mut intermediate_state = pipeline.init_intermediate_state();
@@ -701,8 +764,7 @@ async fn run_pipeline(
             // `Vec<Vec<pipeline::Value>>`.
             let requests = Box::pin(run_pipeline(
                 state,
-                next_pipeline_name,
-                None,
+                PipelineDefinition::from_name(next_pipeline_name, None),
                 PipelineExecInput::Intermediate {
                     array: values,
                     // FIXME(sunng87): this intermediate_keys is incorrect. what
@@ -744,8 +806,7 @@ pub(crate) async fn ingest_logs_inner(
     for request in log_ingest_requests {
         let requests = run_pipeline(
             &state,
-            &pipeline_name,
-            version,
+            PipelineDefinition::from_name(&pipeline_name, version),
             PipelineExecInput::Original(request.values),
             request.table,
             &query_ctx,

From dd40c090f0c9dd10402a8495848da4cdad4ca0d6 Mon Sep 17 00:00:00 2001
From: Ning Sun <sunning@greptime.com>
Date: Tue, 21 Jan 2025 14:56:04 +0800
Subject: [PATCH 09/32] refactor: move pipeline execution code to a separated
 module

---
 src/servers/src/elasticsearch.rs |   6 +-
 src/servers/src/http/event.rs    | 225 +---------------------------
 src/servers/src/lib.rs           |   1 +
 src/servers/src/pipeline.rs      | 243 +++++++++++++++++++++++++++++++
 4 files changed, 252 insertions(+), 223 deletions(-)
 create mode 100644 src/servers/src/pipeline.rs

diff --git a/src/servers/src/elasticsearch.rs b/src/servers/src/elasticsearch.rs
index 58c6aa520a61..41bb9cbc9f76 100644
--- a/src/servers/src/elasticsearch.rs
+++ b/src/servers/src/elasticsearch.rs
@@ -31,13 +31,11 @@ use crate::error::{
     status_code_to_http_status, InvalidElasticsearchInputSnafu, ParseJsonSnafu,
     Result as ServersResult,
 };
-use crate::http::event::{
-    ingest_logs_inner, LogIngestRequest, LogIngesterQueryParams, LogState,
-    GREPTIME_INTERNAL_IDENTITY_PIPELINE_NAME,
-};
+use crate::http::event::{ingest_logs_inner, LogIngestRequest, LogIngesterQueryParams, LogState};
 use crate::metrics::{
     METRIC_ELASTICSEARCH_LOGS_DOCS_COUNT, METRIC_ELASTICSEARCH_LOGS_INGESTION_ELAPSED,
 };
+use crate::pipeline::GREPTIME_INTERNAL_IDENTITY_PIPELINE_NAME;
 
 // The headers for every response of Elasticsearch API.
 static ELASTICSEARCH_HEADERS: Lazy<HeaderMap> = Lazy::new(|| {
diff --git a/src/servers/src/http/event.rs b/src/servers/src/http/event.rs
index 1c341e1610d6..c2998a396671 100644
--- a/src/servers/src/http/event.rs
+++ b/src/servers/src/http/event.rs
@@ -12,13 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-use std::collections::BTreeMap;
 use std::result::Result as StdResult;
 use std::str::FromStr;
 use std::sync::Arc;
 use std::time::Instant;
 
-use api::v1::{Row, RowInsertRequest, RowInsertRequests, Rows};
+use api::v1::RowInsertRequests;
 use axum::body::HttpBody;
 use axum::extract::{FromRequest, Multipart, Path, Query, State};
 use axum::headers::ContentType;
@@ -31,20 +30,16 @@ use common_query::{Output, OutputData};
 use common_telemetry::{error, warn};
 use datatypes::value::column_data_to_json;
 use lazy_static::lazy_static;
-use pipeline::error::PipelineTransformSnafu;
 use pipeline::util::to_pipeline_version;
-use pipeline::{
-    DispatchedTo, GreptimeTransformer, Pipeline, PipelineExecInput, PipelineExecOutput,
-    PipelineVersion,
-};
+use pipeline::{GreptimeTransformer, PipelineExecInput, PipelineVersion};
 use serde::{Deserialize, Serialize};
 use serde_json::{json, Deserializer, Map, Value};
 use session::context::{Channel, QueryContext, QueryContextRef};
 use snafu::{ensure, OptionExt, ResultExt};
 
 use crate::error::{
-    status_code_to_http_status, CatalogSnafu, Error, InvalidParameterSnafu, ParseJsonSnafu,
-    PipelineSnafu, Result, UnsupportedContentTypeSnafu,
+    status_code_to_http_status, Error, InvalidParameterSnafu, ParseJsonSnafu, PipelineSnafu,
+    Result, UnsupportedContentTypeSnafu,
 };
 use crate::http::header::CONTENT_TYPE_PROTOBUF_STR;
 use crate::http::result::greptime_manage_resp::GreptimedbManageResponse;
@@ -53,11 +48,11 @@ use crate::http::HttpResponse;
 use crate::interceptor::{LogIngestInterceptor, LogIngestInterceptorRef};
 use crate::metrics::{
     METRIC_FAILURE_VALUE, METRIC_HTTP_LOGS_INGESTION_COUNTER, METRIC_HTTP_LOGS_INGESTION_ELAPSED,
-    METRIC_HTTP_LOGS_TRANSFORM_ELAPSED, METRIC_SUCCESS_VALUE,
+    METRIC_SUCCESS_VALUE,
 };
+use crate::pipeline::{run_pipeline, PipelineDefinition};
 use crate::query_handler::PipelineHandlerRef;
 
-pub const GREPTIME_INTERNAL_IDENTITY_PIPELINE_NAME: &str = "greptime_identity";
 const GREPTIME_INTERNAL_PIPELINE_NAME_PREFIX: &str = "greptime_";
 
 lazy_static! {
@@ -583,214 +578,6 @@ fn extract_pipeline_value_by_content_type(
     })
 }
 
-#[inline]
-fn pipeline_exec_with_intermediate_state(
-    pipeline: &Arc<Pipeline<GreptimeTransformer>>,
-    intermediate_state: &mut Vec<pipeline::Value>,
-    transformed: &mut Vec<Row>,
-    dispatched: &mut BTreeMap<DispatchedTo, Vec<Vec<pipeline::Value>>>,
-    db: &str,
-    transform_timer: &Instant,
-    is_top_level: bool,
-) -> Result<()> {
-    let r = pipeline
-        .exec_mut(intermediate_state)
-        .inspect_err(|_| {
-            if is_top_level {
-                METRIC_HTTP_LOGS_TRANSFORM_ELAPSED
-                    .with_label_values(&[db, METRIC_FAILURE_VALUE])
-                    .observe(transform_timer.elapsed().as_secs_f64());
-            }
-        })
-        .context(PipelineTransformSnafu)
-        .context(PipelineSnafu)?;
-
-    match r {
-        PipelineExecOutput::Transformed(row) => {
-            transformed.push(row);
-        }
-        PipelineExecOutput::DispatchedTo(dispatched_to) => {
-            if let Some(values) = dispatched.get_mut(&dispatched_to) {
-                values.push(intermediate_state.clone());
-            } else {
-                dispatched.insert(dispatched_to, vec![intermediate_state.clone()]);
-            }
-        }
-    }
-
-    Ok(())
-}
-
-/// Enum for holding information of a pipeline, which is either pipeline itself,
-/// or information that be used to retrieve a pipeline from `PipelineHandler`
-enum PipelineDefinition<'a> {
-    Resolved(Arc<Pipeline<GreptimeTransformer>>),
-    ByNameAndValue((&'a str, PipelineVersion)),
-    GreptimeIdentityPipeline,
-}
-
-impl<'a> PipelineDefinition<'a> {
-    pub fn from_name(name: &'a str, version: PipelineVersion) -> Self {
-        if name == GREPTIME_INTERNAL_IDENTITY_PIPELINE_NAME {
-            Self::GreptimeIdentityPipeline
-        } else {
-            Self::ByNameAndValue((name, version))
-        }
-    }
-
-    /// Never call this on `GreptimeIdentityPipeline` because it's a real pipeline
-    pub async fn get_pipeline(
-        self,
-        handler: &PipelineHandlerRef,
-        query_ctx: &QueryContextRef,
-    ) -> Result<Arc<Pipeline<GreptimeTransformer>>> {
-        match self {
-            Self::Resolved(pipeline) => Ok(pipeline),
-            Self::ByNameAndValue((name, version)) => {
-                handler.get_pipeline(name, version, query_ctx.clone()).await
-            }
-            _ => {
-                unreachable!("Never call get_pipeline on identity.")
-            }
-        }
-    }
-}
-
-async fn run_pipeline<'a>(
-    state: &PipelineHandlerRef,
-    pipeline_definition: PipelineDefinition<'a>,
-    values: PipelineExecInput,
-    table_name: String,
-    query_ctx: &QueryContextRef,
-    db: &str,
-    is_top_level: bool,
-) -> Result<Vec<RowInsertRequest>> {
-    if matches!(
-        pipeline_definition,
-        PipelineDefinition::GreptimeIdentityPipeline
-    ) {
-        let table = state
-            .get_table(&table_name, &query_ctx)
-            .await
-            .context(CatalogSnafu)?;
-        pipeline::identity_pipeline(values, table)
-            .map(|rows| {
-                vec![RowInsertRequest {
-                    rows: Some(rows),
-                    table_name,
-                }]
-            })
-            .context(PipelineTransformSnafu)
-            .context(PipelineSnafu)
-    } else {
-        let pipeline = pipeline_definition.get_pipeline(state, query_ctx).await?;
-
-        let transform_timer = std::time::Instant::now();
-        let mut intermediate_state = pipeline.init_intermediate_state();
-
-        let mut transformed = Vec::with_capacity(values.len());
-        let mut dispatched: BTreeMap<DispatchedTo, Vec<Vec<pipeline::Value>>> = BTreeMap::new();
-
-        match values {
-            PipelineExecInput::Original(array) => {
-                for v in array {
-                    pipeline
-                        .prepare(v, &mut intermediate_state)
-                        .inspect_err(|_| {
-                            if is_top_level {
-                                METRIC_HTTP_LOGS_TRANSFORM_ELAPSED
-                                    .with_label_values(&[db, METRIC_FAILURE_VALUE])
-                                    .observe(transform_timer.elapsed().as_secs_f64());
-                            }
-                        })
-                        .context(PipelineTransformSnafu)
-                        .context(PipelineSnafu)?;
-
-                    pipeline_exec_with_intermediate_state(
-                        &pipeline,
-                        &mut intermediate_state,
-                        &mut transformed,
-                        &mut dispatched,
-                        db,
-                        &transform_timer,
-                        is_top_level,
-                    )?;
-
-                    pipeline.reset_intermediate_state(&mut intermediate_state);
-                }
-            }
-            PipelineExecInput::Intermediate { array, .. } => {
-                for mut intermediate_state in array {
-                    pipeline_exec_with_intermediate_state(
-                        &pipeline,
-                        &mut intermediate_state,
-                        &mut transformed,
-                        &mut dispatched,
-                        db,
-                        &transform_timer,
-                        is_top_level,
-                    )?;
-                }
-            }
-        }
-
-        let mut results = Vec::new();
-        // if current pipeline generates some transformed results, build it as
-        // `RowInsertRequest` and append to results. If the pipeline doesn't
-        // have dispatch, this will be only output of the pipeline.
-        if !transformed.is_empty() {
-            results.push(RowInsertRequest {
-                rows: Some(Rows {
-                    rows: transformed,
-                    schema: pipeline.schemas().clone(),
-                }),
-                table_name: table_name.clone(),
-            })
-        }
-
-        // if current pipeline contains dispatcher and has several rules, we may
-        // already accumulated several dispatched rules and rows.
-        for (dispatched_to, values) in dispatched {
-            // we generate the new table name according to `table_part` and
-            // current custom table name.
-            let table_name = format!("{}_{}", &table_name, dispatched_to.table_part);
-            let next_pipeline_name = dispatched_to
-                .pipeline
-                .as_deref()
-                .unwrap_or(GREPTIME_INTERNAL_IDENTITY_PIPELINE_NAME);
-
-            // run pipeline recursively. Note that the values we are going to
-            // process is now intermediate version. It's in form of
-            // `Vec<Vec<pipeline::Value>>`.
-            let requests = Box::pin(run_pipeline(
-                state,
-                PipelineDefinition::from_name(next_pipeline_name, None),
-                PipelineExecInput::Intermediate {
-                    array: values,
-                    // FIXME(sunng87): this intermediate_keys is incorrect. what
-                    // we will need is the keys that generated after processors
-                    keys: pipeline.intermediate_keys().clone(),
-                },
-                table_name,
-                query_ctx,
-                db,
-                false,
-            ))
-            .await?;
-
-            results.extend(requests);
-        }
-
-        if is_top_level {
-            METRIC_HTTP_LOGS_TRANSFORM_ELAPSED
-                .with_label_values(&[db, METRIC_SUCCESS_VALUE])
-                .observe(transform_timer.elapsed().as_secs_f64());
-        }
-
-        Ok(results)
-    }
-}
-
 pub(crate) async fn ingest_logs_inner(
     state: PipelineHandlerRef,
     pipeline_name: String,
diff --git a/src/servers/src/lib.rs b/src/servers/src/lib.rs
index 417d2646513b..a2f76a115583 100644
--- a/src/servers/src/lib.rs
+++ b/src/servers/src/lib.rs
@@ -37,6 +37,7 @@ pub mod metrics_handler;
 pub mod mysql;
 pub mod opentsdb;
 pub mod otlp;
+mod pipeline;
 pub mod postgres;
 mod prom_row_builder;
 pub mod prom_store;
diff --git a/src/servers/src/pipeline.rs b/src/servers/src/pipeline.rs
new file mode 100644
index 000000000000..ddebd4d37a6a
--- /dev/null
+++ b/src/servers/src/pipeline.rs
@@ -0,0 +1,243 @@
+// Copyright 2023 Greptime Team
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+use std::collections::BTreeMap;
+use std::sync::Arc;
+use std::time::Instant;
+
+use session::context::QueryContextRef;
+use snafu::ResultExt;
+
+use api::v1::{Row, RowInsertRequest, Rows};
+use pipeline::error::PipelineTransformSnafu;
+use pipeline::{
+    DispatchedTo, GreptimeTransformer, Pipeline, PipelineExecInput, PipelineExecOutput,
+    PipelineVersion,
+};
+
+use crate::error::{CatalogSnafu, PipelineSnafu, Result};
+use crate::metrics::{
+    METRIC_FAILURE_VALUE, METRIC_HTTP_LOGS_TRANSFORM_ELAPSED, METRIC_SUCCESS_VALUE,
+};
+use crate::query_handler::PipelineHandlerRef;
+
+pub(crate) const GREPTIME_INTERNAL_IDENTITY_PIPELINE_NAME: &str = "greptime_identity";
+
+#[inline]
+pub(crate) fn pipeline_exec_with_intermediate_state(
+    pipeline: &Arc<Pipeline<GreptimeTransformer>>,
+    intermediate_state: &mut Vec<pipeline::Value>,
+    transformed: &mut Vec<Row>,
+    dispatched: &mut BTreeMap<DispatchedTo, Vec<Vec<pipeline::Value>>>,
+    db: &str,
+    transform_timer: &Instant,
+    is_top_level: bool,
+) -> Result<()> {
+    let r = pipeline
+        .exec_mut(intermediate_state)
+        .inspect_err(|_| {
+            if is_top_level {
+                METRIC_HTTP_LOGS_TRANSFORM_ELAPSED
+                    .with_label_values(&[db, METRIC_FAILURE_VALUE])
+                    .observe(transform_timer.elapsed().as_secs_f64());
+            }
+        })
+        .context(PipelineTransformSnafu)
+        .context(PipelineSnafu)?;
+
+    match r {
+        PipelineExecOutput::Transformed(row) => {
+            transformed.push(row);
+        }
+        PipelineExecOutput::DispatchedTo(dispatched_to) => {
+            if let Some(values) = dispatched.get_mut(&dispatched_to) {
+                values.push(intermediate_state.clone());
+            } else {
+                dispatched.insert(dispatched_to, vec![intermediate_state.clone()]);
+            }
+        }
+    }
+
+    Ok(())
+}
+
+/// Enum for holding information of a pipeline, which is either pipeline itself,
+/// or information that be used to retrieve a pipeline from `PipelineHandler`
+pub(crate) enum PipelineDefinition<'a> {
+    Resolved(Arc<Pipeline<GreptimeTransformer>>),
+    ByNameAndValue((&'a str, PipelineVersion)),
+    GreptimeIdentityPipeline,
+}
+
+impl<'a> PipelineDefinition<'a> {
+    pub fn from_name(name: &'a str, version: PipelineVersion) -> Self {
+        if name == GREPTIME_INTERNAL_IDENTITY_PIPELINE_NAME {
+            Self::GreptimeIdentityPipeline
+        } else {
+            Self::ByNameAndValue((name, version))
+        }
+    }
+
+    /// Never call this on `GreptimeIdentityPipeline` because it's a real pipeline
+    pub async fn get_pipeline(
+        self,
+        handler: &PipelineHandlerRef,
+        query_ctx: &QueryContextRef,
+    ) -> Result<Arc<Pipeline<GreptimeTransformer>>> {
+        match self {
+            Self::Resolved(pipeline) => Ok(pipeline),
+            Self::ByNameAndValue((name, version)) => {
+                handler.get_pipeline(name, version, query_ctx.clone()).await
+            }
+            _ => {
+                unreachable!("Never call get_pipeline on identity.")
+            }
+        }
+    }
+}
+
+pub(crate) async fn run_pipeline<'a>(
+    state: &PipelineHandlerRef,
+    pipeline_definition: PipelineDefinition<'a>,
+    values: PipelineExecInput,
+    table_name: String,
+    query_ctx: &QueryContextRef,
+    db: &str,
+    is_top_level: bool,
+) -> Result<Vec<RowInsertRequest>> {
+    if matches!(
+        pipeline_definition,
+        PipelineDefinition::GreptimeIdentityPipeline
+    ) {
+        let table = state
+            .get_table(&table_name, &query_ctx)
+            .await
+            .context(CatalogSnafu)?;
+        pipeline::identity_pipeline(values, table)
+            .map(|rows| {
+                vec![RowInsertRequest {
+                    rows: Some(rows),
+                    table_name,
+                }]
+            })
+            .context(PipelineTransformSnafu)
+            .context(PipelineSnafu)
+    } else {
+        let pipeline = pipeline_definition.get_pipeline(state, query_ctx).await?;
+
+        let transform_timer = std::time::Instant::now();
+        let mut intermediate_state = pipeline.init_intermediate_state();
+
+        let mut transformed = Vec::with_capacity(values.len());
+        let mut dispatched: BTreeMap<DispatchedTo, Vec<Vec<pipeline::Value>>> = BTreeMap::new();
+
+        match values {
+            PipelineExecInput::Original(array) => {
+                for v in array {
+                    pipeline
+                        .prepare(v, &mut intermediate_state)
+                        .inspect_err(|_| {
+                            if is_top_level {
+                                METRIC_HTTP_LOGS_TRANSFORM_ELAPSED
+                                    .with_label_values(&[db, METRIC_FAILURE_VALUE])
+                                    .observe(transform_timer.elapsed().as_secs_f64());
+                            }
+                        })
+                        .context(PipelineTransformSnafu)
+                        .context(PipelineSnafu)?;
+
+                    pipeline_exec_with_intermediate_state(
+                        &pipeline,
+                        &mut intermediate_state,
+                        &mut transformed,
+                        &mut dispatched,
+                        db,
+                        &transform_timer,
+                        is_top_level,
+                    )?;
+
+                    pipeline.reset_intermediate_state(&mut intermediate_state);
+                }
+            }
+            PipelineExecInput::Intermediate { array, .. } => {
+                for mut intermediate_state in array {
+                    pipeline_exec_with_intermediate_state(
+                        &pipeline,
+                        &mut intermediate_state,
+                        &mut transformed,
+                        &mut dispatched,
+                        db,
+                        &transform_timer,
+                        is_top_level,
+                    )?;
+                }
+            }
+        }
+
+        let mut results = Vec::new();
+        // if current pipeline generates some transformed results, build it as
+        // `RowInsertRequest` and append to results. If the pipeline doesn't
+        // have dispatch, this will be only output of the pipeline.
+        if !transformed.is_empty() {
+            results.push(RowInsertRequest {
+                rows: Some(Rows {
+                    rows: transformed,
+                    schema: pipeline.schemas().clone(),
+                }),
+                table_name: table_name.clone(),
+            })
+        }
+
+        // if current pipeline contains dispatcher and has several rules, we may
+        // already accumulated several dispatched rules and rows.
+        for (dispatched_to, values) in dispatched {
+            // we generate the new table name according to `table_part` and
+            // current custom table name.
+            let table_name = format!("{}_{}", &table_name, dispatched_to.table_part);
+            let next_pipeline_name = dispatched_to
+                .pipeline
+                .as_deref()
+                .unwrap_or(GREPTIME_INTERNAL_IDENTITY_PIPELINE_NAME);
+
+            // run pipeline recursively. Note that the values we are going to
+            // process is now intermediate version. It's in form of
+            // `Vec<Vec<pipeline::Value>>`.
+            let requests = Box::pin(run_pipeline(
+                state,
+                PipelineDefinition::from_name(next_pipeline_name, None),
+                PipelineExecInput::Intermediate {
+                    array: values,
+                    // FIXME(sunng87): this intermediate_keys is incorrect. what
+                    // we will need is the keys that generated after processors
+                    keys: pipeline.intermediate_keys().clone(),
+                },
+                table_name,
+                query_ctx,
+                db,
+                false,
+            ))
+            .await?;
+
+            results.extend(requests);
+        }
+
+        if is_top_level {
+            METRIC_HTTP_LOGS_TRANSFORM_ELAPSED
+                .with_label_values(&[db, METRIC_SUCCESS_VALUE])
+                .observe(transform_timer.elapsed().as_secs_f64());
+        }
+
+        Ok(results)
+    }
+}

From 233d57f3691ced5018beeab8e3c272619b0ec966 Mon Sep 17 00:00:00 2001
From: Ning Sun <sunning@greptime.com>
Date: Tue, 21 Jan 2025 17:09:11 +0800
Subject: [PATCH 10/32] refactor: update otlp pipeline execution path

---
 src/frontend/src/instance/otlp.rs |  12 ++-
 src/pipeline/src/etl.rs           |  87 +++++++++------
 src/pipeline/src/lib.rs           |   4 +-
 src/servers/src/elasticsearch.rs  |   2 +-
 src/servers/src/http/event.rs     |   4 +-
 src/servers/src/http/otlp.rs      |  29 +++--
 src/servers/src/lib.rs            |   1 +
 src/servers/src/otlp/logs.rs      | 171 ++++++++++++++----------------
 src/servers/src/pipeline.rs       |  58 ++++------
 src/servers/src/query_handler.rs  |   1 +
 10 files changed, 183 insertions(+), 186 deletions(-)

diff --git a/src/frontend/src/instance/otlp.rs b/src/frontend/src/instance/otlp.rs
index 989c6c4348fc..5b5a7fbfe10e 100644
--- a/src/frontend/src/instance/otlp.rs
+++ b/src/frontend/src/instance/otlp.rs
@@ -24,7 +24,7 @@ use pipeline::PipelineWay;
 use servers::error::{self, AuthSnafu, InFlightWriteBytesExceededSnafu, Result as ServerResult};
 use servers::interceptor::{OpenTelemetryProtocolInterceptor, OpenTelemetryProtocolInterceptorRef};
 use servers::otlp;
-use servers::query_handler::OpenTelemetryProtocolHandler;
+use servers::query_handler::{OpenTelemetryProtocolHandler, PipelineHandlerRef};
 use session::context::QueryContextRef;
 use snafu::ResultExt;
 
@@ -112,6 +112,7 @@ impl OpenTelemetryProtocolHandler for Instance {
     #[tracing::instrument(skip_all)]
     async fn logs(
         &self,
+        pipeline_handler: PipelineHandlerRef,
         request: ExportLogsServiceRequest,
         pipeline: PipelineWay,
         table_name: String,
@@ -128,7 +129,14 @@ impl OpenTelemetryProtocolHandler for Instance {
             .get::<OpenTelemetryProtocolInterceptorRef<servers::error::Error>>();
         interceptor_ref.pre_execute(ctx.clone())?;
 
-        let (requests, rows) = otlp::logs::to_grpc_insert_requests(request, pipeline, table_name)?;
+        let (requests, rows) = otlp::logs::to_grpc_insert_requests(
+            request,
+            pipeline,
+            table_name,
+            &ctx,
+            &pipeline_handler,
+        )
+        .await?;
 
         let _guard = if let Some(limiter) = &self.limiter {
             let result = limiter.limit_row_inserts(&requests);
diff --git a/src/pipeline/src/etl.rs b/src/pipeline/src/etl.rs
index 50889cb37fad..275c1000f46a 100644
--- a/src/pipeline/src/etl.rs
+++ b/src/pipeline/src/etl.rs
@@ -20,6 +20,8 @@ pub mod processor;
 pub mod transform;
 pub mod value;
 
+use std::sync::Arc;
+
 use ahash::HashSet;
 use common_telemetry::debug;
 use error::{IntermediateKeyIndexSnafu, PrepareValueMustBeObjectSnafu, YamlLoadSnafu};
@@ -32,6 +34,7 @@ use yaml_rust::YamlLoader;
 
 use crate::dispatcher::{Dispatcher, Rule};
 use crate::etl::error::Result;
+use crate::{GreptimeTransformer, PipelineVersion};
 
 const DESCRIPTION: &str = "description";
 const PROCESSORS: &str = "processors";
@@ -256,36 +259,36 @@ where
         }
     }
 
-    pub fn prepare_pipeline_value(&self, val: Value, result: &mut [Value]) -> Result<()> {
-        match val {
-            Value::Map(map) => {
-                let mut search_from = 0;
-                // because of the key in the json map is ordered
-                for (payload_key, payload_value) in map.values.into_iter() {
-                    if search_from >= self.required_keys.len() {
-                        break;
-                    }
-
-                    // because of map key is ordered, required_keys is ordered too
-                    if let Some(pos) = self.required_keys[search_from..]
-                        .iter()
-                        .position(|k| k == &payload_key)
-                    {
-                        result[search_from + pos] = payload_value;
-                        // next search from is always after the current key
-                        search_from += pos;
-                    }
-                }
-            }
-            Value::String(_) => {
-                result[0] = val;
-            }
-            _ => {
-                return PrepareValueMustBeObjectSnafu.fail();
-            }
-        }
-        Ok(())
-    }
+    // pub fn prepare_pipeline_value(&self, val: Value, result: &mut [Value]) -> Result<()> {
+    //     match val {
+    //         Value::Map(map) => {
+    //             let mut search_from = 0;
+    //             // because of the key in the json map is ordered
+    //             for (payload_key, payload_value) in map.values.into_iter() {
+    //                 if search_from >= self.required_keys.len() {
+    //                     break;
+    //                 }
+
+    //                 // because of map key is ordered, required_keys is ordered too
+    //                 if let Some(pos) = self.required_keys[search_from..]
+    //                     .iter()
+    //                     .position(|k| k == &payload_key)
+    //                 {
+    //                     result[search_from + pos] = payload_value;
+    //                     // next search from is always after the current key
+    //                     search_from += pos;
+    //                 }
+    //             }
+    //         }
+    //         Value::String(_) => {
+    //             result[0] = val;
+    //         }
+    //         _ => {
+    //             return PrepareValueMustBeObjectSnafu.fail();
+    //         }
+    //     }
+    //     Ok(())
+    // }
 
     pub fn prepare(&self, val: serde_json::Value, result: &mut [Value]) -> Result<()> {
         match val {
@@ -388,9 +391,29 @@ impl SelectInfo {
     }
 }
 
+pub const GREPTIME_INTERNAL_IDENTITY_PIPELINE_NAME: &str = "greptime_identity";
+
+/// Enum for holding information of a pipeline, which is either pipeline itself,
+/// or information that be used to retrieve a pipeline from `PipelineHandler`
+pub enum PipelineDefinition {
+    Resolved(Arc<Pipeline<GreptimeTransformer>>),
+    ByNameAndValue((String, PipelineVersion)),
+    GreptimeIdentityPipeline,
+}
+
+impl PipelineDefinition {
+    pub fn from_name(name: &str, version: PipelineVersion) -> Self {
+        if name == GREPTIME_INTERNAL_IDENTITY_PIPELINE_NAME {
+            Self::GreptimeIdentityPipeline
+        } else {
+            Self::ByNameAndValue((name.to_owned(), version))
+        }
+    }
+}
+
 pub enum PipelineWay {
-    OtlpLog(Box<SelectInfo>),
-    Custom(std::sync::Arc<Pipeline<crate::GreptimeTransformer>>),
+    OtlpLogDirect(Box<SelectInfo>),
+    Pipeline(PipelineDefinition),
 }
 
 #[cfg(test)]
diff --git a/src/pipeline/src/lib.rs b/src/pipeline/src/lib.rs
index 8ebf9ab0b9f1..9482f63a723a 100644
--- a/src/pipeline/src/lib.rs
+++ b/src/pipeline/src/lib.rs
@@ -24,8 +24,8 @@ pub use etl::transform::transformer::identity_pipeline;
 pub use etl::transform::{GreptimeTransformer, Transformer};
 pub use etl::value::{Array, Map, Value};
 pub use etl::{
-    error as etl_error, parse, Content, DispatchedTo, Pipeline, PipelineExecOutput, PipelineWay,
-    SelectInfo,
+    error as etl_error, parse, Content, DispatchedTo, Pipeline, PipelineDefinition,
+    PipelineExecOutput, PipelineWay, SelectInfo, GREPTIME_INTERNAL_IDENTITY_PIPELINE_NAME,
 };
 pub use manager::{
     error, pipeline_operator, table, util, PipelineInfo, PipelineRef, PipelineTableRef,
diff --git a/src/servers/src/elasticsearch.rs b/src/servers/src/elasticsearch.rs
index 41bb9cbc9f76..c3b4eb54abc9 100644
--- a/src/servers/src/elasticsearch.rs
+++ b/src/servers/src/elasticsearch.rs
@@ -23,6 +23,7 @@ use axum::{Extension, TypedHeader};
 use common_error::ext::ErrorExt;
 use common_telemetry::{debug, error};
 use once_cell::sync::Lazy;
+use pipeline::GREPTIME_INTERNAL_IDENTITY_PIPELINE_NAME;
 use serde_json::{json, Deserializer, Value};
 use session::context::{Channel, QueryContext};
 use snafu::{ensure, ResultExt};
@@ -35,7 +36,6 @@ use crate::http::event::{ingest_logs_inner, LogIngestRequest, LogIngesterQueryPa
 use crate::metrics::{
     METRIC_ELASTICSEARCH_LOGS_DOCS_COUNT, METRIC_ELASTICSEARCH_LOGS_INGESTION_ELAPSED,
 };
-use crate::pipeline::GREPTIME_INTERNAL_IDENTITY_PIPELINE_NAME;
 
 // The headers for every response of Elasticsearch API.
 static ELASTICSEARCH_HEADERS: Lazy<HeaderMap> = Lazy::new(|| {
diff --git a/src/servers/src/http/event.rs b/src/servers/src/http/event.rs
index c2998a396671..2b2b1535cbc6 100644
--- a/src/servers/src/http/event.rs
+++ b/src/servers/src/http/event.rs
@@ -31,7 +31,7 @@ use common_telemetry::{error, warn};
 use datatypes::value::column_data_to_json;
 use lazy_static::lazy_static;
 use pipeline::util::to_pipeline_version;
-use pipeline::{GreptimeTransformer, PipelineExecInput, PipelineVersion};
+use pipeline::{GreptimeTransformer, PipelineDefinition, PipelineExecInput, PipelineVersion};
 use serde::{Deserialize, Serialize};
 use serde_json::{json, Deserializer, Map, Value};
 use session::context::{Channel, QueryContext, QueryContextRef};
@@ -50,7 +50,7 @@ use crate::metrics::{
     METRIC_FAILURE_VALUE, METRIC_HTTP_LOGS_INGESTION_COUNTER, METRIC_HTTP_LOGS_INGESTION_ELAPSED,
     METRIC_SUCCESS_VALUE,
 };
-use crate::pipeline::{run_pipeline, PipelineDefinition};
+use crate::pipeline::run_pipeline;
 use crate::query_handler::PipelineHandlerRef;
 
 const GREPTIME_INTERNAL_PIPELINE_NAME_PREFIX: &str = "greptime_";
diff --git a/src/servers/src/http/otlp.rs b/src/servers/src/http/otlp.rs
index b5c4607c29e3..6657bfc845a3 100644
--- a/src/servers/src/http/otlp.rs
+++ b/src/servers/src/http/otlp.rs
@@ -30,7 +30,7 @@ use opentelemetry_proto::tonic::collector::trace::v1::{
     ExportTraceServiceRequest, ExportTraceServiceResponse,
 };
 use pipeline::util::to_pipeline_version;
-use pipeline::PipelineWay;
+use pipeline::{PipelineDefinition, PipelineWay};
 use prost::Message;
 use session::context::{Channel, QueryContext};
 use snafu::prelude::*;
@@ -39,7 +39,7 @@ use super::header::{write_cost_header_map, CONTENT_TYPE_PROTOBUF};
 use crate::error::{self, PipelineSnafu, Result};
 use crate::http::extractor::{LogTableName, PipelineInfo, SelectInfoWrapper, TraceTableName};
 use crate::otlp::trace::TRACE_TABLE_NAME;
-use crate::query_handler::OpenTelemetryProtocolHandlerRef;
+use crate::query_handler::{OpenTelemetryProtocolHandlerRef, PipelineHandler};
 
 #[axum_macros::debug_handler]
 #[tracing::instrument(skip_all, fields(protocol = "otlp", request_type = "metrics"))]
@@ -117,25 +117,20 @@ pub async fn logs(
         .start_timer();
     let request = ExportLogsServiceRequest::decode(bytes).context(error::DecodeOtlpRequestSnafu)?;
 
-    let pipeline_way = if let Some(pipeline_name) = &pipeline_info.pipeline_name {
-        let pipeline_version =
-            to_pipeline_version(pipeline_info.pipeline_version).context(PipelineSnafu)?;
-        let pipeline = match handler
-            .get_pipeline(pipeline_name, pipeline_version, query_ctx.clone())
-            .await
-        {
-            Ok(p) => p,
-            Err(e) => {
-                return Err(e);
-            }
-        };
-        PipelineWay::Custom(pipeline)
+    let pipeline = if let Some(pipeline_name) = pipeline_info.pipeline_name {
+        PipelineWay::Pipeline(PipelineDefinition::from_name(
+            &pipeline_name,
+            to_pipeline_version(pipeline_info.pipeline_version).context(PipelineSnafu)?,
+        ))
     } else {
-        PipelineWay::OtlpLog(Box::new(select_info))
+        PipelineWay::OtlpLogDirect(Box::new(select_info))
     };
 
+    // here we use nightly feature `trait_upcasting` to convert handler to
+    // pipeline_handler
+    let pipeline_handler: Arc<dyn PipelineHandler + Send + Sync> = handler.clone();
     handler
-        .logs(request, pipeline_way, tablename, query_ctx)
+        .logs(pipeline_handler, request, pipeline, tablename, query_ctx)
         .await
         .map(|o| OtlpResponse {
             resp_body: ExportLogsServiceResponse {
diff --git a/src/servers/src/lib.rs b/src/servers/src/lib.rs
index a2f76a115583..423d640759f8 100644
--- a/src/servers/src/lib.rs
+++ b/src/servers/src/lib.rs
@@ -17,6 +17,7 @@
 #![feature(exclusive_wrapper)]
 #![feature(let_chains)]
 #![feature(if_let_guard)]
+#![feature(trait_upcasting)]
 
 use datafusion_expr::LogicalPlan;
 use datatypes::schema::Schema;
diff --git a/src/servers/src/otlp/logs.rs b/src/servers/src/otlp/logs.rs
index f11cd4ff3c68..348dcdd2d96a 100644
--- a/src/servers/src/otlp/logs.rs
+++ b/src/servers/src/otlp/logs.rs
@@ -25,14 +25,16 @@ use jsonb::{Number as JsonbNumber, Value as JsonbValue};
 use opentelemetry_proto::tonic::collector::logs::v1::ExportLogsServiceRequest;
 use opentelemetry_proto::tonic::common::v1::{any_value, AnyValue, InstrumentationScope, KeyValue};
 use opentelemetry_proto::tonic::logs::v1::{LogRecord, ResourceLogs, ScopeLogs};
-use pipeline::{Array, Map, PipelineWay, SchemaInfo, SelectInfo, Value as PipelineValue};
-use snafu::{ensure, ResultExt};
+use pipeline::{PipelineExecInput, PipelineWay, SchemaInfo, SelectInfo};
+use serde_json::{Map, Value};
+use session::context::QueryContextRef;
+use snafu::ensure;
 
 use super::trace::attributes::OtlpAnyValue;
 use super::utils::{bytes_to_hex_string, key_value_to_jsonb};
-use crate::error::{
-    IncompatibleSchemaSnafu, OpenTelemetryLogSnafu, Result, UnsupportedJsonDataTypeForTagSnafu,
-};
+use crate::error::{IncompatibleSchemaSnafu, Result, UnsupportedJsonDataTypeForTagSnafu};
+use crate::pipeline::run_pipeline;
+use crate::query_handler::PipelineHandlerRef;
 
 pub const LOG_TABLE_NAME: &str = "opentelemetry_logs";
 
@@ -43,13 +45,15 @@ pub const LOG_TABLE_NAME: &str = "opentelemetry_logs";
 /// for data structure of OTLP metrics.
 ///
 /// Returns `InsertRequests` and total number of rows to ingest
-pub fn to_grpc_insert_requests(
+pub async fn to_grpc_insert_requests(
     request: ExportLogsServiceRequest,
     pipeline: PipelineWay,
     table_name: String,
+    query_ctx: &QueryContextRef,
+    pipeline_handler: PipelineHandlerRef,
 ) -> Result<(RowInsertRequests, usize)> {
     match pipeline {
-        PipelineWay::OtlpLog(select_info) => {
+        PipelineWay::OtlpLogDirect(select_info) => {
             let rows = parse_export_logs_service_request_to_rows(request, select_info)?;
             let len = rows.rows.len();
             let insert_request = RowInsertRequest {
@@ -63,53 +67,48 @@ pub fn to_grpc_insert_requests(
                 len,
             ))
         }
-        PipelineWay::Custom(p) => {
-            let request = parse_export_logs_service_request(request);
-            let mut result = Vec::new();
-            let mut intermediate_state = p.init_intermediate_state();
-            for v in request {
-                p.prepare_pipeline_value(v, &mut intermediate_state)
-                    .context(OpenTelemetryLogSnafu)?;
-                let r = p
-                    .exec_mut(&mut intermediate_state)
-                    .context(OpenTelemetryLogSnafu)?;
-                result.push(r);
-            }
-            let len = result.len();
-            let rows = Rows {
-                schema: p.schemas().clone(),
-                rows: result,
-            };
-            let insert_request = RowInsertRequest {
-                rows: Some(rows),
+        PipelineWay::Pipeline(pipeline_def) => {
+            let data = parse_export_logs_service_request(request);
+
+            let db_string = query_ctx.get_db_string();
+
+            let inserts = run_pipeline(
+                &pipeline_handler,
+                pipeline_def,
+                PipelineExecInput::Original(data),
                 table_name,
-            };
-            let insert_requests = RowInsertRequests {
-                inserts: vec![insert_request],
-            };
+                query_ctx,
+                db_string.as_ref(),
+                true,
+            )
+            .await?;
+            let len = inserts
+                .iter()
+                .map(|insert| {
+                    insert
+                        .rows
+                        .as_ref()
+                        .map(|rows| rows.rows.len())
+                        .unwrap_or(0)
+                })
+                .sum();
+
+            let insert_requests = RowInsertRequests { inserts };
             Ok((insert_requests, len))
         }
     }
 }
 
-fn scope_to_pipeline_value(
-    scope: Option<InstrumentationScope>,
-) -> (PipelineValue, PipelineValue, PipelineValue) {
+fn scope_to_pipeline_value(scope: Option<InstrumentationScope>) -> (Value, Value, Value) {
     scope
         .map(|x| {
             (
-                PipelineValue::Map(Map {
-                    values: key_value_to_map(x.attributes),
-                }),
-                PipelineValue::String(x.version),
-                PipelineValue::String(x.name),
+                Value::Object(key_value_to_map(x.attributes)),
+                Value::String(x.version),
+                Value::String(x.name),
             )
         })
-        .unwrap_or((
-            PipelineValue::Null,
-            PipelineValue::Null,
-            PipelineValue::Null,
-        ))
+        .unwrap_or((Value::Null, Value::Null, Value::Null))
 }
 
 fn scope_to_jsonb(
@@ -128,51 +127,43 @@ fn scope_to_jsonb(
 
 fn log_to_pipeline_value(
     log: LogRecord,
-    resource_schema_url: PipelineValue,
-    resource_attr: PipelineValue,
-    scope_schema_url: PipelineValue,
-    scope_name: PipelineValue,
-    scope_version: PipelineValue,
-    scope_attrs: PipelineValue,
-) -> PipelineValue {
-    let log_attrs = PipelineValue::Map(Map {
-        values: key_value_to_map(log.attributes),
-    });
-    let mut map = BTreeMap::new();
-    map.insert(
-        "Timestamp".to_string(),
-        PipelineValue::Uint64(log.time_unix_nano),
-    );
+    resource_schema_url: Value,
+    resource_attr: Value,
+    scope_schema_url: Value,
+    scope_name: Value,
+    scope_version: Value,
+    scope_attrs: Value,
+) -> Value {
+    let log_attrs = Value::Object(key_value_to_map(log.attributes));
+    let mut map = Map::new();
+    map.insert("Timestamp".to_string(), Value::from(log.time_unix_nano));
     map.insert(
         "ObservedTimestamp".to_string(),
-        PipelineValue::Uint64(log.observed_time_unix_nano),
+        Value::from(log.observed_time_unix_nano),
     );
 
     // need to be convert to string
     map.insert(
         "TraceId".to_string(),
-        PipelineValue::String(bytes_to_hex_string(&log.trace_id)),
+        Value::String(bytes_to_hex_string(&log.trace_id)),
     );
     map.insert(
         "SpanId".to_string(),
-        PipelineValue::String(bytes_to_hex_string(&log.span_id)),
-    );
-    map.insert("TraceFlags".to_string(), PipelineValue::Uint32(log.flags));
-    map.insert(
-        "SeverityText".to_string(),
-        PipelineValue::String(log.severity_text),
+        Value::String(bytes_to_hex_string(&log.span_id)),
     );
+    map.insert("TraceFlags".to_string(), Value::from(log.flags));
+    map.insert("SeverityText".to_string(), Value::String(log.severity_text));
     map.insert(
         "SeverityNumber".to_string(),
-        PipelineValue::Int32(log.severity_number),
+        Value::from(log.severity_number),
     );
     // need to be convert to string
     map.insert(
         "Body".to_string(),
         log.body
             .as_ref()
-            .map(|x| PipelineValue::String(log_body_to_string(x)))
-            .unwrap_or(PipelineValue::Null),
+            .map(|x| Value::String(log_body_to_string(x)))
+            .unwrap_or(Value::Null),
     );
     map.insert("ResourceSchemaUrl".to_string(), resource_schema_url);
 
@@ -182,7 +173,7 @@ fn log_to_pipeline_value(
     map.insert("ScopeVersion".to_string(), scope_version);
     map.insert("ScopeAttributes".to_string(), scope_attrs);
     map.insert("LogAttributes".to_string(), log_attrs);
-    PipelineValue::Map(Map { values: map })
+    Value::Object(map)
 }
 
 fn build_otlp_logs_identity_schema() -> Vec<ColumnSchema> {
@@ -699,22 +690,18 @@ struct ParseInfo {
 
 /// transform otlp logs request to pipeline value
 /// https://opentelemetry.io/docs/concepts/signals/logs/
-fn parse_export_logs_service_request(request: ExportLogsServiceRequest) -> Vec<PipelineValue> {
+fn parse_export_logs_service_request(request: ExportLogsServiceRequest) -> Vec<Value> {
     let mut result = Vec::new();
     for r in request.resource_logs {
         let resource_attr = r
             .resource
-            .map(|x| {
-                PipelineValue::Map(Map {
-                    values: key_value_to_map(x.attributes),
-                })
-            })
-            .unwrap_or(PipelineValue::Null);
-        let resource_schema_url = PipelineValue::String(r.schema_url);
+            .map(|x| Value::Object(key_value_to_map(x.attributes)))
+            .unwrap_or(Value::Null);
+        let resource_schema_url = Value::String(r.schema_url);
         for scope_logs in r.scope_logs {
             let (scope_attrs, scope_version, scope_name) =
                 scope_to_pipeline_value(scope_logs.scope);
-            let scope_schema_url = PipelineValue::String(scope_logs.schema_url);
+            let scope_schema_url = Value::String(scope_logs.schema_url);
             for log in scope_logs.log_records {
                 let value = log_to_pipeline_value(
                     log,
@@ -733,41 +720,41 @@ fn parse_export_logs_service_request(request: ExportLogsServiceRequest) -> Vec<P
 }
 
 // convert AnyValue to pipeline value
-fn any_value_to_pipeline_value(value: any_value::Value) -> PipelineValue {
+fn any_value_to_pipeline_value(value: any_value::Value) -> Value {
     match value {
-        any_value::Value::StringValue(s) => PipelineValue::String(s),
-        any_value::Value::IntValue(i) => PipelineValue::Int64(i),
-        any_value::Value::DoubleValue(d) => PipelineValue::Float64(d),
-        any_value::Value::BoolValue(b) => PipelineValue::Boolean(b),
+        any_value::Value::StringValue(s) => Value::String(s),
+        any_value::Value::IntValue(i) => Value::from(i),
+        any_value::Value::DoubleValue(d) => Value::from(d),
+        any_value::Value::BoolValue(b) => Value::Bool(b),
         any_value::Value::ArrayValue(a) => {
             let values = a
                 .values
                 .into_iter()
                 .map(|v| match v.value {
                     Some(value) => any_value_to_pipeline_value(value),
-                    None => PipelineValue::Null,
+                    None => Value::Null,
                 })
                 .collect();
-            PipelineValue::Array(Array { values })
+            Value::Array(values)
         }
         any_value::Value::KvlistValue(kv) => {
             let value = key_value_to_map(kv.values);
-            PipelineValue::Map(Map { values: value })
+            Value::Object(value)
         }
-        any_value::Value::BytesValue(b) => PipelineValue::String(bytes_to_hex_string(&b)),
+        any_value::Value::BytesValue(b) => Value::String(bytes_to_hex_string(&b)),
     }
 }
 
 // convert otlp keyValue vec to map
-fn key_value_to_map(key_values: Vec<KeyValue>) -> BTreeMap<String, PipelineValue> {
-    let mut map = BTreeMap::new();
+fn key_value_to_map(key_values: Vec<KeyValue>) -> Map<String, Value> {
+    let mut map = Map::new();
     for kv in key_values {
         let value = match kv.value {
             Some(value) => match value.value {
                 Some(value) => any_value_to_pipeline_value(value),
-                None => PipelineValue::Null,
+                None => Value::Null,
             },
-            None => PipelineValue::Null,
+            None => Value::Null,
         };
         map.insert(kv.key.clone(), value);
     }
diff --git a/src/servers/src/pipeline.rs b/src/servers/src/pipeline.rs
index ddebd4d37a6a..25914b42df0d 100644
--- a/src/servers/src/pipeline.rs
+++ b/src/servers/src/pipeline.rs
@@ -22,8 +22,8 @@ use snafu::ResultExt;
 use api::v1::{Row, RowInsertRequest, Rows};
 use pipeline::error::PipelineTransformSnafu;
 use pipeline::{
-    DispatchedTo, GreptimeTransformer, Pipeline, PipelineExecInput, PipelineExecOutput,
-    PipelineVersion,
+    DispatchedTo, GreptimeTransformer, Pipeline, PipelineDefinition, PipelineExecInput,
+    PipelineExecOutput, GREPTIME_INTERNAL_IDENTITY_PIPELINE_NAME,
 };
 
 use crate::error::{CatalogSnafu, PipelineSnafu, Result};
@@ -32,8 +32,6 @@ use crate::metrics::{
 };
 use crate::query_handler::PipelineHandlerRef;
 
-pub(crate) const GREPTIME_INTERNAL_IDENTITY_PIPELINE_NAME: &str = "greptime_identity";
-
 #[inline]
 pub(crate) fn pipeline_exec_with_intermediate_state(
     pipeline: &Arc<Pipeline<GreptimeTransformer>>,
@@ -72,44 +70,28 @@ pub(crate) fn pipeline_exec_with_intermediate_state(
     Ok(())
 }
 
-/// Enum for holding information of a pipeline, which is either pipeline itself,
-/// or information that be used to retrieve a pipeline from `PipelineHandler`
-pub(crate) enum PipelineDefinition<'a> {
-    Resolved(Arc<Pipeline<GreptimeTransformer>>),
-    ByNameAndValue((&'a str, PipelineVersion)),
-    GreptimeIdentityPipeline,
-}
-
-impl<'a> PipelineDefinition<'a> {
-    pub fn from_name(name: &'a str, version: PipelineVersion) -> Self {
-        if name == GREPTIME_INTERNAL_IDENTITY_PIPELINE_NAME {
-            Self::GreptimeIdentityPipeline
-        } else {
-            Self::ByNameAndValue((name, version))
+/// Never call this on `GreptimeIdentityPipeline` because it's a real pipeline
+pub async fn get_pipeline(
+    pipeline_def: PipelineDefinition,
+    handler: &PipelineHandlerRef,
+    query_ctx: &QueryContextRef,
+) -> Result<Arc<Pipeline<GreptimeTransformer>>> {
+    match pipeline_def {
+        PipelineDefinition::Resolved(pipeline) => Ok(pipeline),
+        PipelineDefinition::ByNameAndValue((name, version)) => {
+            handler
+                .get_pipeline(&name, version, query_ctx.clone())
+                .await
         }
-    }
-
-    /// Never call this on `GreptimeIdentityPipeline` because it's a real pipeline
-    pub async fn get_pipeline(
-        self,
-        handler: &PipelineHandlerRef,
-        query_ctx: &QueryContextRef,
-    ) -> Result<Arc<Pipeline<GreptimeTransformer>>> {
-        match self {
-            Self::Resolved(pipeline) => Ok(pipeline),
-            Self::ByNameAndValue((name, version)) => {
-                handler.get_pipeline(name, version, query_ctx.clone()).await
-            }
-            _ => {
-                unreachable!("Never call get_pipeline on identity.")
-            }
+        _ => {
+            unreachable!("Never call get_pipeline on identity.")
         }
     }
 }
 
-pub(crate) async fn run_pipeline<'a>(
+pub(crate) async fn run_pipeline(
     state: &PipelineHandlerRef,
-    pipeline_definition: PipelineDefinition<'a>,
+    pipeline_definition: PipelineDefinition,
     values: PipelineExecInput,
     table_name: String,
     query_ctx: &QueryContextRef,
@@ -121,7 +103,7 @@ pub(crate) async fn run_pipeline<'a>(
         PipelineDefinition::GreptimeIdentityPipeline
     ) {
         let table = state
-            .get_table(&table_name, &query_ctx)
+            .get_table(&table_name, query_ctx)
             .await
             .context(CatalogSnafu)?;
         pipeline::identity_pipeline(values, table)
@@ -134,7 +116,7 @@ pub(crate) async fn run_pipeline<'a>(
             .context(PipelineTransformSnafu)
             .context(PipelineSnafu)
     } else {
-        let pipeline = pipeline_definition.get_pipeline(state, query_ctx).await?;
+        let pipeline = get_pipeline(pipeline_definition, state, query_ctx).await?;
 
         let transform_timer = std::time::Instant::now();
         let mut intermediate_state = pipeline.init_intermediate_state();
diff --git a/src/servers/src/query_handler.rs b/src/servers/src/query_handler.rs
index d450815a4a0c..9029a8fc2a99 100644
--- a/src/servers/src/query_handler.rs
+++ b/src/servers/src/query_handler.rs
@@ -110,6 +110,7 @@ pub trait OpenTelemetryProtocolHandler: PipelineHandler {
 
     async fn logs(
         &self,
+        pipeline_handler: PipelineHandlerRef,
         request: ExportLogsServiceRequest,
         pipeline: PipelineWay,
         table_name: String,

From af64c069da5a0eff8d253deea78239ca6aa2434d Mon Sep 17 00:00:00 2001
From: Ning Sun <sunning@greptime.com>
Date: Tue, 21 Jan 2025 17:12:39 +0800
Subject: [PATCH 11/32] fmt: format imports

---
 src/servers/src/pipeline.rs | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/servers/src/pipeline.rs b/src/servers/src/pipeline.rs
index 25914b42df0d..c4b01b489ef6 100644
--- a/src/servers/src/pipeline.rs
+++ b/src/servers/src/pipeline.rs
@@ -16,15 +16,14 @@ use std::collections::BTreeMap;
 use std::sync::Arc;
 use std::time::Instant;
 
-use session::context::QueryContextRef;
-use snafu::ResultExt;
-
 use api::v1::{Row, RowInsertRequest, Rows};
 use pipeline::error::PipelineTransformSnafu;
 use pipeline::{
     DispatchedTo, GreptimeTransformer, Pipeline, PipelineDefinition, PipelineExecInput,
     PipelineExecOutput, GREPTIME_INTERNAL_IDENTITY_PIPELINE_NAME,
 };
+use session::context::QueryContextRef;
+use snafu::ResultExt;
 
 use crate::error::{CatalogSnafu, PipelineSnafu, Result};
 use crate::metrics::{

From a980314e3a8a488f202c1e9f2918cdc9f43446d1 Mon Sep 17 00:00:00 2001
From: Ning Sun <sunning@greptime.com>
Date: Tue, 21 Jan 2025 17:16:26 +0800
Subject: [PATCH 12/32] fix: compilation

---
 src/frontend/src/instance/otlp.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/frontend/src/instance/otlp.rs b/src/frontend/src/instance/otlp.rs
index 5b5a7fbfe10e..8c33f4dfdf4b 100644
--- a/src/frontend/src/instance/otlp.rs
+++ b/src/frontend/src/instance/otlp.rs
@@ -134,7 +134,7 @@ impl OpenTelemetryProtocolHandler for Instance {
             pipeline,
             table_name,
             &ctx,
-            &pipeline_handler,
+            pipeline_handler,
         )
         .await?;
 

From 5bd87988964badb64112379722124e9849e59578 Mon Sep 17 00:00:00 2001
From: Ning Sun <sunning@greptime.com>
Date: Tue, 21 Jan 2025 18:20:57 +0800
Subject: [PATCH 13/32] fix: resolve residual issues

---
 src/pipeline/src/etl.rs         | 31 -------------------------------
 src/servers/src/error.rs        |  7 -------
 tests-integration/tests/http.rs | 12 ++++++------
 3 files changed, 6 insertions(+), 44 deletions(-)

diff --git a/src/pipeline/src/etl.rs b/src/pipeline/src/etl.rs
index 275c1000f46a..f7d3a1c3bbe7 100644
--- a/src/pipeline/src/etl.rs
+++ b/src/pipeline/src/etl.rs
@@ -259,37 +259,6 @@ where
         }
     }
 
-    // pub fn prepare_pipeline_value(&self, val: Value, result: &mut [Value]) -> Result<()> {
-    //     match val {
-    //         Value::Map(map) => {
-    //             let mut search_from = 0;
-    //             // because of the key in the json map is ordered
-    //             for (payload_key, payload_value) in map.values.into_iter() {
-    //                 if search_from >= self.required_keys.len() {
-    //                     break;
-    //                 }
-
-    //                 // because of map key is ordered, required_keys is ordered too
-    //                 if let Some(pos) = self.required_keys[search_from..]
-    //                     .iter()
-    //                     .position(|k| k == &payload_key)
-    //                 {
-    //                     result[search_from + pos] = payload_value;
-    //                     // next search from is always after the current key
-    //                     search_from += pos;
-    //                 }
-    //             }
-    //         }
-    //         Value::String(_) => {
-    //             result[0] = val;
-    //         }
-    //         _ => {
-    //             return PrepareValueMustBeObjectSnafu.fail();
-    //         }
-    //     }
-    //     Ok(())
-    // }
-
     pub fn prepare(&self, val: serde_json::Value, result: &mut [Value]) -> Result<()> {
         match val {
             serde_json::Value::Object(map) => {
diff --git a/src/servers/src/error.rs b/src/servers/src/error.rs
index b8882b0b7299..f285019a9d69 100644
--- a/src/servers/src/error.rs
+++ b/src/servers/src/error.rs
@@ -554,12 +554,6 @@ pub enum Error {
         location: Location,
     },
 
-    #[snafu(display("OpenTelemetry log error"))]
-    OpenTelemetryLog {
-        source: pipeline::etl_error::Error,
-        #[snafu(implicit)]
-        location: Location,
-    },
     #[snafu(display("Unsupported json data type for tag: {} {}", key, ty))]
     UnsupportedJsonDataTypeForTag {
         key: String,
@@ -658,7 +652,6 @@ impl ErrorExt for Error {
             | InvalidLokiPayload { .. }
             | UnsupportedContentType { .. }
             | TimestampOverflow { .. }
-            | OpenTelemetryLog { .. }
             | UnsupportedJsonDataTypeForTag { .. }
             | InvalidTableName { .. }
             | PrepareStatementNotFound { .. }
diff --git a/tests-integration/tests/http.rs b/tests-integration/tests/http.rs
index 2c912aa0af60..66fc21a6fd3d 100644
--- a/tests-integration/tests/http.rs
+++ b/tests-integration/tests/http.rs
@@ -1440,8 +1440,8 @@ transform:
             .await;
         assert_eq!(res.status(), StatusCode::OK);
         let body: Value = res.json().await;
-        let schema = &body["schema"];
-        let rows = &body["rows"];
+        let schema = &body[0]["schema"];
+        let rows = &body[0]["rows"];
         assert_eq!(schema, &dryrun_schema);
         assert_eq!(rows, &dryrun_rows);
     }
@@ -1470,8 +1470,8 @@ transform:
             .await;
         assert_eq!(res.status(), StatusCode::OK);
         let body: Value = res.json().await;
-        let schema = &body["schema"];
-        let rows = &body["rows"];
+        let schema = &body[0]["schema"];
+        let rows = &body[0]["rows"];
         assert_eq!(schema, &dryrun_schema);
         assert_eq!(rows, &dryrun_rows);
     }
@@ -1498,8 +1498,8 @@ transform:
             .await;
         assert_eq!(res.status(), StatusCode::OK);
         let body: Value = res.json().await;
-        let schema = &body["schema"];
-        let rows = &body["rows"];
+        let schema = &body[0]["schema"];
+        let rows = &body[0]["rows"];
         assert_eq!(schema, &dryrun_schema);
         assert_eq!(rows, &dryrun_rows);
     }

From b43a6c83139a03e7589bf22c6ddbdb8b16f39c10 Mon Sep 17 00:00:00 2001
From: Ning Sun <sunning@greptime.com>
Date: Wed, 22 Jan 2025 14:48:55 +0800
Subject: [PATCH 14/32] refactor: address review comments

---
 src/servers/src/pipeline.rs | 23 ++++++++---------------
 1 file changed, 8 insertions(+), 15 deletions(-)

diff --git a/src/servers/src/pipeline.rs b/src/servers/src/pipeline.rs
index c4b01b489ef6..5d11df2eff2b 100644
--- a/src/servers/src/pipeline.rs
+++ b/src/servers/src/pipeline.rs
@@ -32,23 +32,20 @@ use crate::metrics::{
 use crate::query_handler::PipelineHandlerRef;
 
 #[inline]
-pub(crate) fn pipeline_exec_with_intermediate_state(
+fn pipeline_exec_with_intermediate_state(
     pipeline: &Arc<Pipeline<GreptimeTransformer>>,
     intermediate_state: &mut Vec<pipeline::Value>,
     transformed: &mut Vec<Row>,
     dispatched: &mut BTreeMap<DispatchedTo, Vec<Vec<pipeline::Value>>>,
     db: &str,
     transform_timer: &Instant,
-    is_top_level: bool,
 ) -> Result<()> {
     let r = pipeline
         .exec_mut(intermediate_state)
         .inspect_err(|_| {
-            if is_top_level {
-                METRIC_HTTP_LOGS_TRANSFORM_ELAPSED
-                    .with_label_values(&[db, METRIC_FAILURE_VALUE])
-                    .observe(transform_timer.elapsed().as_secs_f64());
-            }
+            METRIC_HTTP_LOGS_TRANSFORM_ELAPSED
+                .with_label_values(&[db, METRIC_FAILURE_VALUE])
+                .observe(transform_timer.elapsed().as_secs_f64());
         })
         .context(PipelineTransformSnafu)
         .context(PipelineSnafu)?;
@@ -118,22 +115,20 @@ pub(crate) async fn run_pipeline(
         let pipeline = get_pipeline(pipeline_definition, state, query_ctx).await?;
 
         let transform_timer = std::time::Instant::now();
-        let mut intermediate_state = pipeline.init_intermediate_state();
 
         let mut transformed = Vec::with_capacity(values.len());
         let mut dispatched: BTreeMap<DispatchedTo, Vec<Vec<pipeline::Value>>> = BTreeMap::new();
 
         match values {
             PipelineExecInput::Original(array) => {
+                let mut intermediate_state = pipeline.init_intermediate_state();
                 for v in array {
                     pipeline
                         .prepare(v, &mut intermediate_state)
                         .inspect_err(|_| {
-                            if is_top_level {
-                                METRIC_HTTP_LOGS_TRANSFORM_ELAPSED
-                                    .with_label_values(&[db, METRIC_FAILURE_VALUE])
-                                    .observe(transform_timer.elapsed().as_secs_f64());
-                            }
+                            METRIC_HTTP_LOGS_TRANSFORM_ELAPSED
+                                .with_label_values(&[db, METRIC_FAILURE_VALUE])
+                                .observe(transform_timer.elapsed().as_secs_f64());
                         })
                         .context(PipelineTransformSnafu)
                         .context(PipelineSnafu)?;
@@ -145,7 +140,6 @@ pub(crate) async fn run_pipeline(
                         &mut dispatched,
                         db,
                         &transform_timer,
-                        is_top_level,
                     )?;
 
                     pipeline.reset_intermediate_state(&mut intermediate_state);
@@ -160,7 +154,6 @@ pub(crate) async fn run_pipeline(
                         &mut dispatched,
                         db,
                         &transform_timer,
-                        is_top_level,
                     )?;
                 }
             }

From ad05a39ed70795296f5026b984e9b557067efe6a Mon Sep 17 00:00:00 2001
From: paomian <xpaomian@gmail.com>
Date: Thu, 23 Jan 2025 17:16:44 +0800
Subject: [PATCH 15/32] chore: use btreemap as pipeline intermediate status
 trait modify

---
 src/pipeline/src/etl.rs                       | 676 +++++++-----------
 src/pipeline/src/etl/processor.rs             | 137 +---
 src/pipeline/src/etl/transform.rs             | 172 +----
 .../src/etl/transform/transformer/greptime.rs | 112 +--
 4 files changed, 338 insertions(+), 759 deletions(-)

diff --git a/src/pipeline/src/etl.rs b/src/pipeline/src/etl.rs
index f7d3a1c3bbe7..61b72efb470e 100644
--- a/src/pipeline/src/etl.rs
+++ b/src/pipeline/src/etl.rs
@@ -20,6 +20,7 @@ pub mod processor;
 pub mod transform;
 pub mod value;
 
+use std::collections::BTreeMap;
 use std::sync::Arc;
 
 use ahash::HashSet;
@@ -28,7 +29,7 @@ use error::{IntermediateKeyIndexSnafu, PrepareValueMustBeObjectSnafu, YamlLoadSn
 use itertools::Itertools;
 use processor::{Processor, ProcessorBuilder, Processors};
 use snafu::{OptionExt, ResultExt};
-use transform::{TransformBuilders, Transformer, Transforms};
+use transform::{Transformer, Transforms};
 use value::Value;
 use yaml_rust::YamlLoader;
 
@@ -59,99 +60,17 @@ where
 
             let description = doc[DESCRIPTION].as_str().map(|s| s.to_string());
 
-            let processor_builder_list = if let Some(v) = doc[PROCESSORS].as_vec() {
+            let processors = if let Some(v) = doc[PROCESSORS].as_vec() {
                 v.try_into()?
             } else {
-                processor::ProcessorBuilderList::default()
+                Processors::default()
             };
 
-            let transform_builders =
-                if let Some(v) = doc[TRANSFORMS].as_vec().or(doc[TRANSFORM].as_vec()) {
-                    v.try_into()?
-                } else {
-                    TransformBuilders::default()
-                };
-
-            let processors_required_keys = &processor_builder_list.input_keys;
-            let processors_output_keys = &processor_builder_list.output_keys;
-            let processors_required_original_keys = &processor_builder_list.original_input_keys;
-
-            debug!(
-                "processors_required_original_keys: {:?}",
-                processors_required_original_keys
-            );
-            debug!("processors_required_keys: {:?}", processors_required_keys);
-            debug!("processors_output_keys: {:?}", processors_output_keys);
-
-            let transforms_required_keys = &transform_builders.required_keys;
-            let mut tr_keys = Vec::with_capacity(50);
-            for key in transforms_required_keys.iter() {
-                if !processors_output_keys.contains(key)
-                    && !processors_required_original_keys.contains(key)
-                {
-                    tr_keys.push(key.clone());
-                }
-            }
-
-            let mut required_keys = processors_required_original_keys.clone();
-
-            required_keys.append(&mut tr_keys);
-            required_keys.sort();
-
-            debug!("required_keys: {:?}", required_keys);
-
-            // intermediate keys are the keys that all processor and transformer required
-            let ordered_intermediate_keys: Vec<String> = [
-                processors_required_keys,
-                transforms_required_keys,
-                processors_output_keys,
-            ]
-            .iter()
-            .flat_map(|l| l.iter())
-            .collect::<HashSet<&String>>()
-            .into_iter()
-            .sorted()
-            .cloned()
-            .collect_vec();
-
-            let mut final_intermediate_keys = Vec::with_capacity(ordered_intermediate_keys.len());
-            let mut intermediate_keys_exclude_original =
-                Vec::with_capacity(ordered_intermediate_keys.len());
-
-            for key_name in ordered_intermediate_keys.iter() {
-                if required_keys.contains(key_name) {
-                    final_intermediate_keys.push(key_name.clone());
-                } else {
-                    intermediate_keys_exclude_original.push(key_name.clone());
-                }
-            }
-
-            final_intermediate_keys.extend(intermediate_keys_exclude_original);
-
-            let output_keys = transform_builders.output_keys.clone();
-
-            let processors_kind_list = processor_builder_list
-                .processor_builders
-                .into_iter()
-                .map(|builder| builder.build(&final_intermediate_keys))
-                .collect::<Result<Vec<_>>>()?;
-            let processors = Processors {
-                processors: processors_kind_list,
-                required_keys: processors_required_keys.clone(),
-                output_keys: processors_output_keys.clone(),
-                required_original_keys: processors_required_original_keys.clone(),
-            };
-
-            let transfor_list = transform_builders
-                .builders
-                .into_iter()
-                .map(|builder| builder.build(&final_intermediate_keys, &output_keys))
-                .collect::<Result<Vec<_>>>()?;
-
-            let transformers = Transforms {
-                transforms: transfor_list,
-                required_keys: transforms_required_keys.clone(),
-                output_keys: output_keys.clone(),
+            let transformers = if let Some(v) = doc[TRANSFORMS].as_vec().or(doc[TRANSFORM].as_vec())
+            {
+                v.try_into()?
+            } else {
+                Transforms::default()
             };
 
             let transformer = T::new(transformers)?;
@@ -167,9 +86,6 @@ where
                 processors,
                 transformer,
                 dispatcher,
-                required_keys,
-                output_keys,
-                intermediate_keys: final_intermediate_keys,
             })
         }
         Content::Json(_) => unimplemented!(),
@@ -185,14 +101,6 @@ where
     processors: processor::Processors,
     dispatcher: Option<Dispatcher>,
     transformer: T,
-    /// required keys for the preprocessing from map data from user
-    /// include all processor required and transformer required keys
-    required_keys: Vec<String>,
-    /// all output keys from the transformer
-    output_keys: Vec<String>,
-    /// intermediate keys from the processors
-    intermediate_keys: Vec<String>,
-    // pub on_failure: processor::Processors,
 }
 
 /// Where the pipeline executed is dispatched to, with context information
@@ -240,64 +148,31 @@ impl<T> Pipeline<T>
 where
     T: Transformer,
 {
-    pub fn exec_mut(&self, val: &mut Vec<Value>) -> Result<PipelineExecOutput<T::VecOutput>> {
-        for processor in self.processors.iter() {
-            processor.exec_mut(val)?;
-        }
-
-        let matched_rule = self
-            .dispatcher
-            .as_ref()
-            .and_then(|dispatcher| dispatcher.exec(&self.intermediate_keys, val));
-
-        match matched_rule {
-            None => self
-                .transformer
-                .transform_mut(val)
-                .map(PipelineExecOutput::Transformed),
-            Some(rule) => Ok(PipelineExecOutput::DispatchedTo(rule.into())),
-        }
-    }
-
-    pub fn prepare(&self, val: serde_json::Value, result: &mut [Value]) -> Result<()> {
-        match val {
-            serde_json::Value::Object(map) => {
-                let mut search_from = 0;
-                // because of the key in the json map is ordered
-                for (payload_key, payload_value) in map.into_iter() {
-                    if search_from >= self.required_keys.len() {
-                        break;
-                    }
-
-                    // because of map key is ordered, required_keys is ordered too
-                    if let Some(pos) = self.required_keys[search_from..]
-                        .iter()
-                        .position(|k| k == &payload_key)
-                    {
-                        result[search_from + pos] = payload_value.try_into()?;
-                        // next search from is always after the current key
-                        search_from += pos;
-                    }
-                }
-            }
-            serde_json::Value::String(_) => {
-                result[0] = val.try_into()?;
-            }
-            _ => {
-                return PrepareValueMustBeObjectSnafu.fail();
-            }
-        }
-        Ok(())
+    pub fn exec_mut(
+        &self,
+        val: &mut BTreeMap<String, Value>,
+    ) -> Result<PipelineExecOutput<T::VecOutput>> {
+        // for processor in self.processors.iter() {
+        //     processor.exec_mut(val)?;
+        // }
+
+        // let matched_rule = self
+        //     .dispatcher
+        //     .as_ref()
+        //     .and_then(|dispatcher| dispatcher.exec(&self.intermediate_keys, val));
+
+        // match matched_rule {
+        //     None => self
+        //         .transformer
+        //         .transform_mut(val)
+        //         .map(PipelineExecOutput::Transformed),
+        //     Some(rule) => Ok(PipelineExecOutput::DispatchedTo(rule.into())),
+        // }
+        todo!()
     }
 
-    pub fn init_intermediate_state(&self) -> Vec<Value> {
-        vec![Value::Null; self.intermediate_keys.len()]
-    }
-
-    pub fn reset_intermediate_state(&self, result: &mut [Value]) {
-        for i in result {
-            *i = Value::Null;
-        }
+    pub fn prepare(&self, val: serde_json::Value) -> Result<BTreeMap<String, Value>> {
+        todo!()
     }
 
     pub fn processors(&self) -> &processor::Processors {
@@ -308,21 +183,6 @@ where
         &self.transformer
     }
 
-    /// Required fields in user-supplied data
-    pub fn required_keys(&self) -> &Vec<String> {
-        &self.required_keys
-    }
-
-    /// All output keys from the pipeline
-    pub fn output_keys(&self) -> &Vec<String> {
-        &self.output_keys
-    }
-
-    /// intermediate keys from the processors
-    pub fn intermediate_keys(&self) -> &Vec<String> {
-        &self.intermediate_keys
-    }
-
     pub fn schemas(&self) -> &Vec<greptime_proto::v1::ColumnSchema> {
         self.transformer.schemas()
     }
@@ -394,242 +254,242 @@ mod tests {
     use super::*;
     use crate::etl::transform::GreptimeTransformer;
 
-    #[test]
-    fn test_pipeline_prepare() {
-        let input_value_str = r#"
-                {
-                    "my_field": "1,2",
-                    "foo": "bar"
-                }
-            "#;
-        let input_value: serde_json::Value = serde_json::from_str(input_value_str).unwrap();
-
-        let pipeline_yaml = r#"description: 'Pipeline for Apache Tomcat'
-processors:
-  - csv:
-      field: my_field
-      target_fields: field1, field2
-transform:
-  - field: field1
-    type: uint32
-  - field: field2
-    type: uint32
-"#;
-        let pipeline: Pipeline<GreptimeTransformer> = parse(&Content::Yaml(pipeline_yaml)).unwrap();
-        let mut payload = pipeline.init_intermediate_state();
-        pipeline.prepare(input_value, &mut payload).unwrap();
-        assert_eq!(&["my_field"].to_vec(), pipeline.required_keys());
-        assert_eq!(
-            payload,
-            vec![Value::String("1,2".to_string()), Value::Null, Value::Null]
-        );
-        let result = pipeline
-            .exec_mut(&mut payload)
-            .unwrap()
-            .into_transformed()
-            .unwrap();
-
-        assert_eq!(result.values[0].value_data, Some(ValueData::U32Value(1)));
-        assert_eq!(result.values[1].value_data, Some(ValueData::U32Value(2)));
-        match &result.values[2].value_data {
-            Some(ValueData::TimestampNanosecondValue(v)) => {
-                assert_ne!(*v, 0);
-            }
-            _ => panic!("expect null value"),
-        }
-    }
-
-    #[test]
-    fn test_dissect_pipeline() {
-        let message = r#"129.37.245.88 - meln1ks [01/Aug/2024:14:22:47 +0800] "PATCH /observability/metrics/production HTTP/1.0" 501 33085"#.to_string();
-        let pipeline_str = r#"processors:
-  - dissect:
-      fields:
-        - message
-      patterns:
-        - "%{ip} %{?ignored} %{username} [%{ts}] \"%{method} %{path} %{proto}\" %{status} %{bytes}"
-  - timestamp:
-      fields:
-        - ts
-      formats:
-        - "%d/%b/%Y:%H:%M:%S %z"
-
-transform:
-  - fields:
-      - ip
-      - username
-      - method
-      - path
-      - proto
-    type: string
-  - fields:
-      - status
-    type: uint16
-  - fields:
-      - bytes
-    type: uint32
-  - field: ts
-    type: timestamp, ns
-    index: time"#;
-        let pipeline: Pipeline<GreptimeTransformer> = parse(&Content::Yaml(pipeline_str)).unwrap();
-        let mut payload = pipeline.init_intermediate_state();
-        pipeline
-            .prepare(serde_json::Value::String(message), &mut payload)
-            .unwrap();
-        let result = pipeline
-            .exec_mut(&mut payload)
-            .unwrap()
-            .into_transformed()
-            .unwrap();
-        let sechema = pipeline.schemas();
-
-        assert_eq!(sechema.len(), result.values.len());
-        let test = vec![
-            (
-                ColumnDataType::String as i32,
-                Some(ValueData::StringValue("129.37.245.88".into())),
-            ),
-            (
-                ColumnDataType::String as i32,
-                Some(ValueData::StringValue("meln1ks".into())),
-            ),
-            (
-                ColumnDataType::String as i32,
-                Some(ValueData::StringValue("PATCH".into())),
-            ),
-            (
-                ColumnDataType::String as i32,
-                Some(ValueData::StringValue(
-                    "/observability/metrics/production".into(),
-                )),
-            ),
-            (
-                ColumnDataType::String as i32,
-                Some(ValueData::StringValue("HTTP/1.0".into())),
-            ),
-            (
-                ColumnDataType::Uint16 as i32,
-                Some(ValueData::U16Value(501)),
-            ),
-            (
-                ColumnDataType::Uint32 as i32,
-                Some(ValueData::U32Value(33085)),
-            ),
-            (
-                ColumnDataType::TimestampNanosecond as i32,
-                Some(ValueData::TimestampNanosecondValue(1722493367000000000)),
-            ),
-        ];
-        for i in 0..sechema.len() {
-            let schema = &sechema[i];
-            let value = &result.values[i];
-            assert_eq!(schema.datatype, test[i].0);
-            assert_eq!(value.value_data, test[i].1);
-        }
-    }
-
-    #[test]
-    fn test_csv_pipeline() {
-        let input_value_str = r#"
-                {
-                    "my_field": "1,2",
-                    "foo": "bar"
-                }
-            "#;
-        let input_value: serde_json::Value = serde_json::from_str(input_value_str).unwrap();
-
-        let pipeline_yaml = r#"
-description: Pipeline for Apache Tomcat
-processors:
-  - csv:
-      field: my_field
-      target_fields: field1, field2
-transform:
-  - field: field1
-    type: uint32
-  - field: field2
-    type: uint32
-"#;
-
-        let pipeline: Pipeline<GreptimeTransformer> = parse(&Content::Yaml(pipeline_yaml)).unwrap();
-        let mut payload = pipeline.init_intermediate_state();
-        pipeline.prepare(input_value, &mut payload).unwrap();
-        assert_eq!(&["my_field"].to_vec(), pipeline.required_keys());
-        assert_eq!(
-            payload,
-            vec![Value::String("1,2".to_string()), Value::Null, Value::Null]
-        );
-        let result = pipeline
-            .exec_mut(&mut payload)
-            .unwrap()
-            .into_transformed()
-            .unwrap();
-        assert_eq!(result.values[0].value_data, Some(ValueData::U32Value(1)));
-        assert_eq!(result.values[1].value_data, Some(ValueData::U32Value(2)));
-        match &result.values[2].value_data {
-            Some(ValueData::TimestampNanosecondValue(v)) => {
-                assert_ne!(*v, 0);
-            }
-            _ => panic!("expect null value"),
-        }
-    }
-
-    #[test]
-    fn test_date_pipeline() {
-        let input_value_str = r#"
-            {
-                "my_field": "1,2",
-                "foo": "bar",
-                "test_time": "2014-5-17T04:34:56+00:00"
-            }
-        "#;
-        let input_value: serde_json::Value = serde_json::from_str(input_value_str).unwrap();
-
-        let pipeline_yaml = r#"
----
-description: Pipeline for Apache Tomcat
-
-processors:
-  - timestamp:
-      field: test_time
-
-transform:
-  - field: test_time
-    type: timestamp, ns
-    index: time
-"#;
-
-        let pipeline: Pipeline<GreptimeTransformer> = parse(&Content::Yaml(pipeline_yaml)).unwrap();
-        let schema = pipeline.schemas().clone();
-        let mut result = pipeline.init_intermediate_state();
-        pipeline.prepare(input_value, &mut result).unwrap();
-        let row = pipeline
-            .exec_mut(&mut result)
-            .unwrap()
-            .into_transformed()
-            .unwrap();
-        let output = Rows {
-            schema,
-            rows: vec![row],
-        };
-        let schemas = output.schema;
-
-        assert_eq!(schemas.len(), 1);
-        let schema = schemas[0].clone();
-        assert_eq!("test_time", schema.column_name);
-        assert_eq!(ColumnDataType::TimestampNanosecond as i32, schema.datatype);
-        assert_eq!(SemanticType::Timestamp as i32, schema.semantic_type);
-
-        let row = output.rows[0].clone();
-        assert_eq!(1, row.values.len());
-        let value_data = row.values[0].clone().value_data;
-        assert_eq!(
-            Some(v1::value::ValueData::TimestampNanosecondValue(
-                1400301296000000000
-            )),
-            value_data
-        );
-    }
+//     #[test]
+//     fn test_pipeline_prepare() {
+//         let input_value_str = r#"
+//                 {
+//                     "my_field": "1,2",
+//                     "foo": "bar"
+//                 }
+//             "#;
+//         let input_value: serde_json::Value = serde_json::from_str(input_value_str).unwrap();
+
+//         let pipeline_yaml = r#"description: 'Pipeline for Apache Tomcat'
+// processors:
+//   - csv:
+//       field: my_field
+//       target_fields: field1, field2
+// transform:
+//   - field: field1
+//     type: uint32
+//   - field: field2
+//     type: uint32
+// "#;
+//         let pipeline: Pipeline<GreptimeTransformer> = parse(&Content::Yaml(pipeline_yaml)).unwrap();
+//         let mut payload = pipeline.init_intermediate_state();
+//         pipeline.prepare(input_value, &mut payload).unwrap();
+//         assert_eq!(&["my_field"].to_vec(), pipeline.required_keys());
+//         assert_eq!(
+//             payload,
+//             vec![Value::String("1,2".to_string()), Value::Null, Value::Null]
+//         );
+//         let result = pipeline
+//             .exec_mut(&mut payload)
+//             .unwrap()
+//             .into_transformed()
+//             .unwrap();
+
+//         assert_eq!(result.values[0].value_data, Some(ValueData::U32Value(1)));
+//         assert_eq!(result.values[1].value_data, Some(ValueData::U32Value(2)));
+//         match &result.values[2].value_data {
+//             Some(ValueData::TimestampNanosecondValue(v)) => {
+//                 assert_ne!(*v, 0);
+//             }
+//             _ => panic!("expect null value"),
+//         }
+//     }
+
+//     #[test]
+//     fn test_dissect_pipeline() {
+//         let message = r#"129.37.245.88 - meln1ks [01/Aug/2024:14:22:47 +0800] "PATCH /observability/metrics/production HTTP/1.0" 501 33085"#.to_string();
+//         let pipeline_str = r#"processors:
+//   - dissect:
+//       fields:
+//         - message
+//       patterns:
+//         - "%{ip} %{?ignored} %{username} [%{ts}] \"%{method} %{path} %{proto}\" %{status} %{bytes}"
+//   - timestamp:
+//       fields:
+//         - ts
+//       formats:
+//         - "%d/%b/%Y:%H:%M:%S %z"
+
+// transform:
+//   - fields:
+//       - ip
+//       - username
+//       - method
+//       - path
+//       - proto
+//     type: string
+//   - fields:
+//       - status
+//     type: uint16
+//   - fields:
+//       - bytes
+//     type: uint32
+//   - field: ts
+//     type: timestamp, ns
+//     index: time"#;
+//         let pipeline: Pipeline<GreptimeTransformer> = parse(&Content::Yaml(pipeline_str)).unwrap();
+//         let mut payload = pipeline.init_intermediate_state();
+//         pipeline
+//             .prepare(serde_json::Value::String(message), &mut payload)
+//             .unwrap();
+//         let result = pipeline
+//             .exec_mut(&mut payload)
+//             .unwrap()
+//             .into_transformed()
+//             .unwrap();
+//         let sechema = pipeline.schemas();
+
+//         assert_eq!(sechema.len(), result.values.len());
+//         let test = vec![
+//             (
+//                 ColumnDataType::String as i32,
+//                 Some(ValueData::StringValue("129.37.245.88".into())),
+//             ),
+//             (
+//                 ColumnDataType::String as i32,
+//                 Some(ValueData::StringValue("meln1ks".into())),
+//             ),
+//             (
+//                 ColumnDataType::String as i32,
+//                 Some(ValueData::StringValue("PATCH".into())),
+//             ),
+//             (
+//                 ColumnDataType::String as i32,
+//                 Some(ValueData::StringValue(
+//                     "/observability/metrics/production".into(),
+//                 )),
+//             ),
+//             (
+//                 ColumnDataType::String as i32,
+//                 Some(ValueData::StringValue("HTTP/1.0".into())),
+//             ),
+//             (
+//                 ColumnDataType::Uint16 as i32,
+//                 Some(ValueData::U16Value(501)),
+//             ),
+//             (
+//                 ColumnDataType::Uint32 as i32,
+//                 Some(ValueData::U32Value(33085)),
+//             ),
+//             (
+//                 ColumnDataType::TimestampNanosecond as i32,
+//                 Some(ValueData::TimestampNanosecondValue(1722493367000000000)),
+//             ),
+//         ];
+//         for i in 0..sechema.len() {
+//             let schema = &sechema[i];
+//             let value = &result.values[i];
+//             assert_eq!(schema.datatype, test[i].0);
+//             assert_eq!(value.value_data, test[i].1);
+//         }
+//     }
+
+//     #[test]
+//     fn test_csv_pipeline() {
+//         let input_value_str = r#"
+//                 {
+//                     "my_field": "1,2",
+//                     "foo": "bar"
+//                 }
+//             "#;
+//         let input_value: serde_json::Value = serde_json::from_str(input_value_str).unwrap();
+
+//         let pipeline_yaml = r#"
+// description: Pipeline for Apache Tomcat
+// processors:
+//   - csv:
+//       field: my_field
+//       target_fields: field1, field2
+// transform:
+//   - field: field1
+//     type: uint32
+//   - field: field2
+//     type: uint32
+// "#;
+
+//         let pipeline: Pipeline<GreptimeTransformer> = parse(&Content::Yaml(pipeline_yaml)).unwrap();
+//         let mut payload = pipeline.init_intermediate_state();
+//         pipeline.prepare(input_value, &mut payload).unwrap();
+//         assert_eq!(&["my_field"].to_vec(), pipeline.required_keys());
+//         assert_eq!(
+//             payload,
+//             vec![Value::String("1,2".to_string()), Value::Null, Value::Null]
+//         );
+//         let result = pipeline
+//             .exec_mut(&mut payload)
+//             .unwrap()
+//             .into_transformed()
+//             .unwrap();
+//         assert_eq!(result.values[0].value_data, Some(ValueData::U32Value(1)));
+//         assert_eq!(result.values[1].value_data, Some(ValueData::U32Value(2)));
+//         match &result.values[2].value_data {
+//             Some(ValueData::TimestampNanosecondValue(v)) => {
+//                 assert_ne!(*v, 0);
+//             }
+//             _ => panic!("expect null value"),
+//         }
+//     }
+
+//     #[test]
+//     fn test_date_pipeline() {
+//         let input_value_str = r#"
+//             {
+//                 "my_field": "1,2",
+//                 "foo": "bar",
+//                 "test_time": "2014-5-17T04:34:56+00:00"
+//             }
+//         "#;
+//         let input_value: serde_json::Value = serde_json::from_str(input_value_str).unwrap();
+
+//         let pipeline_yaml = r#"
+// ---
+// description: Pipeline for Apache Tomcat
+
+// processors:
+//   - timestamp:
+//       field: test_time
+
+// transform:
+//   - field: test_time
+//     type: timestamp, ns
+//     index: time
+// "#;
+
+//         let pipeline: Pipeline<GreptimeTransformer> = parse(&Content::Yaml(pipeline_yaml)).unwrap();
+//         let schema = pipeline.schemas().clone();
+//         let mut result = pipeline.init_intermediate_state();
+//         pipeline.prepare(input_value, &mut result).unwrap();
+//         let row = pipeline
+//             .exec_mut(&mut result)
+//             .unwrap()
+//             .into_transformed()
+//             .unwrap();
+//         let output = Rows {
+//             schema,
+//             rows: vec![row],
+//         };
+//         let schemas = output.schema;
+
+//         assert_eq!(schemas.len(), 1);
+//         let schema = schemas[0].clone();
+//         assert_eq!("test_time", schema.column_name);
+//         assert_eq!(ColumnDataType::TimestampNanosecond as i32, schema.datatype);
+//         assert_eq!(SemanticType::Timestamp as i32, schema.semantic_type);
+
+//         let row = output.rows[0].clone();
+//         assert_eq!(1, row.values.len());
+//         let value_data = row.values[0].clone().value_data;
+//         assert_eq!(
+//             Some(v1::value::ValueData::TimestampNanosecondValue(
+//                 1400301296000000000
+//             )),
+//             value_data
+//         );
+//     }
 
     #[test]
     fn test_dispatcher() {
diff --git a/src/pipeline/src/etl/processor.rs b/src/pipeline/src/etl/processor.rs
index bf37f1f8ce7f..b6df91204c39 100644
--- a/src/pipeline/src/etl/processor.rs
+++ b/src/pipeline/src/etl/processor.rs
@@ -27,6 +27,8 @@ pub mod regex;
 pub mod timestamp;
 pub mod urlencoding;
 
+use std::collections::BTreeMap;
+
 use ahash::{HashSet, HashSetExt};
 use cmcd::{CmcdProcessor, CmcdProcessorBuilder};
 use csv::{CsvProcessor, CsvProcessorBuilder};
@@ -80,7 +82,7 @@ pub trait Processor: std::fmt::Debug + Send + Sync + 'static {
     fn ignore_missing(&self) -> bool;
 
     /// Execute the processor on a vector which be preprocessed by the pipeline
-    fn exec_mut(&self, val: &mut Vec<Value>) -> Result<()>;
+    fn exec_mut(&self, val: &mut BTreeMap<String, Value>) -> Result<()>;
 }
 
 #[derive(Debug)]
@@ -114,45 +116,12 @@ pub trait ProcessorBuilder: std::fmt::Debug + Send + Sync + 'static {
     fn build(self, intermediate_keys: &[String]) -> Result<ProcessorKind>;
 }
 
-#[derive(Debug)]
-#[enum_dispatch]
-pub enum ProcessorBuilders {
-    Cmcd(CmcdProcessorBuilder),
-    Csv(CsvProcessorBuilder),
-    Dissect(DissectProcessorBuilder),
-    Gsub(GsubProcessorBuilder),
-    Join(JoinProcessorBuilder),
-    Letter(LetterProcessorBuilder),
-    Regex(RegexProcessorBuilder),
-    Timestamp(TimestampProcessorBuilder),
-    UrlEncoding(UrlEncodingProcessorBuilder),
-    Epoch(EpochProcessorBuilder),
-    Date(DateProcessorBuilder),
-    JsonPath(JsonPathProcessorBuilder),
-    Decolorize(DecolorizeProcessorBuilder),
-    Digest(DigestProcessorBuilder),
-}
-
-#[derive(Debug, Default)]
-pub struct ProcessorBuilderList {
-    pub(crate) processor_builders: Vec<ProcessorBuilders>,
-    pub(crate) input_keys: Vec<String>,
-    pub(crate) output_keys: Vec<String>,
-    pub(crate) original_input_keys: Vec<String>,
-}
-
 #[derive(Debug, Default)]
 pub struct Processors {
     /// A ordered list of processors
     /// The order of processors is important
     /// The output of the first processor will be the input of the second processor
     pub processors: Vec<ProcessorKind>,
-    /// all required keys in all processors
-    pub required_keys: Vec<String>,
-    /// all required keys in user-supplied data, not pipeline output fields
-    pub required_original_keys: Vec<String>,
-    /// all output keys in all processors
-    pub output_keys: Vec<String>,
 }
 
 impl std::ops::Deref for Processors {
@@ -169,80 +138,22 @@ impl std::ops::DerefMut for Processors {
     }
 }
 
-impl Processors {
-    /// A collection of all the processor's required input fields
-    pub fn required_keys(&self) -> &Vec<String> {
-        &self.required_keys
-    }
-
-    /// A collection of all the processor's output fields
-    pub fn output_keys(&self) -> &Vec<String> {
-        &self.output_keys
-    }
-
-    /// Required fields in user-supplied data, not pipeline output fields.
-    pub fn required_original_keys(&self) -> &Vec<String> {
-        &self.required_original_keys
-    }
-}
-
-impl TryFrom<&Vec<yaml_rust::Yaml>> for ProcessorBuilderList {
+impl TryFrom<&Vec<yaml_rust::Yaml>> for Processors {
     type Error = Error;
 
     fn try_from(vec: &Vec<yaml_rust::Yaml>) -> Result<Self> {
         let mut processors_builders = vec![];
-        let mut all_output_keys = HashSet::with_capacity(50);
-        let mut all_required_keys = HashSet::with_capacity(50);
-        let mut all_required_original_keys = HashSet::with_capacity(50);
         for doc in vec {
             let processor = parse_processor(doc)?;
             processors_builders.push(processor);
         }
-
-        for processor in processors_builders.iter() {
-            {
-                // get all required keys
-                let processor_required_keys = processor.input_keys();
-
-                for key in &processor_required_keys {
-                    if !all_output_keys.contains(key) {
-                        all_required_original_keys.insert(*key);
-                    }
-                }
-
-                all_required_keys.extend(processor_required_keys);
-
-                let processor_output_keys = processor.output_keys().into_iter();
-                all_output_keys.extend(processor_output_keys);
-            }
-        }
-
-        let all_required_keys = all_required_keys
-            .into_iter()
-            .map(|x| x.to_string())
-            .sorted()
-            .collect();
-        let all_output_keys = all_output_keys
-            .into_iter()
-            .map(|x| x.to_string())
-            .sorted()
-            .collect();
-        let all_required_original_keys = all_required_original_keys
-            .into_iter()
-            .map(|x| x.to_string())
-            .sorted()
-            .collect();
-
-        Ok(ProcessorBuilderList {
-            processor_builders: processors_builders,
-            input_keys: all_required_keys,
-            output_keys: all_output_keys,
-            original_input_keys: all_required_original_keys,
+        Ok(Processors {
+            processors: processors_builders,
         })
     }
 }
 
-fn parse_processor(doc: &yaml_rust::Yaml) -> Result<ProcessorBuilders> {
+fn parse_processor(doc: &yaml_rust::Yaml) -> Result<ProcessorKind> {
     let map = doc.as_hash().context(ProcessorMustBeMapSnafu)?;
 
     let key = map.keys().next().context(ProcessorMustHaveStringKeySnafu)?;
@@ -255,39 +166,7 @@ fn parse_processor(doc: &yaml_rust::Yaml) -> Result<ProcessorBuilders> {
 
     let str_key = key.as_str().context(ProcessorKeyMustBeStringSnafu)?;
 
-    let processor = match str_key {
-        cmcd::PROCESSOR_CMCD => ProcessorBuilders::Cmcd(CmcdProcessorBuilder::try_from(value)?),
-        csv::PROCESSOR_CSV => ProcessorBuilders::Csv(CsvProcessorBuilder::try_from(value)?),
-        dissect::PROCESSOR_DISSECT => {
-            ProcessorBuilders::Dissect(DissectProcessorBuilder::try_from(value)?)
-        }
-        epoch::PROCESSOR_EPOCH => ProcessorBuilders::Epoch(EpochProcessorBuilder::try_from(value)?),
-        date::PROCESSOR_DATE => ProcessorBuilders::Date(DateProcessorBuilder::try_from(value)?),
-        gsub::PROCESSOR_GSUB => ProcessorBuilders::Gsub(GsubProcessorBuilder::try_from(value)?),
-        join::PROCESSOR_JOIN => ProcessorBuilders::Join(JoinProcessorBuilder::try_from(value)?),
-        letter::PROCESSOR_LETTER => {
-            ProcessorBuilders::Letter(LetterProcessorBuilder::try_from(value)?)
-        }
-        regex::PROCESSOR_REGEX => ProcessorBuilders::Regex(RegexProcessorBuilder::try_from(value)?),
-        timestamp::PROCESSOR_TIMESTAMP => {
-            ProcessorBuilders::Timestamp(TimestampProcessorBuilder::try_from(value)?)
-        }
-        urlencoding::PROCESSOR_URL_ENCODING => {
-            ProcessorBuilders::UrlEncoding(UrlEncodingProcessorBuilder::try_from(value)?)
-        }
-        json_path::PROCESSOR_JSON_PATH => {
-            ProcessorBuilders::JsonPath(json_path::JsonPathProcessorBuilder::try_from(value)?)
-        }
-        decolorize::PROCESSOR_DECOLORIZE => {
-            ProcessorBuilders::Decolorize(DecolorizeProcessorBuilder::try_from(value)?)
-        }
-        digest::PROCESSOR_DIGEST => {
-            ProcessorBuilders::Digest(DigestProcessorBuilder::try_from(value)?)
-        }
-        _ => return UnsupportedProcessorSnafu { processor: str_key }.fail(),
-    };
-
-    Ok(processor)
+    todo!()
 }
 
 pub(crate) fn yaml_string(v: &yaml_rust::Yaml, field: &str) -> Result<String> {
diff --git a/src/pipeline/src/etl/transform.rs b/src/pipeline/src/etl/transform.rs
index be7fe35e5076..4daa3a4d8cf4 100644
--- a/src/pipeline/src/etl/transform.rs
+++ b/src/pipeline/src/etl/transform.rs
@@ -15,6 +15,8 @@
 pub mod index;
 pub mod transformer;
 
+use std::collections::BTreeMap;
+
 use snafu::OptionExt;
 
 use crate::etl::error::{Error, Result};
@@ -47,7 +49,7 @@ pub trait Transformer: std::fmt::Debug + Sized + Send + Sync + 'static {
     fn schemas(&self) -> &Vec<greptime_proto::v1::ColumnSchema>;
     fn transforms(&self) -> &Transforms;
     fn transforms_mut(&mut self) -> &mut Transforms;
-    fn transform_mut(&self, val: &mut Vec<Value>) -> Result<Self::VecOutput>;
+    fn transform_mut(&self, val: &mut BTreeMap<String, Value>) -> Result<Self::VecOutput>;
 }
 
 /// On Failure behavior when transform fails
@@ -73,37 +75,12 @@ impl std::str::FromStr for OnFailure {
     }
 }
 
-#[derive(Debug, Default, Clone)]
-pub struct TransformBuilders {
-    pub(crate) builders: Vec<TransformBuilder>,
-    pub(crate) output_keys: Vec<String>,
-    pub(crate) required_keys: Vec<String>,
-}
-
 #[derive(Debug, Default, Clone)]
 pub struct Transforms {
     pub(crate) transforms: Vec<Transform>,
-    pub(crate) output_keys: Vec<String>,
-    pub(crate) required_keys: Vec<String>,
 }
 
 impl Transforms {
-    pub fn output_keys(&self) -> &Vec<String> {
-        &self.output_keys
-    }
-
-    pub fn output_keys_mut(&mut self) -> &mut Vec<String> {
-        &mut self.output_keys
-    }
-
-    pub fn required_keys_mut(&mut self) -> &mut Vec<String> {
-        &mut self.required_keys
-    }
-
-    pub fn required_keys(&self) -> &Vec<String> {
-        &self.required_keys
-    }
-
     pub fn transforms(&self) -> &Vec<Transform> {
         &self.transforms
     }
@@ -123,75 +100,11 @@ impl std::ops::DerefMut for Transforms {
     }
 }
 
-impl TryFrom<&Vec<yaml_rust::Yaml>> for TransformBuilders {
+impl TryFrom<&Vec<yaml_rust::Yaml>> for Transforms {
     type Error = Error;
 
     fn try_from(docs: &Vec<yaml_rust::Yaml>) -> Result<Self> {
-        let mut transforms = Vec::with_capacity(100);
-        let mut all_output_keys: Vec<String> = Vec::with_capacity(100);
-        let mut all_required_keys = Vec::with_capacity(100);
-        for doc in docs {
-            let transform_builder: TransformBuilder = doc
-                .as_hash()
-                .context(TransformElementMustBeMapSnafu)?
-                .try_into()?;
-            let mut transform_output_keys = transform_builder
-                .fields
-                .iter()
-                .map(|f| f.target_or_input_field().to_string())
-                .collect();
-            all_output_keys.append(&mut transform_output_keys);
-
-            let mut transform_required_keys = transform_builder
-                .fields
-                .iter()
-                .map(|f| f.input_field().to_string())
-                .collect();
-            all_required_keys.append(&mut transform_required_keys);
-
-            transforms.push(transform_builder);
-        }
-
-        all_required_keys.sort();
-
-        Ok(TransformBuilders {
-            builders: transforms,
-            output_keys: all_output_keys,
-            required_keys: all_required_keys,
-        })
-    }
-}
-
-#[derive(Debug, Clone)]
-pub struct TransformBuilder {
-    fields: Fields,
-    type_: Value,
-    default: Option<Value>,
-    index: Option<Index>,
-    on_failure: Option<OnFailure>,
-}
-
-impl TransformBuilder {
-    pub fn build(self, intermediate_keys: &[String], output_keys: &[String]) -> Result<Transform> {
-        let mut real_fields = vec![];
-        for field in self.fields {
-            let input_index = find_key_index(intermediate_keys, field.input_field(), "transform")?;
-            let input_field_info = InputFieldInfo::new(field.input_field(), input_index);
-            let output_index =
-                find_key_index(output_keys, field.target_or_input_field(), "transform")?;
-            let input = OneInputOneOutputField::new(
-                input_field_info,
-                (field.target_or_input_field().to_string(), output_index),
-            );
-            real_fields.push(input);
-        }
-        Ok(Transform {
-            real_fields,
-            type_: self.type_,
-            default: self.default,
-            index: self.index,
-            on_failure: self.on_failure,
-        })
+        todo!()
     }
 }
 
@@ -230,78 +143,3 @@ impl Transform {
         &self.type_
     }
 }
-
-impl TryFrom<&yaml_rust::yaml::Hash> for TransformBuilder {
-    type Error = Error;
-
-    fn try_from(hash: &yaml_rust::yaml::Hash) -> Result<Self> {
-        let mut fields = Fields::default();
-        let mut type_ = Value::Null;
-        let mut default = None;
-        let mut index = None;
-        let mut on_failure = None;
-
-        for (k, v) in hash {
-            let key = k
-                .as_str()
-                .with_context(|| KeyMustBeStringSnafu { k: k.clone() })?;
-            match key {
-                TRANSFORM_FIELD => {
-                    fields = Fields::one(yaml_new_field(v, TRANSFORM_FIELD)?);
-                }
-
-                TRANSFORM_FIELDS => {
-                    fields = yaml_new_fields(v, TRANSFORM_FIELDS)?;
-                }
-
-                TRANSFORM_TYPE => {
-                    let t = yaml_string(v, TRANSFORM_TYPE)?;
-                    type_ = Value::parse_str_type(&t)?;
-                }
-
-                TRANSFORM_INDEX => {
-                    let index_str = yaml_string(v, TRANSFORM_INDEX)?;
-                    index = Some(index_str.try_into()?);
-                }
-
-                TRANSFORM_DEFAULT => {
-                    default = Some(Value::try_from(v)?);
-                }
-
-                TRANSFORM_ON_FAILURE => {
-                    let on_failure_str = yaml_string(v, TRANSFORM_ON_FAILURE)?;
-                    on_failure = Some(on_failure_str.parse()?);
-                }
-
-                _ => {}
-            }
-        }
-        let mut final_default = None;
-
-        if let Some(default_value) = default {
-            match (&type_, &default_value) {
-                (Value::Null, _) => {
-                    return TransformTypeMustBeSetSnafu {
-                        fields: format!("{:?}", fields),
-                        default: default_value.to_string(),
-                    }
-                    .fail();
-                }
-                (_, Value::Null) => {} // if default is not set, then it will be regarded as default null
-                (_, _) => {
-                    let target = type_.parse_str_value(default_value.to_str_value().as_str())?;
-                    final_default = Some(target);
-                }
-            }
-        }
-        let builder = TransformBuilder {
-            fields,
-            type_,
-            default: final_default,
-            index,
-            on_failure,
-        };
-
-        Ok(builder)
-    }
-}
diff --git a/src/pipeline/src/etl/transform/transformer/greptime.rs b/src/pipeline/src/etl/transform/transformer/greptime.rs
index 7d3752ef2880..5ace3afccda7 100644
--- a/src/pipeline/src/etl/transform/transformer/greptime.rs
+++ b/src/pipeline/src/etl/transform/transformer/greptime.rs
@@ -14,7 +14,7 @@
 
 pub mod coerce;
 
-use std::collections::HashSet;
+use std::collections::{BTreeMap, HashSet};
 use std::sync::Arc;
 
 use ahash::HashMap;
@@ -52,36 +52,37 @@ pub struct GreptimeTransformer {
 impl GreptimeTransformer {
     /// Add a default timestamp column to the transforms
     fn add_greptime_timestamp_column(transforms: &mut Transforms) {
-        let ns = chrono::Utc::now().timestamp_nanos_opt().unwrap_or(0);
-        let type_ = Value::Timestamp(Timestamp::Nanosecond(ns));
-        let default = Some(type_.clone());
-
-        let transform = Transform {
-            real_fields: vec![OneInputOneOutputField::new(
-                InputFieldInfo {
-                    name: DEFAULT_GREPTIME_TIMESTAMP_COLUMN.to_string(),
-                    index: usize::MAX,
-                },
-                (
-                    DEFAULT_GREPTIME_TIMESTAMP_COLUMN.to_string(),
-                    transforms
-                        .transforms
-                        .iter()
-                        .map(|x| x.real_fields.len())
-                        .sum(),
-                ),
-            )],
-            type_,
-            default,
-            index: Some(Index::Time),
-            on_failure: Some(crate::etl::transform::OnFailure::Default),
-        };
-        let required_keys = transforms.required_keys_mut();
-        required_keys.push(DEFAULT_GREPTIME_TIMESTAMP_COLUMN.to_string());
-
-        let output_keys = transforms.output_keys_mut();
-        output_keys.push(DEFAULT_GREPTIME_TIMESTAMP_COLUMN.to_string());
-        transforms.push(transform);
+        // let ns = chrono::Utc::now().timestamp_nanos_opt().unwrap_or(0);
+        // let type_ = Value::Timestamp(Timestamp::Nanosecond(ns));
+        // let default = Some(type_.clone());
+
+        // let transform = Transform {
+        //     real_fields: vec![OneInputOneOutputField::new(
+        //         InputFieldInfo {
+        //             name: DEFAULT_GREPTIME_TIMESTAMP_COLUMN.to_string(),
+        //             index: usize::MAX,
+        //         },
+        //         (
+        //             DEFAULT_GREPTIME_TIMESTAMP_COLUMN.to_string(),
+        //             transforms
+        //                 .transforms
+        //                 .iter()
+        //                 .map(|x| x.real_fields.len())
+        //                 .sum(),
+        //         ),
+        //     )],
+        //     type_,
+        //     default,
+        //     index: Some(Index::Time),
+        //     on_failure: Some(crate::etl::transform::OnFailure::Default),
+        // };
+        // let required_keys = transforms.required_keys_mut();
+        // required_keys.push(DEFAULT_GREPTIME_TIMESTAMP_COLUMN.to_string());
+
+        // let output_keys = transforms.output_keys_mut();
+        // output_keys.push(DEFAULT_GREPTIME_TIMESTAMP_COLUMN.to_string());
+        // transforms.push(transform);
+        todo!()
     }
 
     /// Generate the schema for the GreptimeTransformer
@@ -161,30 +162,31 @@ impl Transformer for GreptimeTransformer {
         }
     }
 
-    fn transform_mut(&self, val: &mut Vec<Value>) -> Result<Self::VecOutput> {
-        let mut values = vec![GreptimeValue { value_data: None }; self.schema.len()];
-        for transform in self.transforms.iter() {
-            for field in transform.real_fields.iter() {
-                let index = field.input_index();
-                let output_index = field.output_index();
-                match val.get(index) {
-                    Some(v) => {
-                        let value_data = coerce_value(v, transform)?;
-                        // every transform fields has only one output field
-                        values[output_index] = GreptimeValue { value_data };
-                    }
-                    None => {
-                        let default = transform.get_default();
-                        let value_data = match default {
-                            Some(default) => coerce_value(default, transform)?,
-                            None => None,
-                        };
-                        values[output_index] = GreptimeValue { value_data };
-                    }
-                }
-            }
-        }
-        Ok(Row { values })
+    fn transform_mut(&self, val: &mut BTreeMap<String, Value>) -> Result<Self::VecOutput> {
+        // let mut values = vec![GreptimeValue { value_data: None }; self.schema.len()];
+        // for transform in self.transforms.iter() {
+        //     for field in transform.real_fields.iter() {
+        //         let index = field.input_index();
+        //         let output_index = field.output_index();
+        //         match val.get(index) {
+        //             Some(v) => {
+        //                 let value_data = coerce_value(v, transform)?;
+        //                 // every transform fields has only one output field
+        //                 values[output_index] = GreptimeValue { value_data };
+        //             }
+        //             None => {
+        //                 let default = transform.get_default();
+        //                 let value_data = match default {
+        //                     Some(default) => coerce_value(default, transform)?,
+        //                     None => None,
+        //                 };
+        //                 values[output_index] = GreptimeValue { value_data };
+        //             }
+        //         }
+        //     }
+        // }
+        // Ok(Row { values })
+        todo!()
     }
 
     fn transforms(&self) -> &Transforms {

From 13268f975e84794685111d6bd0fe8e555961200f Mon Sep 17 00:00:00 2001
From: Ning Sun <sunning@greptime.com>
Date: Thu, 23 Jan 2025 18:13:47 +0800
Subject: [PATCH 16/32] refactor: update dispatcher to accept BTreeMap

---
 src/pipeline/src/dispatcher.rs | 23 ++++++++++-------------
 1 file changed, 10 insertions(+), 13 deletions(-)

diff --git a/src/pipeline/src/dispatcher.rs b/src/pipeline/src/dispatcher.rs
index 45bd6b47cbfb..fa9e54cf0f4a 100644
--- a/src/pipeline/src/dispatcher.rs
+++ b/src/pipeline/src/dispatcher.rs
@@ -12,6 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+use std::collections::BTreeMap;
+
 use common_telemetry::debug;
 use snafu::OptionExt;
 use yaml_rust::Yaml;
@@ -109,22 +111,17 @@ impl TryFrom<&Yaml> for Dispatcher {
 
 impl Dispatcher {
     /// execute dispatcher and returns matched rule if any
-    pub(crate) fn exec(&self, keys: &Vec<String>, val: &Vec<Value>) -> Option<&Rule> {
-        if let Some(index) = keys.iter().position(|key| key == &self.field) {
-            if let Some(value) = val.get(index) {
-                for rule in &self.rules {
-                    if rule.value == *value {
-                        return Some(rule);
-                    }
+    pub(crate) fn exec(&self, data: &BTreeMap<String, Value>) -> Option<&Rule> {
+        if let Some(value) = data.get(&self.field) {
+            for rule in &self.rules {
+                if rule.value == *value {
+                    return Some(rule);
                 }
-
-                None
-            } else {
-                debug!("value at index {} is not found in {:?}", &index, val);
-                None
             }
+
+            None
         } else {
-            debug!("field {} not found in keys {:?}", &self.field, keys);
+            debug!("field {} not found in keys {:?}", &self.field, data.keys());
             None
         }
     }

From a2148121e5502e3569058253dc433f9ccaf7720e Mon Sep 17 00:00:00 2001
From: Ning Sun <sunning@greptime.com>
Date: Thu, 23 Jan 2025 18:52:26 +0800
Subject: [PATCH 17/32] refactor: update identity pipeline

---
 src/pipeline/src/etl/error.rs                 |   7 -
 .../src/etl/transform/transformer/greptime.rs | 173 ++----------------
 2 files changed, 19 insertions(+), 161 deletions(-)

diff --git a/src/pipeline/src/etl/error.rs b/src/pipeline/src/etl/error.rs
index 2fd267ce9548..d1e0b56e6e9d 100644
--- a/src/pipeline/src/etl/error.rs
+++ b/src/pipeline/src/etl/error.rs
@@ -594,13 +594,6 @@ pub enum Error {
     TablePartRequiredForDispatcherRule,
     #[snafu(display("Value is required for dispatcher rule"))]
     ValueRequiredForDispatcherRule,
-    #[snafu(display("Keys and values length mismatch, values: {values}, keys: {keys}"))]
-    KeyValueLengthMismatch {
-        #[snafu(implicit)]
-        location: Location,
-        keys: usize,
-        values: usize,
-    },
     #[snafu(display(
         "Reached max nested levels when flattening JSON object: {max_nested_levels}"
     ))]
diff --git a/src/pipeline/src/etl/transform/transformer/greptime.rs b/src/pipeline/src/etl/transform/transformer/greptime.rs
index 27338ccfb1b5..f7e59904a313 100644
--- a/src/pipeline/src/etl/transform/transformer/greptime.rs
+++ b/src/pipeline/src/etl/transform/transformer/greptime.rs
@@ -22,21 +22,19 @@ use api::helper::proto_value_type;
 use api::v1::column_data_type_extension::TypeExt;
 use api::v1::value::ValueData;
 use api::v1::{ColumnDataType, ColumnDataTypeExtension, JsonTypeExtension, SemanticType};
-use coerce::{coerce_columns, coerce_value};
+use coerce::coerce_columns;
 use greptime_proto::v1::{ColumnSchema, Row, Rows, Value as GreptimeValue};
 use itertools::Itertools;
 use serde_json::{Map, Number, Value as JsonValue};
-use snafu::ensure;
 
 use crate::etl::error::{
-    IdentifyPipelineColumnTypeMismatchSnafu, KeyValueLengthMismatchSnafu,
-    ReachedMaxNestedLevelsSnafu, Result, TransformColumnNameMustBeUniqueSnafu, TransformEmptySnafu,
+    IdentifyPipelineColumnTypeMismatchSnafu, ReachedMaxNestedLevelsSnafu, Result,
+    TransformColumnNameMustBeUniqueSnafu, TransformEmptySnafu,
     TransformMultipleTimestampIndexSnafu, TransformTimestampIndexCountSnafu,
     UnsupportedNumberTypeSnafu,
 };
-use crate::etl::field::{InputFieldInfo, OneInputOneOutputField};
 use crate::etl::transform::index::Index;
-use crate::etl::transform::{Transform, Transformer, Transforms};
+use crate::etl::transform::{Transformer, Transforms};
 use crate::etl::value::{Timestamp, Value};
 
 /// The header key that contains the pipeline params.
@@ -329,23 +327,13 @@ fn resolve_number_schema(
     )
 }
 
-fn values_to_row(schema_info: &mut SchemaInfo, values: Vec<Value>, keys: &[String]) -> Result<Row> {
-    ensure!(
-        values.len() == keys.len(),
-        KeyValueLengthMismatchSnafu {
-            keys: keys.len(),
-            values: values.len(),
-        }
-    );
-
+fn values_to_row(schema_info: &mut SchemaInfo, values: BTreeMap<String, Value>) -> Result<Row> {
     let mut row: Vec<GreptimeValue> = Vec::with_capacity(schema_info.schema.len());
     for _ in 0..schema_info.schema.len() {
         row.push(GreptimeValue { value_data: None });
     }
 
-    for (idx, value) in values.into_iter().enumerate() {
-        // ensured by previous check
-        let column_name = keys[idx].clone();
+    for (column_name, value) in values.into_iter() {
         if column_name == DEFAULT_GREPTIME_TIMESTAMP_COLUMN {
             continue;
         }
@@ -524,107 +512,17 @@ fn values_to_row(schema_info: &mut SchemaInfo, values: Vec<Value>, keys: &[Strin
     Ok(Row { values: row })
 }
 
-fn json_value_to_row(
-    schema_info: &mut SchemaInfo,
-    map: Map<String, serde_json::Value>,
-) -> Result<Row> {
-    let mut row: Vec<GreptimeValue> = Vec::with_capacity(schema_info.schema.len());
-    for _ in 0..schema_info.schema.len() {
-        row.push(GreptimeValue { value_data: None });
-    }
-    for (column_name, value) in map {
-        if column_name == DEFAULT_GREPTIME_TIMESTAMP_COLUMN {
-            continue;
-        }
-        let index = schema_info.index.get(&column_name).copied();
-        match value {
-            serde_json::Value::Null => {
-                // do nothing
-            }
-            serde_json::Value::String(s) => {
-                resolve_schema(
-                    index,
-                    ValueData::StringValue(s),
-                    ColumnSchema {
-                        column_name,
-                        datatype: ColumnDataType::String as i32,
-                        semantic_type: SemanticType::Field as i32,
-                        datatype_extension: None,
-                        options: None,
-                    },
-                    &mut row,
-                    schema_info,
-                )?;
-            }
-            serde_json::Value::Bool(b) => {
-                resolve_schema(
-                    index,
-                    ValueData::BoolValue(b),
-                    ColumnSchema {
-                        column_name,
-                        datatype: ColumnDataType::Boolean as i32,
-                        semantic_type: SemanticType::Field as i32,
-                        datatype_extension: None,
-                        options: None,
-                    },
-                    &mut row,
-                    schema_info,
-                )?;
-            }
-            serde_json::Value::Number(n) => {
-                resolve_number_schema(n, column_name, index, &mut row, schema_info)?;
-            }
-            serde_json::Value::Array(_) | serde_json::Value::Object(_) => {
-                resolve_schema(
-                    index,
-                    ValueData::BinaryValue(jsonb::Value::from(value).to_vec()),
-                    ColumnSchema {
-                        column_name,
-                        datatype: ColumnDataType::Binary as i32,
-                        semantic_type: SemanticType::Field as i32,
-                        datatype_extension: Some(ColumnDataTypeExtension {
-                            type_ext: Some(TypeExt::JsonType(JsonTypeExtension::JsonBinary.into())),
-                        }),
-                        options: None,
-                    },
-                    &mut row,
-                    schema_info,
-                )?;
-            }
-        }
-    }
-    Ok(Row { values: row })
-}
-
 fn identity_pipeline_inner<'a>(
-    array: PipelineExecInput,
+    array: Vec<BTreeMap<String, Value>>,
     tag_column_names: Option<impl Iterator<Item = &'a String>>,
-    params: &GreptimePipelineParams,
+    _params: &GreptimePipelineParams,
 ) -> Result<Rows> {
     let mut rows = Vec::with_capacity(array.len());
     let mut schema_info = SchemaInfo::default();
 
-    match array {
-        PipelineExecInput::Original(array) => {
-            for value in array {
-                if let serde_json::Value::Object(map) = value {
-                    let object = if params.flatten_json_object() {
-                        flatten_json_object(map, DEFAULT_MAX_NESTED_LEVELS_FOR_JSON_FLATTENING)?
-                    } else {
-                        map
-                    };
-
-                    let row = json_value_to_row(&mut schema_info, object)?;
-                    rows.push(row);
-                }
-            }
-        }
-        PipelineExecInput::Intermediate { keys, array } => {
-            for values in array {
-                let row = values_to_row(&mut schema_info, values, &keys)?;
-                rows.push(row);
-            }
-        }
+    for values in array {
+        let row = values_to_row(&mut schema_info, values)?;
+        rows.push(row);
     }
 
     let greptime_timestamp_schema = ColumnSchema {
@@ -662,36 +560,6 @@ fn identity_pipeline_inner<'a>(
     })
 }
 
-/// The input data format for pipeline
-///
-/// It can either be raw input as in `serde_json::Value` or intermediate `Vec<Value>`
-pub enum PipelineExecInput {
-    // multiple row values as a value object
-    Original(Vec<serde_json::Value>),
-    // 2-dimension row values by column
-    Intermediate {
-        array: Vec<Vec<Value>>,
-        keys: Vec<String>,
-    },
-}
-
-impl PipelineExecInput {
-    /// return the length of internal array
-    pub fn len(&self) -> usize {
-        match self {
-            PipelineExecInput::Original(array) => array.len(),
-            PipelineExecInput::Intermediate { array, .. } => array.len(),
-        }
-    }
-
-    pub fn is_empty(&self) -> bool {
-        match self {
-            PipelineExecInput::Original(array) => array.is_empty(),
-            PipelineExecInput::Intermediate { array, .. } => array.is_empty(),
-        }
-    }
-}
-
 /// Identity pipeline for Greptime
 /// This pipeline will convert the input JSON array to Greptime Rows
 /// params table is used to set the semantic type of the row key column to Tag
@@ -701,7 +569,7 @@ impl PipelineExecInput {
 /// 4. The pipeline will return an error if the same column datatype is mismatched
 /// 5. The pipeline will analyze the schema of each json record and merge them to get the final schema.
 pub fn identity_pipeline(
-    array: PipelineExecInput,
+    array: Vec<BTreeMap<String, Value>>,
     table: Option<Arc<table::Table>>,
     params: &GreptimePipelineParams,
 ) -> Result<Rows> {
@@ -773,9 +641,9 @@ mod tests {
     use api::v1::SemanticType;
 
     use crate::etl::transform::transformer::greptime::{
-        flatten_json_object, identity_pipeline_inner, GreptimePipelineParams, PipelineExecInput,
+        flatten_json_object, identity_pipeline_inner, GreptimePipelineParams,
     };
-    use crate::identity_pipeline;
+    use crate::{identity_pipeline, Pipeline};
 
     #[test]
     fn test_identify_pipeline() {
@@ -800,11 +668,8 @@ mod tests {
                     "gaga": "gaga"
                 }),
             ];
-            let rows = identity_pipeline(
-                PipelineExecInput::Original(array),
-                None,
-                &GreptimePipelineParams::default(),
-            );
+            let array = Pipeline::prepare(array).unwrap();
+            let rows = identity_pipeline(array, None, &GreptimePipelineParams::default());
             assert!(rows.is_err());
             assert_eq!(
                 rows.err().unwrap().to_string(),
@@ -833,7 +698,7 @@ mod tests {
                 }),
             ];
             let rows = identity_pipeline(
-                PipelineExecInput::Original(array),
+                Pipeline::prepare(array).unwrap(),
                 None,
                 &GreptimePipelineParams::default(),
             );
@@ -865,7 +730,7 @@ mod tests {
                 }),
             ];
             let rows = identity_pipeline(
-                PipelineExecInput::Original(array),
+                Pipeline::prepare(array).unwrap(),
                 None,
                 &GreptimePipelineParams::default(),
             );
@@ -899,7 +764,7 @@ mod tests {
             ];
             let tag_column_names = ["name".to_string(), "address".to_string()];
             let rows = identity_pipeline_inner(
-                PipelineExecInput::Original(array),
+                Pipeline::prepare(array).uwnrap(),
                 Some(tag_column_names.iter()),
                 &GreptimePipelineParams::default(),
             );

From c7e08eb3103e1217fc2f29c77ed514a58de2f565 Mon Sep 17 00:00:00 2001
From: Ning Sun <sunning@greptime.com>
Date: Thu, 23 Jan 2025 19:38:16 +0800
Subject: [PATCH 18/32] refactor: use new input for pipeline

---
 src/frontend/src/instance/otlp.rs |   4 +-
 src/servers/src/http/event.rs     |   6 +-
 src/servers/src/http/extractor.rs |  25 +++----
 src/servers/src/http/otlp.rs      |  11 ++-
 src/servers/src/otlp/logs.rs      |   7 +-
 src/servers/src/pipeline.rs       | 118 ++++++++----------------------
 src/servers/src/query_handler.rs  |   6 +-
 7 files changed, 64 insertions(+), 113 deletions(-)

diff --git a/src/frontend/src/instance/otlp.rs b/src/frontend/src/instance/otlp.rs
index 8c33f4dfdf4b..fff075cac6a1 100644
--- a/src/frontend/src/instance/otlp.rs
+++ b/src/frontend/src/instance/otlp.rs
@@ -20,7 +20,7 @@ use common_telemetry::tracing;
 use opentelemetry_proto::tonic::collector::logs::v1::ExportLogsServiceRequest;
 use opentelemetry_proto::tonic::collector::metrics::v1::ExportMetricsServiceRequest;
 use opentelemetry_proto::tonic::collector::trace::v1::ExportTraceServiceRequest;
-use pipeline::PipelineWay;
+use pipeline::{GreptimePipelineParams, PipelineWay};
 use servers::error::{self, AuthSnafu, InFlightWriteBytesExceededSnafu, Result as ServerResult};
 use servers::interceptor::{OpenTelemetryProtocolInterceptor, OpenTelemetryProtocolInterceptorRef};
 use servers::otlp;
@@ -115,6 +115,7 @@ impl OpenTelemetryProtocolHandler for Instance {
         pipeline_handler: PipelineHandlerRef,
         request: ExportLogsServiceRequest,
         pipeline: PipelineWay,
+        pipeline_params: GreptimePipelineParams,
         table_name: String,
         ctx: QueryContextRef,
     ) -> ServerResult<Output> {
@@ -132,6 +133,7 @@ impl OpenTelemetryProtocolHandler for Instance {
         let (requests, rows) = otlp::logs::to_grpc_insert_requests(
             request,
             pipeline,
+            pipeline_params,
             table_name,
             &ctx,
             pipeline_handler,
diff --git a/src/servers/src/http/event.rs b/src/servers/src/http/event.rs
index 6d17144740db..e8f0d749f873 100644
--- a/src/servers/src/http/event.rs
+++ b/src/servers/src/http/event.rs
@@ -282,10 +282,9 @@ async fn dryrun_pipeline_inner(
         &pipeline_handler,
         PipelineDefinition::Resolved(pipeline),
         &params,
-        PipelineExecInput::Original(value),
+        Pipeline::prepare(value)?,
         "dry_run".to_owned(),
         query_ctx,
-        db.as_ref(),
         true,
     )
     .await?;
@@ -604,10 +603,9 @@ pub(crate) async fn ingest_logs_inner(
             &state,
             PipelineDefinition::from_name(&pipeline_name, version),
             &pipeline_params,
-            PipelineExecInput::Original(request.values),
+            Pipeline::prepare(request.values),
             request.table,
             &query_ctx,
-            db.as_str(),
             true,
         )
         .await?;
diff --git a/src/servers/src/http/extractor.rs b/src/servers/src/http/extractor.rs
index f3ae606636c5..ee662f36f615 100644
--- a/src/servers/src/http/extractor.rs
+++ b/src/servers/src/http/extractor.rs
@@ -18,7 +18,7 @@ use axum::extract::FromRequestParts;
 use axum::http::request::Parts;
 use axum::http::StatusCode;
 use http::HeaderMap;
-use pipeline::SelectInfo;
+use pipeline::{GreptimePipelineParams, SelectInfo};
 
 use crate::http::header::constants::{
     GREPTIME_LOG_EXTRACT_KEYS_HEADER_NAME, GREPTIME_LOG_PIPELINE_NAME_HEADER_NAME,
@@ -91,6 +91,7 @@ where
 pub struct PipelineInfo {
     pub pipeline_name: Option<String>,
     pub pipeline_version: Option<String>,
+    pub pipeline_params: Option<GreptimePipelineParams>,
 }
 
 impl<S> FromRequestParts<S> for PipelineInfo
@@ -105,20 +106,14 @@ where
             string_value_from_header(headers, GREPTIME_LOG_PIPELINE_NAME_HEADER_NAME)?;
         let pipeline_version =
             string_value_from_header(headers, GREPTIME_LOG_PIPELINE_VERSION_HEADER_NAME)?;
-        match (pipeline_name, pipeline_version) {
-            (Some(name), Some(version)) => Ok(PipelineInfo {
-                pipeline_name: Some(name),
-                pipeline_version: Some(version),
-            }),
-            (None, _) => Ok(PipelineInfo {
-                pipeline_name: None,
-                pipeline_version: None,
-            }),
-            (Some(name), None) => Ok(PipelineInfo {
-                pipeline_name: Some(name),
-                pipeline_version: None,
-            }),
-        }
+        let pipeline_parameters =
+            string_value_from_header(headers, GREPTIME_PIPELINE_PARAMS_HEADER)?;
+
+        Ok(PipelineInfo {
+            pipeline_name,
+            pipeline_version,
+            pipeline_params: pipeline_parameters.map(|v| GreptimePipelineParams::from_params(v)),
+        })
     }
 }
 
diff --git a/src/servers/src/http/otlp.rs b/src/servers/src/http/otlp.rs
index 6657bfc845a3..a7efa4b7d32b 100644
--- a/src/servers/src/http/otlp.rs
+++ b/src/servers/src/http/otlp.rs
@@ -126,11 +126,20 @@ pub async fn logs(
         PipelineWay::OtlpLogDirect(Box::new(select_info))
     };
 
+    let pipeline_params = pipeline_info.pipeline_params.unwrap_or_default();
+
     // here we use nightly feature `trait_upcasting` to convert handler to
     // pipeline_handler
     let pipeline_handler: Arc<dyn PipelineHandler + Send + Sync> = handler.clone();
     handler
-        .logs(pipeline_handler, request, pipeline, tablename, query_ctx)
+        .logs(
+            pipeline_handler,
+            request,
+            pipeline,
+            pipeline_params,
+            tablename,
+            query_ctx,
+        )
         .await
         .map(|o| OtlpResponse {
             resp_body: ExportLogsServiceResponse {
diff --git a/src/servers/src/otlp/logs.rs b/src/servers/src/otlp/logs.rs
index ecf53988adf4..24232fcef01f 100644
--- a/src/servers/src/otlp/logs.rs
+++ b/src/servers/src/otlp/logs.rs
@@ -48,6 +48,7 @@ pub const LOG_TABLE_NAME: &str = "opentelemetry_logs";
 pub async fn to_grpc_insert_requests(
     request: ExportLogsServiceRequest,
     pipeline: PipelineWay,
+    pipeline_params: GreptimePipelineParams,
     table_name: String,
     query_ctx: &QueryContextRef,
     pipeline_handler: PipelineHandlerRef,
@@ -69,19 +70,17 @@ pub async fn to_grpc_insert_requests(
         }
         PipelineWay::Pipeline(pipeline_def) => {
             let data = parse_export_logs_service_request(request);
+            let array = Pipeline::prepare(data)?;
 
             let db_string = query_ctx.get_db_string();
 
-            let pipeline_params = GreptimePipelineParams::default();
-
             let inserts = run_pipeline(
                 &pipeline_handler,
                 pipeline_def,
                 &pipeline_params,
-                PipelineExecInput::Original(data),
+                array,
                 table_name,
                 query_ctx,
-                db_string.as_ref(),
                 true,
             )
             .await?;
diff --git a/src/servers/src/pipeline.rs b/src/servers/src/pipeline.rs
index 7ab1275fe0a3..bee45d476404 100644
--- a/src/servers/src/pipeline.rs
+++ b/src/servers/src/pipeline.rs
@@ -31,41 +31,6 @@ use crate::metrics::{
 };
 use crate::query_handler::PipelineHandlerRef;
 
-#[inline]
-fn pipeline_exec_with_intermediate_state(
-    pipeline: &Arc<Pipeline<GreptimeTransformer>>,
-    intermediate_state: &mut Vec<pipeline::Value>,
-    transformed: &mut Vec<Row>,
-    dispatched: &mut BTreeMap<DispatchedTo, Vec<Vec<pipeline::Value>>>,
-    db: &str,
-    transform_timer: &Instant,
-) -> Result<()> {
-    let r = pipeline
-        .exec_mut(intermediate_state)
-        .inspect_err(|_| {
-            METRIC_HTTP_LOGS_TRANSFORM_ELAPSED
-                .with_label_values(&[db, METRIC_FAILURE_VALUE])
-                .observe(transform_timer.elapsed().as_secs_f64());
-        })
-        .context(PipelineTransformSnafu)
-        .context(PipelineSnafu)?;
-
-    match r {
-        PipelineExecOutput::Transformed(row) => {
-            transformed.push(row);
-        }
-        PipelineExecOutput::DispatchedTo(dispatched_to) => {
-            if let Some(values) = dispatched.get_mut(&dispatched_to) {
-                values.push(intermediate_state.clone());
-            } else {
-                dispatched.insert(dispatched_to, vec![intermediate_state.clone()]);
-            }
-        }
-    }
-
-    Ok(())
-}
-
 /// Never call this on `GreptimeIdentityPipeline` because it's a real pipeline
 pub async fn get_pipeline(
     pipeline_def: PipelineDefinition,
@@ -89,12 +54,13 @@ pub(crate) async fn run_pipeline(
     state: &PipelineHandlerRef,
     pipeline_definition: PipelineDefinition,
     pipeline_parameters: &GreptimePipelineParams,
-    values: PipelineExecInput,
+    array: Vec<BTreeMap<String, pipeline::Value>>,
     table_name: String,
     query_ctx: &QueryContextRef,
-    db: &str,
     is_top_level: bool,
 ) -> Result<Vec<RowInsertRequest>> {
+    let db = query_ctx.get_db_string();
+
     if matches!(
         pipeline_definition,
         PipelineDefinition::GreptimeIdentityPipeline
@@ -103,7 +69,7 @@ pub(crate) async fn run_pipeline(
             .get_table(&table_name, query_ctx)
             .await
             .context(CatalogSnafu)?;
-        pipeline::identity_pipeline(values, table, pipeline_parameters)
+        pipeline::identity_pipeline(array, table, pipeline_parameters)
             .map(|rows| {
                 vec![RowInsertRequest {
                     rows: Some(rows),
@@ -118,44 +84,30 @@ pub(crate) async fn run_pipeline(
         let transform_timer = std::time::Instant::now();
 
         let mut transformed = Vec::with_capacity(values.len());
-        let mut dispatched: BTreeMap<DispatchedTo, Vec<Vec<pipeline::Value>>> = BTreeMap::new();
-
-        match values {
-            PipelineExecInput::Original(array) => {
-                let mut intermediate_state = pipeline.init_intermediate_state();
-                for v in array {
-                    pipeline
-                        .prepare(v, &mut intermediate_state)
-                        .inspect_err(|_| {
-                            METRIC_HTTP_LOGS_TRANSFORM_ELAPSED
-                                .with_label_values(&[db, METRIC_FAILURE_VALUE])
-                                .observe(transform_timer.elapsed().as_secs_f64());
-                        })
-                        .context(PipelineTransformSnafu)
-                        .context(PipelineSnafu)?;
-
-                    pipeline_exec_with_intermediate_state(
-                        &pipeline,
-                        &mut intermediate_state,
-                        &mut transformed,
-                        &mut dispatched,
-                        db,
-                        &transform_timer,
-                    )?;
-
-                    pipeline.reset_intermediate_state(&mut intermediate_state);
+        let mut dispatched: BTreeMap<DispatchedTo, Vec<BTreeMap<String, pipeline::Value>>> =
+            BTreeMap::new();
+
+        for mut values in array {
+            let r = pipeline
+                .exec_mut(&mut values)
+                .inspect_err(|_| {
+                    METRIC_HTTP_LOGS_TRANSFORM_ELAPSED
+                        .with_label_values(&[db.as_str(), METRIC_FAILURE_VALUE])
+                        .observe(transform_timer.elapsed().as_secs_f64());
+                })
+                .context(PipelineTransformSnafu)
+                .context(PipelineSnafu)?;
+
+            match r {
+                PipelineExecOutput::Transformed(row) => {
+                    transformed.push(row);
                 }
-            }
-            PipelineExecInput::Intermediate { array, .. } => {
-                for mut intermediate_state in array {
-                    pipeline_exec_with_intermediate_state(
-                        &pipeline,
-                        &mut intermediate_state,
-                        &mut transformed,
-                        &mut dispatched,
-                        db,
-                        &transform_timer,
-                    )?;
+                PipelineExecOutput::DispatchedTo(dispatched_to) => {
+                    if let Some(coll) = dispatched.get_mut(&dispatched_to) {
+                        coll.push(values);
+                    } else {
+                        dispatched.insert(dispatched_to, vec![values]);
+                    }
                 }
             }
         }
@@ -176,7 +128,7 @@ pub(crate) async fn run_pipeline(
 
         // if current pipeline contains dispatcher and has several rules, we may
         // already accumulated several dispatched rules and rows.
-        for (dispatched_to, values) in dispatched {
+        for (dispatched_to, coll) in dispatched {
             // we generate the new table name according to `table_part` and
             // current custom table name.
             let table_name = format!("{}_{}", &table_name, dispatched_to.table_part);
@@ -185,22 +137,14 @@ pub(crate) async fn run_pipeline(
                 .as_deref()
                 .unwrap_or(GREPTIME_INTERNAL_IDENTITY_PIPELINE_NAME);
 
-            // run pipeline recursively. Note that the values we are going to
-            // process is now intermediate version. It's in form of
-            // `Vec<Vec<pipeline::Value>>`.
+            // run pipeline recursively.
             let requests = Box::pin(run_pipeline(
                 state,
                 PipelineDefinition::from_name(next_pipeline_name, None),
                 pipeline_parameters,
-                PipelineExecInput::Intermediate {
-                    array: values,
-                    // FIXME(sunng87): this intermediate_keys is incorrect. what
-                    // we will need is the keys that generated after processors
-                    keys: pipeline.intermediate_keys().clone(),
-                },
+                coll,
                 table_name,
                 query_ctx,
-                db,
                 false,
             ))
             .await?;
@@ -210,7 +154,7 @@ pub(crate) async fn run_pipeline(
 
         if is_top_level {
             METRIC_HTTP_LOGS_TRANSFORM_ELAPSED
-                .with_label_values(&[db, METRIC_SUCCESS_VALUE])
+                .with_label_values(&[db.as_str(), METRIC_SUCCESS_VALUE])
                 .observe(transform_timer.elapsed().as_secs_f64());
         }
 
diff --git a/src/servers/src/query_handler.rs b/src/servers/src/query_handler.rs
index 9029a8fc2a99..dd41305626b9 100644
--- a/src/servers/src/query_handler.rs
+++ b/src/servers/src/query_handler.rs
@@ -38,7 +38,10 @@ use log_query::LogQuery;
 use opentelemetry_proto::tonic::collector::logs::v1::ExportLogsServiceRequest;
 use opentelemetry_proto::tonic::collector::metrics::v1::ExportMetricsServiceRequest;
 use opentelemetry_proto::tonic::collector::trace::v1::ExportTraceServiceRequest;
-use pipeline::{GreptimeTransformer, Pipeline, PipelineInfo, PipelineVersion, PipelineWay};
+use pipeline::{
+    GreptimePipelineParams, GreptimeTransformer, Pipeline, PipelineInfo, PipelineVersion,
+    PipelineWay,
+};
 use serde_json::Value;
 use session::context::{QueryContext, QueryContextRef};
 
@@ -113,6 +116,7 @@ pub trait OpenTelemetryProtocolHandler: PipelineHandler {
         pipeline_handler: PipelineHandlerRef,
         request: ExportLogsServiceRequest,
         pipeline: PipelineWay,
+        pipeline_params: GreptimePipelineParams,
         table_name: String,
         ctx: QueryContextRef,
     ) -> Result<Output>;

From eb6e8d2cda9c03a79ed4f69f1220980b05d311dd Mon Sep 17 00:00:00 2001
From: paomian <xpaomian@gmail.com>
Date: Fri, 24 Jan 2025 17:48:17 +0800
Subject: [PATCH 19/32] chore: wip

---
 src/pipeline/src/etl.rs                       | 538 +++++++++---------
 src/pipeline/src/etl/error.rs                 |   5 +
 src/pipeline/src/etl/field.rs                 | 129 +----
 src/pipeline/src/etl/processor.rs             |  93 +--
 src/pipeline/src/etl/processor/cmcd.rs        |   8 +-
 src/pipeline/src/etl/processor/csv.rs         |   4 +-
 src/pipeline/src/etl/processor/date.rs        |  72 +--
 src/pipeline/src/etl/processor/decolorize.rs  |  67 +--
 src/pipeline/src/etl/processor/digest.rs      |  79 +--
 src/pipeline/src/etl/processor/dissect.rs     |   4 +-
 src/pipeline/src/etl/processor/epoch.rs       |  68 +--
 src/pipeline/src/etl/processor/gsub.rs        |  82 +--
 src/pipeline/src/etl/processor/join.rs        |  77 +--
 src/pipeline/src/etl/processor/json_path.rs   |  72 +--
 src/pipeline/src/etl/processor/letter.rs      |  69 +--
 src/pipeline/src/etl/processor/regex.rs       |   4 +-
 src/pipeline/src/etl/processor/timestamp.rs   |  89 +--
 src/pipeline/src/etl/processor/urlencoding.rs |  73 +--
 src/pipeline/src/etl/transform.rs             | 119 +++-
 .../src/etl/transform/transformer/greptime.rs |  77 +--
 .../transform/transformer/greptime/coerce.rs  |  13 +-
 src/pipeline/src/lib.rs                       |   2 +-
 22 files changed, 642 insertions(+), 1102 deletions(-)

diff --git a/src/pipeline/src/etl.rs b/src/pipeline/src/etl.rs
index 61b72efb470e..bca33a607f6e 100644
--- a/src/pipeline/src/etl.rs
+++ b/src/pipeline/src/etl.rs
@@ -23,11 +23,11 @@ pub mod value;
 use std::collections::BTreeMap;
 use std::sync::Arc;
 
-use ahash::HashSet;
-use common_telemetry::debug;
-use error::{IntermediateKeyIndexSnafu, PrepareValueMustBeObjectSnafu, YamlLoadSnafu};
+use error::{
+    IntermediateKeyIndexSnafu, PrepareValueMustBeObjectSnafu, YamlLoadSnafu, YamlParseSnafu,
+};
 use itertools::Itertools;
-use processor::{Processor, ProcessorBuilder, Processors};
+use processor::{IntermediateStatus, Processor, Processors};
 use snafu::{OptionExt, ResultExt};
 use transform::{Transformer, Transforms};
 use value::Value;
@@ -56,6 +56,10 @@ where
         Content::Yaml(str) => {
             let docs = YamlLoader::load_from_str(str).context(YamlLoadSnafu)?;
 
+            if docs.len() != 1 {
+                return YamlParseSnafu.fail();
+            }
+
             let doc = &docs[0];
 
             let description = doc[DESCRIPTION].as_str().map(|s| s.to_string());
@@ -144,6 +148,25 @@ impl<O> PipelineExecOutput<O> {
     }
 }
 
+pub fn json_to_intermediate_state(val: serde_json::Value) -> Result<IntermediateStatus> {
+    match val {
+        serde_json::Value::Object(map) => {
+            let mut intermediate_state = BTreeMap::new();
+            for (k, v) in map {
+                intermediate_state.insert(k, Value::try_from(v)?);
+            }
+            Ok(intermediate_state)
+        }
+        _ => PrepareValueMustBeObjectSnafu.fail(),
+    }
+}
+
+pub fn json_array_to_intermediate_state(
+    val: Vec<serde_json::Value>,
+) -> Result<Vec<IntermediateStatus>> {
+    val.into_iter().map(json_to_intermediate_state).collect()
+}
+
 impl<T> Pipeline<T>
 where
     T: Transformer,
@@ -152,27 +175,22 @@ where
         &self,
         val: &mut BTreeMap<String, Value>,
     ) -> Result<PipelineExecOutput<T::VecOutput>> {
-        // for processor in self.processors.iter() {
-        //     processor.exec_mut(val)?;
-        // }
-
-        // let matched_rule = self
-        //     .dispatcher
-        //     .as_ref()
-        //     .and_then(|dispatcher| dispatcher.exec(&self.intermediate_keys, val));
-
-        // match matched_rule {
-        //     None => self
-        //         .transformer
-        //         .transform_mut(val)
-        //         .map(PipelineExecOutput::Transformed),
-        //     Some(rule) => Ok(PipelineExecOutput::DispatchedTo(rule.into())),
-        // }
-        todo!()
-    }
+        for processor in self.processors.iter() {
+            processor.exec_mut(val)?;
+        }
 
-    pub fn prepare(&self, val: serde_json::Value) -> Result<BTreeMap<String, Value>> {
-        todo!()
+        let matched_rule = self
+            .dispatcher
+            .as_ref()
+            .and_then(|dispatcher| dispatcher.exec(val));
+
+        match matched_rule {
+            None => self
+                .transformer
+                .transform_mut(val)
+                .map(PipelineExecOutput::Transformed),
+            Some(rule) => Ok(PipelineExecOutput::DispatchedTo(rule.into())),
+        }
     }
 
     pub fn processors(&self) -> &processor::Processors {
@@ -254,242 +272,242 @@ mod tests {
     use super::*;
     use crate::etl::transform::GreptimeTransformer;
 
-//     #[test]
-//     fn test_pipeline_prepare() {
-//         let input_value_str = r#"
-//                 {
-//                     "my_field": "1,2",
-//                     "foo": "bar"
-//                 }
-//             "#;
-//         let input_value: serde_json::Value = serde_json::from_str(input_value_str).unwrap();
-
-//         let pipeline_yaml = r#"description: 'Pipeline for Apache Tomcat'
-// processors:
-//   - csv:
-//       field: my_field
-//       target_fields: field1, field2
-// transform:
-//   - field: field1
-//     type: uint32
-//   - field: field2
-//     type: uint32
-// "#;
-//         let pipeline: Pipeline<GreptimeTransformer> = parse(&Content::Yaml(pipeline_yaml)).unwrap();
-//         let mut payload = pipeline.init_intermediate_state();
-//         pipeline.prepare(input_value, &mut payload).unwrap();
-//         assert_eq!(&["my_field"].to_vec(), pipeline.required_keys());
-//         assert_eq!(
-//             payload,
-//             vec![Value::String("1,2".to_string()), Value::Null, Value::Null]
-//         );
-//         let result = pipeline
-//             .exec_mut(&mut payload)
-//             .unwrap()
-//             .into_transformed()
-//             .unwrap();
-
-//         assert_eq!(result.values[0].value_data, Some(ValueData::U32Value(1)));
-//         assert_eq!(result.values[1].value_data, Some(ValueData::U32Value(2)));
-//         match &result.values[2].value_data {
-//             Some(ValueData::TimestampNanosecondValue(v)) => {
-//                 assert_ne!(*v, 0);
-//             }
-//             _ => panic!("expect null value"),
-//         }
-//     }
-
-//     #[test]
-//     fn test_dissect_pipeline() {
-//         let message = r#"129.37.245.88 - meln1ks [01/Aug/2024:14:22:47 +0800] "PATCH /observability/metrics/production HTTP/1.0" 501 33085"#.to_string();
-//         let pipeline_str = r#"processors:
-//   - dissect:
-//       fields:
-//         - message
-//       patterns:
-//         - "%{ip} %{?ignored} %{username} [%{ts}] \"%{method} %{path} %{proto}\" %{status} %{bytes}"
-//   - timestamp:
-//       fields:
-//         - ts
-//       formats:
-//         - "%d/%b/%Y:%H:%M:%S %z"
-
-// transform:
-//   - fields:
-//       - ip
-//       - username
-//       - method
-//       - path
-//       - proto
-//     type: string
-//   - fields:
-//       - status
-//     type: uint16
-//   - fields:
-//       - bytes
-//     type: uint32
-//   - field: ts
-//     type: timestamp, ns
-//     index: time"#;
-//         let pipeline: Pipeline<GreptimeTransformer> = parse(&Content::Yaml(pipeline_str)).unwrap();
-//         let mut payload = pipeline.init_intermediate_state();
-//         pipeline
-//             .prepare(serde_json::Value::String(message), &mut payload)
-//             .unwrap();
-//         let result = pipeline
-//             .exec_mut(&mut payload)
-//             .unwrap()
-//             .into_transformed()
-//             .unwrap();
-//         let sechema = pipeline.schemas();
-
-//         assert_eq!(sechema.len(), result.values.len());
-//         let test = vec![
-//             (
-//                 ColumnDataType::String as i32,
-//                 Some(ValueData::StringValue("129.37.245.88".into())),
-//             ),
-//             (
-//                 ColumnDataType::String as i32,
-//                 Some(ValueData::StringValue("meln1ks".into())),
-//             ),
-//             (
-//                 ColumnDataType::String as i32,
-//                 Some(ValueData::StringValue("PATCH".into())),
-//             ),
-//             (
-//                 ColumnDataType::String as i32,
-//                 Some(ValueData::StringValue(
-//                     "/observability/metrics/production".into(),
-//                 )),
-//             ),
-//             (
-//                 ColumnDataType::String as i32,
-//                 Some(ValueData::StringValue("HTTP/1.0".into())),
-//             ),
-//             (
-//                 ColumnDataType::Uint16 as i32,
-//                 Some(ValueData::U16Value(501)),
-//             ),
-//             (
-//                 ColumnDataType::Uint32 as i32,
-//                 Some(ValueData::U32Value(33085)),
-//             ),
-//             (
-//                 ColumnDataType::TimestampNanosecond as i32,
-//                 Some(ValueData::TimestampNanosecondValue(1722493367000000000)),
-//             ),
-//         ];
-//         for i in 0..sechema.len() {
-//             let schema = &sechema[i];
-//             let value = &result.values[i];
-//             assert_eq!(schema.datatype, test[i].0);
-//             assert_eq!(value.value_data, test[i].1);
-//         }
-//     }
-
-//     #[test]
-//     fn test_csv_pipeline() {
-//         let input_value_str = r#"
-//                 {
-//                     "my_field": "1,2",
-//                     "foo": "bar"
-//                 }
-//             "#;
-//         let input_value: serde_json::Value = serde_json::from_str(input_value_str).unwrap();
-
-//         let pipeline_yaml = r#"
-// description: Pipeline for Apache Tomcat
-// processors:
-//   - csv:
-//       field: my_field
-//       target_fields: field1, field2
-// transform:
-//   - field: field1
-//     type: uint32
-//   - field: field2
-//     type: uint32
-// "#;
-
-//         let pipeline: Pipeline<GreptimeTransformer> = parse(&Content::Yaml(pipeline_yaml)).unwrap();
-//         let mut payload = pipeline.init_intermediate_state();
-//         pipeline.prepare(input_value, &mut payload).unwrap();
-//         assert_eq!(&["my_field"].to_vec(), pipeline.required_keys());
-//         assert_eq!(
-//             payload,
-//             vec![Value::String("1,2".to_string()), Value::Null, Value::Null]
-//         );
-//         let result = pipeline
-//             .exec_mut(&mut payload)
-//             .unwrap()
-//             .into_transformed()
-//             .unwrap();
-//         assert_eq!(result.values[0].value_data, Some(ValueData::U32Value(1)));
-//         assert_eq!(result.values[1].value_data, Some(ValueData::U32Value(2)));
-//         match &result.values[2].value_data {
-//             Some(ValueData::TimestampNanosecondValue(v)) => {
-//                 assert_ne!(*v, 0);
-//             }
-//             _ => panic!("expect null value"),
-//         }
-//     }
-
-//     #[test]
-//     fn test_date_pipeline() {
-//         let input_value_str = r#"
-//             {
-//                 "my_field": "1,2",
-//                 "foo": "bar",
-//                 "test_time": "2014-5-17T04:34:56+00:00"
-//             }
-//         "#;
-//         let input_value: serde_json::Value = serde_json::from_str(input_value_str).unwrap();
-
-//         let pipeline_yaml = r#"
-// ---
-// description: Pipeline for Apache Tomcat
-
-// processors:
-//   - timestamp:
-//       field: test_time
-
-// transform:
-//   - field: test_time
-//     type: timestamp, ns
-//     index: time
-// "#;
-
-//         let pipeline: Pipeline<GreptimeTransformer> = parse(&Content::Yaml(pipeline_yaml)).unwrap();
-//         let schema = pipeline.schemas().clone();
-//         let mut result = pipeline.init_intermediate_state();
-//         pipeline.prepare(input_value, &mut result).unwrap();
-//         let row = pipeline
-//             .exec_mut(&mut result)
-//             .unwrap()
-//             .into_transformed()
-//             .unwrap();
-//         let output = Rows {
-//             schema,
-//             rows: vec![row],
-//         };
-//         let schemas = output.schema;
-
-//         assert_eq!(schemas.len(), 1);
-//         let schema = schemas[0].clone();
-//         assert_eq!("test_time", schema.column_name);
-//         assert_eq!(ColumnDataType::TimestampNanosecond as i32, schema.datatype);
-//         assert_eq!(SemanticType::Timestamp as i32, schema.semantic_type);
-
-//         let row = output.rows[0].clone();
-//         assert_eq!(1, row.values.len());
-//         let value_data = row.values[0].clone().value_data;
-//         assert_eq!(
-//             Some(v1::value::ValueData::TimestampNanosecondValue(
-//                 1400301296000000000
-//             )),
-//             value_data
-//         );
-//     }
+    //     #[test]
+    //     fn test_pipeline_prepare() {
+    //         let input_value_str = r#"
+    //                 {
+    //                     "my_field": "1,2",
+    //                     "foo": "bar"
+    //                 }
+    //             "#;
+    //         let input_value: serde_json::Value = serde_json::from_str(input_value_str).unwrap();
+
+    //         let pipeline_yaml = r#"description: 'Pipeline for Apache Tomcat'
+    // processors:
+    //   - csv:
+    //       field: my_field
+    //       target_fields: field1, field2
+    // transform:
+    //   - field: field1
+    //     type: uint32
+    //   - field: field2
+    //     type: uint32
+    // "#;
+    //         let pipeline: Pipeline<GreptimeTransformer> = parse(&Content::Yaml(pipeline_yaml)).unwrap();
+    //         let mut payload = pipeline.init_intermediate_state();
+    //         pipeline.prepare(input_value, &mut payload).unwrap();
+    //         assert_eq!(&["my_field"].to_vec(), pipeline.required_keys());
+    //         assert_eq!(
+    //             payload,
+    //             vec![Value::String("1,2".to_string()), Value::Null, Value::Null]
+    //         );
+    //         let result = pipeline
+    //             .exec_mut(&mut payload)
+    //             .unwrap()
+    //             .into_transformed()
+    //             .unwrap();
+
+    //         assert_eq!(result.values[0].value_data, Some(ValueData::U32Value(1)));
+    //         assert_eq!(result.values[1].value_data, Some(ValueData::U32Value(2)));
+    //         match &result.values[2].value_data {
+    //             Some(ValueData::TimestampNanosecondValue(v)) => {
+    //                 assert_ne!(*v, 0);
+    //             }
+    //             _ => panic!("expect null value"),
+    //         }
+    //     }
+
+    //     #[test]
+    //     fn test_dissect_pipeline() {
+    //         let message = r#"129.37.245.88 - meln1ks [01/Aug/2024:14:22:47 +0800] "PATCH /observability/metrics/production HTTP/1.0" 501 33085"#.to_string();
+    //         let pipeline_str = r#"processors:
+    //   - dissect:
+    //       fields:
+    //         - message
+    //       patterns:
+    //         - "%{ip} %{?ignored} %{username} [%{ts}] \"%{method} %{path} %{proto}\" %{status} %{bytes}"
+    //   - timestamp:
+    //       fields:
+    //         - ts
+    //       formats:
+    //         - "%d/%b/%Y:%H:%M:%S %z"
+
+    // transform:
+    //   - fields:
+    //       - ip
+    //       - username
+    //       - method
+    //       - path
+    //       - proto
+    //     type: string
+    //   - fields:
+    //       - status
+    //     type: uint16
+    //   - fields:
+    //       - bytes
+    //     type: uint32
+    //   - field: ts
+    //     type: timestamp, ns
+    //     index: time"#;
+    //         let pipeline: Pipeline<GreptimeTransformer> = parse(&Content::Yaml(pipeline_str)).unwrap();
+    //         let mut payload = pipeline.init_intermediate_state();
+    //         pipeline
+    //             .prepare(serde_json::Value::String(message), &mut payload)
+    //             .unwrap();
+    //         let result = pipeline
+    //             .exec_mut(&mut payload)
+    //             .unwrap()
+    //             .into_transformed()
+    //             .unwrap();
+    //         let sechema = pipeline.schemas();
+
+    //         assert_eq!(sechema.len(), result.values.len());
+    //         let test = vec![
+    //             (
+    //                 ColumnDataType::String as i32,
+    //                 Some(ValueData::StringValue("129.37.245.88".into())),
+    //             ),
+    //             (
+    //                 ColumnDataType::String as i32,
+    //                 Some(ValueData::StringValue("meln1ks".into())),
+    //             ),
+    //             (
+    //                 ColumnDataType::String as i32,
+    //                 Some(ValueData::StringValue("PATCH".into())),
+    //             ),
+    //             (
+    //                 ColumnDataType::String as i32,
+    //                 Some(ValueData::StringValue(
+    //                     "/observability/metrics/production".into(),
+    //                 )),
+    //             ),
+    //             (
+    //                 ColumnDataType::String as i32,
+    //                 Some(ValueData::StringValue("HTTP/1.0".into())),
+    //             ),
+    //             (
+    //                 ColumnDataType::Uint16 as i32,
+    //                 Some(ValueData::U16Value(501)),
+    //             ),
+    //             (
+    //                 ColumnDataType::Uint32 as i32,
+    //                 Some(ValueData::U32Value(33085)),
+    //             ),
+    //             (
+    //                 ColumnDataType::TimestampNanosecond as i32,
+    //                 Some(ValueData::TimestampNanosecondValue(1722493367000000000)),
+    //             ),
+    //         ];
+    //         for i in 0..sechema.len() {
+    //             let schema = &sechema[i];
+    //             let value = &result.values[i];
+    //             assert_eq!(schema.datatype, test[i].0);
+    //             assert_eq!(value.value_data, test[i].1);
+    //         }
+    //     }
+
+    //     #[test]
+    //     fn test_csv_pipeline() {
+    //         let input_value_str = r#"
+    //                 {
+    //                     "my_field": "1,2",
+    //                     "foo": "bar"
+    //                 }
+    //             "#;
+    //         let input_value: serde_json::Value = serde_json::from_str(input_value_str).unwrap();
+
+    //         let pipeline_yaml = r#"
+    // description: Pipeline for Apache Tomcat
+    // processors:
+    //   - csv:
+    //       field: my_field
+    //       target_fields: field1, field2
+    // transform:
+    //   - field: field1
+    //     type: uint32
+    //   - field: field2
+    //     type: uint32
+    // "#;
+
+    //         let pipeline: Pipeline<GreptimeTransformer> = parse(&Content::Yaml(pipeline_yaml)).unwrap();
+    //         let mut payload = pipeline.init_intermediate_state();
+    //         pipeline.prepare(input_value, &mut payload).unwrap();
+    //         assert_eq!(&["my_field"].to_vec(), pipeline.required_keys());
+    //         assert_eq!(
+    //             payload,
+    //             vec![Value::String("1,2".to_string()), Value::Null, Value::Null]
+    //         );
+    //         let result = pipeline
+    //             .exec_mut(&mut payload)
+    //             .unwrap()
+    //             .into_transformed()
+    //             .unwrap();
+    //         assert_eq!(result.values[0].value_data, Some(ValueData::U32Value(1)));
+    //         assert_eq!(result.values[1].value_data, Some(ValueData::U32Value(2)));
+    //         match &result.values[2].value_data {
+    //             Some(ValueData::TimestampNanosecondValue(v)) => {
+    //                 assert_ne!(*v, 0);
+    //             }
+    //             _ => panic!("expect null value"),
+    //         }
+    //     }
+
+    //     #[test]
+    //     fn test_date_pipeline() {
+    //         let input_value_str = r#"
+    //             {
+    //                 "my_field": "1,2",
+    //                 "foo": "bar",
+    //                 "test_time": "2014-5-17T04:34:56+00:00"
+    //             }
+    //         "#;
+    //         let input_value: serde_json::Value = serde_json::from_str(input_value_str).unwrap();
+
+    //         let pipeline_yaml = r#"
+    // ---
+    // description: Pipeline for Apache Tomcat
+
+    // processors:
+    //   - timestamp:
+    //       field: test_time
+
+    // transform:
+    //   - field: test_time
+    //     type: timestamp, ns
+    //     index: time
+    // "#;
+
+    //         let pipeline: Pipeline<GreptimeTransformer> = parse(&Content::Yaml(pipeline_yaml)).unwrap();
+    //         let schema = pipeline.schemas().clone();
+    //         let mut result = pipeline.init_intermediate_state();
+    //         pipeline.prepare(input_value, &mut result).unwrap();
+    //         let row = pipeline
+    //             .exec_mut(&mut result)
+    //             .unwrap()
+    //             .into_transformed()
+    //             .unwrap();
+    //         let output = Rows {
+    //             schema,
+    //             rows: vec![row],
+    //         };
+    //         let schemas = output.schema;
+
+    //         assert_eq!(schemas.len(), 1);
+    //         let schema = schemas[0].clone();
+    //         assert_eq!("test_time", schema.column_name);
+    //         assert_eq!(ColumnDataType::TimestampNanosecond as i32, schema.datatype);
+    //         assert_eq!(SemanticType::Timestamp as i32, schema.semantic_type);
+
+    //         let row = output.rows[0].clone();
+    //         assert_eq!(1, row.values.len());
+    //         let value_data = row.values[0].clone().value_data;
+    //         assert_eq!(
+    //             Some(v1::value::ValueData::TimestampNanosecondValue(
+    //                 1400301296000000000
+    //             )),
+    //             value_data
+    //         );
+    //     }
 
     #[test]
     fn test_dispatcher() {
diff --git a/src/pipeline/src/etl/error.rs b/src/pipeline/src/etl/error.rs
index d1e0b56e6e9d..51080c86eebf 100644
--- a/src/pipeline/src/etl/error.rs
+++ b/src/pipeline/src/etl/error.rs
@@ -543,6 +543,11 @@ pub enum Error {
         #[snafu(implicit)]
         location: Location,
     },
+    #[snafu(display("Yaml parse error."))]
+    YamlParse {
+        #[snafu(implicit)]
+        location: Location,
+    },
     #[snafu(display("Prepare value must be an object"))]
     PrepareValueMustBeObject {
         #[snafu(implicit)]
diff --git a/src/pipeline/src/etl/field.rs b/src/pipeline/src/etl/field.rs
index 10fa681f236c..dd4835ec9279 100644
--- a/src/pipeline/src/etl/field.rs
+++ b/src/pipeline/src/etl/field.rs
@@ -19,133 +19,12 @@ use snafu::OptionExt;
 
 use super::error::{EmptyInputFieldSnafu, MissingInputFieldSnafu};
 use crate::etl::error::{Error, Result};
-use crate::etl::find_key_index;
-
-/// Information about the input field including the name and index in intermediate keys.
-#[derive(Debug, Default, Clone)]
-pub struct InputFieldInfo {
-    pub(crate) name: String,
-    pub(crate) index: usize,
-}
-
-impl InputFieldInfo {
-    /// Create a new input field info with the given field name and index.
-    pub(crate) fn new(field: impl Into<String>, index: usize) -> Self {
-        InputFieldInfo {
-            name: field.into(),
-            index,
-        }
-    }
-}
-
-/// Information about a field that has one input and one output.
-#[derive(Debug, Default, Clone)]
-pub struct OneInputOneOutputField {
-    input: InputFieldInfo,
-    output: Option<(String, usize)>,
-}
-
-impl OneInputOneOutputField {
-    /// Create a new field with the given input and output.
-    pub(crate) fn new(input: InputFieldInfo, output: (String, usize)) -> Self {
-        OneInputOneOutputField {
-            input,
-            output: Some(output),
-        }
-    }
-
-    /// Build a new field with the given processor kind, intermediate keys, input field, and target field.
-    pub(crate) fn build(
-        processor_kind: &str,
-        intermediate_keys: &[String],
-        input_field: &str,
-        target_field: &str,
-    ) -> Result<Self> {
-        let input_index = find_key_index(intermediate_keys, input_field, processor_kind)?;
-
-        let input_field_info = InputFieldInfo::new(input_field, input_index);
-        let output_index = find_key_index(intermediate_keys, target_field, processor_kind)?;
-        Ok(OneInputOneOutputField::new(
-            input_field_info,
-            (target_field.to_string(), output_index),
-        ))
-    }
-
-    /// Get the input field information.
-    pub(crate) fn input(&self) -> &InputFieldInfo {
-        &self.input
-    }
-
-    /// Get the index of the input field.
-    pub(crate) fn input_index(&self) -> usize {
-        self.input.index
-    }
-
-    /// Get the name of the input field.
-    pub(crate) fn input_name(&self) -> &str {
-        &self.input.name
-    }
-
-    /// Get the index of the output field.
-    pub(crate) fn output_index(&self) -> usize {
-        *self.output().1
-    }
-
-    /// Get the name of the output field.
-    pub(crate) fn output_name(&self) -> &str {
-        self.output().0
-    }
-
-    /// Get the output field information.
-    pub(crate) fn output(&self) -> (&String, &usize) {
-        if let Some((name, index)) = &self.output {
-            (name, index)
-        } else {
-            (&self.input.name, &self.input.index)
-        }
-    }
-}
-
-/// Information about a field that has one input and multiple outputs.
-#[derive(Debug, Default, Clone)]
-pub struct OneInputMultiOutputField {
-    input: InputFieldInfo,
-    /// Typically, processors that output multiple keys need to be distinguished by splicing the keys together.
-    prefix: Option<String>,
-}
-
-impl OneInputMultiOutputField {
-    /// Create a new field with the given input and prefix.
-    pub(crate) fn new(input: InputFieldInfo, prefix: Option<String>) -> Self {
-        OneInputMultiOutputField { input, prefix }
-    }
-
-    /// Get the input field information.
-    pub(crate) fn input(&self) -> &InputFieldInfo {
-        &self.input
-    }
-
-    /// Get the index of the input field.
-    pub(crate) fn input_index(&self) -> usize {
-        self.input.index
-    }
-
-    /// Get the name of the input field.
-    pub(crate) fn input_name(&self) -> &str {
-        &self.input.name
-    }
-
-    /// Get the prefix for the output fields.
-    pub(crate) fn target_prefix(&self) -> &str {
-        self.prefix.as_deref().unwrap_or(&self.input.name)
-    }
-}
 
 /// Raw processor-defined inputs and outputs
 #[derive(Debug, Default, Clone)]
 pub struct Field {
-    pub(crate) input_field: String,
-    pub(crate) target_field: Option<String>,
+    input_field: String,
+    target_field: Option<String>,
 }
 
 impl FromStr for Field {
@@ -194,6 +73,10 @@ impl Field {
     pub(crate) fn target_or_input_field(&self) -> &str {
         self.target_field.as_deref().unwrap_or(&self.input_field)
     }
+
+    pub(crate) fn set_target_field(&mut self, target_field: Option<String>) {
+        self.target_field = target_field;
+    }
 }
 
 /// A collection of fields.
diff --git a/src/pipeline/src/etl/processor.rs b/src/pipeline/src/etl/processor.rs
index b6df91204c39..63854ad552a7 100644
--- a/src/pipeline/src/etl/processor.rs
+++ b/src/pipeline/src/etl/processor.rs
@@ -12,49 +12,48 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-pub mod cmcd;
-pub mod csv;
+// pub mod cmcd;
+// pub mod csv;
 pub mod date;
 pub mod decolorize;
 pub mod digest;
-pub mod dissect;
+// pub mod dissect;
 pub mod epoch;
 pub mod gsub;
 pub mod join;
 pub mod json_path;
 pub mod letter;
-pub mod regex;
+// pub mod regex;
 pub mod timestamp;
 pub mod urlencoding;
 
 use std::collections::BTreeMap;
 
-use ahash::{HashSet, HashSetExt};
-use cmcd::{CmcdProcessor, CmcdProcessorBuilder};
-use csv::{CsvProcessor, CsvProcessorBuilder};
-use date::{DateProcessor, DateProcessorBuilder};
-use decolorize::{DecolorizeProcessor, DecolorizeProcessorBuilder};
-use digest::{DigestProcessor, DigestProcessorBuilder};
-use dissect::{DissectProcessor, DissectProcessorBuilder};
+// use cmcd::CmcdProcessor;
+// use csv::CsvProcessor;
+use date::DateProcessor;
+use decolorize::DecolorizeProcessor;
+use digest::DigestProcessor;
+// use dissect::DissectProcessor;
 use enum_dispatch::enum_dispatch;
-use epoch::{EpochProcessor, EpochProcessorBuilder};
-use gsub::{GsubProcessor, GsubProcessorBuilder};
-use itertools::Itertools;
-use join::{JoinProcessor, JoinProcessorBuilder};
-use json_path::{JsonPathProcessor, JsonPathProcessorBuilder};
-use letter::{LetterProcessor, LetterProcessorBuilder};
-use regex::{RegexProcessor, RegexProcessorBuilder};
+use epoch::EpochProcessor;
+use gsub::GsubProcessor;
+use join::JoinProcessor;
+use json_path::JsonPathProcessor;
+use letter::LetterProcessor;
+// use regex::RegexProcessor;
 use snafu::{OptionExt, ResultExt};
-use timestamp::{TimestampProcessor, TimestampProcessorBuilder};
-use urlencoding::{UrlEncodingProcessor, UrlEncodingProcessorBuilder};
+use timestamp::TimestampProcessor;
+use urlencoding::UrlEncodingProcessor;
 
 use super::error::{
     FailedParseFieldFromStringSnafu, FieldMustBeTypeSnafu, ProcessorKeyMustBeStringSnafu,
-    ProcessorMustBeMapSnafu, ProcessorMustHaveStringKeySnafu, UnsupportedProcessorSnafu,
+    ProcessorMustBeMapSnafu, ProcessorMustHaveStringKeySnafu,
 };
 use super::field::{Field, Fields};
 use crate::etl::error::{Error, Result};
 use crate::etl::value::Value;
+use crate::etl_error::UnsupportedProcessorSnafu;
 
 const FIELD_NAME: &str = "field";
 const FIELDS_NAME: &str = "fields";
@@ -67,6 +66,8 @@ const TARGET_FIELDS_NAME: &str = "target_fields";
 const JSON_PATH_NAME: &str = "json_path";
 const JSON_PATH_RESULT_INDEX_NAME: &str = "result_index";
 
+pub type IntermediateStatus = BTreeMap<String, Value>;
+
 /// Processor trait defines the interface for all processors.
 ///
 /// A processor is a transformation that can be applied to a field in a document
@@ -82,19 +83,19 @@ pub trait Processor: std::fmt::Debug + Send + Sync + 'static {
     fn ignore_missing(&self) -> bool;
 
     /// Execute the processor on a vector which be preprocessed by the pipeline
-    fn exec_mut(&self, val: &mut BTreeMap<String, Value>) -> Result<()>;
+    fn exec_mut(&self, val: &mut IntermediateStatus) -> Result<()>;
 }
 
 #[derive(Debug)]
 #[enum_dispatch]
 pub enum ProcessorKind {
-    Cmcd(CmcdProcessor),
-    Csv(CsvProcessor),
-    Dissect(DissectProcessor),
+    // Cmcd(CmcdProcessor),
+    // Csv(CsvProcessor),
+    // Dissect(DissectProcessor),
     Gsub(GsubProcessor),
     Join(JoinProcessor),
     Letter(LetterProcessor),
-    Regex(RegexProcessor),
+    // Regex(RegexProcessor),
     Timestamp(TimestampProcessor),
     UrlEncoding(UrlEncodingProcessor),
     Epoch(EpochProcessor),
@@ -104,18 +105,6 @@ pub enum ProcessorKind {
     Digest(DigestProcessor),
 }
 
-/// ProcessorBuilder trait defines the interface for all processor builders
-/// A processor builder is used to create a processor
-#[enum_dispatch(ProcessorBuilders)]
-pub trait ProcessorBuilder: std::fmt::Debug + Send + Sync + 'static {
-    /// Get the processor's output keys
-    fn output_keys(&self) -> HashSet<&str>;
-    /// Get the processor's input keys
-    fn input_keys(&self) -> HashSet<&str>;
-    /// Build the processor
-    fn build(self, intermediate_keys: &[String]) -> Result<ProcessorKind>;
-}
-
 #[derive(Debug, Default)]
 pub struct Processors {
     /// A ordered list of processors
@@ -166,7 +155,33 @@ fn parse_processor(doc: &yaml_rust::Yaml) -> Result<ProcessorKind> {
 
     let str_key = key.as_str().context(ProcessorKeyMustBeStringSnafu)?;
 
-    todo!()
+    let processor = match str_key {
+        // cmcd::PROCESSOR_CMCD => ProcessorKind::Cmcd(CmcdProcessor::try_from(value)?),
+        // csv::PROCESSOR_CSV => ProcessorKind::Csv(CsvProcessor::try_from(value)?),
+        // dissect::PROCESSOR_DISSECT => ProcessorKind::Dissect(DissectProcessor::try_from(value)?),
+        epoch::PROCESSOR_EPOCH => ProcessorKind::Epoch(EpochProcessor::try_from(value)?),
+        date::PROCESSOR_DATE => ProcessorKind::Date(DateProcessor::try_from(value)?),
+        gsub::PROCESSOR_GSUB => ProcessorKind::Gsub(GsubProcessor::try_from(value)?),
+        join::PROCESSOR_JOIN => ProcessorKind::Join(JoinProcessor::try_from(value)?),
+        letter::PROCESSOR_LETTER => ProcessorKind::Letter(LetterProcessor::try_from(value)?),
+        // regex::PROCESSOR_REGEX => ProcessorKind::Regex(RegexProcessor::try_from(value)?),
+        timestamp::PROCESSOR_TIMESTAMP => {
+            ProcessorKind::Timestamp(TimestampProcessor::try_from(value)?)
+        }
+        urlencoding::PROCESSOR_URL_ENCODING => {
+            ProcessorKind::UrlEncoding(UrlEncodingProcessor::try_from(value)?)
+        }
+        json_path::PROCESSOR_JSON_PATH => {
+            ProcessorKind::JsonPath(json_path::JsonPathProcessor::try_from(value)?)
+        }
+        decolorize::PROCESSOR_DECOLORIZE => {
+            ProcessorKind::Decolorize(DecolorizeProcessor::try_from(value)?)
+        }
+        digest::PROCESSOR_DIGEST => ProcessorKind::Digest(DigestProcessor::try_from(value)?),
+        _ => return UnsupportedProcessorSnafu { processor: str_key }.fail(),
+    };
+
+    Ok(processor)
 }
 
 pub(crate) fn yaml_string(v: &yaml_rust::Yaml, field: &str) -> Result<String> {
diff --git a/src/pipeline/src/etl/processor/cmcd.rs b/src/pipeline/src/etl/processor/cmcd.rs
index 086fe8f3d610..944487472691 100644
--- a/src/pipeline/src/etl/processor/cmcd.rs
+++ b/src/pipeline/src/etl/processor/cmcd.rs
@@ -27,7 +27,7 @@ use crate::etl::error::{
     FailedToParseIntKeySnafu, KeyMustBeStringSnafu, ProcessorExpectStringSnafu,
     ProcessorMissingFieldSnafu, Result,
 };
-use crate::etl::field::{Field, Fields, InputFieldInfo, OneInputMultiOutputField};
+use crate::etl::field::{Field, Fields, InputField, OneInputMultiOutputField};
 use crate::etl::find_key_index;
 use crate::etl::processor::{
     yaml_bool, yaml_new_field, yaml_new_fields, Processor, ProcessorBuilder, ProcessorKind,
@@ -35,6 +35,8 @@ use crate::etl::processor::{
 };
 use crate::etl::value::Value;
 
+use super::IntermediateStatus;
+
 pub(crate) const PROCESSOR_CMCD: &str = "cmcd";
 
 const CMCD_KEY_BR: &str = "br"; // Encoded bitrate, Integer kbps
@@ -135,7 +137,7 @@ impl CmcdProcessorBuilder {
         for field in self.fields.into_iter() {
             let input_index = find_key_index(intermediate_keys, field.input_field(), "cmcd")?;
 
-            let input_field_info = InputFieldInfo::new(field.input_field(), input_index);
+            let input_field_info = InputField::new(field.input_field(), input_index);
 
             let (_, cmcd_field_outputs) = Self::build_cmcd_outputs(&field, intermediate_keys)?;
 
@@ -372,7 +374,7 @@ impl Processor for CmcdProcessor {
         self.ignore_missing
     }
 
-    fn exec_mut(&self, val: &mut Vec<Value>) -> Result<()> {
+    fn exec_mut(&self, val: &mut IntermediateStatus) -> Result<()> {
         for (field_index, field) in self.fields.iter().enumerate() {
             let field_value_index = field.input_index();
             match val.get(field_value_index) {
diff --git a/src/pipeline/src/etl/processor/csv.rs b/src/pipeline/src/etl/processor/csv.rs
index c9cb5f847db1..86f39fc89369 100644
--- a/src/pipeline/src/etl/processor/csv.rs
+++ b/src/pipeline/src/etl/processor/csv.rs
@@ -24,7 +24,7 @@ use crate::etl::error::{
     CsvNoRecordSnafu, CsvQuoteNameSnafu, CsvReadSnafu, CsvSeparatorNameSnafu, Error,
     KeyMustBeStringSnafu, ProcessorExpectStringSnafu, ProcessorMissingFieldSnafu, Result,
 };
-use crate::etl::field::{Fields, InputFieldInfo, OneInputMultiOutputField};
+use crate::etl::field::{Fields, InputField, OneInputMultiOutputField};
 use crate::etl::find_key_index;
 use crate::etl::processor::{
     yaml_bool, yaml_new_field, yaml_new_fields, yaml_string, Processor, ProcessorBuilder,
@@ -64,7 +64,7 @@ impl CsvProcessorBuilder {
         for field in self.fields {
             let input_index = find_key_index(intermediate_keys, field.input_field(), "csv")?;
 
-            let input_field_info = InputFieldInfo::new(field.input_field(), input_index);
+            let input_field_info = InputField::new(field.input_field(), input_index);
             let real_field = OneInputMultiOutputField::new(input_field_info, None);
             real_fields.push(real_field);
         }
diff --git a/src/pipeline/src/etl/processor/date.rs b/src/pipeline/src/etl/processor/date.rs
index fa202a0edff2..e080b795402c 100644
--- a/src/pipeline/src/etl/processor/date.rs
+++ b/src/pipeline/src/etl/processor/date.rs
@@ -14,21 +14,21 @@
 
 use std::sync::Arc;
 
-use ahash::HashSet;
 use chrono::{DateTime, NaiveDateTime};
 use chrono_tz::Tz;
 use lazy_static::lazy_static;
 use snafu::{OptionExt, ResultExt};
 
+use super::IntermediateStatus;
 use crate::etl::error::{
     DateFailedToGetLocalTimezoneSnafu, DateFailedToGetTimestampSnafu, DateParseSnafu,
     DateParseTimezoneSnafu, Error, KeyMustBeStringSnafu, ProcessorExpectStringSnafu,
     ProcessorFailedToParseStringSnafu, ProcessorMissingFieldSnafu, Result,
 };
-use crate::etl::field::{Fields, OneInputOneOutputField};
+use crate::etl::field::Fields;
 use crate::etl::processor::{
-    yaml_bool, yaml_new_field, yaml_new_fields, yaml_string, yaml_strings, Processor,
-    ProcessorBuilder, ProcessorKind, FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME,
+    yaml_bool, yaml_new_field, yaml_new_fields, yaml_string, yaml_strings, Processor, FIELDS_NAME,
+    FIELD_NAME, IGNORE_MISSING_NAME,
 };
 use crate::etl::value::{Timestamp, Value};
 
@@ -88,55 +88,7 @@ impl std::ops::Deref for Formats {
     }
 }
 
-#[derive(Debug, Default)]
-pub struct DateProcessorBuilder {
-    fields: Fields,
-    formats: Formats,
-    timezone: Option<Arc<String>>,
-    locale: Option<Arc<String>>,
-    ignore_missing: bool,
-}
-
-impl ProcessorBuilder for DateProcessorBuilder {
-    fn output_keys(&self) -> HashSet<&str> {
-        self.fields
-            .iter()
-            .map(|f| f.target_or_input_field())
-            .collect()
-    }
-
-    fn input_keys(&self) -> HashSet<&str> {
-        self.fields.iter().map(|f| f.input_field()).collect()
-    }
-
-    fn build(self, intermediate_keys: &[String]) -> Result<ProcessorKind> {
-        self.build(intermediate_keys).map(ProcessorKind::Date)
-    }
-}
-
-impl DateProcessorBuilder {
-    pub fn build(self, intermediate_keys: &[String]) -> Result<DateProcessor> {
-        let mut real_fields = vec![];
-        for field in self.fields.into_iter() {
-            let input = OneInputOneOutputField::build(
-                "date",
-                intermediate_keys,
-                field.input_field(),
-                field.target_or_input_field(),
-            )?;
-            real_fields.push(input);
-        }
-        Ok(DateProcessor {
-            fields: real_fields,
-            formats: self.formats,
-            timezone: self.timezone,
-            locale: self.locale,
-            ignore_missing: self.ignore_missing,
-        })
-    }
-}
-
-impl TryFrom<&yaml_rust::yaml::Hash> for DateProcessorBuilder {
+impl TryFrom<&yaml_rust::yaml::Hash> for DateProcessor {
     type Error = Error;
 
     fn try_from(hash: &yaml_rust::yaml::Hash) -> Result<Self> {
@@ -181,7 +133,7 @@ impl TryFrom<&yaml_rust::yaml::Hash> for DateProcessorBuilder {
             }
         }
 
-        let builder = DateProcessorBuilder {
+        let builder = DateProcessor {
             fields,
             formats,
             timezone,
@@ -197,7 +149,7 @@ impl TryFrom<&yaml_rust::yaml::Hash> for DateProcessorBuilder {
 /// Reserved for compatibility only
 #[derive(Debug, Default)]
 pub struct DateProcessor {
-    fields: Vec<OneInputOneOutputField>,
+    fields: Fields,
     formats: Formats,
     timezone: Option<Arc<String>>,
     locale: Option<Arc<String>>, // to support locale
@@ -242,20 +194,20 @@ impl Processor for DateProcessor {
         self.ignore_missing
     }
 
-    fn exec_mut(&self, val: &mut Vec<Value>) -> Result<()> {
+    fn exec_mut(&self, val: &mut IntermediateStatus) -> Result<()> {
         for field in self.fields.iter() {
-            let index = field.input_index();
+            let index = field.input_field();
             match val.get(index) {
                 Some(Value::String(s)) => {
                     let timestamp = self.parse(s)?;
-                    let output_index = field.output_index();
-                    val[output_index] = Value::Timestamp(timestamp);
+                    let output_key = field.target_or_input_field();
+                    val.insert(output_key.to_string(), Value::Timestamp(timestamp));
                 }
                 Some(Value::Null) | None => {
                     if !self.ignore_missing {
                         return ProcessorMissingFieldSnafu {
                             processor: self.kind().to_string(),
-                            field: field.input_name().to_string(),
+                            field: field.input_field().to_string(),
                         }
                         .fail();
                     }
diff --git a/src/pipeline/src/etl/processor/decolorize.rs b/src/pipeline/src/etl/processor/decolorize.rs
index e72bc28a1e66..2547b99d6824 100644
--- a/src/pipeline/src/etl/processor/decolorize.rs
+++ b/src/pipeline/src/etl/processor/decolorize.rs
@@ -18,18 +18,17 @@
 //! from Grafana Loki and [`strip_ansi_escape_codes`](https://vector.dev/docs/reference/vrl/functions/#strip_ansi_escape_codes)
 //! from Vector VRL.
 
-use ahash::HashSet;
 use once_cell::sync::Lazy;
 use regex::Regex;
 use snafu::OptionExt;
 
+use super::IntermediateStatus;
 use crate::etl::error::{
     Error, KeyMustBeStringSnafu, ProcessorExpectStringSnafu, ProcessorMissingFieldSnafu, Result,
 };
-use crate::etl::field::{Fields, OneInputOneOutputField};
+use crate::etl::field::Fields;
 use crate::etl::processor::{
-    yaml_bool, yaml_new_field, yaml_new_fields, ProcessorBuilder, ProcessorKind, FIELDS_NAME,
-    FIELD_NAME, IGNORE_MISSING_NAME,
+    yaml_bool, yaml_new_field, yaml_new_fields, FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME,
 };
 use crate::etl::value::Value;
 
@@ -37,52 +36,10 @@ pub(crate) const PROCESSOR_DECOLORIZE: &str = "decolorize";
 
 static RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"\x1b\[[0-9;]*m").unwrap());
 
-#[derive(Debug, Default)]
-pub struct DecolorizeProcessorBuilder {
-    fields: Fields,
-    ignore_missing: bool,
-}
-
-impl ProcessorBuilder for DecolorizeProcessorBuilder {
-    fn output_keys(&self) -> HashSet<&str> {
-        self.fields
-            .iter()
-            .map(|f| f.target_or_input_field())
-            .collect()
-    }
-
-    fn input_keys(&self) -> HashSet<&str> {
-        self.fields.iter().map(|f| f.input_field()).collect()
-    }
-
-    fn build(self, intermediate_keys: &[String]) -> Result<ProcessorKind> {
-        self.build(intermediate_keys).map(ProcessorKind::Decolorize)
-    }
-}
-
-impl DecolorizeProcessorBuilder {
-    fn build(self, intermediate_keys: &[String]) -> Result<DecolorizeProcessor> {
-        let mut real_fields = vec![];
-        for field in self.fields.into_iter() {
-            let input = OneInputOneOutputField::build(
-                "decolorize",
-                intermediate_keys,
-                field.input_field(),
-                field.target_or_input_field(),
-            )?;
-            real_fields.push(input);
-        }
-        Ok(DecolorizeProcessor {
-            fields: real_fields,
-            ignore_missing: self.ignore_missing,
-        })
-    }
-}
-
 /// Remove ANSI color control codes from the input text.
 #[derive(Debug, Default)]
 pub struct DecolorizeProcessor {
-    fields: Vec<OneInputOneOutputField>,
+    fields: Fields,
     ignore_missing: bool,
 }
 
@@ -103,7 +60,7 @@ impl DecolorizeProcessor {
     }
 }
 
-impl TryFrom<&yaml_rust::yaml::Hash> for DecolorizeProcessorBuilder {
+impl TryFrom<&yaml_rust::yaml::Hash> for DecolorizeProcessor {
     type Error = Error;
 
     fn try_from(value: &yaml_rust::yaml::Hash) -> Result<Self> {
@@ -129,7 +86,7 @@ impl TryFrom<&yaml_rust::yaml::Hash> for DecolorizeProcessorBuilder {
             }
         }
 
-        Ok(DecolorizeProcessorBuilder {
+        Ok(DecolorizeProcessor {
             fields,
             ignore_missing,
         })
@@ -145,23 +102,23 @@ impl crate::etl::processor::Processor for DecolorizeProcessor {
         self.ignore_missing
     }
 
-    fn exec_mut(&self, val: &mut Vec<Value>) -> Result<()> {
+    fn exec_mut(&self, val: &mut IntermediateStatus) -> Result<()> {
         for field in self.fields.iter() {
-            let index = field.input_index();
+            let index = field.input_field();
             match val.get(index) {
                 Some(Value::Null) | None => {
                     if !self.ignore_missing {
                         return ProcessorMissingFieldSnafu {
                             processor: self.kind(),
-                            field: field.input_name(),
+                            field: field.input_field(),
                         }
                         .fail();
                     }
                 }
                 Some(v) => {
                     let result = self.process(v)?;
-                    let output_index = field.output_index();
-                    val[output_index] = result;
+                    let output_index = field.target_or_input_field();
+                    val.insert(output_index.to_string(), result);
                 }
             }
         }
@@ -176,7 +133,7 @@ mod tests {
     #[test]
     fn test_decolorize_processor() {
         let processor = DecolorizeProcessor {
-            fields: vec![],
+            fields: Fields::default(),
             ignore_missing: false,
         };
 
diff --git a/src/pipeline/src/etl/processor/digest.rs b/src/pipeline/src/etl/processor/digest.rs
index 29054365ad03..64bb2a2f6d8a 100644
--- a/src/pipeline/src/etl/processor/digest.rs
+++ b/src/pipeline/src/etl/processor/digest.rs
@@ -21,17 +21,16 @@
 
 use std::borrow::Cow;
 
-use ahash::HashSet;
 use regex::Regex;
 use snafu::OptionExt;
 
+use super::IntermediateStatus;
 use crate::etl::error::{
     Error, KeyMustBeStringSnafu, ProcessorExpectStringSnafu, ProcessorMissingFieldSnafu, Result,
 };
-use crate::etl::field::{Fields, OneInputOneOutputField};
+use crate::etl::field::Fields;
 use crate::etl::processor::{
-    yaml_bool, yaml_new_field, yaml_new_fields, ProcessorBuilder, ProcessorKind, FIELDS_NAME,
-    FIELD_NAME, IGNORE_MISSING_NAME,
+    yaml_bool, yaml_new_field, yaml_new_fields, FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME,
 };
 use crate::etl::value::Value;
 use crate::etl_error::DigestPatternInvalidSnafu;
@@ -88,54 +87,10 @@ impl PresetPattern {
     }
 }
 
-#[derive(Debug, Default)]
-pub struct DigestProcessorBuilder {
-    fields: Fields,
-    patterns: Vec<Regex>,
-    ignore_missing: bool,
-}
-
-impl ProcessorBuilder for DigestProcessorBuilder {
-    fn output_keys(&self) -> HashSet<&str> {
-        self.fields
-            .iter()
-            .map(|f| f.target_or_input_field())
-            .collect()
-    }
-
-    fn input_keys(&self) -> HashSet<&str> {
-        self.fields.iter().map(|f| f.input_field()).collect()
-    }
-
-    fn build(self, intermediate_keys: &[String]) -> Result<ProcessorKind> {
-        self.build(intermediate_keys).map(ProcessorKind::Digest)
-    }
-}
-
-impl DigestProcessorBuilder {
-    fn build(self, intermediate_keys: &[String]) -> Result<DigestProcessor> {
-        let mut real_fields = Vec::with_capacity(self.fields.len());
-        for field in self.fields.into_iter() {
-            let input = OneInputOneOutputField::build(
-                "digest",
-                intermediate_keys,
-                field.input_field(),
-                field.target_or_input_field(),
-            )?;
-            real_fields.push(input);
-        }
-        Ok(DigestProcessor {
-            fields: real_fields,
-            ignore_missing: self.ignore_missing,
-            patterns: self.patterns,
-        })
-    }
-}
-
 /// Computes a digest (hash) of the input string.
 #[derive(Debug, Default)]
 pub struct DigestProcessor {
-    fields: Vec<OneInputOneOutputField>,
+    fields: Fields,
     ignore_missing: bool,
     patterns: Vec<Regex>,
 }
@@ -169,7 +124,7 @@ impl DigestProcessor {
     }
 }
 
-impl TryFrom<&yaml_rust::yaml::Hash> for DigestProcessorBuilder {
+impl TryFrom<&yaml_rust::yaml::Hash> for DigestProcessor {
     type Error = Error;
 
     fn try_from(value: &yaml_rust::yaml::Hash) -> Result<Self> {
@@ -226,10 +181,10 @@ impl TryFrom<&yaml_rust::yaml::Hash> for DigestProcessorBuilder {
         }
 
         for field in fields.iter_mut() {
-            field.target_field = Some(format!("{}_digest", field.input_field()));
+            field.set_target_field(Some(format!("{}_digest", field.input_field())));
         }
 
-        Ok(DigestProcessorBuilder {
+        Ok(DigestProcessor {
             fields,
             patterns,
             ignore_missing,
@@ -246,23 +201,23 @@ impl crate::etl::processor::Processor for DigestProcessor {
         self.ignore_missing
     }
 
-    fn exec_mut(&self, val: &mut Vec<Value>) -> Result<()> {
+    fn exec_mut(&self, val: &mut IntermediateStatus) -> Result<()> {
         for field in self.fields.iter() {
-            let index = field.input_index();
+            let index = field.input_field();
             match val.get(index) {
                 Some(Value::Null) | None => {
                     if !self.ignore_missing {
                         return ProcessorMissingFieldSnafu {
                             processor: self.kind(),
-                            field: field.input_name(),
+                            field: field.input_field(),
                         }
                         .fail();
                     }
                 }
                 Some(v) => {
                     let result = self.process(v)?;
-                    let output_index = field.output_index();
-                    val[output_index] = result;
+                    let output_index = field.target_or_input_field();
+                    val.insert(output_index.to_string(), result);
                 }
             }
         }
@@ -278,7 +233,7 @@ mod tests {
     #[test]
     fn test_digest_processor_ip() {
         let processor = DigestProcessor {
-            fields: vec![],
+            fields: Fields::default(),
             ignore_missing: false,
             patterns: vec![PresetPattern::Ip.regex()],
         };
@@ -306,7 +261,7 @@ mod tests {
     #[test]
     fn test_digest_processor_uuid() {
         let processor = DigestProcessor {
-            fields: vec![],
+            fields: Fields::default(),
             ignore_missing: false,
             patterns: vec![PresetPattern::Uuid.regex()],
         };
@@ -339,7 +294,7 @@ mod tests {
     #[test]
     fn test_digest_processor_brackets() {
         let processor = DigestProcessor {
-            fields: vec![],
+            fields: Fields::default(),
             ignore_missing: false,
             patterns: vec![PresetPattern::Bracketed.regex()],
         };
@@ -389,7 +344,7 @@ mod tests {
     #[test]
     fn test_digest_processor_quotes() {
         let processor = DigestProcessor {
-            fields: vec![],
+            fields: Fields::default(),
             ignore_missing: false,
             patterns: vec![PresetPattern::Quoted.regex()],
         };
@@ -409,7 +364,7 @@ mod tests {
     #[test]
     fn test_digest_processor_custom_regex() {
         let processor = DigestProcessor {
-            fields: vec![],
+            fields: Fields::default(),
             ignore_missing: false,
             patterns: vec![Regex::new(r"\d+").unwrap()],
         };
diff --git a/src/pipeline/src/etl/processor/dissect.rs b/src/pipeline/src/etl/processor/dissect.rs
index a9ccf5e8735e..13ad9175e7df 100644
--- a/src/pipeline/src/etl/processor/dissect.rs
+++ b/src/pipeline/src/etl/processor/dissect.rs
@@ -25,7 +25,7 @@ use crate::etl::error::{
     DissectOrderOnlyAppendSnafu, DissectSplitExceedsInputSnafu, DissectSplitNotMatchInputSnafu,
     Error, KeyMustBeStringSnafu, ProcessorExpectStringSnafu, ProcessorMissingFieldSnafu, Result,
 };
-use crate::etl::field::{Fields, InputFieldInfo, OneInputMultiOutputField};
+use crate::etl::field::{Fields, InputField, OneInputMultiOutputField};
 use crate::etl::find_key_index;
 use crate::etl::processor::{
     yaml_bool, yaml_new_field, yaml_new_fields, yaml_parse_string, yaml_parse_strings, yaml_string,
@@ -612,7 +612,7 @@ impl ProcessorBuilder for DissectProcessorBuilder {
         for field in self.fields.into_iter() {
             let input_index = find_key_index(intermediate_keys, field.input_field(), "dissect")?;
 
-            let input_field_info = InputFieldInfo::new(field.input_field(), input_index);
+            let input_field_info = InputField::new(field.input_field(), input_index);
 
             let real_field = OneInputMultiOutputField::new(input_field_info, field.target_field);
             real_fields.push(real_field);
diff --git a/src/pipeline/src/etl/processor/epoch.rs b/src/pipeline/src/etl/processor/epoch.rs
index f2c03fd120de..29ad6bd3d97d 100644
--- a/src/pipeline/src/etl/processor/epoch.rs
+++ b/src/pipeline/src/etl/processor/epoch.rs
@@ -12,17 +12,17 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-use ahash::HashSet;
 use snafu::{OptionExt, ResultExt};
 
+use super::IntermediateStatus;
 use crate::etl::error::{
     EpochInvalidResolutionSnafu, Error, FailedToParseIntSnafu, KeyMustBeStringSnafu,
     ProcessorMissingFieldSnafu, ProcessorUnsupportedValueSnafu, Result,
 };
-use crate::etl::field::{Fields, OneInputOneOutputField};
+use crate::etl::field::Fields;
 use crate::etl::processor::{
-    yaml_bool, yaml_new_field, yaml_new_fields, yaml_string, Processor, ProcessorBuilder,
-    ProcessorKind, FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME,
+    yaml_bool, yaml_new_field, yaml_new_fields, yaml_string, Processor, FIELDS_NAME, FIELD_NAME,
+    IGNORE_MISSING_NAME,
 };
 use crate::etl::value::time::{
     MICROSECOND_RESOLUTION, MICRO_RESOLUTION, MILLISECOND_RESOLUTION, MILLI_RESOLUTION,
@@ -57,56 +57,12 @@ impl TryFrom<&str> for Resolution {
     }
 }
 
-#[derive(Debug, Default)]
-pub struct EpochProcessorBuilder {
-    fields: Fields,
-    resolution: Resolution,
-    ignore_missing: bool,
-}
-
-impl ProcessorBuilder for EpochProcessorBuilder {
-    fn output_keys(&self) -> HashSet<&str> {
-        self.fields
-            .iter()
-            .map(|f| f.target_or_input_field())
-            .collect()
-    }
-
-    fn input_keys(&self) -> HashSet<&str> {
-        self.fields.iter().map(|f| f.input_field()).collect()
-    }
-
-    fn build(self, intermediate_keys: &[String]) -> Result<ProcessorKind> {
-        self.build(intermediate_keys).map(ProcessorKind::Epoch)
-    }
-}
-
-impl EpochProcessorBuilder {
-    pub fn build(self, intermediate_keys: &[String]) -> Result<EpochProcessor> {
-        let mut real_fields = vec![];
-        for field in self.fields.into_iter() {
-            let input = OneInputOneOutputField::build(
-                "epoch",
-                intermediate_keys,
-                field.input_field(),
-                field.target_or_input_field(),
-            )?;
-            real_fields.push(input);
-        }
-        Ok(EpochProcessor {
-            fields: real_fields,
-            resolution: self.resolution,
-            ignore_missing: self.ignore_missing,
-        })
-    }
-}
-
 /// support string, integer, float, time, epoch
 /// deprecated it should be removed in the future
 /// Reserved for compatibility only
 #[derive(Debug, Default)]
 pub struct EpochProcessor {
-    fields: Vec<OneInputOneOutputField>,
+    fields: Fields,
     resolution: Resolution,
     ignore_missing: bool,
     // description
@@ -157,7 +113,7 @@ impl EpochProcessor {
     }
 }
 
-impl TryFrom<&yaml_rust::yaml::Hash> for EpochProcessorBuilder {
+impl TryFrom<&yaml_rust::yaml::Hash> for EpochProcessor {
     type Error = Error;
 
     fn try_from(hash: &yaml_rust::yaml::Hash) -> Result<Self> {
@@ -188,7 +144,7 @@ impl TryFrom<&yaml_rust::yaml::Hash> for EpochProcessorBuilder {
                 _ => {}
             }
         }
-        let builder = EpochProcessorBuilder {
+        let builder = EpochProcessor {
             fields,
             resolution,
             ignore_missing,
@@ -207,23 +163,23 @@ impl Processor for EpochProcessor {
         self.ignore_missing
     }
 
-    fn exec_mut(&self, val: &mut Vec<Value>) -> Result<()> {
+    fn exec_mut(&self, val: &mut IntermediateStatus) -> Result<()> {
         for field in self.fields.iter() {
-            let index = field.input_index();
+            let index = field.input_field();
             match val.get(index) {
                 Some(Value::Null) | None => {
                     if !self.ignore_missing {
                         return ProcessorMissingFieldSnafu {
                             processor: self.kind(),
-                            field: field.input_name(),
+                            field: field.input_field(),
                         }
                         .fail();
                     }
                 }
                 Some(v) => {
                     let timestamp = self.parse(v)?;
-                    let output_index = field.output_index();
-                    val[output_index] = Value::Timestamp(timestamp);
+                    let output_index = field.target_or_input_field();
+                    val.insert(output_index.to_string(), Value::Timestamp(timestamp));
                 }
             }
         }
diff --git a/src/pipeline/src/etl/processor/gsub.rs b/src/pipeline/src/etl/processor/gsub.rs
index 54c8306ec4de..dbdb9c5c3047 100644
--- a/src/pipeline/src/etl/processor/gsub.rs
+++ b/src/pipeline/src/etl/processor/gsub.rs
@@ -12,18 +12,18 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-use ahash::HashSet;
 use regex::Regex;
 use snafu::{OptionExt, ResultExt};
 
+use super::IntermediateStatus;
 use crate::etl::error::{
     Error, GsubPatternRequiredSnafu, GsubReplacementRequiredSnafu, KeyMustBeStringSnafu,
     ProcessorExpectStringSnafu, ProcessorMissingFieldSnafu, RegexSnafu, Result,
 };
-use crate::etl::field::{Fields, OneInputOneOutputField};
+use crate::etl::field::Fields;
 use crate::etl::processor::{
-    yaml_bool, yaml_new_field, yaml_new_fields, yaml_string, ProcessorBuilder, ProcessorKind,
-    FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME, PATTERN_NAME,
+    yaml_bool, yaml_new_field, yaml_new_fields, yaml_string, FIELDS_NAME, FIELD_NAME,
+    IGNORE_MISSING_NAME, PATTERN_NAME,
 };
 use crate::etl::value::Value;
 
@@ -31,68 +31,10 @@ pub(crate) const PROCESSOR_GSUB: &str = "gsub";
 
 const REPLACEMENT_NAME: &str = "replacement";
 
-#[derive(Debug, Default)]
-pub struct GsubProcessorBuilder {
-    fields: Fields,
-    pattern: Option<Regex>,
-    replacement: Option<String>,
-    ignore_missing: bool,
-}
-
-impl ProcessorBuilder for GsubProcessorBuilder {
-    fn output_keys(&self) -> HashSet<&str> {
-        self.fields
-            .iter()
-            .map(|f| f.target_or_input_field())
-            .collect()
-    }
-
-    fn input_keys(&self) -> HashSet<&str> {
-        self.fields.iter().map(|f| f.input_field()).collect()
-    }
-
-    fn build(self, intermediate_keys: &[String]) -> Result<ProcessorKind> {
-        self.build(intermediate_keys).map(ProcessorKind::Gsub)
-    }
-}
-
-impl GsubProcessorBuilder {
-    fn check(self) -> Result<Self> {
-        if self.pattern.is_none() {
-            return GsubPatternRequiredSnafu.fail();
-        }
-
-        if self.replacement.is_none() {
-            return GsubReplacementRequiredSnafu.fail();
-        }
-
-        Ok(self)
-    }
-
-    fn build(self, intermediate_keys: &[String]) -> Result<GsubProcessor> {
-        let mut real_fields = vec![];
-        for field in self.fields.into_iter() {
-            let input = OneInputOneOutputField::build(
-                "gsub",
-                intermediate_keys,
-                field.input_field(),
-                field.target_or_input_field(),
-            )?;
-            real_fields.push(input);
-        }
-        Ok(GsubProcessor {
-            fields: real_fields,
-            pattern: self.pattern,
-            replacement: self.replacement,
-            ignore_missing: self.ignore_missing,
-        })
-    }
-}
-
 /// A processor to replace all matches of a pattern in string by a replacement, only support string value, and array string value
 #[derive(Debug, Default)]
 pub struct GsubProcessor {
-    fields: Vec<OneInputOneOutputField>,
+    fields: Fields,
     pattern: Option<Regex>,
     replacement: Option<String>,
     ignore_missing: bool,
@@ -136,7 +78,7 @@ impl GsubProcessor {
     }
 }
 
-impl TryFrom<&yaml_rust::yaml::Hash> for GsubProcessorBuilder {
+impl TryFrom<&yaml_rust::yaml::Hash> for GsubProcessor {
     type Error = Error;
 
     fn try_from(value: &yaml_rust::yaml::Hash) -> Result<Self> {
@@ -176,7 +118,7 @@ impl TryFrom<&yaml_rust::yaml::Hash> for GsubProcessorBuilder {
             }
         }
 
-        let builder = GsubProcessorBuilder {
+        let builder = GsubProcessor {
             fields,
             pattern,
             replacement,
@@ -196,23 +138,23 @@ impl crate::etl::processor::Processor for GsubProcessor {
         self.ignore_missing
     }
 
-    fn exec_mut(&self, val: &mut Vec<Value>) -> Result<()> {
+    fn exec_mut(&self, val: &mut IntermediateStatus) -> Result<()> {
         for field in self.fields.iter() {
-            let index = field.input_index();
+            let index = field.input_field();
             match val.get(index) {
                 Some(Value::Null) | None => {
                     if !self.ignore_missing {
                         return ProcessorMissingFieldSnafu {
                             processor: self.kind(),
-                            field: field.input_name(),
+                            field: field.input_field(),
                         }
                         .fail();
                     }
                 }
                 Some(v) => {
                     let result = self.process(v)?;
-                    let output_index = field.output_index();
-                    val[output_index] = result;
+                    let output_index = field.target_or_input_field();
+                    val.insert(output_index.to_string(), result);
                 }
             }
         }
diff --git a/src/pipeline/src/etl/processor/join.rs b/src/pipeline/src/etl/processor/join.rs
index ddbc086ab8da..6913a5428873 100644
--- a/src/pipeline/src/etl/processor/join.rs
+++ b/src/pipeline/src/etl/processor/join.rs
@@ -12,79 +12,26 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-use ahash::HashSet;
 use snafu::OptionExt;
 
+use super::IntermediateStatus;
 use crate::etl::error::{
     Error, JoinSeparatorRequiredSnafu, KeyMustBeStringSnafu, ProcessorExpectStringSnafu,
     ProcessorMissingFieldSnafu, Result,
 };
-use crate::etl::field::{Fields, OneInputOneOutputField};
+use crate::etl::field::Fields;
 use crate::etl::processor::{
-    yaml_bool, yaml_new_field, yaml_new_fields, yaml_string, Processor, ProcessorBuilder,
-    ProcessorKind, FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME, SEPARATOR_NAME,
+    yaml_bool, yaml_new_field, yaml_new_fields, yaml_string, Processor, FIELDS_NAME, FIELD_NAME,
+    IGNORE_MISSING_NAME, SEPARATOR_NAME,
 };
 use crate::etl::value::{Array, Value};
 
 pub(crate) const PROCESSOR_JOIN: &str = "join";
 
-#[derive(Debug, Default)]
-pub struct JoinProcessorBuilder {
-    fields: Fields,
-    separator: Option<String>,
-    ignore_missing: bool,
-}
-
-impl ProcessorBuilder for JoinProcessorBuilder {
-    fn output_keys(&self) -> HashSet<&str> {
-        self.fields
-            .iter()
-            .map(|f| f.target_or_input_field())
-            .collect()
-    }
-
-    fn input_keys(&self) -> HashSet<&str> {
-        self.fields.iter().map(|f| f.input_field()).collect()
-    }
-
-    fn build(self, intermediate_keys: &[String]) -> Result<ProcessorKind> {
-        self.build(intermediate_keys).map(ProcessorKind::Join)
-    }
-}
-
-impl JoinProcessorBuilder {
-    fn check(self) -> Result<Self> {
-        if self.separator.is_none() {
-            return JoinSeparatorRequiredSnafu.fail();
-        }
-
-        Ok(self)
-    }
-
-    pub fn build(self, intermediate_keys: &[String]) -> Result<JoinProcessor> {
-        let mut real_fields = vec![];
-        for field in self.fields.into_iter() {
-            let input = OneInputOneOutputField::build(
-                "join",
-                intermediate_keys,
-                field.input_field(),
-                field.target_or_input_field(),
-            )?;
-            real_fields.push(input);
-        }
-
-        Ok(JoinProcessor {
-            fields: real_fields,
-            separator: self.separator,
-            ignore_missing: self.ignore_missing,
-        })
-    }
-}
-
 /// A processor to join each element of an array into a single string using a separator string between each element
 #[derive(Debug, Default)]
 pub struct JoinProcessor {
-    fields: Vec<OneInputOneOutputField>,
+    fields: Fields,
     separator: Option<String>,
     ignore_missing: bool,
 }
@@ -110,7 +57,7 @@ impl JoinProcessor {
     }
 }
 
-impl TryFrom<&yaml_rust::yaml::Hash> for JoinProcessorBuilder {
+impl TryFrom<&yaml_rust::yaml::Hash> for JoinProcessor {
     type Error = Error;
 
     fn try_from(value: &yaml_rust::yaml::Hash) -> Result<Self> {
@@ -140,7 +87,7 @@ impl TryFrom<&yaml_rust::yaml::Hash> for JoinProcessorBuilder {
             }
         }
 
-        let builder = JoinProcessorBuilder {
+        let builder = JoinProcessor {
             fields,
             separator,
             ignore_missing,
@@ -158,20 +105,20 @@ impl Processor for JoinProcessor {
         self.ignore_missing
     }
 
-    fn exec_mut(&self, val: &mut Vec<Value>) -> Result<()> {
+    fn exec_mut(&self, val: &mut IntermediateStatus) -> Result<()> {
         for field in self.fields.iter() {
-            let index = field.input_index();
+            let index = field.input_field();
             match val.get(index) {
                 Some(Value::Array(arr)) => {
                     let result = self.process(arr)?;
-                    let output_index = field.output_index();
-                    val[output_index] = result;
+                    let output_index = field.target_or_input_field();
+                    val.insert(output_index.to_string(), result);
                 }
                 Some(Value::Null) | None => {
                     if !self.ignore_missing {
                         return ProcessorMissingFieldSnafu {
                             processor: self.kind(),
-                            field: field.input_name(),
+                            field: field.input_field(),
                         }
                         .fail();
                     }
diff --git a/src/pipeline/src/etl/processor/json_path.rs b/src/pipeline/src/etl/processor/json_path.rs
index c09d338c637f..c7b4210e83f1 100644
--- a/src/pipeline/src/etl/processor/json_path.rs
+++ b/src/pipeline/src/etl/processor/json_path.rs
@@ -12,17 +12,15 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-use ahash::HashSet;
 use jsonpath_rust::JsonPath;
 use snafu::{OptionExt, ResultExt};
 
 use super::{
-    yaml_bool, yaml_new_field, yaml_new_fields, yaml_string, Processor, ProcessorBuilder,
+    yaml_bool, yaml_new_field, yaml_new_fields, yaml_string, IntermediateStatus, Processor,
     FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME, JSON_PATH_NAME, JSON_PATH_RESULT_INDEX_NAME,
 };
 use crate::etl::error::{Error, Result};
-use crate::etl::field::{Fields, OneInputOneOutputField};
-use crate::etl::processor::ProcessorKind;
+use crate::etl::field::Fields;
 use crate::etl_error::{
     JsonPathParseResultIndexSnafu, JsonPathParseSnafu, KeyMustBeStringSnafu,
     ProcessorMissingFieldSnafu,
@@ -31,54 +29,7 @@ use crate::Value;
 
 pub(crate) const PROCESSOR_JSON_PATH: &str = "json_path";
 
-#[derive(Debug)]
-pub struct JsonPathProcessorBuilder {
-    fields: Fields,
-    json_path: JsonPath<Value>,
-    ignore_missing: bool,
-    result_idex: Option<usize>,
-}
-
-impl JsonPathProcessorBuilder {
-    fn build(self, intermediate_keys: &[String]) -> Result<JsonPathProcessor> {
-        let mut real_fields = vec![];
-        for field in self.fields.into_iter() {
-            let input = OneInputOneOutputField::build(
-                JSON_PATH_NAME,
-                intermediate_keys,
-                field.input_field(),
-                field.target_or_input_field(),
-            )?;
-            real_fields.push(input);
-        }
-
-        Ok(JsonPathProcessor {
-            fields: real_fields,
-            json_path: self.json_path,
-            ignore_missing: self.ignore_missing,
-            result_idex: self.result_idex,
-        })
-    }
-}
-
-impl ProcessorBuilder for JsonPathProcessorBuilder {
-    fn output_keys(&self) -> HashSet<&str> {
-        self.fields
-            .iter()
-            .map(|f| f.target_or_input_field())
-            .collect()
-    }
-
-    fn input_keys(&self) -> HashSet<&str> {
-        self.fields.iter().map(|f| f.input_field()).collect()
-    }
-
-    fn build(self, intermediate_keys: &[String]) -> Result<ProcessorKind> {
-        self.build(intermediate_keys).map(ProcessorKind::JsonPath)
-    }
-}
-
-impl TryFrom<&yaml_rust::yaml::Hash> for JsonPathProcessorBuilder {
+impl TryFrom<&yaml_rust::yaml::Hash> for JsonPathProcessor {
     type Error = Error;
 
     fn try_from(value: &yaml_rust::yaml::Hash) -> std::result::Result<Self, Self::Error> {
@@ -117,7 +68,7 @@ impl TryFrom<&yaml_rust::yaml::Hash> for JsonPathProcessorBuilder {
             }
         }
         if let Some(json_path) = json_path {
-            let processor = JsonPathProcessorBuilder {
+            let processor = JsonPathProcessor {
                 fields,
                 json_path,
                 ignore_missing,
@@ -137,7 +88,7 @@ impl TryFrom<&yaml_rust::yaml::Hash> for JsonPathProcessorBuilder {
 
 #[derive(Debug)]
 pub struct JsonPathProcessor {
-    fields: Vec<OneInputOneOutputField>,
+    fields: Fields,
     json_path: JsonPath<Value>,
     ignore_missing: bool,
     result_idex: Option<usize>,
@@ -146,7 +97,7 @@ pub struct JsonPathProcessor {
 impl Default for JsonPathProcessor {
     fn default() -> Self {
         JsonPathProcessor {
-            fields: vec![],
+            fields: Fields::default(),
             json_path: JsonPath::try_from("$").unwrap(),
             ignore_missing: false,
             result_idex: None,
@@ -179,21 +130,20 @@ impl Processor for JsonPathProcessor {
         self.ignore_missing
     }
 
-    fn exec_mut(&self, val: &mut Vec<Value>) -> Result<()> {
+    fn exec_mut(&self, val: &mut IntermediateStatus) -> Result<()> {
         for field in self.fields.iter() {
-            let index = field.input_index();
+            let index = field.input_field();
             match val.get(index) {
                 Some(v) => {
                     let processed = self.process_field(v)?;
-
-                    let output_index = field.output_index();
-                    val[output_index] = processed;
+                    let output_index = field.target_or_input_field();
+                    val.insert(output_index.to_string(), processed);
                 }
                 None => {
                     if !self.ignore_missing {
                         return ProcessorMissingFieldSnafu {
                             processor: self.kind(),
-                            field: field.input_name(),
+                            field: field.input_field(),
                         }
                         .fail();
                     }
diff --git a/src/pipeline/src/etl/processor/letter.rs b/src/pipeline/src/etl/processor/letter.rs
index 8eb939918104..960521853e48 100644
--- a/src/pipeline/src/etl/processor/letter.rs
+++ b/src/pipeline/src/etl/processor/letter.rs
@@ -12,17 +12,17 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-use ahash::HashSet;
 use snafu::OptionExt;
 
+use super::IntermediateStatus;
 use crate::etl::error::{
     Error, KeyMustBeStringSnafu, LetterInvalidMethodSnafu, ProcessorExpectStringSnafu,
     ProcessorMissingFieldSnafu, Result,
 };
-use crate::etl::field::{Fields, OneInputOneOutputField};
+use crate::etl::field::Fields;
 use crate::etl::processor::{
-    yaml_bool, yaml_new_field, yaml_new_fields, yaml_string, Processor, ProcessorBuilder,
-    ProcessorKind, FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME, METHOD_NAME,
+    yaml_bool, yaml_new_field, yaml_new_fields, yaml_string, Processor, FIELDS_NAME, FIELD_NAME,
+    IGNORE_MISSING_NAME, METHOD_NAME,
 };
 use crate::etl::value::Value;
 
@@ -59,55 +59,10 @@ impl std::str::FromStr for Method {
     }
 }
 
-#[derive(Debug, Default)]
-pub struct LetterProcessorBuilder {
-    fields: Fields,
-    method: Method,
-    ignore_missing: bool,
-}
-
-impl ProcessorBuilder for LetterProcessorBuilder {
-    fn output_keys(&self) -> HashSet<&str> {
-        self.fields
-            .iter()
-            .map(|f| f.target_or_input_field())
-            .collect()
-    }
-
-    fn input_keys(&self) -> HashSet<&str> {
-        self.fields.iter().map(|f| f.input_field()).collect()
-    }
-
-    fn build(self, intermediate_keys: &[String]) -> Result<ProcessorKind> {
-        self.build(intermediate_keys).map(ProcessorKind::Letter)
-    }
-}
-
-impl LetterProcessorBuilder {
-    pub fn build(self, intermediate_keys: &[String]) -> Result<LetterProcessor> {
-        let mut real_fields = vec![];
-        for field in self.fields.into_iter() {
-            let input = OneInputOneOutputField::build(
-                "letter",
-                intermediate_keys,
-                field.input_field(),
-                field.target_or_input_field(),
-            )?;
-            real_fields.push(input);
-        }
-
-        Ok(LetterProcessor {
-            fields: real_fields,
-            method: self.method,
-            ignore_missing: self.ignore_missing,
-        })
-    }
-}
-
 /// only support string value
 #[derive(Debug, Default)]
 pub struct LetterProcessor {
-    fields: Vec<OneInputOneOutputField>,
+    fields: Fields,
     method: Method,
     ignore_missing: bool,
 }
@@ -125,7 +80,7 @@ impl LetterProcessor {
     }
 }
 
-impl TryFrom<&yaml_rust::yaml::Hash> for LetterProcessorBuilder {
+impl TryFrom<&yaml_rust::yaml::Hash> for LetterProcessor {
     type Error = Error;
 
     fn try_from(value: &yaml_rust::yaml::Hash) -> Result<Self> {
@@ -154,7 +109,7 @@ impl TryFrom<&yaml_rust::yaml::Hash> for LetterProcessorBuilder {
             }
         }
 
-        Ok(LetterProcessorBuilder {
+        Ok(LetterProcessor {
             fields,
             method,
             ignore_missing,
@@ -171,20 +126,20 @@ impl Processor for LetterProcessor {
         self.ignore_missing
     }
 
-    fn exec_mut(&self, val: &mut Vec<Value>) -> Result<()> {
+    fn exec_mut(&self, val: &mut IntermediateStatus) -> Result<()> {
         for field in self.fields.iter() {
-            let index = field.input_index();
+            let index = field.input_field();
             match val.get(index) {
                 Some(Value::String(s)) => {
                     let result = self.process_field(s)?;
-                    let (_, output_index) = field.output();
-                    val[*output_index] = result;
+                    let output_key = field.target_or_input_field();
+                    val.insert(output_key.to_string(), result);
                 }
                 Some(Value::Null) | None => {
                     if !self.ignore_missing {
                         return ProcessorMissingFieldSnafu {
                             processor: self.kind(),
-                            field: field.input_name(),
+                            field: field.input_field(),
                         }
                         .fail();
                     }
diff --git a/src/pipeline/src/etl/processor/regex.rs b/src/pipeline/src/etl/processor/regex.rs
index de25195f99ab..a6ffa86d1689 100644
--- a/src/pipeline/src/etl/processor/regex.rs
+++ b/src/pipeline/src/etl/processor/regex.rs
@@ -28,7 +28,7 @@ use crate::etl::error::{
     RegexNamedGroupNotFoundSnafu, RegexNoValidFieldSnafu, RegexNoValidPatternSnafu, RegexSnafu,
     Result,
 };
-use crate::etl::field::{Fields, InputFieldInfo, OneInputMultiOutputField};
+use crate::etl::field::{Fields, InputField, OneInputMultiOutputField};
 use crate::etl::find_key_index;
 use crate::etl::processor::{
     yaml_bool, yaml_new_field, yaml_new_fields, yaml_string, yaml_strings, Processor,
@@ -173,7 +173,7 @@ impl RegexProcessorBuilder {
         let mut real_fields = vec![];
         for field in self.fields.into_iter() {
             let input_index = find_key_index(intermediate_keys, field.input_field(), "regex")?;
-            let input_field_info = InputFieldInfo::new(field.input_field(), input_index);
+            let input_field_info = InputField::new(field.input_field(), input_index);
 
             let input = OneInputMultiOutputField::new(input_field_info, field.target_field);
             real_fields.push(input);
diff --git a/src/pipeline/src/etl/processor/timestamp.rs b/src/pipeline/src/etl/processor/timestamp.rs
index 18b6711c1d80..bf90e78f2165 100644
--- a/src/pipeline/src/etl/processor/timestamp.rs
+++ b/src/pipeline/src/etl/processor/timestamp.rs
@@ -14,22 +14,22 @@
 
 use std::sync::Arc;
 
-use ahash::HashSet;
 use chrono::{DateTime, NaiveDateTime};
 use chrono_tz::Tz;
 use lazy_static::lazy_static;
 use snafu::{OptionExt, ResultExt};
 
+use super::IntermediateStatus;
 use crate::etl::error::{
     DateFailedToGetLocalTimezoneSnafu, DateFailedToGetTimestampSnafu, DateInvalidFormatSnafu,
     DateParseSnafu, DateParseTimezoneSnafu, EpochInvalidResolutionSnafu, Error,
     KeyMustBeStringSnafu, ProcessorFailedToParseStringSnafu, ProcessorMissingFieldSnafu,
     ProcessorUnsupportedValueSnafu, Result,
 };
-use crate::etl::field::{Fields, OneInputOneOutputField};
+use crate::etl::field::Fields;
 use crate::etl::processor::{
-    yaml_bool, yaml_new_field, yaml_new_fields, yaml_string, yaml_strings, Processor,
-    ProcessorBuilder, ProcessorKind, FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME,
+    yaml_bool, yaml_new_field, yaml_new_fields, yaml_string, yaml_strings, Processor, FIELDS_NAME,
+    FIELD_NAME, IGNORE_MISSING_NAME,
 };
 use crate::etl::value::time::{
     MICROSECOND_RESOLUTION, MICRO_RESOLUTION, MILLISECOND_RESOLUTION, MILLI_RESOLUTION,
@@ -114,56 +114,10 @@ impl std::ops::Deref for Formats {
     }
 }
 
-#[derive(Debug)]
-pub struct TimestampProcessorBuilder {
-    fields: Fields,
-    formats: Formats,
-    resolution: Resolution,
-    ignore_missing: bool,
-}
-
-impl ProcessorBuilder for TimestampProcessorBuilder {
-    fn output_keys(&self) -> HashSet<&str> {
-        self.fields
-            .iter()
-            .map(|f| f.target_or_input_field())
-            .collect()
-    }
-
-    fn input_keys(&self) -> HashSet<&str> {
-        self.fields.iter().map(|f| f.input_field()).collect()
-    }
-
-    fn build(self, intermediate_keys: &[String]) -> Result<ProcessorKind> {
-        self.build(intermediate_keys).map(ProcessorKind::Timestamp)
-    }
-}
-
-impl TimestampProcessorBuilder {
-    pub fn build(self, intermediate_keys: &[String]) -> Result<TimestampProcessor> {
-        let mut real_fields = vec![];
-        for field in self.fields.into_iter() {
-            let input = OneInputOneOutputField::build(
-                "timestamp",
-                intermediate_keys,
-                field.input_field(),
-                field.target_or_input_field(),
-            )?;
-            real_fields.push(input);
-        }
-        Ok(TimestampProcessor {
-            fields: real_fields,
-            formats: self.formats,
-            resolution: self.resolution,
-            ignore_missing: self.ignore_missing,
-        })
-    }
-}
-
 /// support string, integer, float, time, epoch
 #[derive(Debug, Default)]
 pub struct TimestampProcessor {
-    fields: Vec<OneInputOneOutputField>,
+    fields: Fields,
     formats: Formats,
     resolution: Resolution,
     ignore_missing: bool,
@@ -289,7 +243,7 @@ fn parse_formats(yaml: &yaml_rust::yaml::Yaml) -> Result<Vec<(Arc<String>, Tz)>>
     }
 }
 
-impl TryFrom<&yaml_rust::yaml::Hash> for TimestampProcessorBuilder {
+impl TryFrom<&yaml_rust::yaml::Hash> for TimestampProcessor {
     type Error = Error;
 
     fn try_from(hash: &yaml_rust::yaml::Hash) -> Result<Self> {
@@ -324,7 +278,7 @@ impl TryFrom<&yaml_rust::yaml::Hash> for TimestampProcessorBuilder {
             }
         }
 
-        let processor_builder = TimestampProcessorBuilder {
+        let processor_builder = TimestampProcessor {
             fields,
             formats,
             resolution,
@@ -344,23 +298,23 @@ impl Processor for TimestampProcessor {
         self.ignore_missing
     }
 
-    fn exec_mut(&self, val: &mut Vec<Value>) -> Result<()> {
+    fn exec_mut(&self, val: &mut IntermediateStatus) -> Result<()> {
         for field in self.fields.iter() {
-            let index = field.input().index;
+            let index = field.input_field();
             match val.get(index) {
                 Some(Value::Null) | None => {
                     if !self.ignore_missing {
                         return ProcessorMissingFieldSnafu {
                             processor: self.kind(),
-                            field: field.input_name(),
+                            field: field.input_field(),
                         }
                         .fail();
                     }
                 }
                 Some(v) => {
                     let result = self.parse(v)?;
-                    let (_, index) = field.output();
-                    val[*index] = Value::Timestamp(result);
+                    let output_key = field.target_or_input_field();
+                    val.insert(output_key.to_string(), Value::Timestamp(result));
                 }
             }
         }
@@ -372,18 +326,9 @@ impl Processor for TimestampProcessor {
 mod tests {
     use yaml_rust::YamlLoader;
 
-    use super::{TimestampProcessor, TimestampProcessorBuilder};
+    use super::TimestampProcessor;
     use crate::etl::value::{Timestamp, Value};
 
-    fn builder_to_native_processor(builder: TimestampProcessorBuilder) -> TimestampProcessor {
-        TimestampProcessor {
-            fields: vec![],
-            formats: builder.formats,
-            resolution: builder.resolution,
-            ignore_missing: builder.ignore_missing,
-        }
-    }
-
     #[test]
     fn test_parse_epoch() {
         let processor_yaml_str = r#"fields:
@@ -397,9 +342,7 @@ formats:
 "#;
         let yaml = &YamlLoader::load_from_str(processor_yaml_str).unwrap()[0];
         let timestamp_yaml = yaml.as_hash().unwrap();
-        let processor = builder_to_native_processor(
-            TimestampProcessorBuilder::try_from(timestamp_yaml).unwrap(),
-        );
+        let processor = TimestampProcessor::try_from(timestamp_yaml).unwrap();
 
         let values = [
             (
@@ -451,9 +394,7 @@ formats:
 "#;
         let yaml = &YamlLoader::load_from_str(processor_yaml_str).unwrap()[0];
         let timestamp_yaml = yaml.as_hash().unwrap();
-        let processor = builder_to_native_processor(
-            TimestampProcessorBuilder::try_from(timestamp_yaml).unwrap(),
-        );
+        let processor = TimestampProcessor::try_from(timestamp_yaml).unwrap();
 
         let values: Vec<&str> = vec![
             "2014-5-17T12:34:56",
diff --git a/src/pipeline/src/etl/processor/urlencoding.rs b/src/pipeline/src/etl/processor/urlencoding.rs
index ca42aae23677..c14c7d87b11f 100644
--- a/src/pipeline/src/etl/processor/urlencoding.rs
+++ b/src/pipeline/src/etl/processor/urlencoding.rs
@@ -12,7 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-use ahash::HashSet;
+use std::collections::BTreeMap;
+
 use snafu::{OptionExt, ResultExt};
 use urlencoding::{decode, encode};
 
@@ -20,10 +21,10 @@ use crate::etl::error::{
     Error, KeyMustBeStringSnafu, ProcessorExpectStringSnafu, ProcessorMissingFieldSnafu, Result,
     UrlEncodingDecodeSnafu, UrlEncodingInvalidMethodSnafu,
 };
-use crate::etl::field::{Fields, OneInputOneOutputField};
+use crate::etl::field::Fields;
 use crate::etl::processor::{
-    yaml_bool, yaml_new_field, yaml_new_fields, yaml_string, ProcessorBuilder, ProcessorKind,
-    FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME, METHOD_NAME,
+    yaml_bool, yaml_new_field, yaml_new_fields, yaml_string, FIELDS_NAME, FIELD_NAME,
+    IGNORE_MISSING_NAME, METHOD_NAME,
 };
 use crate::etl::value::Value;
 
@@ -57,55 +58,10 @@ impl std::str::FromStr for Method {
     }
 }
 
-#[derive(Debug, Default)]
-pub struct UrlEncodingProcessorBuilder {
-    fields: Fields,
-    method: Method,
-    ignore_missing: bool,
-}
-
-impl ProcessorBuilder for UrlEncodingProcessorBuilder {
-    fn output_keys(&self) -> HashSet<&str> {
-        self.fields
-            .iter()
-            .map(|f| f.target_or_input_field())
-            .collect()
-    }
-
-    fn input_keys(&self) -> HashSet<&str> {
-        self.fields.iter().map(|f| f.input_field()).collect()
-    }
-
-    fn build(self, intermediate_keys: &[String]) -> Result<ProcessorKind> {
-        self.build(intermediate_keys)
-            .map(ProcessorKind::UrlEncoding)
-    }
-}
-
-impl UrlEncodingProcessorBuilder {
-    fn build(self, intermediate_keys: &[String]) -> Result<UrlEncodingProcessor> {
-        let mut real_fields = vec![];
-        for field in self.fields.into_iter() {
-            let input = OneInputOneOutputField::build(
-                "urlencoding",
-                intermediate_keys,
-                field.input_field(),
-                field.target_or_input_field(),
-            )?;
-            real_fields.push(input);
-        }
-        Ok(UrlEncodingProcessor {
-            fields: real_fields,
-            method: self.method,
-            ignore_missing: self.ignore_missing,
-        })
-    }
-}
-
 /// only support string value
 #[derive(Debug, Default)]
 pub struct UrlEncodingProcessor {
-    fields: Vec<OneInputOneOutputField>,
+    fields: Fields,
     method: Method,
     ignore_missing: bool,
 }
@@ -120,7 +76,7 @@ impl UrlEncodingProcessor {
     }
 }
 
-impl TryFrom<&yaml_rust::yaml::Hash> for UrlEncodingProcessorBuilder {
+impl TryFrom<&yaml_rust::yaml::Hash> for UrlEncodingProcessor {
     type Error = Error;
 
     fn try_from(value: &yaml_rust::yaml::Hash) -> Result<Self> {
@@ -152,7 +108,7 @@ impl TryFrom<&yaml_rust::yaml::Hash> for UrlEncodingProcessorBuilder {
                 _ => {}
             }
         }
-        let processor = UrlEncodingProcessorBuilder {
+        let processor = UrlEncodingProcessor {
             fields,
             method,
             ignore_missing,
@@ -171,20 +127,20 @@ impl crate::etl::processor::Processor for UrlEncodingProcessor {
         self.ignore_missing
     }
 
-    fn exec_mut(&self, val: &mut Vec<Value>) -> Result<()> {
+    fn exec_mut(&self, val: &mut BTreeMap<String, Value>) -> Result<()> {
         for field in self.fields.iter() {
-            let index = field.input_index();
+            let index = field.input_field();
             match val.get(index) {
                 Some(Value::String(s)) => {
                     let result = self.process_field(s)?;
-                    let output_index = field.output_index();
-                    val[output_index] = result;
+                    let output_index = field.target_or_input_field();
+                    val.insert(output_index.to_string(), result);
                 }
                 Some(Value::Null) | None => {
                     if !self.ignore_missing {
                         return ProcessorMissingFieldSnafu {
                             processor: self.kind(),
-                            field: field.input_name(),
+                            field: field.input_field(),
                         }
                         .fail();
                     }
@@ -205,6 +161,7 @@ impl crate::etl::processor::Processor for UrlEncodingProcessor {
 #[cfg(test)]
 mod tests {
 
+    use crate::etl::field::Fields;
     use crate::etl::processor::urlencoding::UrlEncodingProcessor;
     use crate::etl::value::Value;
 
@@ -220,7 +177,7 @@ mod tests {
         }
         {
             let processor = UrlEncodingProcessor {
-                fields: vec![],
+                fields: Fields::default(),
                 method: super::Method::Encode,
                 ignore_missing: false,
             };
diff --git a/src/pipeline/src/etl/transform.rs b/src/pipeline/src/etl/transform.rs
index 4daa3a4d8cf4..7191d272069c 100644
--- a/src/pipeline/src/etl/transform.rs
+++ b/src/pipeline/src/etl/transform.rs
@@ -17,11 +17,7 @@ pub mod transformer;
 
 use std::collections::BTreeMap;
 
-use snafu::OptionExt;
-
 use crate::etl::error::{Error, Result};
-use crate::etl::find_key_index;
-use crate::etl::processor::yaml_string;
 use crate::etl::transform::index::Index;
 use crate::etl::value::Value;
 
@@ -32,14 +28,15 @@ const TRANSFORM_INDEX: &str = "index";
 const TRANSFORM_DEFAULT: &str = "default";
 const TRANSFORM_ON_FAILURE: &str = "on_failure";
 
+use snafu::OptionExt;
 pub use transformer::greptime::GreptimeTransformer;
 
 use super::error::{
     KeyMustBeStringSnafu, TransformElementMustBeMapSnafu, TransformOnFailureInvalidValueSnafu,
     TransformTypeMustBeSetSnafu,
 };
-use super::field::{Fields, InputFieldInfo, OneInputOneOutputField};
-use super::processor::{yaml_new_field, yaml_new_fields};
+use super::field::Fields;
+use super::processor::{yaml_new_field, yaml_new_fields, yaml_string};
 
 pub trait Transformer: std::fmt::Debug + Sized + Send + Sync + 'static {
     type Output;
@@ -104,14 +101,43 @@ impl TryFrom<&Vec<yaml_rust::Yaml>> for Transforms {
     type Error = Error;
 
     fn try_from(docs: &Vec<yaml_rust::Yaml>) -> Result<Self> {
-        todo!()
+        let mut transforms = Vec::with_capacity(100);
+        let mut all_output_keys: Vec<String> = Vec::with_capacity(100);
+        let mut all_required_keys = Vec::with_capacity(100);
+        for doc in docs {
+            let transform_builder: Transform = doc
+                .as_hash()
+                .context(TransformElementMustBeMapSnafu)?
+                .try_into()?;
+            let mut transform_output_keys = transform_builder
+                .fields
+                .iter()
+                .map(|f| f.target_or_input_field().to_string())
+                .collect();
+            all_output_keys.append(&mut transform_output_keys);
+
+            let mut transform_required_keys = transform_builder
+                .fields
+                .iter()
+                .map(|f| f.input_field().to_string())
+                .collect();
+            all_required_keys.append(&mut transform_required_keys);
+
+            transforms.push(transform_builder);
+        }
+
+        all_required_keys.sort();
+
+        Ok(Transforms {
+            transforms: transforms,
+        })
     }
 }
 
 /// only field is required
 #[derive(Debug, Clone)]
 pub struct Transform {
-    pub real_fields: Vec<OneInputOneOutputField>,
+    pub fields: Fields,
 
     pub type_: Value,
 
@@ -125,7 +151,7 @@ pub struct Transform {
 impl Default for Transform {
     fn default() -> Self {
         Transform {
-            real_fields: Vec::new(),
+            fields: Fields::default(),
             type_: Value::Null,
             default: None,
             index: None,
@@ -143,3 +169,78 @@ impl Transform {
         &self.type_
     }
 }
+
+impl TryFrom<&yaml_rust::yaml::Hash> for Transform {
+    type Error = Error;
+
+    fn try_from(hash: &yaml_rust::yaml::Hash) -> Result<Self> {
+        let mut fields = Fields::default();
+        let mut type_ = Value::Null;
+        let mut default = None;
+        let mut index = None;
+        let mut on_failure = None;
+
+        for (k, v) in hash {
+            let key = k
+                .as_str()
+                .with_context(|| KeyMustBeStringSnafu { k: k.clone() })?;
+            match key {
+                TRANSFORM_FIELD => {
+                    fields = Fields::one(yaml_new_field(v, TRANSFORM_FIELD)?);
+                }
+
+                TRANSFORM_FIELDS => {
+                    fields = yaml_new_fields(v, TRANSFORM_FIELDS)?;
+                }
+
+                TRANSFORM_TYPE => {
+                    let t = yaml_string(v, TRANSFORM_TYPE)?;
+                    type_ = Value::parse_str_type(&t)?;
+                }
+
+                TRANSFORM_INDEX => {
+                    let index_str = yaml_string(v, TRANSFORM_INDEX)?;
+                    index = Some(index_str.try_into()?);
+                }
+
+                TRANSFORM_DEFAULT => {
+                    default = Some(Value::try_from(v)?);
+                }
+
+                TRANSFORM_ON_FAILURE => {
+                    let on_failure_str = yaml_string(v, TRANSFORM_ON_FAILURE)?;
+                    on_failure = Some(on_failure_str.parse()?);
+                }
+
+                _ => {}
+            }
+        }
+        let mut final_default = None;
+
+        if let Some(default_value) = default {
+            match (&type_, &default_value) {
+                (Value::Null, _) => {
+                    return TransformTypeMustBeSetSnafu {
+                        fields: format!("{:?}", fields),
+                        default: default_value.to_string(),
+                    }
+                    .fail();
+                }
+                (_, Value::Null) => {} // if default is not set, then it will be regarded as default null
+                (_, _) => {
+                    let target = type_.parse_str_value(default_value.to_str_value().as_str())?;
+                    final_default = Some(target);
+                }
+            }
+        }
+        let builder = Transform {
+            fields,
+            type_,
+            default: final_default,
+            index,
+            on_failure,
+        };
+
+        Ok(builder)
+    }
+}
diff --git a/src/pipeline/src/etl/transform/transformer/greptime.rs b/src/pipeline/src/etl/transform/transformer/greptime.rs
index f7e59904a313..eeff061f755c 100644
--- a/src/pipeline/src/etl/transform/transformer/greptime.rs
+++ b/src/pipeline/src/etl/transform/transformer/greptime.rs
@@ -22,7 +22,7 @@ use api::helper::proto_value_type;
 use api::v1::column_data_type_extension::TypeExt;
 use api::v1::value::ValueData;
 use api::v1::{ColumnDataType, ColumnDataTypeExtension, JsonTypeExtension, SemanticType};
-use coerce::coerce_columns;
+use coerce::{coerce_columns, coerce_value};
 use greptime_proto::v1::{ColumnSchema, Row, Rows, Value as GreptimeValue};
 use itertools::Itertools;
 use serde_json::{Map, Number, Value as JsonValue};
@@ -33,6 +33,7 @@ use crate::etl::error::{
     TransformMultipleTimestampIndexSnafu, TransformTimestampIndexCountSnafu,
     UnsupportedNumberTypeSnafu,
 };
+use crate::etl::processor::IntermediateStatus;
 use crate::etl::transform::index::Index;
 use crate::etl::transform::{Transformer, Transforms};
 use crate::etl::value::{Timestamp, Value};
@@ -142,9 +143,9 @@ impl Transformer for GreptimeTransformer {
 
         for transform in transforms.iter() {
             let target_fields_set = transform
-                .real_fields
+                .fields
                 .iter()
-                .map(|f| f.output_name())
+                .map(|f| f.target_or_input_field())
                 .collect::<HashSet<_>>();
 
             let intersections: Vec<_> = column_names_set.intersection(&target_fields_set).collect();
@@ -157,16 +158,17 @@ impl Transformer for GreptimeTransformer {
 
             if let Some(idx) = transform.index {
                 if idx == Index::Time {
-                    match transform.real_fields.len() {
+                    match transform.fields.len() {
                         //Safety unwrap is fine here because we have checked the length of real_fields
-                        1 => timestamp_columns
-                            .push(transform.real_fields.first().unwrap().input_name()),
+                        1 => {
+                            timestamp_columns.push(transform.fields.first().unwrap().input_field())
+                        }
                         _ => {
                             return TransformMultipleTimestampIndexSnafu {
                                 columns: transform
-                                    .real_fields
+                                    .fields
                                     .iter()
-                                    .map(|x| x.input_name())
+                                    .map(|x| x.input_field())
                                     .join(", "),
                             }
                             .fail();
@@ -195,31 +197,31 @@ impl Transformer for GreptimeTransformer {
         }
     }
 
-    fn transform_mut(&self, val: &mut BTreeMap<String, Value>) -> Result<Self::VecOutput> {
-        // let mut values = vec![GreptimeValue { value_data: None }; self.schema.len()];
-        // for transform in self.transforms.iter() {
-        //     for field in transform.real_fields.iter() {
-        //         let index = field.input_index();
-        //         let output_index = field.output_index();
-        //         match val.get(index) {
-        //             Some(v) => {
-        //                 let value_data = coerce_value(v, transform)?;
-        //                 // every transform fields has only one output field
-        //                 values[output_index] = GreptimeValue { value_data };
-        //             }
-        //             None => {
-        //                 let default = transform.get_default();
-        //                 let value_data = match default {
-        //                     Some(default) => coerce_value(default, transform)?,
-        //                     None => None,
-        //                 };
-        //                 values[output_index] = GreptimeValue { value_data };
-        //             }
-        //         }
-        //     }
-        // }
-        // Ok(Row { values })
-        todo!()
+    fn transform_mut(&self, val: &mut IntermediateStatus) -> Result<Self::VecOutput> {
+        let mut values = vec![GreptimeValue { value_data: None }; self.schema.len()];
+        let mut output_index = 0;
+        for transform in self.transforms.iter() {
+            for field in transform.fields.iter() {
+                let index = field.input_field();
+                match val.get(index) {
+                    Some(v) => {
+                        let value_data = coerce_value(v, transform)?;
+                        // every transform fields has only one output field
+                        values[output_index] = GreptimeValue { value_data };
+                    }
+                    None => {
+                        let default = transform.get_default();
+                        let value_data = match default {
+                            Some(default) => coerce_value(default, transform)?,
+                            None => None,
+                        };
+                        values[output_index] = GreptimeValue { value_data };
+                    }
+                }
+                output_index += 1;
+            }
+        }
+        Ok(Row { values })
     }
 
     fn transforms(&self) -> &Transforms {
@@ -643,6 +645,7 @@ mod tests {
     use crate::etl::transform::transformer::greptime::{
         flatten_json_object, identity_pipeline_inner, GreptimePipelineParams,
     };
+    use crate::etl::{json_array_to_intermediate_state, json_to_intermediate_state};
     use crate::{identity_pipeline, Pipeline};
 
     #[test]
@@ -668,7 +671,7 @@ mod tests {
                     "gaga": "gaga"
                 }),
             ];
-            let array = Pipeline::prepare(array).unwrap();
+            let array = json_array_to_intermediate_state(array).unwrap();
             let rows = identity_pipeline(array, None, &GreptimePipelineParams::default());
             assert!(rows.is_err());
             assert_eq!(
@@ -698,7 +701,7 @@ mod tests {
                 }),
             ];
             let rows = identity_pipeline(
-                Pipeline::prepare(array).unwrap(),
+                json_array_to_intermediate_state(array).unwrap(),
                 None,
                 &GreptimePipelineParams::default(),
             );
@@ -730,7 +733,7 @@ mod tests {
                 }),
             ];
             let rows = identity_pipeline(
-                Pipeline::prepare(array).unwrap(),
+                json_array_to_intermediate_state(array).unwrap(),
                 None,
                 &GreptimePipelineParams::default(),
             );
@@ -764,7 +767,7 @@ mod tests {
             ];
             let tag_column_names = ["name".to_string(), "address".to_string()];
             let rows = identity_pipeline_inner(
-                Pipeline::prepare(array).uwnrap(),
+                json_array_to_intermediate_state(array).unwrap(),
                 Some(tag_column_names.iter()),
                 &GreptimePipelineParams::default(),
             );
diff --git a/src/pipeline/src/etl/transform/transformer/greptime/coerce.rs b/src/pipeline/src/etl/transform/transformer/greptime/coerce.rs
index 5f448b386cbd..da345b3bdeb3 100644
--- a/src/pipeline/src/etl/transform/transformer/greptime/coerce.rs
+++ b/src/pipeline/src/etl/transform/transformer/greptime/coerce.rs
@@ -71,12 +71,11 @@ impl TryFrom<Value> for ValueData {
     }
 }
 
-// TODO(yuanbohan): add fulltext support in datatype_extension
 pub(crate) fn coerce_columns(transform: &Transform) -> Result<Vec<ColumnSchema>> {
     let mut columns = Vec::new();
 
-    for field in transform.real_fields.iter() {
-        let column_name = field.output_name().to_string();
+    for field in transform.fields.iter() {
+        let column_name = field.target_or_input_field().to_string();
 
         let (datatype, datatype_extension) = coerce_type(transform)?;
 
@@ -477,12 +476,14 @@ fn coerce_json_value(v: &Value, transform: &Transform) -> Result<Option<ValueDat
 
 #[cfg(test)]
 mod tests {
+
     use super::*;
+    use crate::etl::field::Fields;
 
     #[test]
     fn test_coerce_string_without_on_failure() {
         let transform = Transform {
-            real_fields: vec![],
+            fields: Fields::default(),
             type_: Value::Int32(0),
             default: None,
             index: None,
@@ -507,7 +508,7 @@ mod tests {
     #[test]
     fn test_coerce_string_with_on_failure_ignore() {
         let transform = Transform {
-            real_fields: vec![],
+            fields: Fields::default(),
             type_: Value::Int32(0),
             default: None,
             index: None,
@@ -522,7 +523,7 @@ mod tests {
     #[test]
     fn test_coerce_string_with_on_failure_default() {
         let mut transform = Transform {
-            real_fields: vec![],
+            fields: Fields::default(),
             type_: Value::Int32(0),
             default: None,
             index: None,
diff --git a/src/pipeline/src/lib.rs b/src/pipeline/src/lib.rs
index 4c9a64668dfd..529908145f45 100644
--- a/src/pipeline/src/lib.rs
+++ b/src/pipeline/src/lib.rs
@@ -20,7 +20,7 @@ mod metrics;
 pub use etl::error::Result;
 pub use etl::processor::Processor;
 pub use etl::transform::transformer::greptime::{
-    GreptimePipelineParams, PipelineExecInput, SchemaInfo, GREPTIME_PIPELINE_PARAMS_HEADER,
+    GreptimePipelineParams, SchemaInfo, GREPTIME_PIPELINE_PARAMS_HEADER,
 };
 pub use etl::transform::transformer::identity_pipeline;
 pub use etl::transform::{GreptimeTransformer, Transformer};

From 3f08d15de802f6e0f080e73a3c38a478de96680c Mon Sep 17 00:00:00 2001
From: Ning Sun <sunning@greptime.com>
Date: Fri, 24 Jan 2025 22:39:47 +0800
Subject: [PATCH 20/32] refactor: use updated prepare api

---
 src/pipeline/src/lib.rs       |  5 +++--
 src/servers/src/http/event.rs | 14 +++++++++-----
 src/servers/src/otlp/logs.rs  | 15 +++++++++------
 src/servers/src/pipeline.rs   |  7 +++----
 4 files changed, 24 insertions(+), 17 deletions(-)

diff --git a/src/pipeline/src/lib.rs b/src/pipeline/src/lib.rs
index 529908145f45..c0003d3f4ea1 100644
--- a/src/pipeline/src/lib.rs
+++ b/src/pipeline/src/lib.rs
@@ -26,8 +26,9 @@ pub use etl::transform::transformer::identity_pipeline;
 pub use etl::transform::{GreptimeTransformer, Transformer};
 pub use etl::value::{Array, Map, Value};
 pub use etl::{
-    error as etl_error, parse, Content, DispatchedTo, Pipeline, PipelineDefinition,
-    PipelineExecOutput, PipelineWay, SelectInfo, GREPTIME_INTERNAL_IDENTITY_PIPELINE_NAME,
+    error as etl_error, json_array_to_intermediate_state, parse, Content, DispatchedTo, Pipeline,
+    PipelineDefinition, PipelineExecOutput, PipelineWay, SelectInfo,
+    GREPTIME_INTERNAL_IDENTITY_PIPELINE_NAME,
 };
 pub use manager::{
     error, pipeline_operator, table, util, PipelineInfo, PipelineRef, PipelineTableRef,
diff --git a/src/servers/src/http/event.rs b/src/servers/src/http/event.rs
index e8f0d749f873..fc4ca58b7543 100644
--- a/src/servers/src/http/event.rs
+++ b/src/servers/src/http/event.rs
@@ -30,10 +30,11 @@ use common_telemetry::{error, warn};
 use datatypes::value::column_data_to_json;
 use headers::ContentType;
 use lazy_static::lazy_static;
+use pipeline::error::PipelineTransformSnafu;
 use pipeline::util::to_pipeline_version;
 use pipeline::{
-    GreptimePipelineParams, GreptimeTransformer, PipelineDefinition, PipelineExecInput,
-    PipelineVersion, GREPTIME_PIPELINE_PARAMS_HEADER,
+    GreptimePipelineParams, GreptimeTransformer, PipelineDefinition, PipelineVersion,
+    GREPTIME_PIPELINE_PARAMS_HEADER,
 };
 use serde::{Deserialize, Serialize};
 use serde_json::{json, Deserializer, Map, Value};
@@ -275,14 +276,15 @@ async fn dryrun_pipeline_inner(
     pipeline_handler: PipelineHandlerRef,
     query_ctx: &QueryContextRef,
 ) -> Result<Response> {
-    let db = query_ctx.get_db_string();
     let params = GreptimePipelineParams::default();
 
     let results = run_pipeline(
         &pipeline_handler,
         PipelineDefinition::Resolved(pipeline),
         &params,
-        Pipeline::prepare(value)?,
+        pipeline::json_array_to_intermediate_state(value)
+            .context(PipelineTransformSnafu)
+            .context(PipelineSnafu)?,
         "dry_run".to_owned(),
         query_ctx,
         true,
@@ -603,7 +605,9 @@ pub(crate) async fn ingest_logs_inner(
             &state,
             PipelineDefinition::from_name(&pipeline_name, version),
             &pipeline_params,
-            Pipeline::prepare(request.values),
+            pipeline::json_array_to_intermediate_state(request.values)
+                .context(PipelineTransformSnafu)
+                .context(PipelineSnafu)?,
             request.table,
             &query_ctx,
             true,
diff --git a/src/servers/src/otlp/logs.rs b/src/servers/src/otlp/logs.rs
index 24232fcef01f..54f29c291621 100644
--- a/src/servers/src/otlp/logs.rs
+++ b/src/servers/src/otlp/logs.rs
@@ -25,14 +25,17 @@ use jsonb::{Number as JsonbNumber, Value as JsonbValue};
 use opentelemetry_proto::tonic::collector::logs::v1::ExportLogsServiceRequest;
 use opentelemetry_proto::tonic::common::v1::{any_value, AnyValue, InstrumentationScope, KeyValue};
 use opentelemetry_proto::tonic::logs::v1::{LogRecord, ResourceLogs, ScopeLogs};
-use pipeline::{GreptimePipelineParams, PipelineExecInput, PipelineWay, SchemaInfo, SelectInfo};
+use pipeline::error::PipelineTransformSnafu;
+use pipeline::{GreptimePipelineParams, PipelineWay, SchemaInfo, SelectInfo};
 use serde_json::{Map, Value};
 use session::context::QueryContextRef;
-use snafu::ensure;
+use snafu::{ensure, ResultExt};
 
 use super::trace::attributes::OtlpAnyValue;
 use super::utils::{bytes_to_hex_string, key_value_to_jsonb};
-use crate::error::{IncompatibleSchemaSnafu, Result, UnsupportedJsonDataTypeForTagSnafu};
+use crate::error::{
+    IncompatibleSchemaSnafu, PipelineSnafu, Result, UnsupportedJsonDataTypeForTagSnafu,
+};
 use crate::pipeline::run_pipeline;
 use crate::query_handler::PipelineHandlerRef;
 
@@ -70,9 +73,9 @@ pub async fn to_grpc_insert_requests(
         }
         PipelineWay::Pipeline(pipeline_def) => {
             let data = parse_export_logs_service_request(request);
-            let array = Pipeline::prepare(data)?;
-
-            let db_string = query_ctx.get_db_string();
+            let array = pipeline::json_array_to_intermediate_state(data)
+                .context(PipelineTransformSnafu)
+                .context(PipelineSnafu)?;
 
             let inserts = run_pipeline(
                 &pipeline_handler,
diff --git a/src/servers/src/pipeline.rs b/src/servers/src/pipeline.rs
index bee45d476404..bf7e949db37b 100644
--- a/src/servers/src/pipeline.rs
+++ b/src/servers/src/pipeline.rs
@@ -14,13 +14,12 @@
 
 use std::collections::BTreeMap;
 use std::sync::Arc;
-use std::time::Instant;
 
-use api::v1::{Row, RowInsertRequest, Rows};
+use api::v1::{RowInsertRequest, Rows};
 use pipeline::error::PipelineTransformSnafu;
 use pipeline::{
     DispatchedTo, GreptimePipelineParams, GreptimeTransformer, Pipeline, PipelineDefinition,
-    PipelineExecInput, PipelineExecOutput, GREPTIME_INTERNAL_IDENTITY_PIPELINE_NAME,
+    PipelineExecOutput, GREPTIME_INTERNAL_IDENTITY_PIPELINE_NAME,
 };
 use session::context::QueryContextRef;
 use snafu::ResultExt;
@@ -83,7 +82,7 @@ pub(crate) async fn run_pipeline(
 
         let transform_timer = std::time::Instant::now();
 
-        let mut transformed = Vec::with_capacity(values.len());
+        let mut transformed = Vec::with_capacity(array.len());
         let mut dispatched: BTreeMap<DispatchedTo, Vec<BTreeMap<String, pipeline::Value>>> =
             BTreeMap::new();
 

From daa9ec163a1411980e516512334252f19ca9aba3 Mon Sep 17 00:00:00 2001
From: Ning Sun <sunning@greptime.com>
Date: Sat, 25 Jan 2025 22:10:53 +0800
Subject: [PATCH 21/32] refactor: improve error and header name

---
 src/pipeline/src/etl/transform/transformer/greptime.rs | 4 +---
 src/pipeline/src/lib.rs                                | 4 +---
 src/servers/src/error.rs                               | 9 +++++++++
 src/servers/src/http/event.rs                          | 6 ++----
 src/servers/src/http/extractor.rs                      | 6 +++---
 src/servers/src/http/header.rs                         | 2 ++
 src/servers/src/http/otlp.rs                           | 2 +-
 src/servers/src/otlp/logs.rs                           | 8 +++-----
 src/servers/src/pipeline.rs                            | 7 ++-----
 9 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/src/pipeline/src/etl/transform/transformer/greptime.rs b/src/pipeline/src/etl/transform/transformer/greptime.rs
index eeff061f755c..1d17472b9737 100644
--- a/src/pipeline/src/etl/transform/transformer/greptime.rs
+++ b/src/pipeline/src/etl/transform/transformer/greptime.rs
@@ -38,9 +38,6 @@ use crate::etl::transform::index::Index;
 use crate::etl::transform::{Transformer, Transforms};
 use crate::etl::value::{Timestamp, Value};
 
-/// The header key that contains the pipeline params.
-pub const GREPTIME_PIPELINE_PARAMS_HEADER: &str = "x-greptime-pipeline-params";
-
 const DEFAULT_GREPTIME_TIMESTAMP_COLUMN: &str = "greptime_timestamp";
 const DEFAULT_MAX_NESTED_LEVELS_FOR_JSON_FLATTENING: usize = 10;
 
@@ -575,6 +572,7 @@ pub fn identity_pipeline(
     table: Option<Arc<table::Table>>,
     params: &GreptimePipelineParams,
 ) -> Result<Rows> {
+    // TODO: flatten
     match table {
         Some(table) => {
             let table_info = table.table_info();
diff --git a/src/pipeline/src/lib.rs b/src/pipeline/src/lib.rs
index c0003d3f4ea1..b4e3405cf154 100644
--- a/src/pipeline/src/lib.rs
+++ b/src/pipeline/src/lib.rs
@@ -19,9 +19,7 @@ mod metrics;
 
 pub use etl::error::Result;
 pub use etl::processor::Processor;
-pub use etl::transform::transformer::greptime::{
-    GreptimePipelineParams, SchemaInfo, GREPTIME_PIPELINE_PARAMS_HEADER,
-};
+pub use etl::transform::transformer::greptime::{GreptimePipelineParams, SchemaInfo};
 pub use etl::transform::transformer::identity_pipeline;
 pub use etl::transform::{GreptimeTransformer, Transformer};
 pub use etl::value::{Array, Map, Value};
diff --git a/src/servers/src/error.rs b/src/servers/src/error.rs
index 375151bfec1e..a6ab75a3bcc7 100644
--- a/src/servers/src/error.rs
+++ b/src/servers/src/error.rs
@@ -157,6 +157,14 @@ pub enum Error {
         location: Location,
     },
 
+    #[snafu(display("Pipeline transform error"))]
+    PipelineTransform {
+        #[snafu(source)]
+        source: pipeline::etl_error::Error,
+        #[snafu(implicit)]
+        location: Location,
+    },
+
     #[snafu(display("Not supported: {}", feat))]
     NotSupported { feat: String },
 
@@ -619,6 +627,7 @@ impl ErrorExt for Error {
             | CheckDatabaseValidity { source, .. } => source.status_code(),
 
             Pipeline { source, .. } => source.status_code(),
+            PipelineTransform { source, .. } => source.status_code(),
 
             NotSupported { .. }
             | InvalidParameter { .. }
diff --git a/src/servers/src/http/event.rs b/src/servers/src/http/event.rs
index fc4ca58b7543..978891078cce 100644
--- a/src/servers/src/http/event.rs
+++ b/src/servers/src/http/event.rs
@@ -32,10 +32,7 @@ use headers::ContentType;
 use lazy_static::lazy_static;
 use pipeline::error::PipelineTransformSnafu;
 use pipeline::util::to_pipeline_version;
-use pipeline::{
-    GreptimePipelineParams, GreptimeTransformer, PipelineDefinition, PipelineVersion,
-    GREPTIME_PIPELINE_PARAMS_HEADER,
-};
+use pipeline::{GreptimePipelineParams, GreptimeTransformer, PipelineDefinition, PipelineVersion};
 use serde::{Deserialize, Serialize};
 use serde_json::{json, Deserializer, Map, Value};
 use session::context::{Channel, QueryContext, QueryContextRef};
@@ -45,6 +42,7 @@ use crate::error::{
     status_code_to_http_status, Error, InvalidParameterSnafu, ParseJsonSnafu, PipelineSnafu,
     Result, UnsupportedContentTypeSnafu,
 };
+use crate::http::header::constants::GREPTIME_PIPELINE_PARAMS_HEADER;
 use crate::http::header::CONTENT_TYPE_PROTOBUF_STR;
 use crate::http::result::greptime_manage_resp::GreptimedbManageResponse;
 use crate::http::result::greptime_result_v1::GreptimedbV1Response;
diff --git a/src/servers/src/http/extractor.rs b/src/servers/src/http/extractor.rs
index ee662f36f615..ae578f21d302 100644
--- a/src/servers/src/http/extractor.rs
+++ b/src/servers/src/http/extractor.rs
@@ -23,7 +23,7 @@ use pipeline::{GreptimePipelineParams, SelectInfo};
 use crate::http::header::constants::{
     GREPTIME_LOG_EXTRACT_KEYS_HEADER_NAME, GREPTIME_LOG_PIPELINE_NAME_HEADER_NAME,
     GREPTIME_LOG_PIPELINE_VERSION_HEADER_NAME, GREPTIME_LOG_TABLE_NAME_HEADER_NAME,
-    GREPTIME_TRACE_TABLE_NAME_HEADER_NAME,
+    GREPTIME_PIPELINE_PARAMS_HEADER, GREPTIME_TRACE_TABLE_NAME_HEADER_NAME,
 };
 
 /// Axum extractor for optional target log table name from HTTP header
@@ -91,7 +91,7 @@ where
 pub struct PipelineInfo {
     pub pipeline_name: Option<String>,
     pub pipeline_version: Option<String>,
-    pub pipeline_params: Option<GreptimePipelineParams>,
+    pub pipeline_params: GreptimePipelineParams,
 }
 
 impl<S> FromRequestParts<S> for PipelineInfo
@@ -112,7 +112,7 @@ where
         Ok(PipelineInfo {
             pipeline_name,
             pipeline_version,
-            pipeline_params: pipeline_parameters.map(|v| GreptimePipelineParams::from_params(v)),
+            pipeline_params: GreptimePipelineParams::from_params(pipeline_parameters.as_deref()),
         })
     }
 }
diff --git a/src/servers/src/http/header.rs b/src/servers/src/http/header.rs
index 51a07ca01f0c..e14ce6172958 100644
--- a/src/servers/src/http/header.rs
+++ b/src/servers/src/http/header.rs
@@ -50,6 +50,8 @@ pub mod constants {
     pub const GREPTIME_LOG_TABLE_NAME_HEADER_NAME: &str = "x-greptime-log-table-name";
     pub const GREPTIME_LOG_EXTRACT_KEYS_HEADER_NAME: &str = "x-greptime-log-extract-keys";
     pub const GREPTIME_TRACE_TABLE_NAME_HEADER_NAME: &str = "x-greptime-trace-table-name";
+    /// The header key that contains the pipeline params.
+    pub const GREPTIME_PIPELINE_PARAMS_HEADER: &str = "x-greptime-pipeline-params";
 }
 
 pub static GREPTIME_DB_HEADER_FORMAT: HeaderName =
diff --git a/src/servers/src/http/otlp.rs b/src/servers/src/http/otlp.rs
index a7efa4b7d32b..d8579fc960b3 100644
--- a/src/servers/src/http/otlp.rs
+++ b/src/servers/src/http/otlp.rs
@@ -126,7 +126,7 @@ pub async fn logs(
         PipelineWay::OtlpLogDirect(Box::new(select_info))
     };
 
-    let pipeline_params = pipeline_info.pipeline_params.unwrap_or_default();
+    let pipeline_params = pipeline_info.pipeline_params;
 
     // here we use nightly feature `trait_upcasting` to convert handler to
     // pipeline_handler
diff --git a/src/servers/src/otlp/logs.rs b/src/servers/src/otlp/logs.rs
index 54f29c291621..5936bd40ad60 100644
--- a/src/servers/src/otlp/logs.rs
+++ b/src/servers/src/otlp/logs.rs
@@ -25,7 +25,6 @@ use jsonb::{Number as JsonbNumber, Value as JsonbValue};
 use opentelemetry_proto::tonic::collector::logs::v1::ExportLogsServiceRequest;
 use opentelemetry_proto::tonic::common::v1::{any_value, AnyValue, InstrumentationScope, KeyValue};
 use opentelemetry_proto::tonic::logs::v1::{LogRecord, ResourceLogs, ScopeLogs};
-use pipeline::error::PipelineTransformSnafu;
 use pipeline::{GreptimePipelineParams, PipelineWay, SchemaInfo, SelectInfo};
 use serde_json::{Map, Value};
 use session::context::QueryContextRef;
@@ -34,7 +33,7 @@ use snafu::{ensure, ResultExt};
 use super::trace::attributes::OtlpAnyValue;
 use super::utils::{bytes_to_hex_string, key_value_to_jsonb};
 use crate::error::{
-    IncompatibleSchemaSnafu, PipelineSnafu, Result, UnsupportedJsonDataTypeForTagSnafu,
+    IncompatibleSchemaSnafu, PipelineTransformSnafu, Result, UnsupportedJsonDataTypeForTagSnafu,
 };
 use crate::pipeline::run_pipeline;
 use crate::query_handler::PipelineHandlerRef;
@@ -73,9 +72,8 @@ pub async fn to_grpc_insert_requests(
         }
         PipelineWay::Pipeline(pipeline_def) => {
             let data = parse_export_logs_service_request(request);
-            let array = pipeline::json_array_to_intermediate_state(data)
-                .context(PipelineTransformSnafu)
-                .context(PipelineSnafu)?;
+            let array =
+                pipeline::json_array_to_intermediate_state(data).context(PipelineTransformSnafu)?;
 
             let inserts = run_pipeline(
                 &pipeline_handler,
diff --git a/src/servers/src/pipeline.rs b/src/servers/src/pipeline.rs
index bf7e949db37b..4d16cb6c351f 100644
--- a/src/servers/src/pipeline.rs
+++ b/src/servers/src/pipeline.rs
@@ -16,7 +16,6 @@ use std::collections::BTreeMap;
 use std::sync::Arc;
 
 use api::v1::{RowInsertRequest, Rows};
-use pipeline::error::PipelineTransformSnafu;
 use pipeline::{
     DispatchedTo, GreptimePipelineParams, GreptimeTransformer, Pipeline, PipelineDefinition,
     PipelineExecOutput, GREPTIME_INTERNAL_IDENTITY_PIPELINE_NAME,
@@ -24,7 +23,7 @@ use pipeline::{
 use session::context::QueryContextRef;
 use snafu::ResultExt;
 
-use crate::error::{CatalogSnafu, PipelineSnafu, Result};
+use crate::error::{CatalogSnafu, PipelineTransformSnafu, Result};
 use crate::metrics::{
     METRIC_FAILURE_VALUE, METRIC_HTTP_LOGS_TRANSFORM_ELAPSED, METRIC_SUCCESS_VALUE,
 };
@@ -76,7 +75,6 @@ pub(crate) async fn run_pipeline(
                 }]
             })
             .context(PipelineTransformSnafu)
-            .context(PipelineSnafu)
     } else {
         let pipeline = get_pipeline(pipeline_definition, state, query_ctx).await?;
 
@@ -94,8 +92,7 @@ pub(crate) async fn run_pipeline(
                         .with_label_values(&[db.as_str(), METRIC_FAILURE_VALUE])
                         .observe(transform_timer.elapsed().as_secs_f64());
                 })
-                .context(PipelineTransformSnafu)
-                .context(PipelineSnafu)?;
+                .context(PipelineTransformSnafu)?;
 
             match r {
                 PipelineExecOutput::Transformed(row) => {

From df5c35de3c37f67e40e642705209bee720b78aef Mon Sep 17 00:00:00 2001
From: Ning Sun <sunning@greptime.com>
Date: Sun, 26 Jan 2025 11:53:26 +0800
Subject: [PATCH 22/32] feat: port flatten to new api

---
 Cargo.lock                                    |  4 +-
 .../src/etl/transform/transformer/greptime.rs | 53 ++++++++++---------
 src/pipeline/src/etl/value/map.rs             |  6 +++
 3 files changed, 36 insertions(+), 27 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index bb37b729e684..c7dbc90df646 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -13483,7 +13483,7 @@ checksum = "2bbd5b46c938e506ecbce286b6628a02171d56153ba733b6c741fc627ec9579b"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.90",
+ "syn 2.0.96",
 ]
 
 [[package]]
@@ -13494,7 +13494,7 @@ checksum = "053c4c462dc91d3b1504c6fe5a726dd15e216ba718e84a0e46a88fbe5ded3515"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.90",
+ "syn 2.0.96",
 ]
 
 [[package]]
diff --git a/src/pipeline/src/etl/transform/transformer/greptime.rs b/src/pipeline/src/etl/transform/transformer/greptime.rs
index 1d17472b9737..67015e4d5252 100644
--- a/src/pipeline/src/etl/transform/transformer/greptime.rs
+++ b/src/pipeline/src/etl/transform/transformer/greptime.rs
@@ -572,14 +572,22 @@ pub fn identity_pipeline(
     table: Option<Arc<table::Table>>,
     params: &GreptimePipelineParams,
 ) -> Result<Rows> {
-    // TODO: flatten
+    let input = if params.flatten_json_object() {
+        array
+            .into_iter()
+            .map(|item| flatten_object(item, DEFAULT_MAX_NESTED_LEVELS_FOR_JSON_FLATTENING))
+            .collect::<Result<Vec<BTreeMap<String, Value>>>>()?
+    } else {
+        array
+    };
+
     match table {
         Some(table) => {
             let table_info = table.table_info();
             let tag_column_names = table_info.meta.row_key_column_names();
-            identity_pipeline_inner(array, Some(tag_column_names), params)
+            identity_pipeline_inner(input, Some(tag_column_names), params)
         }
-        None => identity_pipeline_inner(array, None::<std::iter::Empty<&String>>, params),
+        None => identity_pipeline_inner(input, None::<std::iter::Empty<&String>>, params),
     }
 }
 
@@ -587,24 +595,24 @@ pub fn identity_pipeline(
 ///
 /// The `max_nested_levels` parameter is used to limit the nested levels of the JSON object.
 /// The error will be returned if the nested levels is greater than the `max_nested_levels`.
-pub fn flatten_json_object(
-    object: Map<String, JsonValue>,
+pub fn flatten_object(
+    object: BTreeMap<String, Value>,
     max_nested_levels: usize,
-) -> Result<Map<String, JsonValue>> {
-    let mut flattened = Map::new();
+) -> Result<BTreeMap<String, Value>> {
+    let mut flattened = BTreeMap::new();
 
     if !object.is_empty() {
         // it will use recursion to flatten the object.
-        do_flatten_json_object(&mut flattened, None, object, 1, max_nested_levels)?;
+        do_flatten_object(&mut flattened, None, object, 1, max_nested_levels)?;
     }
 
     Ok(flattened)
 }
 
-fn do_flatten_json_object(
-    dest: &mut Map<String, JsonValue>,
+fn do_flatten_object(
+    dest: &mut BTreeMap<String, Value>,
     base: Option<&str>,
-    object: Map<String, JsonValue>,
+    object: BTreeMap<String, Value>,
     current_level: usize,
     max_nested_levels: usize,
 ) -> Result<()> {
@@ -617,11 +625,11 @@ fn do_flatten_json_object(
         let new_key = base.map_or_else(|| key.clone(), |base_key| format!("{base_key}.{key}"));
 
         match value {
-            JsonValue::Object(object) => {
-                do_flatten_json_object(
+            Value::Map(object) => {
+                do_flatten_object(
                     dest,
                     Some(&new_key),
-                    object,
+                    object.values,
                     current_level + 1,
                     max_nested_levels,
                 )?;
@@ -640,9 +648,7 @@ fn do_flatten_json_object(
 mod tests {
     use api::v1::SemanticType;
 
-    use crate::etl::transform::transformer::greptime::{
-        flatten_json_object, identity_pipeline_inner, GreptimePipelineParams,
-    };
+    use super::*;
     use crate::etl::{json_array_to_intermediate_state, json_to_intermediate_state};
     use crate::{identity_pipeline, Pipeline};
 
@@ -864,14 +870,11 @@ mod tests {
         ];
 
         for (input, max_depth, expected) in test_cases {
-            let flattened_object =
-                flatten_json_object(input.as_object().unwrap().clone(), max_depth);
-            match flattened_object {
-                Ok(flattened_object) => {
-                    assert_eq!(&flattened_object, expected.unwrap().as_object().unwrap())
-                }
-                Err(_) => assert_eq!(None, expected),
-            }
+            let input = json_to_intermediate_state(input).unwrap();
+            let expected = expected.map(|e| json_to_intermediate_state(e).unwrap());
+
+            let flattened_object = flatten_object(input, max_depth).ok();
+            assert_eq!(flattened_object, expected);
         }
     }
 
diff --git a/src/pipeline/src/etl/value/map.rs b/src/pipeline/src/etl/value/map.rs
index 004a617b0f9c..9e730ef532d8 100644
--- a/src/pipeline/src/etl/value/map.rs
+++ b/src/pipeline/src/etl/value/map.rs
@@ -49,6 +49,12 @@ impl From<HashMap<String, Value>> for Map {
     }
 }
 
+impl From<BTreeMap<String, Value>> for Map {
+    fn from(values: BTreeMap<String, Value>) -> Self {
+        Self { values }
+    }
+}
+
 impl std::ops::Deref for Map {
     type Target = BTreeMap<String, Value>;
 

From 8e1b6e920c1d98f9f975575925dfe64329d0fbfb Mon Sep 17 00:00:00 2001
From: paomian <xpaomian@gmail.com>
Date: Sun, 26 Jan 2025 11:56:07 +0800
Subject: [PATCH 23/32] chore: update pipeline api

---
 src/pipeline/benches/processor.rs |  6 ++----
 src/pipeline/tests/common.rs      | 12 +++++-------
 src/pipeline/tests/dissect.rs     |  4 ++--
 src/pipeline/tests/pipeline.rs    | 29 +++++++++--------------------
 4 files changed, 18 insertions(+), 33 deletions(-)

diff --git a/src/pipeline/benches/processor.rs b/src/pipeline/benches/processor.rs
index 01d1a293d66e..ba7240b9d527 100644
--- a/src/pipeline/benches/processor.rs
+++ b/src/pipeline/benches/processor.rs
@@ -13,24 +13,22 @@
 // limitations under the License.
 
 use criterion::{black_box, criterion_group, criterion_main, Criterion};
-use pipeline::{parse, Content, GreptimeTransformer, Pipeline, Result};
+use pipeline::{json_to_intermediate_state, parse, Content, GreptimeTransformer, Pipeline, Result};
 use serde_json::{Deserializer, Value};
 
 fn processor_mut(
     pipeline: &Pipeline<GreptimeTransformer>,
     input_values: Vec<Value>,
 ) -> Result<Vec<greptime_proto::v1::Row>> {
-    let mut payload = pipeline.init_intermediate_state();
     let mut result = Vec::with_capacity(input_values.len());
 
     for v in input_values {
-        pipeline.prepare(v, &mut payload)?;
+        let mut payload = json_to_intermediate_state(v).unwrap();
         let r = pipeline
             .exec_mut(&mut payload)?
             .into_transformed()
             .expect("expect transformed result ");
         result.push(r);
-        pipeline.reset_intermediate_state(&mut payload);
     }
 
     Ok(result)
diff --git a/src/pipeline/tests/common.rs b/src/pipeline/tests/common.rs
index 781c3a30fe0f..89bebbf85bb9 100644
--- a/src/pipeline/tests/common.rs
+++ b/src/pipeline/tests/common.rs
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 use greptime_proto::v1::{ColumnDataType, ColumnSchema, Rows, SemanticType};
-use pipeline::{parse, Content, GreptimeTransformer, Pipeline};
+use pipeline::{json_to_intermediate_state, parse, Content, GreptimeTransformer, Pipeline};
 
 /// test util function to parse and execute pipeline
 pub fn parse_and_exec(input_str: &str, pipeline_yaml: &str) -> Rows {
@@ -22,7 +22,6 @@ pub fn parse_and_exec(input_str: &str, pipeline_yaml: &str) -> Rows {
     let yaml_content = Content::Yaml(pipeline_yaml);
     let pipeline: Pipeline<GreptimeTransformer> =
         parse(&yaml_content).expect("failed to parse pipeline");
-    let mut result = pipeline.init_intermediate_state();
 
     let schema = pipeline.schemas().clone();
 
@@ -31,20 +30,19 @@ pub fn parse_and_exec(input_str: &str, pipeline_yaml: &str) -> Rows {
     match input_value {
         serde_json::Value::Array(array) => {
             for value in array {
-                pipeline.prepare(value, &mut result).unwrap();
+                let mut intermediate_status = json_to_intermediate_state(value).unwrap();
                 let row = pipeline
-                    .exec_mut(&mut result)
+                    .exec_mut(&mut intermediate_status)
                     .expect("failed to exec pipeline")
                     .into_transformed()
                     .expect("expect transformed result ");
                 rows.push(row);
-                pipeline.reset_intermediate_state(&mut result);
             }
         }
         serde_json::Value::Object(_) => {
-            pipeline.prepare(input_value, &mut result).unwrap();
+            let mut intermediate_status = json_to_intermediate_state(input_value).unwrap();
             let row = pipeline
-                .exec_mut(&mut result)
+                .exec_mut(&mut intermediate_status)
                 .expect("failed to exec pipeline")
                 .into_transformed()
                 .expect("expect transformed result ");
diff --git a/src/pipeline/tests/dissect.rs b/src/pipeline/tests/dissect.rs
index 56386d0e860a..a93112d68945 100644
--- a/src/pipeline/tests/dissect.rs
+++ b/src/pipeline/tests/dissect.rs
@@ -16,6 +16,7 @@ mod common;
 
 use greptime_proto::v1::value::ValueData::StringValue;
 use greptime_proto::v1::{ColumnDataType, SemanticType};
+use pipeline::json_to_intermediate_state;
 
 fn make_string_column_schema(name: String) -> greptime_proto::v1::ColumnSchema {
     common::make_column_schema(name, ColumnDataType::String, SemanticType::Field)
@@ -273,9 +274,8 @@ transform:
     let yaml_content = pipeline::Content::Yaml(pipeline_yaml);
     let pipeline: pipeline::Pipeline<pipeline::GreptimeTransformer> =
         pipeline::parse(&yaml_content).expect("failed to parse pipeline");
-    let mut result = pipeline.init_intermediate_state();
+    let mut result = json_to_intermediate_state(input_value).unwrap();
 
-    pipeline.prepare(input_value, &mut result).unwrap();
     let row = pipeline.exec_mut(&mut result);
 
     assert!(row.is_err());
diff --git a/src/pipeline/tests/pipeline.rs b/src/pipeline/tests/pipeline.rs
index f0fa3992e4bf..c34187c80c91 100644
--- a/src/pipeline/tests/pipeline.rs
+++ b/src/pipeline/tests/pipeline.rs
@@ -20,7 +20,7 @@ use greptime_proto::v1::value::ValueData::{
     U32Value, U64Value, U8Value,
 };
 use greptime_proto::v1::Value as GreptimeValue;
-use pipeline::{parse, Content, GreptimeTransformer, Pipeline};
+use pipeline::{json_to_intermediate_state, parse, Content, GreptimeTransformer, Pipeline};
 
 #[test]
 fn test_complex_data() {
@@ -420,10 +420,7 @@ transform:
     let yaml_content = Content::Yaml(pipeline_yaml);
     let pipeline: Pipeline<GreptimeTransformer> =
         parse(&yaml_content).expect("failed to parse pipeline");
-    let mut stats = pipeline.init_intermediate_state();
-    pipeline
-        .prepare(input_value, &mut stats)
-        .expect("failed to prepare pipeline");
+    let mut stats = json_to_intermediate_state(input_value).unwrap();
 
     let row = pipeline
         .exec_mut(&mut stats)
@@ -492,8 +489,7 @@ transform:
     let yaml_content = Content::Yaml(pipeline_yaml);
     let pipeline: Pipeline<GreptimeTransformer> = parse(&yaml_content).unwrap();
 
-    let mut status = pipeline.init_intermediate_state();
-    pipeline.prepare(input_value, &mut status).unwrap();
+    let mut status = json_to_intermediate_state(input_value).unwrap();
     let row = pipeline
         .exec_mut(&mut status)
         .unwrap()
@@ -601,9 +597,7 @@ transform:
     let yaml_content = Content::Yaml(pipeline_yaml);
     let pipeline: Pipeline<GreptimeTransformer> = parse(&yaml_content).unwrap();
 
-    let mut status = pipeline.init_intermediate_state();
-
-    pipeline.prepare(input_value, &mut status).unwrap();
+    let mut status = json_to_intermediate_state(input_value).unwrap();
     let row = pipeline
         .exec_mut(&mut status)
         .unwrap()
@@ -668,8 +662,7 @@ transform:
     let yaml_content = Content::Yaml(pipeline_yaml);
     let pipeline: Pipeline<GreptimeTransformer> = parse(&yaml_content).unwrap();
 
-    let mut status = pipeline.init_intermediate_state();
-    pipeline.prepare(input_value, &mut status).unwrap();
+    let mut status = json_to_intermediate_state(input_value).unwrap();
     let row = pipeline
         .exec_mut(&mut status)
         .unwrap()
@@ -708,8 +701,7 @@ transform:
     let yaml_content = Content::Yaml(pipeline_yaml);
     let pipeline: Pipeline<GreptimeTransformer> = parse(&yaml_content).unwrap();
 
-    let mut status = pipeline.init_intermediate_state();
-    pipeline.prepare(input_value, &mut status).unwrap();
+    let mut status = json_to_intermediate_state(input_value).unwrap();
 
     let row = pipeline
         .exec_mut(&mut status)
@@ -768,8 +760,7 @@ transform:
     let yaml_content = Content::Yaml(pipeline_yaml);
     let pipeline: Pipeline<GreptimeTransformer> = parse(&yaml_content).unwrap();
 
-    let mut status = pipeline.init_intermediate_state();
-    pipeline.prepare(input_value, &mut status).unwrap();
+    let mut status = json_to_intermediate_state(input_value).unwrap();
     let row = pipeline
         .exec_mut(&mut status)
         .unwrap()
@@ -841,8 +832,7 @@ transform:
     let yaml_content = Content::Yaml(pipeline_yaml);
     let pipeline: Pipeline<GreptimeTransformer> = parse(&yaml_content).unwrap();
 
-    let mut status = pipeline.init_intermediate_state();
-    pipeline.prepare(input_value1, &mut status).unwrap();
+    let mut status = json_to_intermediate_state(input_value1).unwrap();
     let dispatched_to = pipeline
         .exec_mut(&mut status)
         .unwrap()
@@ -851,8 +841,7 @@ transform:
     assert_eq!(dispatched_to.table_part, "http");
     assert_eq!(dispatched_to.pipeline.unwrap(), "access_log_pipeline");
 
-    let mut status = pipeline.init_intermediate_state();
-    pipeline.prepare(input_value2, &mut status).unwrap();
+    let mut status = json_to_intermediate_state(input_value2).unwrap();
     let row = pipeline
         .exec_mut(&mut status)
         .unwrap()

From c803209e4e5028e11dc29a85bfb42cb43443148e Mon Sep 17 00:00:00 2001
From: paomian <xpaomian@gmail.com>
Date: Sun, 26 Jan 2025 15:22:41 +0800
Subject: [PATCH 24/32] chore: fix transform and some pipeline test

---
 src/pipeline/src/etl.rs                       | 458 +++++++++---------
 .../src/etl/transform/transformer/greptime.rs |  53 +-
 src/pipeline/src/lib.rs                       |   6 +-
 3 files changed, 244 insertions(+), 273 deletions(-)

diff --git a/src/pipeline/src/etl.rs b/src/pipeline/src/etl.rs
index bca33a607f6e..f302e655a816 100644
--- a/src/pipeline/src/etl.rs
+++ b/src/pipeline/src/etl.rs
@@ -272,242 +272,228 @@ mod tests {
     use super::*;
     use crate::etl::transform::GreptimeTransformer;
 
-    //     #[test]
-    //     fn test_pipeline_prepare() {
-    //         let input_value_str = r#"
-    //                 {
-    //                     "my_field": "1,2",
-    //                     "foo": "bar"
-    //                 }
-    //             "#;
-    //         let input_value: serde_json::Value = serde_json::from_str(input_value_str).unwrap();
-
-    //         let pipeline_yaml = r#"description: 'Pipeline for Apache Tomcat'
-    // processors:
-    //   - csv:
-    //       field: my_field
-    //       target_fields: field1, field2
-    // transform:
-    //   - field: field1
-    //     type: uint32
-    //   - field: field2
-    //     type: uint32
-    // "#;
-    //         let pipeline: Pipeline<GreptimeTransformer> = parse(&Content::Yaml(pipeline_yaml)).unwrap();
-    //         let mut payload = pipeline.init_intermediate_state();
-    //         pipeline.prepare(input_value, &mut payload).unwrap();
-    //         assert_eq!(&["my_field"].to_vec(), pipeline.required_keys());
-    //         assert_eq!(
-    //             payload,
-    //             vec![Value::String("1,2".to_string()), Value::Null, Value::Null]
-    //         );
-    //         let result = pipeline
-    //             .exec_mut(&mut payload)
-    //             .unwrap()
-    //             .into_transformed()
-    //             .unwrap();
-
-    //         assert_eq!(result.values[0].value_data, Some(ValueData::U32Value(1)));
-    //         assert_eq!(result.values[1].value_data, Some(ValueData::U32Value(2)));
-    //         match &result.values[2].value_data {
-    //             Some(ValueData::TimestampNanosecondValue(v)) => {
-    //                 assert_ne!(*v, 0);
-    //             }
-    //             _ => panic!("expect null value"),
-    //         }
-    //     }
-
-    //     #[test]
-    //     fn test_dissect_pipeline() {
-    //         let message = r#"129.37.245.88 - meln1ks [01/Aug/2024:14:22:47 +0800] "PATCH /observability/metrics/production HTTP/1.0" 501 33085"#.to_string();
-    //         let pipeline_str = r#"processors:
-    //   - dissect:
-    //       fields:
-    //         - message
-    //       patterns:
-    //         - "%{ip} %{?ignored} %{username} [%{ts}] \"%{method} %{path} %{proto}\" %{status} %{bytes}"
-    //   - timestamp:
-    //       fields:
-    //         - ts
-    //       formats:
-    //         - "%d/%b/%Y:%H:%M:%S %z"
-
-    // transform:
-    //   - fields:
-    //       - ip
-    //       - username
-    //       - method
-    //       - path
-    //       - proto
-    //     type: string
-    //   - fields:
-    //       - status
-    //     type: uint16
-    //   - fields:
-    //       - bytes
-    //     type: uint32
-    //   - field: ts
-    //     type: timestamp, ns
-    //     index: time"#;
-    //         let pipeline: Pipeline<GreptimeTransformer> = parse(&Content::Yaml(pipeline_str)).unwrap();
-    //         let mut payload = pipeline.init_intermediate_state();
-    //         pipeline
-    //             .prepare(serde_json::Value::String(message), &mut payload)
-    //             .unwrap();
-    //         let result = pipeline
-    //             .exec_mut(&mut payload)
-    //             .unwrap()
-    //             .into_transformed()
-    //             .unwrap();
-    //         let sechema = pipeline.schemas();
-
-    //         assert_eq!(sechema.len(), result.values.len());
-    //         let test = vec![
-    //             (
-    //                 ColumnDataType::String as i32,
-    //                 Some(ValueData::StringValue("129.37.245.88".into())),
-    //             ),
-    //             (
-    //                 ColumnDataType::String as i32,
-    //                 Some(ValueData::StringValue("meln1ks".into())),
-    //             ),
-    //             (
-    //                 ColumnDataType::String as i32,
-    //                 Some(ValueData::StringValue("PATCH".into())),
-    //             ),
-    //             (
-    //                 ColumnDataType::String as i32,
-    //                 Some(ValueData::StringValue(
-    //                     "/observability/metrics/production".into(),
-    //                 )),
-    //             ),
-    //             (
-    //                 ColumnDataType::String as i32,
-    //                 Some(ValueData::StringValue("HTTP/1.0".into())),
-    //             ),
-    //             (
-    //                 ColumnDataType::Uint16 as i32,
-    //                 Some(ValueData::U16Value(501)),
-    //             ),
-    //             (
-    //                 ColumnDataType::Uint32 as i32,
-    //                 Some(ValueData::U32Value(33085)),
-    //             ),
-    //             (
-    //                 ColumnDataType::TimestampNanosecond as i32,
-    //                 Some(ValueData::TimestampNanosecondValue(1722493367000000000)),
-    //             ),
-    //         ];
-    //         for i in 0..sechema.len() {
-    //             let schema = &sechema[i];
-    //             let value = &result.values[i];
-    //             assert_eq!(schema.datatype, test[i].0);
-    //             assert_eq!(value.value_data, test[i].1);
-    //         }
-    //     }
-
-    //     #[test]
-    //     fn test_csv_pipeline() {
-    //         let input_value_str = r#"
-    //                 {
-    //                     "my_field": "1,2",
-    //                     "foo": "bar"
-    //                 }
-    //             "#;
-    //         let input_value: serde_json::Value = serde_json::from_str(input_value_str).unwrap();
-
-    //         let pipeline_yaml = r#"
-    // description: Pipeline for Apache Tomcat
-    // processors:
-    //   - csv:
-    //       field: my_field
-    //       target_fields: field1, field2
-    // transform:
-    //   - field: field1
-    //     type: uint32
-    //   - field: field2
-    //     type: uint32
-    // "#;
-
-    //         let pipeline: Pipeline<GreptimeTransformer> = parse(&Content::Yaml(pipeline_yaml)).unwrap();
-    //         let mut payload = pipeline.init_intermediate_state();
-    //         pipeline.prepare(input_value, &mut payload).unwrap();
-    //         assert_eq!(&["my_field"].to_vec(), pipeline.required_keys());
-    //         assert_eq!(
-    //             payload,
-    //             vec![Value::String("1,2".to_string()), Value::Null, Value::Null]
-    //         );
-    //         let result = pipeline
-    //             .exec_mut(&mut payload)
-    //             .unwrap()
-    //             .into_transformed()
-    //             .unwrap();
-    //         assert_eq!(result.values[0].value_data, Some(ValueData::U32Value(1)));
-    //         assert_eq!(result.values[1].value_data, Some(ValueData::U32Value(2)));
-    //         match &result.values[2].value_data {
-    //             Some(ValueData::TimestampNanosecondValue(v)) => {
-    //                 assert_ne!(*v, 0);
-    //             }
-    //             _ => panic!("expect null value"),
-    //         }
-    //     }
-
-    //     #[test]
-    //     fn test_date_pipeline() {
-    //         let input_value_str = r#"
-    //             {
-    //                 "my_field": "1,2",
-    //                 "foo": "bar",
-    //                 "test_time": "2014-5-17T04:34:56+00:00"
-    //             }
-    //         "#;
-    //         let input_value: serde_json::Value = serde_json::from_str(input_value_str).unwrap();
-
-    //         let pipeline_yaml = r#"
-    // ---
-    // description: Pipeline for Apache Tomcat
-
-    // processors:
-    //   - timestamp:
-    //       field: test_time
-
-    // transform:
-    //   - field: test_time
-    //     type: timestamp, ns
-    //     index: time
-    // "#;
-
-    //         let pipeline: Pipeline<GreptimeTransformer> = parse(&Content::Yaml(pipeline_yaml)).unwrap();
-    //         let schema = pipeline.schemas().clone();
-    //         let mut result = pipeline.init_intermediate_state();
-    //         pipeline.prepare(input_value, &mut result).unwrap();
-    //         let row = pipeline
-    //             .exec_mut(&mut result)
-    //             .unwrap()
-    //             .into_transformed()
-    //             .unwrap();
-    //         let output = Rows {
-    //             schema,
-    //             rows: vec![row],
-    //         };
-    //         let schemas = output.schema;
-
-    //         assert_eq!(schemas.len(), 1);
-    //         let schema = schemas[0].clone();
-    //         assert_eq!("test_time", schema.column_name);
-    //         assert_eq!(ColumnDataType::TimestampNanosecond as i32, schema.datatype);
-    //         assert_eq!(SemanticType::Timestamp as i32, schema.semantic_type);
-
-    //         let row = output.rows[0].clone();
-    //         assert_eq!(1, row.values.len());
-    //         let value_data = row.values[0].clone().value_data;
-    //         assert_eq!(
-    //             Some(v1::value::ValueData::TimestampNanosecondValue(
-    //                 1400301296000000000
-    //             )),
-    //             value_data
-    //         );
-    //     }
+    #[test]
+    fn test_pipeline_prepare() {
+        let input_value_str = r#"
+                    {
+                        "my_field": "1,2",
+                        "foo": "bar"
+                    }
+                "#;
+        let input_value: serde_json::Value = serde_json::from_str(input_value_str).unwrap();
+
+        let pipeline_yaml = r#"description: 'Pipeline for Apache Tomcat'
+    processors:
+      - csv:
+          field: my_field
+          target_fields: field1, field2
+    transform:
+      - field: field1
+        type: uint32
+      - field: field2
+        type: uint32
+    "#;
+        let pipeline: Pipeline<GreptimeTransformer> = parse(&Content::Yaml(pipeline_yaml)).unwrap();
+        let mut payload = json_to_intermediate_state(input_value).unwrap();
+        let result = pipeline
+            .exec_mut(&mut payload)
+            .unwrap()
+            .into_transformed()
+            .unwrap();
+
+        assert_eq!(result.values[0].value_data, Some(ValueData::U32Value(1)));
+        assert_eq!(result.values[1].value_data, Some(ValueData::U32Value(2)));
+        match &result.values[2].value_data {
+            Some(ValueData::TimestampNanosecondValue(v)) => {
+                assert_ne!(*v, 0);
+            }
+            _ => panic!("expect null value"),
+        }
+    }
+
+    #[test]
+    fn test_dissect_pipeline() {
+        let message = r#"129.37.245.88 - meln1ks [01/Aug/2024:14:22:47 +0800] "PATCH /observability/metrics/production HTTP/1.0" 501 33085"#.to_string();
+        let pipeline_str = r#"processors:
+      - dissect:
+          fields:
+            - message
+          patterns:
+            - "%{ip} %{?ignored} %{username} [%{ts}] \"%{method} %{path} %{proto}\" %{status} %{bytes}"
+      - timestamp:
+          fields:
+            - ts
+          formats:
+            - "%d/%b/%Y:%H:%M:%S %z"
+
+    transform:
+      - fields:
+          - ip
+          - username
+          - method
+          - path
+          - proto
+        type: string
+      - fields:
+          - status
+        type: uint16
+      - fields:
+          - bytes
+        type: uint32
+      - field: ts
+        type: timestamp, ns
+        index: time"#;
+        let pipeline: Pipeline<GreptimeTransformer> = parse(&Content::Yaml(pipeline_str)).unwrap();
+        let mut payload = BTreeMap::new();
+        payload.insert("message".to_string(), Value::String(message));
+        let result = pipeline
+            .exec_mut(&mut payload)
+            .unwrap()
+            .into_transformed()
+            .unwrap();
+        let sechema = pipeline.schemas();
+
+        assert_eq!(sechema.len(), result.values.len());
+        let test = vec![
+            (
+                ColumnDataType::String as i32,
+                Some(ValueData::StringValue("129.37.245.88".into())),
+            ),
+            (
+                ColumnDataType::String as i32,
+                Some(ValueData::StringValue("meln1ks".into())),
+            ),
+            (
+                ColumnDataType::String as i32,
+                Some(ValueData::StringValue("PATCH".into())),
+            ),
+            (
+                ColumnDataType::String as i32,
+                Some(ValueData::StringValue(
+                    "/observability/metrics/production".into(),
+                )),
+            ),
+            (
+                ColumnDataType::String as i32,
+                Some(ValueData::StringValue("HTTP/1.0".into())),
+            ),
+            (
+                ColumnDataType::Uint16 as i32,
+                Some(ValueData::U16Value(501)),
+            ),
+            (
+                ColumnDataType::Uint32 as i32,
+                Some(ValueData::U32Value(33085)),
+            ),
+            (
+                ColumnDataType::TimestampNanosecond as i32,
+                Some(ValueData::TimestampNanosecondValue(1722493367000000000)),
+            ),
+        ];
+        for i in 0..sechema.len() {
+            let schema = &sechema[i];
+            let value = &result.values[i];
+            assert_eq!(schema.datatype, test[i].0);
+            assert_eq!(value.value_data, test[i].1);
+        }
+    }
+
+    #[test]
+    fn test_csv_pipeline() {
+        let input_value_str = r#"
+                    {
+                        "my_field": "1,2",
+                        "foo": "bar"
+                    }
+                "#;
+        let input_value: serde_json::Value = serde_json::from_str(input_value_str).unwrap();
+
+        let pipeline_yaml = r#"
+    description: Pipeline for Apache Tomcat
+    processors:
+      - csv:
+          field: my_field
+          target_fields: field1, field2
+    transform:
+      - field: field1
+        type: uint32
+      - field: field2
+        type: uint32
+    "#;
+
+        let pipeline: Pipeline<GreptimeTransformer> = parse(&Content::Yaml(pipeline_yaml)).unwrap();
+        let mut payload = json_to_intermediate_state(input_value).unwrap();
+        let result = pipeline
+            .exec_mut(&mut payload)
+            .unwrap()
+            .into_transformed()
+            .unwrap();
+        assert_eq!(result.values[0].value_data, Some(ValueData::U32Value(1)));
+        assert_eq!(result.values[1].value_data, Some(ValueData::U32Value(2)));
+        match &result.values[2].value_data {
+            Some(ValueData::TimestampNanosecondValue(v)) => {
+                assert_ne!(*v, 0);
+            }
+            _ => panic!("expect null value"),
+        }
+    }
+
+    #[test]
+    fn test_date_pipeline() {
+        let input_value_str = r#"
+                {
+                    "my_field": "1,2",
+                    "foo": "bar",
+                    "test_time": "2014-5-17T04:34:56+00:00"
+                }
+            "#;
+        let input_value: serde_json::Value = serde_json::from_str(input_value_str).unwrap();
+
+        let pipeline_yaml = r#"
+    ---
+    description: Pipeline for Apache Tomcat
+
+    processors:
+      - timestamp:
+          field: test_time
+
+    transform:
+      - field: test_time
+        type: timestamp, ns
+        index: time
+    "#;
+
+        let pipeline: Pipeline<GreptimeTransformer> = parse(&Content::Yaml(pipeline_yaml)).unwrap();
+        let schema = pipeline.schemas().clone();
+        let mut result = json_to_intermediate_state(input_value).unwrap();
+
+        let row = pipeline
+            .exec_mut(&mut result)
+            .unwrap()
+            .into_transformed()
+            .unwrap();
+        let output = Rows {
+            schema,
+            rows: vec![row],
+        };
+        let schemas = output.schema;
+
+        assert_eq!(schemas.len(), 1);
+        let schema = schemas[0].clone();
+        assert_eq!("test_time", schema.column_name);
+        assert_eq!(ColumnDataType::TimestampNanosecond as i32, schema.datatype);
+        assert_eq!(SemanticType::Timestamp as i32, schema.semantic_type);
+
+        let row = output.rows[0].clone();
+        assert_eq!(1, row.values.len());
+        let value_data = row.values[0].clone().value_data;
+        assert_eq!(
+            Some(v1::value::ValueData::TimestampNanosecondValue(
+                1400301296000000000
+            )),
+            value_data
+        );
+    }
 
     #[test]
     fn test_dispatcher() {
diff --git a/src/pipeline/src/etl/transform/transformer/greptime.rs b/src/pipeline/src/etl/transform/transformer/greptime.rs
index 67015e4d5252..749806261a02 100644
--- a/src/pipeline/src/etl/transform/transformer/greptime.rs
+++ b/src/pipeline/src/etl/transform/transformer/greptime.rs
@@ -25,7 +25,7 @@ use api::v1::{ColumnDataType, ColumnDataTypeExtension, JsonTypeExtension, Semant
 use coerce::{coerce_columns, coerce_value};
 use greptime_proto::v1::{ColumnSchema, Row, Rows, Value as GreptimeValue};
 use itertools::Itertools;
-use serde_json::{Map, Number, Value as JsonValue};
+use serde_json::Number;
 
 use crate::etl::error::{
     IdentifyPipelineColumnTypeMismatchSnafu, ReachedMaxNestedLevelsSnafu, Result,
@@ -33,9 +33,10 @@ use crate::etl::error::{
     TransformMultipleTimestampIndexSnafu, TransformTimestampIndexCountSnafu,
     UnsupportedNumberTypeSnafu,
 };
+use crate::etl::field::{Field, Fields};
 use crate::etl::processor::IntermediateStatus;
 use crate::etl::transform::index::Index;
-use crate::etl::transform::{Transformer, Transforms};
+use crate::etl::transform::{Transform, Transformer, Transforms};
 use crate::etl::value::{Timestamp, Value};
 
 const DEFAULT_GREPTIME_TIMESTAMP_COLUMN: &str = "greptime_timestamp";
@@ -83,37 +84,21 @@ impl GreptimePipelineParams {
 impl GreptimeTransformer {
     /// Add a default timestamp column to the transforms
     fn add_greptime_timestamp_column(transforms: &mut Transforms) {
-        // let ns = chrono::Utc::now().timestamp_nanos_opt().unwrap_or(0);
-        // let type_ = Value::Timestamp(Timestamp::Nanosecond(ns));
-        // let default = Some(type_.clone());
-
-        // let transform = Transform {
-        //     real_fields: vec![OneInputOneOutputField::new(
-        //         InputFieldInfo {
-        //             name: DEFAULT_GREPTIME_TIMESTAMP_COLUMN.to_string(),
-        //             index: usize::MAX,
-        //         },
-        //         (
-        //             DEFAULT_GREPTIME_TIMESTAMP_COLUMN.to_string(),
-        //             transforms
-        //                 .transforms
-        //                 .iter()
-        //                 .map(|x| x.real_fields.len())
-        //                 .sum(),
-        //         ),
-        //     )],
-        //     type_,
-        //     default,
-        //     index: Some(Index::Time),
-        //     on_failure: Some(crate::etl::transform::OnFailure::Default),
-        // };
-        // let required_keys = transforms.required_keys_mut();
-        // required_keys.push(DEFAULT_GREPTIME_TIMESTAMP_COLUMN.to_string());
-
-        // let output_keys = transforms.output_keys_mut();
-        // output_keys.push(DEFAULT_GREPTIME_TIMESTAMP_COLUMN.to_string());
-        // transforms.push(transform);
-        todo!()
+        let ns = chrono::Utc::now().timestamp_nanos_opt().unwrap_or(0);
+        let type_ = Value::Timestamp(Timestamp::Nanosecond(ns));
+        let default = Some(type_.clone());
+
+        let transform = Transform {
+            fields: Fields::one(Field::new(
+                DEFAULT_GREPTIME_TIMESTAMP_COLUMN.to_string(),
+                None,
+            )),
+            type_,
+            default,
+            index: Some(Index::Time),
+            on_failure: Some(crate::etl::transform::OnFailure::Default),
+        };
+        transforms.push(transform);
     }
 
     /// Generate the schema for the GreptimeTransformer
@@ -650,7 +635,7 @@ mod tests {
 
     use super::*;
     use crate::etl::{json_array_to_intermediate_state, json_to_intermediate_state};
-    use crate::{identity_pipeline, Pipeline};
+    use crate::identity_pipeline;
 
     #[test]
     fn test_identify_pipeline() {
diff --git a/src/pipeline/src/lib.rs b/src/pipeline/src/lib.rs
index b4e3405cf154..a6c82f9353cf 100644
--- a/src/pipeline/src/lib.rs
+++ b/src/pipeline/src/lib.rs
@@ -24,9 +24,9 @@ pub use etl::transform::transformer::identity_pipeline;
 pub use etl::transform::{GreptimeTransformer, Transformer};
 pub use etl::value::{Array, Map, Value};
 pub use etl::{
-    error as etl_error, json_array_to_intermediate_state, parse, Content, DispatchedTo, Pipeline,
-    PipelineDefinition, PipelineExecOutput, PipelineWay, SelectInfo,
-    GREPTIME_INTERNAL_IDENTITY_PIPELINE_NAME,
+    error as etl_error, json_array_to_intermediate_state, json_to_intermediate_state, parse,
+    Content, DispatchedTo, Pipeline, PipelineDefinition, PipelineExecOutput, PipelineWay,
+    SelectInfo, GREPTIME_INTERNAL_IDENTITY_PIPELINE_NAME,
 };
 pub use manager::{
     error, pipeline_operator, table, util, PipelineInfo, PipelineRef, PipelineTableRef,

From 9de70d3910cf08989e61fedfa8b5af4b4ad84d5d Mon Sep 17 00:00:00 2001
From: Ning Sun <sunning@greptime.com>
Date: Sun, 26 Jan 2025 17:07:09 +0800
Subject: [PATCH 25/32] refactor: reimplement cmcd

---
 src/pipeline/src/etl/processor.rs      |   8 +-
 src/pipeline/src/etl/processor/cmcd.rs | 251 ++++++-------------------
 2 files changed, 60 insertions(+), 199 deletions(-)

diff --git a/src/pipeline/src/etl/processor.rs b/src/pipeline/src/etl/processor.rs
index 63854ad552a7..b20258f20818 100644
--- a/src/pipeline/src/etl/processor.rs
+++ b/src/pipeline/src/etl/processor.rs
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-// pub mod cmcd;
+pub mod cmcd;
 // pub mod csv;
 pub mod date;
 pub mod decolorize;
@@ -29,7 +29,7 @@ pub mod urlencoding;
 
 use std::collections::BTreeMap;
 
-// use cmcd::CmcdProcessor;
+use cmcd::CmcdProcessor;
 // use csv::CsvProcessor;
 use date::DateProcessor;
 use decolorize::DecolorizeProcessor;
@@ -89,7 +89,7 @@ pub trait Processor: std::fmt::Debug + Send + Sync + 'static {
 #[derive(Debug)]
 #[enum_dispatch]
 pub enum ProcessorKind {
-    // Cmcd(CmcdProcessor),
+    Cmcd(CmcdProcessor),
     // Csv(CsvProcessor),
     // Dissect(DissectProcessor),
     Gsub(GsubProcessor),
@@ -156,7 +156,7 @@ fn parse_processor(doc: &yaml_rust::Yaml) -> Result<ProcessorKind> {
     let str_key = key.as_str().context(ProcessorKeyMustBeStringSnafu)?;
 
     let processor = match str_key {
-        // cmcd::PROCESSOR_CMCD => ProcessorKind::Cmcd(CmcdProcessor::try_from(value)?),
+        cmcd::PROCESSOR_CMCD => ProcessorKind::Cmcd(CmcdProcessor::try_from(value)?),
         // csv::PROCESSOR_CSV => ProcessorKind::Csv(CsvProcessor::try_from(value)?),
         // dissect::PROCESSOR_DISSECT => ProcessorKind::Dissect(DissectProcessor::try_from(value)?),
         epoch::PROCESSOR_EPOCH => ProcessorKind::Epoch(EpochProcessor::try_from(value)?),
diff --git a/src/pipeline/src/etl/processor/cmcd.rs b/src/pipeline/src/etl/processor/cmcd.rs
index 944487472691..37df6e8fbec1 100644
--- a/src/pipeline/src/etl/processor/cmcd.rs
+++ b/src/pipeline/src/etl/processor/cmcd.rs
@@ -18,7 +18,6 @@
 
 use std::collections::BTreeMap;
 
-use ahash::HashSet;
 use snafu::{OptionExt, ResultExt};
 use urlencoding::decode;
 
@@ -27,11 +26,10 @@ use crate::etl::error::{
     FailedToParseIntKeySnafu, KeyMustBeStringSnafu, ProcessorExpectStringSnafu,
     ProcessorMissingFieldSnafu, Result,
 };
-use crate::etl::field::{Field, Fields, InputField, OneInputMultiOutputField};
-use crate::etl::find_key_index;
+use crate::etl::field::Fields;
 use crate::etl::processor::{
-    yaml_bool, yaml_new_field, yaml_new_fields, Processor, ProcessorBuilder, ProcessorKind,
-    FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME,
+    yaml_bool, yaml_new_field, yaml_new_fields, Processor, FIELDS_NAME, FIELD_NAME,
+    IGNORE_MISSING_NAME,
 };
 use crate::etl::value::Value;
 
@@ -79,139 +77,6 @@ const CMCD_KEYS: [&str; 18] = [
     CMCD_KEY_V,
 ];
 
-/// CmcdProcessorBuilder is a builder for CmcdProcessor
-/// parse from raw yaml
-#[derive(Debug, Default)]
-pub struct CmcdProcessorBuilder {
-    fields: Fields,
-    output_keys: HashSet<String>,
-    ignore_missing: bool,
-}
-
-impl CmcdProcessorBuilder {
-    /// build_cmcd_outputs build cmcd output info
-    /// generate index and function for each output
-    pub(super) fn build_cmcd_outputs(
-        field: &Field,
-        intermediate_keys: &[String],
-    ) -> Result<(BTreeMap<String, usize>, Vec<CmcdOutputInfo>)> {
-        let mut output_index = BTreeMap::new();
-        let mut cmcd_field_outputs = Vec::with_capacity(CMCD_KEYS.len());
-        for cmcd in CMCD_KEYS {
-            let final_key = generate_key(field.target_or_input_field(), cmcd);
-            let index = find_key_index(intermediate_keys, &final_key, "cmcd")?;
-            output_index.insert(final_key.clone(), index);
-            match cmcd {
-                CMCD_KEY_BS | CMCD_KEY_SU => {
-                    let output_info = CmcdOutputInfo::new(final_key, cmcd, index, bs_su);
-                    cmcd_field_outputs.push(output_info);
-                }
-                CMCD_KEY_BR | CMCD_KEY_BL | CMCD_KEY_D | CMCD_KEY_DL | CMCD_KEY_MTP
-                | CMCD_KEY_RTP | CMCD_KEY_TB => {
-                    let output_info = CmcdOutputInfo::new(final_key, cmcd, index, br_tb);
-                    cmcd_field_outputs.push(output_info);
-                }
-                CMCD_KEY_CID | CMCD_KEY_NRR | CMCD_KEY_OT | CMCD_KEY_SF | CMCD_KEY_SID
-                | CMCD_KEY_ST | CMCD_KEY_V => {
-                    let output_info = CmcdOutputInfo::new(final_key, cmcd, index, cid_v);
-                    cmcd_field_outputs.push(output_info);
-                }
-                CMCD_KEY_NOR => {
-                    let output_info = CmcdOutputInfo::new(final_key, cmcd, index, nor);
-                    cmcd_field_outputs.push(output_info);
-                }
-                CMCD_KEY_PR => {
-                    let output_info = CmcdOutputInfo::new(final_key, cmcd, index, pr);
-                    cmcd_field_outputs.push(output_info);
-                }
-                _ => {}
-            }
-        }
-        Ok((output_index, cmcd_field_outputs))
-    }
-
-    /// build CmcdProcessor from CmcdProcessorBuilder
-    pub fn build(self, intermediate_keys: &[String]) -> Result<CmcdProcessor> {
-        let mut real_fields = vec![];
-        let mut cmcd_outputs = Vec::with_capacity(CMCD_KEYS.len());
-        for field in self.fields.into_iter() {
-            let input_index = find_key_index(intermediate_keys, field.input_field(), "cmcd")?;
-
-            let input_field_info = InputField::new(field.input_field(), input_index);
-
-            let (_, cmcd_field_outputs) = Self::build_cmcd_outputs(&field, intermediate_keys)?;
-
-            cmcd_outputs.push(cmcd_field_outputs);
-
-            let real_field = OneInputMultiOutputField::new(input_field_info, field.target_field);
-            real_fields.push(real_field);
-        }
-        Ok(CmcdProcessor {
-            fields: real_fields,
-            cmcd_outputs,
-            ignore_missing: self.ignore_missing,
-        })
-    }
-}
-
-impl ProcessorBuilder for CmcdProcessorBuilder {
-    fn output_keys(&self) -> HashSet<&str> {
-        self.output_keys.iter().map(|s| s.as_str()).collect()
-    }
-
-    fn input_keys(&self) -> HashSet<&str> {
-        self.fields.iter().map(|f| f.input_field()).collect()
-    }
-
-    fn build(self, intermediate_keys: &[String]) -> Result<ProcessorKind> {
-        self.build(intermediate_keys).map(ProcessorKind::Cmcd)
-    }
-}
-
-fn generate_key(prefix: &str, key: &str) -> String {
-    format!("{}_{}", prefix, key)
-}
-
-/// CmcdOutputInfo is a struct to store output info
-#[derive(Debug)]
-pub(super) struct CmcdOutputInfo {
-    /// {input_field}_{cmcd_key}
-    final_key: String,
-    /// cmcd key
-    key: &'static str,
-    /// index in intermediate_keys
-    index: usize,
-    /// function to resolve value
-    f: fn(&str, &str, Option<&str>) -> Result<Value>,
-}
-
-impl CmcdOutputInfo {
-    fn new(
-        final_key: String,
-        key: &'static str,
-        index: usize,
-        f: fn(&str, &str, Option<&str>) -> Result<Value>,
-    ) -> Self {
-        Self {
-            final_key,
-            key,
-            index,
-            f,
-        }
-    }
-}
-
-impl Default for CmcdOutputInfo {
-    fn default() -> Self {
-        Self {
-            final_key: String::default(),
-            key: "",
-            index: 0,
-            f: |_, _, _| Ok(Value::Null),
-        }
-    }
-}
-
 /// function to resolve CMCD_KEY_BS | CMCD_KEY_SU
 fn bs_su(_: &str, _: &str, _: Option<&str>) -> Result<Value> {
     Ok(Value::Boolean(true))
@@ -288,9 +153,7 @@ fn pr(s: &str, k: &str, v: Option<&str>) -> Result<Value> {
 /// 12. Transport Layer Security SHOULD be used to protect all transmission of CMCD data.
 #[derive(Debug, Default)]
 pub struct CmcdProcessor {
-    fields: Vec<OneInputMultiOutputField>,
-    cmcd_outputs: Vec<Vec<CmcdOutputInfo>>,
-
+    fields: Fields,
     ignore_missing: bool,
 }
 
@@ -299,27 +162,52 @@ impl CmcdProcessor {
         format!("{}_{}", prefix, key)
     }
 
-    fn parse(&self, field_index: usize, s: &str) -> Result<Vec<(usize, Value)>> {
-        let parts = s.split(',');
-        let mut result = Vec::new();
+    fn parse(&self, name: &str, value: &str) -> Result<BTreeMap<String, Value>> {
+        let mut working_set = BTreeMap::new();
+
+        let parts = value.split(',');
+
         for part in parts {
             let mut kv = part.split('=');
-            let k = kv.next().context(CmcdMissingKeySnafu { part, s })?;
+            let k = kv.next().context(CmcdMissingKeySnafu { part, s: value })?;
             let v = kv.next();
 
-            for cmcd_key in self.cmcd_outputs[field_index].iter() {
-                if cmcd_key.key == k {
-                    let val = (cmcd_key.f)(s, k, v)?;
-                    result.push((cmcd_key.index, val));
+            for cmcd_key in CMCD_KEYS {
+                if cmcd_key == k {
+                    match cmcd_key {
+                        CMCD_KEY_BS | CMCD_KEY_SU => {
+                            working_set
+                                .insert(Self::generate_key(name, cmcd_key), bs_su(value, k, v)?);
+                        }
+                        CMCD_KEY_BR | CMCD_KEY_BL | CMCD_KEY_D | CMCD_KEY_DL | CMCD_KEY_MTP
+                        | CMCD_KEY_RTP | CMCD_KEY_TB => {
+                            working_set
+                                .insert(Self::generate_key(name, cmcd_key), br_tb(value, k, v)?);
+                        }
+                        CMCD_KEY_CID | CMCD_KEY_NRR | CMCD_KEY_OT | CMCD_KEY_SF | CMCD_KEY_SID
+                        | CMCD_KEY_ST | CMCD_KEY_V => {
+                            working_set
+                                .insert(Self::generate_key(name, cmcd_key), cid_v(value, k, v)?);
+                        }
+                        CMCD_KEY_NOR => {
+                            working_set
+                                .insert(Self::generate_key(name, cmcd_key), nor(value, k, v)?);
+                        }
+                        CMCD_KEY_PR => {
+                            working_set
+                                .insert(Self::generate_key(name, cmcd_key), pr(value, k, v)?);
+                        }
+
+                        _ => {}
+                    }
                 }
             }
         }
-
-        Ok(result)
+        Ok(working_set)
     }
 }
 
-impl TryFrom<&yaml_rust::yaml::Hash> for CmcdProcessorBuilder {
+impl TryFrom<&yaml_rust::yaml::Hash> for CmcdProcessor {
     type Error = Error;
 
     fn try_from(value: &yaml_rust::yaml::Hash) -> Result<Self> {
@@ -346,22 +234,12 @@ impl TryFrom<&yaml_rust::yaml::Hash> for CmcdProcessorBuilder {
             }
         }
 
-        let output_keys = fields
-            .iter()
-            .flat_map(|f| {
-                CMCD_KEYS
-                    .iter()
-                    .map(|cmcd_key| generate_key(f.target_or_input_field(), cmcd_key))
-            })
-            .collect();
-
-        let builder = CmcdProcessorBuilder {
+        let proc = CmcdProcessor {
             fields,
-            output_keys,
             ignore_missing,
         };
 
-        Ok(builder)
+        Ok(proc)
     }
 }
 
@@ -375,20 +253,19 @@ impl Processor for CmcdProcessor {
     }
 
     fn exec_mut(&self, val: &mut IntermediateStatus) -> Result<()> {
-        for (field_index, field) in self.fields.iter().enumerate() {
-            let field_value_index = field.input_index();
-            match val.get(field_value_index) {
-                Some(Value::String(v)) => {
-                    let result_list = self.parse(field_index, v)?;
-                    for (output_index, v) in result_list {
-                        val[output_index] = v;
-                    }
+        for field in self.fields.iter() {
+            let name = field.input_field();
+
+            match val.get(name) {
+                Some(Value::String(s)) => {
+                    let results = self.parse(name, s)?;
+                    val.extend(results);
                 }
                 Some(Value::Null) | None => {
                     if !self.ignore_missing {
                         return ProcessorMissingFieldSnafu {
                             processor: self.kind().to_string(),
-                            field: field.input_name().to_string(),
+                            field: name.to_string(),
                         }
                         .fail();
                     }
@@ -402,6 +279,7 @@ impl Processor for CmcdProcessor {
                 }
             }
         }
+
         Ok(())
     }
 }
@@ -412,9 +290,9 @@ mod tests {
 
     use urlencoding::decode;
 
-    use super::{CmcdProcessorBuilder, CMCD_KEYS};
+    use super::CmcdProcessor;
     use crate::etl::field::{Field, Fields};
-    use crate::etl::value::{Map, Value};
+    use crate::etl::value::Value;
 
     #[test]
     fn test_cmcd() {
@@ -548,37 +426,20 @@ mod tests {
 
         let field = Field::new("prefix", None);
 
-        let output_keys = CMCD_KEYS
-            .iter()
-            .map(|k| format!("prefix_{}", k))
-            .collect::<Vec<String>>();
-
-        let mut intermediate_keys = vec!["prefix".to_string()];
-        intermediate_keys.append(&mut (output_keys.clone()));
-
-        let builder = CmcdProcessorBuilder {
+        let processor = CmcdProcessor {
             fields: Fields::new(vec![field]),
-            output_keys: output_keys.iter().map(|s| s.to_string()).collect(),
             ignore_missing: false,
         };
 
-        let processor = builder.build(&intermediate_keys).unwrap();
-
         for (s, vec) in ss.into_iter() {
             let decoded = decode(s).unwrap().to_string();
 
-            let values = vec
+            let expected = vec
                 .into_iter()
                 .map(|(k, v)| (k.to_string(), v))
                 .collect::<BTreeMap<String, Value>>();
-            let expected = Map { values };
 
-            let actual = processor.parse(0, &decoded).unwrap();
-            let actual = actual
-                .into_iter()
-                .map(|(index, value)| (intermediate_keys[index].clone(), value))
-                .collect::<BTreeMap<String, Value>>();
-            let actual = Map { values: actual };
+            let actual = processor.parse("prefix", &decoded).unwrap();
             assert_eq!(actual, expected);
         }
     }

From f289318ba84f86a3818167cd3900c6b7bd7ca492 Mon Sep 17 00:00:00 2001
From: Ning Sun <sunning@greptime.com>
Date: Sun, 26 Jan 2025 17:30:18 +0800
Subject: [PATCH 26/32] refactor: update csv processor

---
 src/pipeline/src/etl/processor.rs     |   8 +-
 src/pipeline/src/etl/processor/csv.rs | 171 ++++++--------------------
 2 files changed, 45 insertions(+), 134 deletions(-)

diff --git a/src/pipeline/src/etl/processor.rs b/src/pipeline/src/etl/processor.rs
index b20258f20818..1e19f194abc3 100644
--- a/src/pipeline/src/etl/processor.rs
+++ b/src/pipeline/src/etl/processor.rs
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 pub mod cmcd;
-// pub mod csv;
+pub mod csv;
 pub mod date;
 pub mod decolorize;
 pub mod digest;
@@ -30,7 +30,7 @@ pub mod urlencoding;
 use std::collections::BTreeMap;
 
 use cmcd::CmcdProcessor;
-// use csv::CsvProcessor;
+use csv::CsvProcessor;
 use date::DateProcessor;
 use decolorize::DecolorizeProcessor;
 use digest::DigestProcessor;
@@ -90,7 +90,7 @@ pub trait Processor: std::fmt::Debug + Send + Sync + 'static {
 #[enum_dispatch]
 pub enum ProcessorKind {
     Cmcd(CmcdProcessor),
-    // Csv(CsvProcessor),
+    Csv(CsvProcessor),
     // Dissect(DissectProcessor),
     Gsub(GsubProcessor),
     Join(JoinProcessor),
@@ -157,7 +157,7 @@ fn parse_processor(doc: &yaml_rust::Yaml) -> Result<ProcessorKind> {
 
     let processor = match str_key {
         cmcd::PROCESSOR_CMCD => ProcessorKind::Cmcd(CmcdProcessor::try_from(value)?),
-        // csv::PROCESSOR_CSV => ProcessorKind::Csv(CsvProcessor::try_from(value)?),
+        csv::PROCESSOR_CSV => ProcessorKind::Csv(CsvProcessor::try_from(value)?),
         // dissect::PROCESSOR_DISSECT => ProcessorKind::Dissect(DissectProcessor::try_from(value)?),
         epoch::PROCESSOR_EPOCH => ProcessorKind::Epoch(EpochProcessor::try_from(value)?),
         date::PROCESSOR_DATE => ProcessorKind::Date(DateProcessor::try_from(value)?),
diff --git a/src/pipeline/src/etl/processor/csv.rs b/src/pipeline/src/etl/processor/csv.rs
index 86f39fc89369..a0fac70de15c 100644
--- a/src/pipeline/src/etl/processor/csv.rs
+++ b/src/pipeline/src/etl/processor/csv.rs
@@ -14,7 +14,8 @@
 
 // Reference: https://www.elastic.co/guide/en/elasticsearch/reference/current/csv-processor.html
 
-use ahash::HashSet;
+use std::collections::BTreeMap;
+
 use csv::{ReaderBuilder, Trim};
 use itertools::EitherOrBoth::{Both, Left, Right};
 use itertools::Itertools;
@@ -24,11 +25,10 @@ use crate::etl::error::{
     CsvNoRecordSnafu, CsvQuoteNameSnafu, CsvReadSnafu, CsvSeparatorNameSnafu, Error,
     KeyMustBeStringSnafu, ProcessorExpectStringSnafu, ProcessorMissingFieldSnafu, Result,
 };
-use crate::etl::field::{Fields, InputField, OneInputMultiOutputField};
-use crate::etl::find_key_index;
+use crate::etl::field::Fields;
 use crate::etl::processor::{
-    yaml_bool, yaml_new_field, yaml_new_fields, yaml_string, Processor, ProcessorBuilder,
-    ProcessorKind, FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME,
+    yaml_bool, yaml_new_field, yaml_new_fields, yaml_string, Processor, FIELDS_NAME, FIELD_NAME,
+    IGNORE_MISSING_NAME,
 };
 use crate::etl::value::Value;
 
@@ -40,76 +40,17 @@ const TRIM_NAME: &str = "trim";
 const EMPTY_VALUE_NAME: &str = "empty_value";
 const TARGET_FIELDS: &str = "target_fields";
 
-#[derive(Debug, Default)]
-pub struct CsvProcessorBuilder {
-    reader: ReaderBuilder,
-
-    fields: Fields,
-    ignore_missing: bool,
-
-    // Value used to fill empty fields, empty fields will be skipped if this is not provided.
-    empty_value: Option<String>,
-    target_fields: Vec<String>,
-    // description
-    // if
-    // ignore_failure
-    // on_failure
-    // tag
-}
-
-impl CsvProcessorBuilder {
-    fn build(self, intermediate_keys: &[String]) -> Result<CsvProcessor> {
-        let mut real_fields = vec![];
-
-        for field in self.fields {
-            let input_index = find_key_index(intermediate_keys, field.input_field(), "csv")?;
-
-            let input_field_info = InputField::new(field.input_field(), input_index);
-            let real_field = OneInputMultiOutputField::new(input_field_info, None);
-            real_fields.push(real_field);
-        }
-
-        let output_index_info = self
-            .target_fields
-            .iter()
-            .map(|f| find_key_index(intermediate_keys, f, "csv"))
-            .collect::<Result<Vec<_>>>()?;
-        Ok(CsvProcessor {
-            reader: self.reader,
-            fields: real_fields,
-            ignore_missing: self.ignore_missing,
-            empty_value: self.empty_value,
-            output_index_info,
-        })
-    }
-}
-
-impl ProcessorBuilder for CsvProcessorBuilder {
-    fn output_keys(&self) -> HashSet<&str> {
-        self.target_fields.iter().map(|s| s.as_str()).collect()
-    }
-
-    fn input_keys(&self) -> HashSet<&str> {
-        self.fields.iter().map(|f| f.input_field()).collect()
-    }
-
-    fn build(self, intermediate_keys: &[String]) -> Result<ProcessorKind> {
-        self.build(intermediate_keys).map(ProcessorKind::Csv)
-    }
-}
-
 /// only support string value
-#[derive(Debug)]
+#[derive(Debug, Default)]
 pub struct CsvProcessor {
     reader: ReaderBuilder,
-
-    fields: Vec<OneInputMultiOutputField>,
+    fields: Fields,
 
     ignore_missing: bool,
 
     // Value used to fill empty fields, empty fields will be skipped if this is not provided.
     empty_value: Option<String>,
-    output_index_info: Vec<usize>,
+    target_fields: Vec<String>,
     // description
     // if
     // ignore_failure
@@ -119,18 +60,20 @@ pub struct CsvProcessor {
 
 impl CsvProcessor {
     // process the csv format string to a map with target_fields as keys
-    fn process(&self, val: &str) -> Result<Vec<(usize, Value)>> {
+    fn process(&self, val: &str) -> Result<BTreeMap<String, Value>> {
         let mut reader = self.reader.from_reader(val.as_bytes());
 
         if let Some(result) = reader.records().next() {
             let record: csv::StringRecord = result.context(CsvReadSnafu)?;
 
-            let values: Vec<(usize, Value)> = self
-                .output_index_info
+            let values = self
+                .target_fields
                 .iter()
                 .zip_longest(record.iter())
                 .filter_map(|zipped| match zipped {
-                    Both(target_field, val) => Some((*target_field, Value::String(val.into()))),
+                    Both(target_field, val) => {
+                        Some((target_field.clone(), Value::String(val.into())))
+                    }
                     // if target fields are more than extracted fields, fill the rest with empty value
                     Left(target_field) => {
                         let value = self
@@ -138,7 +81,7 @@ impl CsvProcessor {
                             .as_ref()
                             .map(|s| Value::String(s.clone()))
                             .unwrap_or(Value::Null);
-                        Some((*target_field, value))
+                        Some((target_field.clone(), value))
                     }
                     // if extracted fields are more than target fields, ignore the rest
                     Right(_) => None,
@@ -152,7 +95,7 @@ impl CsvProcessor {
     }
 }
 
-impl TryFrom<&yaml_rust::yaml::Hash> for CsvProcessorBuilder {
+impl TryFrom<&yaml_rust::yaml::Hash> for CsvProcessor {
     type Error = Error;
 
     fn try_from(hash: &yaml_rust::yaml::Hash) -> Result<Self> {
@@ -224,8 +167,8 @@ impl TryFrom<&yaml_rust::yaml::Hash> for CsvProcessorBuilder {
                 _ => {}
             }
         }
-        let builder = {
-            CsvProcessorBuilder {
+        let proc = {
+            CsvProcessor {
                 reader,
                 fields,
                 ignore_missing,
@@ -234,7 +177,7 @@ impl TryFrom<&yaml_rust::yaml::Hash> for CsvProcessorBuilder {
             }
         };
 
-        Ok(builder)
+        Ok(proc)
     }
 }
 
@@ -247,21 +190,20 @@ impl Processor for CsvProcessor {
         self.ignore_missing
     }
 
-    fn exec_mut(&self, val: &mut Vec<Value>) -> Result<()> {
+    fn exec_mut(&self, val: &mut BTreeMap<String, Value>) -> Result<()> {
         for field in self.fields.iter() {
-            let index = field.input_index();
-            match val.get(index) {
+            let name = field.input_field();
+
+            match val.get(name) {
                 Some(Value::String(v)) => {
-                    let resule_list = self.process(v)?;
-                    for (k, v) in resule_list {
-                        val[k] = v;
-                    }
+                    let results = self.process(v)?;
+                    val.extend(results);
                 }
                 Some(Value::Null) | None => {
                     if !self.ignore_missing {
                         return ProcessorMissingFieldSnafu {
                             processor: self.kind().to_string(),
-                            field: field.input_name().to_string(),
+                            field: name.to_string(),
                         }
                         .fail();
                     }
@@ -282,37 +224,28 @@ impl Processor for CsvProcessor {
 #[cfg(test)]
 mod tests {
 
-    use ahash::HashMap;
-
-    use super::Value;
-    use crate::etl::processor::csv::CsvProcessorBuilder;
+    use super::*;
+    use crate::etl::field::Field;
 
     #[test]
     fn test_equal_length() {
         let mut reader = csv::ReaderBuilder::new();
         reader.has_headers(false);
-        let builder = CsvProcessorBuilder {
+        let processor = CsvProcessor {
             reader,
+            fields: Fields::new(vec![Field::new("data", None)]),
             target_fields: vec!["a".into(), "b".into()],
             ..Default::default()
         };
 
-        let intermediate_keys = vec!["data".into(), "a".into(), "b".into()];
-
-        let processor = builder.build(&intermediate_keys).unwrap();
-        let result = processor
-            .process("1,2")
-            .unwrap()
-            .into_iter()
-            .map(|(k, v)| (intermediate_keys[k].clone(), v))
-            .collect::<HashMap<_, _>>();
+        let result = processor.process("1,2").unwrap();
 
         let values = [
             ("a".into(), Value::String("1".into())),
             ("b".into(), Value::String("2".into())),
         ]
         .into_iter()
-        .collect::<HashMap<_, _>>();
+        .collect();
 
         assert_eq!(result, values);
     }
@@ -324,21 +257,14 @@ mod tests {
         {
             let mut reader = csv::ReaderBuilder::new();
             reader.has_headers(false);
-            let builder = CsvProcessorBuilder {
+            let processor = CsvProcessor {
                 reader,
+                fields: Fields::new(vec![Field::new("data", None)]),
                 target_fields: vec!["a".into(), "b".into(), "c".into()],
                 ..Default::default()
             };
 
-            let intermediate_keys = vec!["data".into(), "a".into(), "b".into(), "c".into()];
-
-            let processor = builder.build(&intermediate_keys).unwrap();
-            let result = processor
-                .process("1,2")
-                .unwrap()
-                .into_iter()
-                .map(|(k, v)| (intermediate_keys[k].clone(), v))
-                .collect::<HashMap<_, _>>();
+            let result = processor.process("1,2").unwrap();
 
             let values = [
                 ("a".into(), Value::String("1".into())),
@@ -346,7 +272,7 @@ mod tests {
                 ("c".into(), Value::Null),
             ]
             .into_iter()
-            .collect::<HashMap<_, _>>();
+            .collect();
 
             assert_eq!(result, values);
         }
@@ -355,22 +281,15 @@ mod tests {
         {
             let mut reader = csv::ReaderBuilder::new();
             reader.has_headers(false);
-            let builder = CsvProcessorBuilder {
+            let processor = CsvProcessor {
                 reader,
+                fields: Fields::new(vec![Field::new("data", None)]),
                 target_fields: vec!["a".into(), "b".into(), "c".into()],
                 empty_value: Some("default".into()),
                 ..Default::default()
             };
 
-            let intermediate_keys = vec!["data".into(), "a".into(), "b".into(), "c".into()];
-
-            let processor = builder.build(&intermediate_keys).unwrap();
-            let result = processor
-                .process("1,2")
-                .unwrap()
-                .into_iter()
-                .map(|(k, v)| (intermediate_keys[k].clone(), v))
-                .collect::<HashMap<_, _>>();
+            let result = processor.process("1,2").unwrap();
 
             let values = [
                 ("a".into(), Value::String("1".into())),
@@ -389,22 +308,14 @@ mod tests {
     fn test_target_fields_has_less_length() {
         let mut reader = csv::ReaderBuilder::new();
         reader.has_headers(false);
-        let builder = CsvProcessorBuilder {
+        let processor = CsvProcessor {
             reader,
             target_fields: vec!["a".into(), "b".into()],
             empty_value: Some("default".into()),
             ..Default::default()
         };
 
-        let intermediate_keys = vec!["data".into(), "a".into(), "b".into()];
-
-        let processor = builder.build(&intermediate_keys).unwrap();
-        let result = processor
-            .process("1,2")
-            .unwrap()
-            .into_iter()
-            .map(|(k, v)| (intermediate_keys[k].clone(), v))
-            .collect::<HashMap<_, _>>();
+        let result = processor.process("1,2").unwrap();
 
         let values = [
             ("a".into(), Value::String("1".into())),

From 60a6421d16eb1bdcbdfde67dc0b3a89661125171 Mon Sep 17 00:00:00 2001
From: Ning Sun <sunning@greptime.com>
Date: Sun, 26 Jan 2025 18:05:40 +0800
Subject: [PATCH 27/32] fmt: update format

---
 src/pipeline/src/etl/processor/cmcd.rs | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/pipeline/src/etl/processor/cmcd.rs b/src/pipeline/src/etl/processor/cmcd.rs
index 37df6e8fbec1..8d8b546f7216 100644
--- a/src/pipeline/src/etl/processor/cmcd.rs
+++ b/src/pipeline/src/etl/processor/cmcd.rs
@@ -21,6 +21,7 @@ use std::collections::BTreeMap;
 use snafu::{OptionExt, ResultExt};
 use urlencoding::decode;
 
+use super::IntermediateStatus;
 use crate::etl::error::{
     CmcdMissingKeySnafu, CmcdMissingValueSnafu, Error, FailedToParseFloatKeySnafu,
     FailedToParseIntKeySnafu, KeyMustBeStringSnafu, ProcessorExpectStringSnafu,
@@ -33,8 +34,6 @@ use crate::etl::processor::{
 };
 use crate::etl::value::Value;
 
-use super::IntermediateStatus;
-
 pub(crate) const PROCESSOR_CMCD: &str = "cmcd";
 
 const CMCD_KEY_BR: &str = "br"; // Encoded bitrate, Integer kbps

From 1a29cfee5284c65c97fa380c4442c68a4bb0e5f7 Mon Sep 17 00:00:00 2001
From: paomian <xpaomian@gmail.com>
Date: Mon, 27 Jan 2025 11:13:23 +0800
Subject: [PATCH 28/32] chore: fix regex and dissect processor

---
 src/pipeline/src/etl.rs                   |  91 ++-
 src/pipeline/src/etl/processor.rs         |  14 +-
 src/pipeline/src/etl/processor/dissect.rs | 642 +++++++++-------------
 src/pipeline/src/etl/processor/regex.rs   | 301 ++--------
 src/pipeline/tests/regex.rs               |   2 +
 src/servers/src/http/event.rs             |   2 +-
 tests-integration/tests/http.rs           |   2 +-
 7 files changed, 381 insertions(+), 673 deletions(-)

diff --git a/src/pipeline/src/etl.rs b/src/pipeline/src/etl.rs
index f302e655a816..cac5c44c17be 100644
--- a/src/pipeline/src/etl.rs
+++ b/src/pipeline/src/etl.rs
@@ -283,15 +283,15 @@ mod tests {
         let input_value: serde_json::Value = serde_json::from_str(input_value_str).unwrap();
 
         let pipeline_yaml = r#"description: 'Pipeline for Apache Tomcat'
-    processors:
-      - csv:
-          field: my_field
-          target_fields: field1, field2
-    transform:
-      - field: field1
-        type: uint32
-      - field: field2
-        type: uint32
+processors:
+    - csv:
+        field: my_field
+        target_fields: field1, field2
+transform:
+    - field: field1
+      type: uint32
+    - field: field2
+      type: uint32
     "#;
         let pipeline: Pipeline<GreptimeTransformer> = parse(&Content::Yaml(pipeline_yaml)).unwrap();
         let mut payload = json_to_intermediate_state(input_value).unwrap();
@@ -315,34 +315,34 @@ mod tests {
     fn test_dissect_pipeline() {
         let message = r#"129.37.245.88 - meln1ks [01/Aug/2024:14:22:47 +0800] "PATCH /observability/metrics/production HTTP/1.0" 501 33085"#.to_string();
         let pipeline_str = r#"processors:
-      - dissect:
-          fields:
-            - message
-          patterns:
-            - "%{ip} %{?ignored} %{username} [%{ts}] \"%{method} %{path} %{proto}\" %{status} %{bytes}"
-      - timestamp:
-          fields:
-            - ts
-          formats:
-            - "%d/%b/%Y:%H:%M:%S %z"
+    - dissect:
+        fields:
+          - message
+        patterns:
+          - "%{ip} %{?ignored} %{username} [%{ts}] \"%{method} %{path} %{proto}\" %{status} %{bytes}"
+    - timestamp:
+        fields:
+          - ts
+        formats:
+          - "%d/%b/%Y:%H:%M:%S %z"
 
-    transform:
-      - fields:
-          - ip
-          - username
-          - method
-          - path
-          - proto
-        type: string
-      - fields:
-          - status
-        type: uint16
-      - fields:
-          - bytes
-        type: uint32
-      - field: ts
-        type: timestamp, ns
-        index: time"#;
+transform:
+    - fields:
+        - ip
+        - username
+        - method
+        - path
+        - proto
+      type: string
+    - fields:
+        - status
+      type: uint16
+    - fields:
+        - bytes
+      type: uint32
+    - field: ts
+      type: timestamp, ns
+      index: time"#;
         let pipeline: Pipeline<GreptimeTransformer> = parse(&Content::Yaml(pipeline_str)).unwrap();
         let mut payload = BTreeMap::new();
         payload.insert("message".to_string(), Value::String(message));
@@ -449,18 +449,17 @@ mod tests {
             "#;
         let input_value: serde_json::Value = serde_json::from_str(input_value_str).unwrap();
 
-        let pipeline_yaml = r#"
-    ---
-    description: Pipeline for Apache Tomcat
+        let pipeline_yaml = r#"---
+description: Pipeline for Apache Tomcat
 
-    processors:
-      - timestamp:
-          field: test_time
+processors:
+    - timestamp:
+        field: test_time
 
-    transform:
-      - field: test_time
-        type: timestamp, ns
-        index: time
+transform:
+    - field: test_time
+      type: timestamp, ns
+      index: time
     "#;
 
         let pipeline: Pipeline<GreptimeTransformer> = parse(&Content::Yaml(pipeline_yaml)).unwrap();
diff --git a/src/pipeline/src/etl/processor.rs b/src/pipeline/src/etl/processor.rs
index 1e19f194abc3..376282afecef 100644
--- a/src/pipeline/src/etl/processor.rs
+++ b/src/pipeline/src/etl/processor.rs
@@ -17,13 +17,13 @@ pub mod csv;
 pub mod date;
 pub mod decolorize;
 pub mod digest;
-// pub mod dissect;
+pub mod dissect;
 pub mod epoch;
 pub mod gsub;
 pub mod join;
 pub mod json_path;
 pub mod letter;
-// pub mod regex;
+pub mod regex;
 pub mod timestamp;
 pub mod urlencoding;
 
@@ -34,14 +34,14 @@ use csv::CsvProcessor;
 use date::DateProcessor;
 use decolorize::DecolorizeProcessor;
 use digest::DigestProcessor;
-// use dissect::DissectProcessor;
+use dissect::DissectProcessor;
 use enum_dispatch::enum_dispatch;
 use epoch::EpochProcessor;
 use gsub::GsubProcessor;
 use join::JoinProcessor;
 use json_path::JsonPathProcessor;
 use letter::LetterProcessor;
-// use regex::RegexProcessor;
+use regex::RegexProcessor;
 use snafu::{OptionExt, ResultExt};
 use timestamp::TimestampProcessor;
 use urlencoding::UrlEncodingProcessor;
@@ -95,7 +95,7 @@ pub enum ProcessorKind {
     Gsub(GsubProcessor),
     Join(JoinProcessor),
     Letter(LetterProcessor),
-    // Regex(RegexProcessor),
+    Regex(RegexProcessor),
     Timestamp(TimestampProcessor),
     UrlEncoding(UrlEncodingProcessor),
     Epoch(EpochProcessor),
@@ -158,13 +158,13 @@ fn parse_processor(doc: &yaml_rust::Yaml) -> Result<ProcessorKind> {
     let processor = match str_key {
         cmcd::PROCESSOR_CMCD => ProcessorKind::Cmcd(CmcdProcessor::try_from(value)?),
         csv::PROCESSOR_CSV => ProcessorKind::Csv(CsvProcessor::try_from(value)?),
-        // dissect::PROCESSOR_DISSECT => ProcessorKind::Dissect(DissectProcessor::try_from(value)?),
+        dissect::PROCESSOR_DISSECT => ProcessorKind::Dissect(DissectProcessor::try_from(value)?),
         epoch::PROCESSOR_EPOCH => ProcessorKind::Epoch(EpochProcessor::try_from(value)?),
         date::PROCESSOR_DATE => ProcessorKind::Date(DateProcessor::try_from(value)?),
         gsub::PROCESSOR_GSUB => ProcessorKind::Gsub(GsubProcessor::try_from(value)?),
         join::PROCESSOR_JOIN => ProcessorKind::Join(JoinProcessor::try_from(value)?),
         letter::PROCESSOR_LETTER => ProcessorKind::Letter(LetterProcessor::try_from(value)?),
-        // regex::PROCESSOR_REGEX => ProcessorKind::Regex(RegexProcessor::try_from(value)?),
+        regex::PROCESSOR_REGEX => ProcessorKind::Regex(RegexProcessor::try_from(value)?),
         timestamp::PROCESSOR_TIMESTAMP => {
             ProcessorKind::Timestamp(TimestampProcessor::try_from(value)?)
         }
diff --git a/src/pipeline/src/etl/processor/dissect.rs b/src/pipeline/src/etl/processor/dissect.rs
index 13ad9175e7df..5755a0aeb8a5 100644
--- a/src/pipeline/src/etl/processor/dissect.rs
+++ b/src/pipeline/src/etl/processor/dissect.rs
@@ -18,6 +18,7 @@ use ahash::{HashMap, HashMapExt, HashSet, HashSetExt};
 use itertools::Itertools;
 use snafu::OptionExt;
 
+use super::IntermediateStatus;
 use crate::etl::error::{
     DissectAppendOrderAlreadySetSnafu, DissectConsecutiveNamesSnafu, DissectEmptyPatternSnafu,
     DissectEndModifierAlreadySetSnafu, DissectInvalidPatternSnafu, DissectModifierAlreadySetSnafu,
@@ -25,12 +26,10 @@ use crate::etl::error::{
     DissectOrderOnlyAppendSnafu, DissectSplitExceedsInputSnafu, DissectSplitNotMatchInputSnafu,
     Error, KeyMustBeStringSnafu, ProcessorExpectStringSnafu, ProcessorMissingFieldSnafu, Result,
 };
-use crate::etl::field::{Fields, InputField, OneInputMultiOutputField};
-use crate::etl::find_key_index;
+use crate::etl::field::Fields;
 use crate::etl::processor::{
     yaml_bool, yaml_new_field, yaml_new_fields, yaml_parse_string, yaml_parse_strings, yaml_string,
-    Processor, ProcessorBuilder, ProcessorKind, FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME,
-    PATTERNS_NAME, PATTERN_NAME,
+    Processor, FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME, PATTERNS_NAME, PATTERN_NAME,
 };
 use crate::etl::value::Value;
 
@@ -69,14 +68,7 @@ impl std::fmt::Display for EndModifier {
     }
 }
 
-#[derive(Debug, PartialEq, Default)]
-struct NameInfo {
-    name: String,
-    start_modifier: Option<StartModifier>,
-    end_modifier: Option<EndModifier>,
-}
-
-impl NameInfo {
+impl Name {
     fn is_name_empty(&self) -> bool {
         self.name.is_empty()
     }
@@ -140,26 +132,9 @@ impl NameInfo {
     }
 }
 
-impl std::fmt::Display for NameInfo {
-    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
-        write!(f, "{}", self.name)
-    }
-}
-
-impl From<&str> for NameInfo {
-    fn from(value: &str) -> Self {
-        NameInfo {
-            name: value.to_string(),
-            start_modifier: None,
-            end_modifier: None,
-        }
-    }
-}
-
 #[derive(Debug, PartialEq, Default)]
 struct Name {
     name: String,
-    index: usize,
     start_modifier: Option<StartModifier>,
     end_modifier: Option<EndModifier>,
 }
@@ -170,57 +145,12 @@ impl std::fmt::Display for Name {
     }
 }
 
-impl From<NameInfo> for Name {
-    fn from(value: NameInfo) -> Self {
+impl From<&str> for Name {
+    fn from(value: &str) -> Self {
         Name {
-            name: value.name,
-            index: 0,
-            start_modifier: value.start_modifier,
-            end_modifier: value.end_modifier,
-        }
-    }
-}
-
-impl Name {
-    fn is_name_empty(&self) -> bool {
-        self.name.is_empty()
-    }
-
-    fn is_empty(&self) -> bool {
-        self.name.is_empty() && self.start_modifier.is_none() && self.end_modifier.is_none()
-    }
-
-    fn is_end_modifier_set(&self) -> bool {
-        self.end_modifier.is_some()
-    }
-}
-
-#[derive(Debug, PartialEq)]
-enum PartInfo {
-    Split(String),
-    Name(NameInfo),
-}
-
-impl PartInfo {
-    fn is_empty(&self) -> bool {
-        match self {
-            PartInfo::Split(v) => v.is_empty(),
-            PartInfo::Name(v) => v.is_empty(),
-        }
-    }
-
-    fn empty_split() -> Self {
-        PartInfo::Split(String::new())
-    }
-
-    fn empty_name() -> Self {
-        PartInfo::Name(NameInfo::default())
-    }
-
-    fn push(&mut self, ch: char) {
-        match self {
-            PartInfo::Split(v) => v.push(ch),
-            PartInfo::Name(v) => v.name.push(ch),
+            name: value.to_string(),
+            start_modifier: None,
+            end_modifier: None,
         }
     }
 }
@@ -246,13 +176,11 @@ impl Part {
     fn empty_name() -> Self {
         Part::Name(Name::default())
     }
-}
 
-impl From<PartInfo> for Part {
-    fn from(value: PartInfo) -> Self {
-        match value {
-            PartInfo::Split(v) => Part::Split(v),
-            PartInfo::Name(v) => Part::Name(v.into()),
+    fn push(&mut self, ch: char) {
+        match self {
+            Part::Split(v) => v.push(ch),
+            Part::Name(v) => v.name.push(ch),
         }
     }
 }
@@ -271,42 +199,12 @@ impl Deref for Pattern {
     }
 }
 
-impl From<PatternInfo> for Pattern {
-    fn from(value: PatternInfo) -> Self {
-        let parts = value.parts.into_iter().map(|x| x.into()).collect();
-        Pattern {
-            origin: value.origin,
-            parts,
-        }
-    }
-}
-
-#[derive(Debug, Default)]
-struct PatternInfo {
-    origin: String,
-    parts: Vec<PartInfo>,
-}
-
-impl std::ops::Deref for PatternInfo {
-    type Target = Vec<PartInfo>;
-
-    fn deref(&self) -> &Self::Target {
-        &self.parts
-    }
-}
-
-impl std::ops::DerefMut for PatternInfo {
-    fn deref_mut(&mut self) -> &mut Self::Target {
-        &mut self.parts
-    }
-}
-
-impl std::str::FromStr for PatternInfo {
+impl std::str::FromStr for Pattern {
     type Err = Error;
 
     fn from_str(s: &str) -> Result<Self> {
         let mut parts = vec![];
-        let mut cursor = PartInfo::empty_split();
+        let mut cursor = Part::empty_split();
 
         let origin = s.to_string();
         let chars: Vec<char> = origin.chars().collect();
@@ -316,27 +214,27 @@ impl std::str::FromStr for PatternInfo {
             let ch = chars[pos];
             match (ch, &mut cursor) {
                 // if cursor is Split part, and found %{, then ready to start a Name part
-                ('%', PartInfo::Split(_)) if matches!(chars.get(pos + 1), Some('{')) => {
+                ('%', Part::Split(_)) if matches!(chars.get(pos + 1), Some('{')) => {
                     if !cursor.is_empty() {
                         parts.push(cursor);
                     }
 
-                    cursor = PartInfo::empty_name();
+                    cursor = Part::empty_name();
                     pos += 1; // skip '{'
                 }
                 // if cursor is Split part, and not found % or {, then continue the Split part
-                (_, PartInfo::Split(_)) => {
+                (_, Part::Split(_)) => {
                     cursor.push(ch);
                 }
                 // if cursor is Name part, and found }, then end the Name part, start the next Split part
-                ('}', PartInfo::Name(_)) => {
+                ('}', Part::Name(_)) => {
                     parts.push(cursor);
-                    cursor = PartInfo::empty_split();
+                    cursor = Part::empty_split();
                 }
-                ('+', PartInfo::Name(name)) if !name.is_start_modifier_set() => {
+                ('+', Part::Name(name)) if !name.is_start_modifier_set() => {
                     name.try_start_modifier(StartModifier::Append(None))?;
                 }
-                ('/', PartInfo::Name(name)) if name.is_append_modifier_set() => {
+                ('/', Part::Name(name)) if name.is_append_modifier_set() => {
                     let mut order = 0;
                     let mut j = pos + 1;
                     while j < chars.len() {
@@ -360,16 +258,16 @@ impl std::str::FromStr for PatternInfo {
                     name.try_append_order(order)?;
                     pos = j - 1; // this will change the position to the last digit of the order
                 }
-                ('?', PartInfo::Name(name)) if !name.is_start_modifier_set() => {
+                ('?', Part::Name(name)) if !name.is_start_modifier_set() => {
                     name.try_start_modifier(StartModifier::NamedSkip)?;
                 }
-                ('*', PartInfo::Name(name)) if !name.is_start_modifier_set() => {
+                ('*', Part::Name(name)) if !name.is_start_modifier_set() => {
                     name.try_start_modifier(StartModifier::MapKey)?;
                 }
-                ('&', PartInfo::Name(name)) if !name.is_start_modifier_set() => {
+                ('&', Part::Name(name)) if !name.is_start_modifier_set() => {
                     name.try_start_modifier(StartModifier::MapVal)?;
                 }
-                ('-', PartInfo::Name(name)) if !name.is_end_modifier_set() => {
+                ('-', Part::Name(name)) if !name.is_end_modifier_set() => {
                     if let Some('>') = chars.get(pos + 1) {
                     } else {
                         return DissectInvalidPatternSnafu {
@@ -391,7 +289,7 @@ impl std::str::FromStr for PatternInfo {
                     name.try_end_modifier()?;
                     pos += 1; // only skip '>', the next loop will skip '}'
                 }
-                (_, PartInfo::Name(name)) if !is_valid_char(ch) => {
+                (_, Part::Name(name)) if !is_valid_char(ch) => {
                     let tail: String = if name.is_name_empty() {
                         format!("Invalid '{ch}'")
                     } else {
@@ -399,7 +297,7 @@ impl std::str::FromStr for PatternInfo {
                     };
                     return DissectInvalidPatternSnafu { s, detail: tail }.fail();
                 }
-                (_, PartInfo::Name(_)) => {
+                (_, Part::Name(_)) => {
                     cursor.push(ch);
                 }
             }
@@ -408,8 +306,8 @@ impl std::str::FromStr for PatternInfo {
         }
 
         match cursor {
-            PartInfo::Split(ref split) if !split.is_empty() => parts.push(cursor),
-            PartInfo::Name(name) if !name.is_empty() => {
+            Part::Split(ref split) if !split.is_empty() => parts.push(cursor),
+            Part::Name(name) if !name.is_empty() => {
                 return DissectInvalidPatternSnafu {
                     s,
                     detail: format!("'{name}' is not closed"),
@@ -425,7 +323,7 @@ impl std::str::FromStr for PatternInfo {
     }
 }
 
-impl PatternInfo {
+impl Pattern {
     fn check(&self) -> Result<()> {
         if self.len() == 0 {
             return DissectEmptyPatternSnafu.fail();
@@ -438,21 +336,21 @@ impl PatternInfo {
             let this_part = &self[i];
             let next_part = self.get(i + 1);
             match (this_part, next_part) {
-                (PartInfo::Split(split), _) if split.is_empty() => {
+                (Part::Split(split), _) if split.is_empty() => {
                     return DissectInvalidPatternSnafu {
                         s: &self.origin,
                         detail: "Empty split is not allowed",
                     }
                     .fail();
                 }
-                (PartInfo::Name(name1), Some(PartInfo::Name(name2))) => {
+                (Part::Name(name1), Some(Part::Name(name2))) => {
                     return DissectInvalidPatternSnafu {
                         s: &self.origin,
                         detail: format!("consecutive names are not allowed: '{name1}' '{name2}'",),
                     }
                     .fail();
                 }
-                (PartInfo::Name(name), _) if name.is_name_empty() => {
+                (Part::Name(name), _) if name.is_name_empty() => {
                     if let Some(ref m) = name.start_modifier {
                         return DissectInvalidPatternSnafu {
                             s: &self.origin,
@@ -461,7 +359,7 @@ impl PatternInfo {
                         .fail();
                     }
                 }
-                (PartInfo::Name(name), _) => match name.start_modifier {
+                (Part::Name(name), _) => match name.start_modifier {
                     Some(StartModifier::MapKey) => {
                         if map_keys.contains(&name.name) {
                             return DissectInvalidPatternSnafu {
@@ -509,128 +407,128 @@ impl PatternInfo {
     }
 }
 
-impl std::fmt::Display for PatternInfo {
-    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
-        write!(f, "{}", self.origin)
-    }
-}
-
-#[derive(Debug, Default)]
-pub struct DissectProcessorBuilder {
-    fields: Fields,
-    patterns: Vec<PatternInfo>,
-    ignore_missing: bool,
-    append_separator: Option<String>,
-    output_keys: HashSet<String>,
-}
-
-impl DissectProcessorBuilder {
-    fn build_output_keys(patterns: &[PatternInfo]) -> HashSet<String> {
-        patterns
-            .iter()
-            .flat_map(|pattern| pattern.iter())
-            .filter_map(|p| match p {
-                PartInfo::Name(name) => {
-                    if !name.is_empty()
-                        && (name.start_modifier.is_none()
-                            || name
-                                .start_modifier
-                                .as_ref()
-                                .is_some_and(|x| matches!(x, StartModifier::Append(_))))
-                    {
-                        Some(name.to_string())
-                    } else {
-                        None
-                    }
-                }
-                _ => None,
-            })
-            .collect()
-    }
-
-    fn part_info_to_part(part_info: PartInfo, intermediate_keys: &[String]) -> Result<Part> {
-        match part_info {
-            PartInfo::Split(s) => Ok(Part::Split(s)),
-            PartInfo::Name(n) => match n.start_modifier {
-                None | Some(StartModifier::Append(_)) => {
-                    let index = find_key_index(intermediate_keys, &n.name, "dissect")?;
-                    Ok(Part::Name(Name {
-                        name: n.name,
-                        index,
-                        start_modifier: n.start_modifier,
-                        end_modifier: n.end_modifier,
-                    }))
-                }
-                _ => Ok(Part::Name(Name {
-                    name: n.name,
-                    index: usize::MAX,
-                    start_modifier: n.start_modifier,
-                    end_modifier: n.end_modifier,
-                })),
-            },
-        }
-    }
-
-    fn pattern_info_to_pattern(
-        pattern_info: PatternInfo,
-        intermediate_keys: &[String],
-    ) -> Result<Pattern> {
-        let original = pattern_info.origin;
-        let pattern = pattern_info
-            .parts
-            .into_iter()
-            .map(|part_info| Self::part_info_to_part(part_info, intermediate_keys))
-            .collect::<Result<Vec<_>>>()?;
-        Ok(Pattern {
-            origin: original,
-            parts: pattern,
-        })
-    }
-
-    fn build_patterns_from_pattern_infos(
-        patterns: Vec<PatternInfo>,
-        intermediate_keys: &[String],
-    ) -> Result<Vec<Pattern>> {
-        patterns
-            .into_iter()
-            .map(|pattern_info| Self::pattern_info_to_pattern(pattern_info, intermediate_keys))
-            .collect()
-    }
-}
-
-impl ProcessorBuilder for DissectProcessorBuilder {
-    fn output_keys(&self) -> HashSet<&str> {
-        self.output_keys.iter().map(|s| s.as_str()).collect()
-    }
-
-    fn input_keys(&self) -> HashSet<&str> {
-        self.fields.iter().map(|f| f.input_field()).collect()
-    }
-
-    fn build(self, intermediate_keys: &[String]) -> Result<ProcessorKind> {
-        let mut real_fields = vec![];
-        for field in self.fields.into_iter() {
-            let input_index = find_key_index(intermediate_keys, field.input_field(), "dissect")?;
-
-            let input_field_info = InputField::new(field.input_field(), input_index);
-
-            let real_field = OneInputMultiOutputField::new(input_field_info, field.target_field);
-            real_fields.push(real_field);
-        }
-        let patterns = Self::build_patterns_from_pattern_infos(self.patterns, intermediate_keys)?;
-        let processor = DissectProcessor {
-            fields: real_fields,
-            patterns,
-            ignore_missing: self.ignore_missing,
-            append_separator: self.append_separator,
-        };
-        Ok(ProcessorKind::Dissect(processor))
-    }
-}
+// impl std::fmt::Display for PatternInfo {
+//     fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+//         write!(f, "{}", self.origin)
+//     }
+// }
+
+// #[derive(Debug, Default)]
+// pub struct DissectProcessorBuilder {
+//     fields: Fields,
+//     patterns: Vec<PatternInfo>,
+//     ignore_missing: bool,
+//     append_separator: Option<String>,
+//     output_keys: HashSet<String>,
+// }
+
+// impl DissectProcessorBuilder {
+//     fn build_output_keys(patterns: &[PatternInfo]) -> HashSet<String> {
+//         patterns
+//             .iter()
+//             .flat_map(|pattern| pattern.iter())
+//             .filter_map(|p| match p {
+//                 PartInfo::Name(name) => {
+//                     if !name.is_empty()
+//                         && (name.start_modifier.is_none()
+//                             || name
+//                                 .start_modifier
+//                                 .as_ref()
+//                                 .is_some_and(|x| matches!(x, StartModifier::Append(_))))
+//                     {
+//                         Some(name.to_string())
+//                     } else {
+//                         None
+//                     }
+//                 }
+//                 _ => None,
+//             })
+//             .collect()
+//     }
+
+//     fn part_info_to_part(part_info: PartInfo, intermediate_keys: &[String]) -> Result<Part> {
+//         match part_info {
+//             PartInfo::Split(s) => Ok(Part::Split(s)),
+//             PartInfo::Name(n) => match n.start_modifier {
+//                 None | Some(StartModifier::Append(_)) => {
+//                     let index = find_key_index(intermediate_keys, &n.name, "dissect")?;
+//                     Ok(Part::Name(Name {
+//                         name: n.name,
+//                         index,
+//                         start_modifier: n.start_modifier,
+//                         end_modifier: n.end_modifier,
+//                     }))
+//                 }
+//                 _ => Ok(Part::Name(Name {
+//                     name: n.name,
+//                     index: usize::MAX,
+//                     start_modifier: n.start_modifier,
+//                     end_modifier: n.end_modifier,
+//                 })),
+//             },
+//         }
+//     }
+
+//     fn pattern_info_to_pattern(
+//         pattern_info: PatternInfo,
+//         intermediate_keys: &[String],
+//     ) -> Result<Pattern> {
+//         let original = pattern_info.origin;
+//         let pattern = pattern_info
+//             .parts
+//             .into_iter()
+//             .map(|part_info| Self::part_info_to_part(part_info, intermediate_keys))
+//             .collect::<Result<Vec<_>>>()?;
+//         Ok(Pattern {
+//             origin: original,
+//             parts: pattern,
+//         })
+//     }
+
+//     fn build_patterns_from_pattern_infos(
+//         patterns: Vec<PatternInfo>,
+//         intermediate_keys: &[String],
+//     ) -> Result<Vec<Pattern>> {
+//         patterns
+//             .into_iter()
+//             .map(|pattern_info| Self::pattern_info_to_pattern(pattern_info, intermediate_keys))
+//             .collect()
+//     }
+// }
+
+// impl ProcessorBuilder for DissectProcessorBuilder {
+//     fn output_keys(&self) -> HashSet<&str> {
+//         self.output_keys.iter().map(|s| s.as_str()).collect()
+//     }
+
+//     fn input_keys(&self) -> HashSet<&str> {
+//         self.fields.iter().map(|f| f.input_field()).collect()
+//     }
+
+//     fn build(self, intermediate_keys: &[String]) -> Result<ProcessorKind> {
+//         let mut real_fields = vec![];
+//         for field in self.fields.into_iter() {
+//             let input_index = find_key_index(intermediate_keys, field.input_field(), "dissect")?;
+
+//             let input_field_info = InputField::new(field.input_field(), input_index);
+
+//             let real_field = OneInputMultiOutputField::new(input_field_info, field.target_field);
+//             real_fields.push(real_field);
+//         }
+//         let patterns = Self::build_patterns_from_pattern_infos(self.patterns, intermediate_keys)?;
+//         let processor = DissectProcessor {
+//             fields: real_fields,
+//             patterns,
+//             ignore_missing: self.ignore_missing,
+//             append_separator: self.append_separator,
+//         };
+//         Ok(ProcessorKind::Dissect(processor))
+//     }
+// }
 
 #[derive(Debug, Default)]
 pub struct DissectProcessor {
-    fields: Vec<OneInputMultiOutputField>,
+    fields: Fields,
     patterns: Vec<Pattern>,
     ignore_missing: bool,
 
@@ -639,33 +537,37 @@ pub struct DissectProcessor {
 }
 
 impl DissectProcessor {
-    fn process_pattern(&self, chs: &[char], pattern: &Pattern) -> Result<Vec<(usize, Value)>> {
+    fn process_name_value<'a, 'b>(
+        name: &'a Name,
+        value: String,
+        appends: &'b mut HashMap<&'a String, Vec<(String, u32)>>,
+        map: &mut Vec<(&'a String, Value)>,
+    ) {
+        match name.start_modifier {
+            Some(StartModifier::NamedSkip) => {
+                // do nothing, ignore this match
+            }
+            Some(StartModifier::Append(order)) => {
+                appends
+                    .entry(&name.name)
+                    .or_default()
+                    .push((value, order.unwrap_or_default()));
+            }
+            Some(_) => {
+                // do nothing, ignore MapKey and MapVal
+                // because transform can know the key name
+            }
+            None => {
+                map.push((&name.name, Value::String(value)));
+            }
+        }
+    }
+
+    fn process_pattern(&self, chs: &[char], pattern: &Pattern) -> Result<Vec<(String, Value)>> {
         let mut map = Vec::new();
         let mut pos = 0;
 
-        let mut appends: HashMap<usize, Vec<(String, u32)>> = HashMap::new();
-
-        let mut process_name_value = |name: &Name, value: String| {
-            let name_index = name.index;
-            match name.start_modifier {
-                Some(StartModifier::NamedSkip) => {
-                    // do nothing, ignore this match
-                }
-                Some(StartModifier::Append(order)) => {
-                    appends
-                        .entry(name_index)
-                        .or_default()
-                        .push((value, order.unwrap_or_default()));
-                }
-                Some(_) => {
-                    // do nothing, ignore MapKey and MapVal
-                    // because transform can know the key name
-                }
-                None => {
-                    map.push((name_index, Value::String(value)));
-                }
-            }
-        };
+        let mut appends: HashMap<&String, Vec<(String, u32)>> = HashMap::new();
 
         for i in 0..pattern.len() {
             let this_part = &pattern[i];
@@ -701,7 +603,7 @@ impl DissectProcessor {
                 // if Name part is the last part, then the rest of the input is the value
                 (Part::Name(name), None) => {
                     let value = chs[pos..].iter().collect::<String>();
-                    process_name_value(name, value);
+                    Self::process_name_value(name, value, &mut appends, &mut map);
                 }
 
                 // if Name part, and next part is Split, then find the matched value of the name
@@ -717,7 +619,7 @@ impl DissectProcessor {
 
                     if !name.is_name_empty() {
                         let value = chs[pos..end].iter().collect::<String>();
-                        process_name_value(name, value);
+                        Self::process_name_value(name, value, &mut appends, &mut map);
                     }
 
                     if name.is_end_modifier_set() {
@@ -745,10 +647,10 @@ impl DissectProcessor {
             }
         }
 
-        Ok(map)
+        Ok(map.into_iter().map(|(k, v)| (k.to_string(), v)).collect())
     }
 
-    fn process(&self, val: &str) -> Result<Vec<(usize, Value)>> {
+    fn process(&self, val: &str) -> Result<Vec<(String, Value)>> {
         let chs = val.chars().collect::<Vec<char>>();
 
         for pattern in &self.patterns {
@@ -760,7 +662,7 @@ impl DissectProcessor {
     }
 }
 
-impl TryFrom<&yaml_rust::yaml::Hash> for DissectProcessorBuilder {
+impl TryFrom<&yaml_rust::yaml::Hash> for DissectProcessor {
     type Error = Error;
 
     fn try_from(value: &yaml_rust::yaml::Hash) -> Result<Self> {
@@ -782,7 +684,7 @@ impl TryFrom<&yaml_rust::yaml::Hash> for DissectProcessorBuilder {
                     fields = yaml_new_fields(v, FIELDS_NAME)?;
                 }
                 PATTERN_NAME => {
-                    let pattern: PatternInfo = yaml_parse_string(v, PATTERN_NAME)?;
+                    let pattern: Pattern = yaml_parse_string(v, PATTERN_NAME)?;
                     patterns = vec![pattern];
                 }
                 PATTERNS_NAME => {
@@ -797,13 +699,12 @@ impl TryFrom<&yaml_rust::yaml::Hash> for DissectProcessorBuilder {
                 _ => {}
             }
         }
-        let output_keys = Self::build_output_keys(&patterns);
-        let builder = DissectProcessorBuilder {
+        // let output_keys = Self::build_output_keys(&patterns);
+        let builder = DissectProcessor {
             fields,
             patterns,
             ignore_missing,
             append_separator,
-            output_keys,
         };
 
         Ok(builder)
@@ -819,21 +720,21 @@ impl Processor for DissectProcessor {
         self.ignore_missing
     }
 
-    fn exec_mut(&self, val: &mut Vec<Value>) -> Result<()> {
+    fn exec_mut(&self, val: &mut IntermediateStatus) -> Result<()> {
         for field in self.fields.iter() {
-            let index = field.input_index();
+            let index = field.input_field();
             match val.get(index) {
                 Some(Value::String(val_str)) => {
                     let r = self.process(val_str)?;
                     for (k, v) in r {
-                        val[k] = v;
+                        val.insert(k, v);
                     }
                 }
                 Some(Value::Null) | None => {
                     if !self.ignore_missing {
                         return ProcessorMissingFieldSnafu {
                             processor: self.kind(),
-                            field: field.input_name(),
+                            field: field.input_field(),
                         }
                         .fail();
                     }
@@ -859,26 +760,19 @@ fn is_valid_char(ch: char) -> bool {
 mod tests {
     use ahash::HashMap;
 
-    use super::{DissectProcessor, EndModifier, NameInfo, PartInfo, PatternInfo, StartModifier};
-    use crate::etl::processor::dissect::DissectProcessorBuilder;
+    use super::{DissectProcessor, EndModifier, Name, Part, StartModifier};
+    use crate::etl::processor::dissect::Pattern;
     use crate::etl::value::Value;
 
     fn assert(pattern_str: &str, input: &str, expected: HashMap<String, Value>) {
         let chs = input.chars().collect::<Vec<char>>();
-        let pattern_infos: Vec<PatternInfo> = vec![pattern_str.parse().unwrap()];
-        let output_keys: Vec<String> = DissectProcessorBuilder::build_output_keys(&pattern_infos)
-            .into_iter()
-            .collect();
-        let pattern =
-            DissectProcessorBuilder::build_patterns_from_pattern_infos(pattern_infos, &output_keys)
-                .unwrap();
+        let patterns: Vec<Pattern> = vec![pattern_str.parse().unwrap()];
 
         let processor = DissectProcessor::default();
         let result: HashMap<String, Value> = processor
-            .process_pattern(&chs, &pattern[0])
+            .process_pattern(&chs, &patterns[0])
             .unwrap()
             .into_iter()
-            .map(|(k, v)| (output_keys[k].to_string(), v))
             .collect();
 
         assert_eq!(result, expected, "pattern: {}", pattern_str);
@@ -889,28 +783,28 @@ mod tests {
         let cases = [(
             "%{clientip} %{ident} %{auth} [%{timestamp}] \"%{verb} %{request} HTTP/%{httpversion}\" %{status} %{size}",
             vec![
-                PartInfo::Name("clientip".into()),
-                PartInfo::Split(" ".into()),
-                PartInfo::Name("ident".into()),
-                PartInfo::Split(" ".into()),
-                PartInfo::Name("auth".into()),
-                PartInfo::Split(" [".into()),
-                PartInfo::Name("timestamp".into()),
-                PartInfo::Split("] \"".into()),
-                PartInfo::Name("verb".into()),
-                PartInfo::Split(" ".into()),
-                PartInfo::Name("request".into()),
-                PartInfo::Split(" HTTP/".into()),
-                PartInfo::Name("httpversion".into()),
-                PartInfo::Split("\" ".into()),
-                PartInfo::Name("status".into()),
-                PartInfo::Split(" ".into()),
-                PartInfo::Name("size".into()),
+                Part::Name("clientip".into()),
+                Part::Split(" ".into()),
+                Part::Name("ident".into()),
+                Part::Split(" ".into()),
+                Part::Name("auth".into()),
+                Part::Split(" [".into()),
+                Part::Name("timestamp".into()),
+                Part::Split("] \"".into()),
+                Part::Name("verb".into()),
+                Part::Split(" ".into()),
+                Part::Name("request".into()),
+                Part::Split(" HTTP/".into()),
+                Part::Name("httpversion".into()),
+                Part::Split("\" ".into()),
+                Part::Name("status".into()),
+                Part::Split(" ".into()),
+                Part::Name("size".into()),
             ],
         )];
 
         for (pattern, expected) in cases.into_iter() {
-            let p: PatternInfo = pattern.parse().unwrap();
+            let p: Pattern = pattern.parse().unwrap();
             assert_eq!(p.parts, expected);
         }
     }
@@ -921,13 +815,13 @@ mod tests {
             (
                 "%{} %{}",
                 vec![
-                    PartInfo::Name(NameInfo {
+                    Part::Name(Name {
                         name: "".into(),
                         start_modifier: None,
                         end_modifier: None,
                     }),
-                    PartInfo::Split(" ".into()),
-                    PartInfo::Name(NameInfo {
+                    Part::Split(" ".into()),
+                    Part::Name(Name {
                         name: "".into(),
                         start_modifier: None,
                         end_modifier: None,
@@ -937,61 +831,61 @@ mod tests {
             (
                 "%{ts->} %{level}",
                 vec![
-                    PartInfo::Name(NameInfo {
+                    Part::Name(Name {
                         name: "ts".into(),
                         start_modifier: None,
                         end_modifier: Some(EndModifier),
                     }),
-                    PartInfo::Split(" ".into()),
-                    PartInfo::Name("level".into()),
+                    Part::Split(" ".into()),
+                    Part::Name("level".into()),
                 ],
             ),
             (
                 "[%{ts}]%{->}[%{level}]",
                 vec![
-                    PartInfo::Split("[".into()),
-                    PartInfo::Name(NameInfo {
+                    Part::Split("[".into()),
+                    Part::Name(Name {
                         name: "ts".into(),
                         start_modifier: None,
                         end_modifier: None,
                     }),
-                    PartInfo::Split("]".into()),
-                    PartInfo::Name(NameInfo {
+                    Part::Split("]".into()),
+                    Part::Name(Name {
                         name: "".into(),
                         start_modifier: None,
                         end_modifier: Some(EndModifier),
                     }),
-                    PartInfo::Split("[".into()),
-                    PartInfo::Name(NameInfo {
+                    Part::Split("[".into()),
+                    Part::Name(Name {
                         name: "level".into(),
                         start_modifier: None,
                         end_modifier: None,
                     }),
-                    PartInfo::Split("]".into()),
+                    Part::Split("]".into()),
                 ],
             ),
             (
                 "%{+name} %{+name} %{+name} %{+name}",
                 vec![
-                    PartInfo::Name(NameInfo {
+                    Part::Name(Name {
                         name: "name".into(),
                         start_modifier: Some(StartModifier::Append(None)),
                         end_modifier: None,
                     }),
-                    PartInfo::Split(" ".into()),
-                    PartInfo::Name(NameInfo {
+                    Part::Split(" ".into()),
+                    Part::Name(Name {
                         name: "name".into(),
                         start_modifier: Some(StartModifier::Append(None)),
                         end_modifier: None,
                     }),
-                    PartInfo::Split(" ".into()),
-                    PartInfo::Name(NameInfo {
+                    Part::Split(" ".into()),
+                    Part::Name(Name {
                         name: "name".into(),
                         start_modifier: Some(StartModifier::Append(None)),
                         end_modifier: None,
                     }),
-                    PartInfo::Split(" ".into()),
-                    PartInfo::Name(NameInfo {
+                    Part::Split(" ".into()),
+                    Part::Name(Name {
                         name: "name".into(),
                         start_modifier: Some(StartModifier::Append(None)),
                         end_modifier: None,
@@ -1001,25 +895,25 @@ mod tests {
             (
                 "%{+name/2} %{+name/4} %{+name/3} %{+name/1}",
                 vec![
-                    PartInfo::Name(NameInfo {
+                    Part::Name(Name {
                         name: "name".into(),
                         start_modifier: Some(StartModifier::Append(Some(2))),
                         end_modifier: None,
                     }),
-                    PartInfo::Split(" ".into()),
-                    PartInfo::Name(NameInfo {
+                    Part::Split(" ".into()),
+                    Part::Name(Name {
                         name: "name".into(),
                         start_modifier: Some(StartModifier::Append(Some(4))),
                         end_modifier: None,
                     }),
-                    PartInfo::Split(" ".into()),
-                    PartInfo::Name(NameInfo {
+                    Part::Split(" ".into()),
+                    Part::Name(Name {
                         name: "name".into(),
                         start_modifier: Some(StartModifier::Append(Some(3))),
                         end_modifier: None,
                     }),
-                    PartInfo::Split(" ".into()),
-                    PartInfo::Name(NameInfo {
+                    Part::Split(" ".into()),
+                    Part::Name(Name {
                         name: "name".into(),
                         start_modifier: Some(StartModifier::Append(Some(1))),
                         end_modifier: None,
@@ -1029,67 +923,67 @@ mod tests {
             (
                 "%{clientip} %{?ident} %{?auth} [%{timestamp}]",
                 vec![
-                    PartInfo::Name(NameInfo {
+                    Part::Name(Name {
                         name: "clientip".into(),
                         start_modifier: None,
                         end_modifier: None,
                     }),
-                    PartInfo::Split(" ".into()),
-                    PartInfo::Name(NameInfo {
+                    Part::Split(" ".into()),
+                    Part::Name(Name {
                         name: "ident".into(),
                         start_modifier: Some(StartModifier::NamedSkip),
                         end_modifier: None,
                     }),
-                    PartInfo::Split(" ".into()),
-                    PartInfo::Name(NameInfo {
+                    Part::Split(" ".into()),
+                    Part::Name(Name {
                         name: "auth".into(),
                         start_modifier: Some(StartModifier::NamedSkip),
                         end_modifier: None,
                     }),
-                    PartInfo::Split(" [".into()),
-                    PartInfo::Name(NameInfo {
+                    Part::Split(" [".into()),
+                    Part::Name(Name {
                         name: "timestamp".into(),
                         start_modifier: None,
                         end_modifier: None,
                     }),
-                    PartInfo::Split("]".into()),
+                    Part::Split("]".into()),
                 ],
             ),
             (
                 "[%{ts}] [%{level}] %{*p1}:%{&p1} %{*p2}:%{&p2}",
                 vec![
-                    PartInfo::Split("[".into()),
-                    PartInfo::Name(NameInfo {
+                    Part::Split("[".into()),
+                    Part::Name(Name {
                         name: "ts".into(),
                         start_modifier: None,
                         end_modifier: None,
                     }),
-                    PartInfo::Split("] [".into()),
-                    PartInfo::Name(NameInfo {
+                    Part::Split("] [".into()),
+                    Part::Name(Name {
                         name: "level".into(),
                         start_modifier: None,
                         end_modifier: None,
                     }),
-                    PartInfo::Split("] ".into()),
-                    PartInfo::Name(NameInfo {
+                    Part::Split("] ".into()),
+                    Part::Name(Name {
                         name: "p1".into(),
                         start_modifier: Some(StartModifier::MapKey),
                         end_modifier: None,
                     }),
-                    PartInfo::Split(":".into()),
-                    PartInfo::Name(NameInfo {
+                    Part::Split(":".into()),
+                    Part::Name(Name {
                         name: "p1".into(),
                         start_modifier: Some(StartModifier::MapVal),
                         end_modifier: None,
                     }),
-                    PartInfo::Split(" ".into()),
-                    PartInfo::Name(NameInfo {
+                    Part::Split(" ".into()),
+                    Part::Name(Name {
                         name: "p2".into(),
                         start_modifier: Some(StartModifier::MapKey),
                         end_modifier: None,
                     }),
-                    PartInfo::Split(":".into()),
-                    PartInfo::Name(NameInfo {
+                    Part::Split(":".into()),
+                    Part::Name(Name {
                         name: "p2".into(),
                         start_modifier: Some(StartModifier::MapVal),
                         end_modifier: None,
@@ -1099,13 +993,13 @@ mod tests {
             (
                 "%{&p1}:%{*p1}",
                 vec![
-                    PartInfo::Name(NameInfo {
+                    Part::Name(Name {
                         name: "p1".into(),
                         start_modifier: Some(StartModifier::MapVal),
                         end_modifier: None,
                     }),
-                    PartInfo::Split(":".into()),
-                    PartInfo::Name(NameInfo {
+                    Part::Split(":".into()),
+                    Part::Name(Name {
                         name: "p1".into(),
                         start_modifier: Some(StartModifier::MapKey),
                         end_modifier: None,
@@ -1115,7 +1009,7 @@ mod tests {
         ];
 
         for (pattern, expected) in cases.into_iter() {
-            let p: PatternInfo = pattern.parse().unwrap();
+            let p: Pattern = pattern.parse().unwrap();
             assert_eq!(p.parts, expected);
         }
     }
@@ -1195,7 +1089,7 @@ mod tests {
         ];
 
         for (pattern, expected) in cases.into_iter() {
-            let err = pattern.parse::<PatternInfo>().unwrap_err();
+            let err = pattern.parse::<Pattern>().unwrap_err();
             assert_eq!(err.to_string(), expected);
         }
     }
diff --git a/src/pipeline/src/etl/processor/regex.rs b/src/pipeline/src/etl/processor/regex.rs
index a6ffa86d1689..e6988c773f52 100644
--- a/src/pipeline/src/etl/processor/regex.rs
+++ b/src/pipeline/src/etl/processor/regex.rs
@@ -18,21 +18,20 @@ const PATTERNS_NAME: &str = "patterns";
 
 pub(crate) const PROCESSOR_REGEX: &str = "regex";
 
-use ahash::{HashSet, HashSetExt};
 use lazy_static::lazy_static;
 use regex::Regex;
 use snafu::{OptionExt, ResultExt};
 
+use super::IntermediateStatus;
 use crate::etl::error::{
     Error, KeyMustBeStringSnafu, ProcessorExpectStringSnafu, ProcessorMissingFieldSnafu,
     RegexNamedGroupNotFoundSnafu, RegexNoValidFieldSnafu, RegexNoValidPatternSnafu, RegexSnafu,
     Result,
 };
-use crate::etl::field::{Fields, InputField, OneInputMultiOutputField};
-use crate::etl::find_key_index;
+use crate::etl::field::Fields;
 use crate::etl::processor::{
-    yaml_bool, yaml_new_field, yaml_new_fields, yaml_string, yaml_strings, Processor,
-    ProcessorBuilder, ProcessorKind, FIELDS_NAME, FIELD_NAME, IGNORE_MISSING_NAME, PATTERN_NAME,
+    yaml_bool, yaml_new_field, yaml_new_fields, yaml_string, yaml_strings, Processor, FIELDS_NAME,
+    FIELD_NAME, IGNORE_MISSING_NAME, PATTERN_NAME,
 };
 use crate::etl::value::Value;
 
@@ -83,113 +82,7 @@ impl std::str::FromStr for GroupRegex {
     }
 }
 
-#[derive(Debug, Default)]
-pub struct RegexProcessorBuilder {
-    fields: Fields,
-    patterns: Vec<GroupRegex>,
-    ignore_missing: bool,
-    output_keys: HashSet<String>,
-}
-
-impl ProcessorBuilder for RegexProcessorBuilder {
-    fn output_keys(&self) -> HashSet<&str> {
-        self.output_keys.iter().map(|k| k.as_str()).collect()
-    }
-
-    fn input_keys(&self) -> HashSet<&str> {
-        self.fields.iter().map(|f| f.input_field()).collect()
-    }
-
-    fn build(self, intermediate_keys: &[String]) -> Result<ProcessorKind> {
-        self.build(intermediate_keys).map(ProcessorKind::Regex)
-    }
-}
-
-impl RegexProcessorBuilder {
-    fn check(self) -> Result<Self> {
-        if self.fields.is_empty() {
-            return RegexNoValidFieldSnafu {
-                processor: PROCESSOR_REGEX,
-            }
-            .fail();
-        }
-
-        if self.patterns.is_empty() {
-            return RegexNoValidPatternSnafu {
-                processor: PROCESSOR_REGEX,
-            }
-            .fail();
-        }
-
-        Ok(self)
-    }
-
-    fn build_group_output_info(
-        group_regex: &GroupRegex,
-        om_field: &OneInputMultiOutputField,
-        intermediate_keys: &[String],
-    ) -> Result<Vec<OutPutInfo>> {
-        group_regex
-            .groups
-            .iter()
-            .map(|g| {
-                let key = generate_key(om_field.target_prefix(), g);
-                let index = find_key_index(intermediate_keys, &key, "regex");
-                index.map(|index| OutPutInfo {
-                    final_key: key,
-                    group_name: g.to_string(),
-                    index,
-                })
-            })
-            .collect::<Result<Vec<_>>>()
-    }
-
-    fn build_group_output_infos(
-        patterns: &[GroupRegex],
-        om_field: &OneInputMultiOutputField,
-        intermediate_keys: &[String],
-    ) -> Result<Vec<Vec<OutPutInfo>>> {
-        patterns
-            .iter()
-            .map(|group_regex| {
-                Self::build_group_output_info(group_regex, om_field, intermediate_keys)
-            })
-            .collect::<Result<Vec<_>>>()
-    }
-
-    fn build_output_info(
-        real_fields: &[OneInputMultiOutputField],
-        patterns: &[GroupRegex],
-        intermediate_keys: &[String],
-    ) -> Result<RegexProcessorOutputInfo> {
-        let inner = real_fields
-            .iter()
-            .map(|om_field| Self::build_group_output_infos(patterns, om_field, intermediate_keys))
-            .collect::<Result<Vec<_>>>();
-        inner.map(|inner| RegexProcessorOutputInfo { inner })
-    }
-
-    fn build(self, intermediate_keys: &[String]) -> Result<RegexProcessor> {
-        let mut real_fields = vec![];
-        for field in self.fields.into_iter() {
-            let input_index = find_key_index(intermediate_keys, field.input_field(), "regex")?;
-            let input_field_info = InputField::new(field.input_field(), input_index);
-
-            let input = OneInputMultiOutputField::new(input_field_info, field.target_field);
-            real_fields.push(input);
-        }
-        let output_info = Self::build_output_info(&real_fields, &self.patterns, intermediate_keys)?;
-        Ok(RegexProcessor {
-            // fields: Fields::one(Field::new("test".to_string())),
-            fields: real_fields,
-            patterns: self.patterns,
-            output_info,
-            ignore_missing: self.ignore_missing,
-        })
-    }
-}
-
-impl TryFrom<&yaml_rust::yaml::Hash> for RegexProcessorBuilder {
+impl TryFrom<&yaml_rust::yaml::Hash> for RegexProcessor {
     type Error = Error;
 
     fn try_from(value: &yaml_rust::yaml::Hash) -> Result<Self> {
@@ -226,61 +119,44 @@ impl TryFrom<&yaml_rust::yaml::Hash> for RegexProcessorBuilder {
             }
         }
 
-        let pattern_output_keys = patterns
-            .iter()
-            .flat_map(|pattern| pattern.groups.iter())
-            .collect::<Vec<_>>();
-        let mut output_keys = HashSet::new();
-        for field in fields.iter() {
-            for x in pattern_output_keys.iter() {
-                output_keys.insert(generate_key(field.target_or_input_field(), x));
-            }
-        }
-
-        let processor_builder = RegexProcessorBuilder {
+        let processor_builder = RegexProcessor {
             fields,
             patterns,
             ignore_missing,
-            output_keys,
         };
 
         processor_builder.check()
     }
 }
 
-#[derive(Debug, Default)]
-struct OutPutInfo {
-    final_key: String,
-    group_name: String,
-    index: usize,
-}
-
-#[derive(Debug, Default)]
-struct RegexProcessorOutputInfo {
-    pub inner: Vec<Vec<Vec<OutPutInfo>>>,
-}
-
-impl RegexProcessorOutputInfo {
-    fn get_output_index(
-        &self,
-        field_index: usize,
-        pattern_index: usize,
-        group_index: usize,
-    ) -> usize {
-        self.inner[field_index][pattern_index][group_index].index
-    }
-}
 /// only support string value
 /// if no value found from a pattern, the target_field will be ignored
 #[derive(Debug, Default)]
 pub struct RegexProcessor {
-    fields: Vec<OneInputMultiOutputField>,
-    output_info: RegexProcessorOutputInfo,
+    fields: Fields,
     patterns: Vec<GroupRegex>,
     ignore_missing: bool,
 }
 
 impl RegexProcessor {
+    fn check(self) -> Result<Self> {
+        if self.fields.is_empty() {
+            return RegexNoValidFieldSnafu {
+                processor: PROCESSOR_REGEX,
+            }
+            .fail();
+        }
+
+        if self.patterns.is_empty() {
+            return RegexNoValidPatternSnafu {
+                processor: PROCESSOR_REGEX,
+            }
+            .fail();
+        }
+
+        Ok(self)
+    }
+
     fn try_with_patterns(&mut self, patterns: Vec<String>) -> Result<()> {
         let mut rs = vec![];
         for pattern in patterns {
@@ -291,21 +167,13 @@ impl RegexProcessor {
         Ok(())
     }
 
-    fn process(
-        &self,
-        val: &str,
-        gr: &GroupRegex,
-        index: (usize, usize),
-    ) -> Result<Vec<(usize, Value)>> {
+    fn process<'a>(&self, val: &str, gr: &'a GroupRegex) -> Result<Vec<(&'a String, Value)>> {
         let mut result = Vec::new();
         if let Some(captures) = gr.regex.captures(val) {
-            for (group_index, group) in gr.groups.iter().enumerate() {
+            for group in gr.groups.iter() {
                 if let Some(capture) = captures.name(group) {
                     let value = capture.as_str().to_string();
-                    let index = self
-                        .output_info
-                        .get_output_index(index.0, index.1, group_index);
-                    result.push((index, Value::String(value)));
+                    result.push((group, Value::String(value)));
                 }
             }
         }
@@ -322,9 +190,9 @@ impl Processor for RegexProcessor {
         self.ignore_missing
     }
 
-    fn exec_mut(&self, val: &mut Vec<Value>) -> Result<()> {
-        for (field_index, field) in self.fields.iter().enumerate() {
-            let index = field.input_index();
+    fn exec_mut(&self, val: &mut IntermediateStatus) -> Result<()> {
+        for field in self.fields.iter() {
+            let index = field.input_field();
             let mut result_list = None;
             match val.get(index) {
                 Some(Value::String(s)) => {
@@ -336,8 +204,8 @@ impl Processor for RegexProcessor {
                     //         val[output_index] = result;
                     //     }
                     // }
-                    for (gr_index, gr) in self.patterns.iter().enumerate() {
-                        let result = self.process(s.as_str(), gr, (field_index, gr_index))?;
+                    for gr in self.patterns.iter() {
+                        let result = self.process(s.as_str(), gr)?;
                         if !result.is_empty() {
                             match result_list.as_mut() {
                                 None => {
@@ -354,7 +222,7 @@ impl Processor for RegexProcessor {
                     if !self.ignore_missing {
                         return ProcessorMissingFieldSnafu {
                             processor: self.kind(),
-                            field: field.input_name(),
+                            field: field.input_field(),
                         }
                         .fail();
                     }
@@ -372,7 +240,7 @@ impl Processor for RegexProcessor {
                 None => {}
                 Some(result_list) => {
                     for (output_index, result) in result_list {
-                        val[output_index] = result;
+                        val.insert(generate_key(index, output_index), result);
                     }
                 }
             }
@@ -388,7 +256,7 @@ mod tests {
     use ahash::{HashMap, HashMapExt};
     use itertools::Itertools;
 
-    use crate::etl::processor::regex::RegexProcessorBuilder;
+    use crate::etl::processor::regex::RegexProcessor;
     use crate::etl::value::{Map, Value};
 
     #[test]
@@ -402,23 +270,21 @@ ignore_missing: false"#;
             .pop()
             .unwrap();
         let processor_yaml_hash = processor_yaml.as_hash().unwrap();
-        let builder = RegexProcessorBuilder::try_from(processor_yaml_hash).unwrap();
-        let intermediate_keys = ["a".to_string(), "a_ar".to_string()];
-        let processor = builder.build(&intermediate_keys).unwrap();
+        let processor = RegexProcessor::try_from(processor_yaml_hash).unwrap();
 
         // single field (with prefix), multiple patterns
 
         let result = processor
-            .process("123", &processor.patterns[0], (0, 0))
+            .process("123", &processor.patterns[0])
             .unwrap()
             .into_iter()
-            .map(|(k, v)| (intermediate_keys[k].clone(), v))
+            .map(|(k, v)| (k.to_string(), v))
             .collect();
 
         let map = Map { values: result };
 
         let v = Map {
-            values: vec![("a_ar".to_string(), Value::String("1".to_string()))]
+            values: vec![("ar".to_string(), Value::String("1".to_string()))]
                 .into_iter()
                 .collect(),
         };
@@ -464,30 +330,23 @@ ignore_missing: false"#;
                 .pop()
                 .unwrap();
             let processor_yaml_hash = processor_yaml.as_hash().unwrap();
-            let builder = RegexProcessorBuilder::try_from(processor_yaml_hash).unwrap();
-            let intermediate_keys = [
-                "breadcrumbs",
-                "breadcrumbs_parent",
-                "breadcrumbs_edge",
-                "breadcrumbs_origin",
-                "breadcrumbs_peer",
-                "breadcrumbs_wrapper",
-            ]
-            .iter()
-            .map(|k| k.to_string())
-            .collect_vec();
-            let processor = builder.build(&intermediate_keys).unwrap();
+            let processor = RegexProcessor::try_from(processor_yaml_hash).unwrap();
             let mut result = BTreeMap::new();
-            for (index, pattern) in processor.patterns.iter().enumerate() {
+            for pattern in processor.patterns.iter() {
                 let r = processor
-                    .process(&breadcrumbs_str, pattern, (0, index))
+                    .process(&breadcrumbs_str, pattern)
                     .unwrap()
                     .into_iter()
-                    .map(|(k, v)| (intermediate_keys[k].clone(), v))
+                    .map(|(k, v)| (k.to_string(), v))
                     .collect::<BTreeMap<_, _>>();
                 result.extend(r);
             }
-            let map = Map { values: result };
+            let map = Map {
+                values: result
+                    .into_iter()
+                    .map(|(k, v)| (format!("breadcrumbs_{}", k), v))
+                    .collect(),
+            };
             assert_eq!(temporary_map, map);
         }
 
@@ -515,67 +374,21 @@ ignore_missing: false"#;
                 .pop()
                 .unwrap();
             let processor_yaml_hash = processor_yaml.as_hash().unwrap();
-            let builder = RegexProcessorBuilder::try_from(processor_yaml_hash).unwrap();
-
-            let intermediate_keys = [
-                "breadcrumbs_parent",
-                "breadcrumbs_edge",
-                "breadcrumbs_origin",
-                "breadcrumbs_peer",
-                "breadcrumbs_wrapper",
-                "edge_ip",
-                "edge_request_id",
-                "edge_request_end_time",
-                "edge_turn_around_time",
-                "edge_dns_lookup_time",
-                "edge_geo",
-                "edge_asn",
-                "origin_ip",
-                "origin_request_id",
-                "origin_request_end_time",
-                "origin_turn_around_time",
-                "origin_dns_lookup_time",
-                "origin_geo",
-                "origin_asn",
-                "peer_ip",
-                "peer_request_id",
-                "peer_request_end_time",
-                "peer_turn_around_time",
-                "peer_dns_lookup_time",
-                "peer_geo",
-                "peer_asn",
-                "parent_ip",
-                "parent_request_id",
-                "parent_request_end_time",
-                "parent_turn_around_time",
-                "parent_dns_lookup_time",
-                "parent_geo",
-                "parent_asn",
-                "wrapper_ip",
-                "wrapper_request_id",
-                "wrapper_request_end_time",
-                "wrapper_turn_around_time",
-                "wrapper_dns_lookup_time",
-                "wrapper_geo",
-                "wrapper_asn",
-            ]
-            .iter()
-            .map(|k| k.to_string())
-            .collect_vec();
-            let processor = builder.build(&intermediate_keys).unwrap();
+            let processor = RegexProcessor::try_from(processor_yaml_hash).unwrap();
 
             let mut result = HashMap::new();
-            for (field_index, field) in processor.fields.iter().enumerate() {
-                for (pattern_index, pattern) in processor.patterns.iter().enumerate() {
+            for field in processor.fields.iter() {
+                for pattern in processor.patterns.iter() {
                     let s = temporary_map
-                        .get(field.input_name())
+                        .get(field.input_field())
                         .unwrap()
                         .to_str_value();
+                    let prefix = field.target_or_input_field();
                     let r = processor
-                        .process(&s, pattern, (field_index, pattern_index))
+                        .process(&s, pattern)
                         .unwrap()
                         .into_iter()
-                        .map(|(k, v)| (intermediate_keys[k].clone(), v))
+                        .map(|(k, v)| (format!("{}_{}", prefix, k), v))
                         .collect::<HashMap<_, _>>();
                     result.extend(r);
                 }
diff --git a/src/pipeline/tests/regex.rs b/src/pipeline/tests/regex.rs
index a8a7daaf5c6f..93b0897db0eb 100644
--- a/src/pipeline/tests/regex.rs
+++ b/src/pipeline/tests/regex.rs
@@ -93,6 +93,8 @@ transform:
 
     assert_eq!(output.schema, *EXPECTED_SCHEMA);
 
+    println!("{:?}", output.rows);
+
     assert_eq!(
         output.rows[0].values[0].value_data,
         Some(StringValue("123".to_string()))
diff --git a/src/servers/src/http/event.rs b/src/servers/src/http/event.rs
index 978891078cce..d6d8e89a56ea 100644
--- a/src/servers/src/http/event.rs
+++ b/src/servers/src/http/event.rs
@@ -573,7 +573,7 @@ fn extract_pipeline_value_by_content_type(
         ct if ct == *TEXT_CONTENT_TYPE || ct == *TEXT_UTF8_CONTENT_TYPE => payload
             .lines()
             .filter(|line| !line.is_empty())
-            .map(|line| Value::String(line.to_string()))
+            .map(|line| json!({"message": line}))
             .collect(),
         _ => UnsupportedContentTypeSnafu { content_type }.fail()?,
     })
diff --git a/tests-integration/tests/http.rs b/tests-integration/tests/http.rs
index d5880b7eae08..4321e2a9d950 100644
--- a/tests-integration/tests/http.rs
+++ b/tests-integration/tests/http.rs
@@ -1663,7 +1663,7 @@ pub async fn test_plain_text_ingestion(store_type: StorageType) {
 processors:
   - dissect:
       fields:
-        - line
+        - message
       patterns:
         - "%{+ts} %{+ts} %{content}"
   - date:

From 448f94dfc99d5aaf5e2f31c3854e48d502601b55 Mon Sep 17 00:00:00 2001
From: paomian <xpaomian@gmail.com>
Date: Mon, 27 Jan 2025 11:51:46 +0800
Subject: [PATCH 29/32] chore: fix test

---
 src/pipeline/src/etl/processor.rs         | 2 +-
 src/pipeline/src/etl/processor/cmcd.rs    | 2 +-
 src/pipeline/src/etl/processor/dissect.rs | 4 ++--
 src/pipeline/src/etl/processor/regex.rs   | 5 +++--
 src/pipeline/src/etl/transform.rs         | 4 +---
 src/pipeline/tests/regex.rs               | 2 --
 6 files changed, 8 insertions(+), 11 deletions(-)

diff --git a/src/pipeline/src/etl/processor.rs b/src/pipeline/src/etl/processor.rs
index 376282afecef..005feca3794e 100644
--- a/src/pipeline/src/etl/processor.rs
+++ b/src/pipeline/src/etl/processor.rs
@@ -91,7 +91,7 @@ pub trait Processor: std::fmt::Debug + Send + Sync + 'static {
 pub enum ProcessorKind {
     Cmcd(CmcdProcessor),
     Csv(CsvProcessor),
-    // Dissect(DissectProcessor),
+    Dissect(DissectProcessor),
     Gsub(GsubProcessor),
     Join(JoinProcessor),
     Letter(LetterProcessor),
diff --git a/src/pipeline/src/etl/processor/cmcd.rs b/src/pipeline/src/etl/processor/cmcd.rs
index 8d8b546f7216..a5da69d0be42 100644
--- a/src/pipeline/src/etl/processor/cmcd.rs
+++ b/src/pipeline/src/etl/processor/cmcd.rs
@@ -257,7 +257,7 @@ impl Processor for CmcdProcessor {
 
             match val.get(name) {
                 Some(Value::String(s)) => {
-                    let results = self.parse(name, s)?;
+                    let results = self.parse(field.target_or_input_field(), s)?;
                     val.extend(results);
                 }
                 Some(Value::Null) | None => {
diff --git a/src/pipeline/src/etl/processor/dissect.rs b/src/pipeline/src/etl/processor/dissect.rs
index 5755a0aeb8a5..b35884b82671 100644
--- a/src/pipeline/src/etl/processor/dissect.rs
+++ b/src/pipeline/src/etl/processor/dissect.rs
@@ -537,10 +537,10 @@ pub struct DissectProcessor {
 }
 
 impl DissectProcessor {
-    fn process_name_value<'a, 'b>(
+    fn process_name_value<'a>(
         name: &'a Name,
         value: String,
-        appends: &'b mut HashMap<&'a String, Vec<(String, u32)>>,
+        appends: &mut HashMap<&'a String, Vec<(String, u32)>>,
         map: &mut Vec<(&'a String, Value)>,
     ) {
         match name.start_modifier {
diff --git a/src/pipeline/src/etl/processor/regex.rs b/src/pipeline/src/etl/processor/regex.rs
index e6988c773f52..fad905479a83 100644
--- a/src/pipeline/src/etl/processor/regex.rs
+++ b/src/pipeline/src/etl/processor/regex.rs
@@ -193,6 +193,7 @@ impl Processor for RegexProcessor {
     fn exec_mut(&self, val: &mut IntermediateStatus) -> Result<()> {
         for field in self.fields.iter() {
             let index = field.input_field();
+            let prefix = field.target_or_input_field();
             let mut result_list = None;
             match val.get(index) {
                 Some(Value::String(s)) => {
@@ -239,8 +240,8 @@ impl Processor for RegexProcessor {
             match result_list {
                 None => {}
                 Some(result_list) => {
-                    for (output_index, result) in result_list {
-                        val.insert(generate_key(index, output_index), result);
+                    for (output_key, result) in result_list {
+                        val.insert(generate_key(prefix, output_key), result);
                     }
                 }
             }
diff --git a/src/pipeline/src/etl/transform.rs b/src/pipeline/src/etl/transform.rs
index 7191d272069c..e3039d6c7ac4 100644
--- a/src/pipeline/src/etl/transform.rs
+++ b/src/pipeline/src/etl/transform.rs
@@ -128,9 +128,7 @@ impl TryFrom<&Vec<yaml_rust::Yaml>> for Transforms {
 
         all_required_keys.sort();
 
-        Ok(Transforms {
-            transforms: transforms,
-        })
+        Ok(Transforms { transforms })
     }
 }
 
diff --git a/src/pipeline/tests/regex.rs b/src/pipeline/tests/regex.rs
index 93b0897db0eb..a8a7daaf5c6f 100644
--- a/src/pipeline/tests/regex.rs
+++ b/src/pipeline/tests/regex.rs
@@ -93,8 +93,6 @@ transform:
 
     assert_eq!(output.schema, *EXPECTED_SCHEMA);
 
-    println!("{:?}", output.rows);
-
     assert_eq!(
         output.rows[0].values[0].value_data,
         Some(StringValue("123".to_string()))

From b5d2969be831eb66c75a53830e7d1a4330ec1a69 Mon Sep 17 00:00:00 2001
From: Ning Sun <sunning@greptime.com>
Date: Mon, 27 Jan 2025 16:56:21 +0800
Subject: [PATCH 30/32] test: add integration test for http pipeline

---
 tests-integration/tests/http.rs | 194 +++++++++++++++++++++++++++++++-
 1 file changed, 193 insertions(+), 1 deletion(-)

diff --git a/tests-integration/tests/http.rs b/tests-integration/tests/http.rs
index 4321e2a9d950..413a656e6004 100644
--- a/tests-integration/tests/http.rs
+++ b/tests-integration/tests/http.rs
@@ -93,6 +93,7 @@ macro_rules! http_tests {
                 test_plain_text_ingestion,
                 test_identify_pipeline,
                 test_identify_pipeline_with_flatten,
+                test_pipeline_dispatcher,
 
                 test_otlp_metrics,
                 test_otlp_traces,
@@ -1359,6 +1360,197 @@ pub async fn test_identify_pipeline(store_type: StorageType) {
     guard.remove_all().await;
 }
 
+pub async fn test_pipeline_dispatcher(storage_type: StorageType) {
+    common_telemetry::init_default_ut_logging();
+    let (app, mut guard) =
+        setup_test_http_app_with_frontend(storage_type, "test_pipeline_dispatcher").await;
+
+    // handshake
+    let client = TestClient::new(app).await;
+
+    let root_pipeline = r#"
+processors:
+  - date:
+      field: time
+      formats:
+        - "%Y-%m-%d %H:%M:%S%.3f"
+      ignore_missing: true
+
+dispatcher:
+  field: type
+  rules:
+    - value: http
+      table_part: http
+      pipeline: http
+    - value: db
+      table_part: db
+    - value: not_found
+      table_part: not_found
+      pipeline: not_found
+
+transform:
+  - fields:
+      - id1, id1_root
+      - id2, id2_root
+    type: int32
+  - fields:
+      - type
+      - log
+      - logger
+    type: string
+  - field: time
+    type: time
+    index: timestamp
+"#;
+
+    let http_pipeline = r#"
+processors:
+
+transform:
+  - fields:
+      - id1, id1_http
+      - id2, id2_http
+    type: int32
+  - fields:
+      - log
+      - logger
+    type: string
+  - field: time
+    type: time
+    index: timestamp
+"#;
+
+    // 1. create pipeline
+    let res = client
+        .post("/v1/events/pipelines/root")
+        .header("Content-Type", "application/x-yaml")
+        .body(root_pipeline)
+        .send()
+        .await;
+
+    assert_eq!(res.status(), StatusCode::OK);
+
+    let res = client
+        .post("/v1/events/pipelines/http")
+        .header("Content-Type", "application/x-yaml")
+        .body(http_pipeline)
+        .send()
+        .await;
+
+    assert_eq!(res.status(), StatusCode::OK);
+
+    // 2. write data
+    let data_body = r#"
+[
+  {
+    "id1": "2436",
+    "id2": "2528",
+    "logger": "INTERACT.MANAGER",
+    "type": "http",
+    "time": "2024-05-25 20:16:37.217",
+    "log": "ClusterAdapter:enter sendTextDataToCluster\\n"
+  }
+]
+"#;
+    let res = client
+        .post("/v1/events/logs?db=public&table=logs1&pipeline_name=root")
+        .header("Content-Type", "application/json")
+        .body(data_body)
+        .send()
+        .await;
+    assert_eq!(res.status(), StatusCode::OK);
+
+    let data_body = r#"
+[
+  {
+    "id1": "2436",
+    "id2": "2528",
+    "logger": "INTERACT.MANAGER",
+    "type": "db",
+    "time": "2024-05-25 20:16:37.217",
+    "log": "ClusterAdapter:enter sendTextDataToCluster\\n"
+  }
+]
+"#;
+    let res = client
+        .post("/v1/events/logs?db=public&table=logs1&pipeline_name=root")
+        .header("Content-Type", "application/json")
+        .body(data_body)
+        .send()
+        .await;
+    assert_eq!(res.status(), StatusCode::OK);
+
+    let data_body = r#"
+[
+  {
+    "id1": "2436",
+    "id2": "2528",
+    "logger": "INTERACT.MANAGER",
+    "type": "api",
+    "time": "2024-05-25 20:16:37.217",
+    "log": "ClusterAdapter:enter sendTextDataToCluster\\n"
+  }
+]
+"#;
+    let res = client
+        .post("/v1/events/logs?db=public&table=logs1&pipeline_name=root")
+        .header("Content-Type", "application/json")
+        .body(data_body)
+        .send()
+        .await;
+    assert_eq!(res.status(), StatusCode::OK);
+
+    let data_body = r#"
+[
+  {
+    "id1": "2436",
+    "id2": "2528",
+    "logger": "INTERACT.MANAGER",
+    "type": "not_found",
+    "time": "2024-05-25 20:16:37.217",
+    "log": "ClusterAdapter:enter sendTextDataToCluster\\n"
+  }
+]
+"#;
+    let res = client
+        .post("/v1/events/logs?db=public&table=logs1&pipeline_name=root")
+        .header("Content-Type", "application/json")
+        .body(data_body)
+        .send()
+        .await;
+    assert_eq!(res.status(), StatusCode::BAD_REQUEST);
+
+    // 3. verify data
+    let expected = "[[2436]]";
+    validate_data(
+        "test_dispatcher_pipeline default table",
+        &client,
+        "select id1_root from logs1",
+        expected,
+    )
+    .await;
+
+    let expected = "[[2436]]";
+    validate_data(
+        "test_dispatcher_pipeline http table",
+        &client,
+        "select id1_http from logs1_http",
+        expected,
+    )
+    .await;
+
+    let expected = "[[\"2436\"]]";
+    validate_data(
+        "test_dispatcher_pipeline db table",
+        &client,
+        "select id1 from logs1_db",
+        expected,
+    )
+    .await;
+
+    guard.remove_all().await;
+}
+
 pub async fn test_identify_pipeline_with_flatten(store_type: StorageType) {
     common_telemetry::init_default_ut_logging();
     let (app, mut guard) =
@@ -2248,7 +2440,7 @@ async fn validate_data(test_name: &str, client: &TestClient, sql: &str, expected
         .get(format!("/v1/sql?sql={sql}").as_str())
         .send()
         .await;
-    assert_eq!(res.status(), StatusCode::OK);
+    assert_eq!(res.status(), StatusCode::OK, "validate {test_name} fail");
     let resp = res.text().await;
     let v = get_rows_from_output(&resp);
 

From fc4b3f1f9801d262a483a6b8eb40bf110698f3f4 Mon Sep 17 00:00:00 2001
From: Ning Sun <sunning@greptime.com>
Date: Tue, 28 Jan 2025 17:56:41 +0800
Subject: [PATCH 31/32] refactor: improve regex pipeline

---
 src/pipeline/src/etl/processor/dissect.rs | 119 ----------------------
 src/pipeline/src/etl/processor/regex.rs   | 107 ++++++-------------
 2 files changed, 29 insertions(+), 197 deletions(-)

diff --git a/src/pipeline/src/etl/processor/dissect.rs b/src/pipeline/src/etl/processor/dissect.rs
index b35884b82671..9ac28f7bf09e 100644
--- a/src/pipeline/src/etl/processor/dissect.rs
+++ b/src/pipeline/src/etl/processor/dissect.rs
@@ -407,125 +407,6 @@ impl Pattern {
     }
 }
 
-// impl std::fmt::Display for PatternInfo {
-//     fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
-//         write!(f, "{}", self.origin)
-//     }
-// }
-
-// #[derive(Debug, Default)]
-// pub struct DissectProcessorBuilder {
-//     fields: Fields,
-//     patterns: Vec<PatternInfo>,
-//     ignore_missing: bool,
-//     append_separator: Option<String>,
-//     output_keys: HashSet<String>,
-// }
-
-// impl DissectProcessorBuilder {
-//     fn build_output_keys(patterns: &[PatternInfo]) -> HashSet<String> {
-//         patterns
-//             .iter()
-//             .flat_map(|pattern| pattern.iter())
-//             .filter_map(|p| match p {
-//                 PartInfo::Name(name) => {
-//                     if !name.is_empty()
-//                         && (name.start_modifier.is_none()
-//                             || name
-//                                 .start_modifier
-//                                 .as_ref()
-//                                 .is_some_and(|x| matches!(x, StartModifier::Append(_))))
-//                     {
-//                         Some(name.to_string())
-//                     } else {
-//                         None
-//                     }
-//                 }
-//                 _ => None,
-//             })
-//             .collect()
-//     }
-
-//     fn part_info_to_part(part_info: PartInfo, intermediate_keys: &[String]) -> Result<Part> {
-//         match part_info {
-//             PartInfo::Split(s) => Ok(Part::Split(s)),
-//             PartInfo::Name(n) => match n.start_modifier {
-//                 None | Some(StartModifier::Append(_)) => {
-//                     let index = find_key_index(intermediate_keys, &n.name, "dissect")?;
-//                     Ok(Part::Name(Name {
-//                         name: n.name,
-//                         index,
-//                         start_modifier: n.start_modifier,
-//                         end_modifier: n.end_modifier,
-//                     }))
-//                 }
-//                 _ => Ok(Part::Name(Name {
-//                     name: n.name,
-//                     index: usize::MAX,
-//                     start_modifier: n.start_modifier,
-//                     end_modifier: n.end_modifier,
-//                 })),
-//             },
-//         }
-//     }
-
-//     fn pattern_info_to_pattern(
-//         pattern_info: PatternInfo,
-//         intermediate_keys: &[String],
-//     ) -> Result<Pattern> {
-//         let original = pattern_info.origin;
-//         let pattern = pattern_info
-//             .parts
-//             .into_iter()
-//             .map(|part_info| Self::part_info_to_part(part_info, intermediate_keys))
-//             .collect::<Result<Vec<_>>>()?;
-//         Ok(Pattern {
-//             origin: original,
-//             parts: pattern,
-//         })
-//     }
-
-//     fn build_patterns_from_pattern_infos(
-//         patterns: Vec<PatternInfo>,
-//         intermediate_keys: &[String],
-//     ) -> Result<Vec<Pattern>> {
-//         patterns
-//             .into_iter()
-//             .map(|pattern_info| Self::pattern_info_to_pattern(pattern_info, intermediate_keys))
-//             .collect()
-//     }
-// }
-
-// impl ProcessorBuilder for DissectProcessorBuilder {
-//     fn output_keys(&self) -> HashSet<&str> {
-//         self.output_keys.iter().map(|s| s.as_str()).collect()
-//     }
-
-//     fn input_keys(&self) -> HashSet<&str> {
-//         self.fields.iter().map(|f| f.input_field()).collect()
-//     }
-
-//     fn build(self, intermediate_keys: &[String]) -> Result<ProcessorKind> {
-//         let mut real_fields = vec![];
-//         for field in self.fields.into_iter() {
-//             let input_index = find_key_index(intermediate_keys, field.input_field(), "dissect")?;
-
-//             let input_field_info = InputField::new(field.input_field(), input_index);
-
-//             let real_field = OneInputMultiOutputField::new(input_field_info, field.target_field);
-//             real_fields.push(real_field);
-//         }
-//         let patterns = Self::build_patterns_from_pattern_infos(self.patterns, intermediate_keys)?;
-//         let processor = DissectProcessor {
-//             fields: real_fields,
-//             patterns,
-//             ignore_missing: self.ignore_missing,
-//             append_separator: self.append_separator,
-//         };
-//         Ok(ProcessorKind::Dissect(processor))
-//     }
-// }
-
 #[derive(Debug, Default)]
 pub struct DissectProcessor {
     fields: Fields,
diff --git a/src/pipeline/src/etl/processor/regex.rs b/src/pipeline/src/etl/processor/regex.rs
index fad905479a83..27f30f65d9ae 100644
--- a/src/pipeline/src/etl/processor/regex.rs
+++ b/src/pipeline/src/etl/processor/regex.rs
@@ -18,6 +18,8 @@ const PATTERNS_NAME: &str = "patterns";
 
 pub(crate) const PROCESSOR_REGEX: &str = "regex";
 
+use std::collections::BTreeMap;
+
 use lazy_static::lazy_static;
 use regex::Regex;
 use snafu::{OptionExt, ResultExt};
@@ -167,13 +169,15 @@ impl RegexProcessor {
         Ok(())
     }
 
-    fn process<'a>(&self, val: &str, gr: &'a GroupRegex) -> Result<Vec<(&'a String, Value)>> {
-        let mut result = Vec::new();
-        if let Some(captures) = gr.regex.captures(val) {
-            for group in gr.groups.iter() {
-                if let Some(capture) = captures.name(group) {
-                    let value = capture.as_str().to_string();
-                    result.push((group, Value::String(value)));
+    fn process(&self, prefix: &str, val: &str) -> Result<BTreeMap<String, Value>> {
+        let mut result = BTreeMap::new();
+        for gr in self.patterns.iter() {
+            if let Some(captures) = gr.regex.captures(val) {
+                for group in gr.groups.iter() {
+                    if let Some(capture) = captures.name(group) {
+                        let value = capture.as_str().to_string();
+                        result.insert(generate_key(prefix, group), Value::String(value));
+                    }
                 }
             }
         }
@@ -194,30 +198,10 @@ impl Processor for RegexProcessor {
         for field in self.fields.iter() {
             let index = field.input_field();
             let prefix = field.target_or_input_field();
-            let mut result_list = None;
             match val.get(index) {
                 Some(Value::String(s)) => {
-                    // we get rust borrow checker error here
-                    // for (gr_index, gr) in self.patterns.iter().enumerate() {
-                    //     let result_list = self.process(s.as_str(), gr, (field_index, gr_index))?;
-                    //     for (output_index, result) in result_list {
-                    //cannot borrow `*val` as mutable because it is also borrowed as immutable mutable borrow occurs here
-                    //         val[output_index] = result;
-                    //     }
-                    // }
-                    for gr in self.patterns.iter() {
-                        let result = self.process(s.as_str(), gr)?;
-                        if !result.is_empty() {
-                            match result_list.as_mut() {
-                                None => {
-                                    result_list = Some(result);
-                                }
-                                Some(result_list) => {
-                                    result_list.extend(result);
-                                }
-                            }
-                        }
-                    }
+                    let result = self.process(prefix, s)?;
+                    val.extend(result);
                 }
                 Some(Value::Null) | None => {
                     if !self.ignore_missing {
@@ -236,15 +220,6 @@ impl Processor for RegexProcessor {
                     .fail();
                 }
             }
-            // safety here
-            match result_list {
-                None => {}
-                Some(result_list) => {
-                    for (output_key, result) in result_list {
-                        val.insert(generate_key(prefix, output_key), result);
-                    }
-                }
-            }
         }
 
         Ok(())
@@ -275,17 +250,12 @@ ignore_missing: false"#;
 
         // single field (with prefix), multiple patterns
 
-        let result = processor
-            .process("123", &processor.patterns[0])
-            .unwrap()
-            .into_iter()
-            .map(|(k, v)| (k.to_string(), v))
-            .collect();
+        let result = processor.process("a", "123").unwrap();
 
         let map = Map { values: result };
 
         let v = Map {
-            values: vec![("ar".to_string(), Value::String("1".to_string()))]
+            values: vec![("a_ar".to_string(), Value::String("1".to_string()))]
                 .into_iter()
                 .collect(),
         };
@@ -302,7 +272,7 @@ ignore_missing: false"#;
         let cw = "[c=w,n=US_CA_SANJOSE,o=55155]";
         let breadcrumbs_str = [cc, cg, co, cp, cw].iter().join(",");
 
-        let values = [
+        let temporary_map: BTreeMap<String, Value> = [
             ("breadcrumbs_parent", Value::String(cc.to_string())),
             ("breadcrumbs_edge", Value::String(cg.to_string())),
             ("breadcrumbs_origin", Value::String(co.to_string())),
@@ -312,7 +282,6 @@ ignore_missing: false"#;
         .into_iter()
         .map(|(k, v)| (k.to_string(), v))
         .collect();
-        let temporary_map = Map { values };
 
         {
             // single field (with prefix), multiple patterns
@@ -332,23 +301,10 @@ ignore_missing: false"#;
                 .unwrap();
             let processor_yaml_hash = processor_yaml.as_hash().unwrap();
             let processor = RegexProcessor::try_from(processor_yaml_hash).unwrap();
-            let mut result = BTreeMap::new();
-            for pattern in processor.patterns.iter() {
-                let r = processor
-                    .process(&breadcrumbs_str, pattern)
-                    .unwrap()
-                    .into_iter()
-                    .map(|(k, v)| (k.to_string(), v))
-                    .collect::<BTreeMap<_, _>>();
-                result.extend(r);
-            }
-            let map = Map {
-                values: result
-                    .into_iter()
-                    .map(|(k, v)| (format!("breadcrumbs_{}", k), v))
-                    .collect(),
-            };
-            assert_eq!(temporary_map, map);
+
+            let result = processor.process("breadcrumbs", &breadcrumbs_str).unwrap();
+
+            assert_eq!(temporary_map, result);
         }
 
         {
@@ -379,20 +335,15 @@ ignore_missing: false"#;
 
             let mut result = HashMap::new();
             for field in processor.fields.iter() {
-                for pattern in processor.patterns.iter() {
-                    let s = temporary_map
-                        .get(field.input_field())
-                        .unwrap()
-                        .to_str_value();
-                    let prefix = field.target_or_input_field();
-                    let r = processor
-                        .process(&s, pattern)
-                        .unwrap()
-                        .into_iter()
-                        .map(|(k, v)| (format!("{}_{}", prefix, k), v))
-                        .collect::<HashMap<_, _>>();
-                    result.extend(r);
-                }
+                let s = temporary_map
+                    .get(field.input_field())
+                    .unwrap()
+                    .to_str_value();
+                let prefix = field.target_or_input_field();
+
+                let r = processor.process(prefix, &s).unwrap();
+
+                result.extend(r);
             }
 
             let new_values = vec![

From 592f2f452ba76e552d562b9ec3cd63c052191588 Mon Sep 17 00:00:00 2001
From: Ning Sun <sunning@greptime.com>
Date: Tue, 28 Jan 2025 23:35:16 +0800
Subject: [PATCH 32/32] refactor: improve required field check

---
 src/pipeline/src/etl/processor/gsub.rs      | 44 ++++++---------------
 src/pipeline/src/etl/processor/join.rs      | 22 +++--------
 src/pipeline/src/etl/processor/json_path.rs | 32 +++++++--------
 3 files changed, 33 insertions(+), 65 deletions(-)

diff --git a/src/pipeline/src/etl/processor/gsub.rs b/src/pipeline/src/etl/processor/gsub.rs
index dbdb9c5c3047..7f0f601f44f3 100644
--- a/src/pipeline/src/etl/processor/gsub.rs
+++ b/src/pipeline/src/etl/processor/gsub.rs
@@ -32,35 +32,17 @@ pub(crate) const PROCESSOR_GSUB: &str = "gsub";
 const REPLACEMENT_NAME: &str = "replacement";
 
 /// A processor to replace all matches of a pattern in string by a replacement, only support string value, and array string value
-#[derive(Debug, Default)]
+#[derive(Debug)]
 pub struct GsubProcessor {
     fields: Fields,
-    pattern: Option<Regex>,
-    replacement: Option<String>,
+    pattern: Regex,
+    replacement: String,
     ignore_missing: bool,
 }
 
 impl GsubProcessor {
-    fn check(self) -> Result<Self> {
-        if self.pattern.is_none() {
-            return GsubPatternRequiredSnafu.fail();
-        }
-
-        if self.replacement.is_none() {
-            return GsubReplacementRequiredSnafu.fail();
-        }
-
-        Ok(self)
-    }
-
     fn process_string(&self, val: &str) -> Result<Value> {
-        let replacement = self.replacement.as_ref().unwrap();
-        let new_val = self
-            .pattern
-            .as_ref()
-            .unwrap()
-            .replace_all(val, replacement)
-            .to_string();
+        let new_val = self.pattern.replace_all(val, &self.replacement).to_string();
         let val = Value::String(new_val);
 
         Ok(val)
@@ -118,14 +100,12 @@ impl TryFrom<&yaml_rust::yaml::Hash> for GsubProcessor {
             }
         }
 
-        let builder = GsubProcessor {
+        Ok(GsubProcessor {
             fields,
-            pattern,
-            replacement,
+            pattern: pattern.context(GsubPatternRequiredSnafu)?,
+            replacement: replacement.context(GsubReplacementRequiredSnafu)?,
             ignore_missing,
-        };
-
-        builder.check()
+        })
     }
 }
 
@@ -164,15 +144,17 @@ impl crate::etl::processor::Processor for GsubProcessor {
 
 #[cfg(test)]
 mod tests {
+    use super::*;
     use crate::etl::processor::gsub::GsubProcessor;
     use crate::etl::value::Value;
 
     #[test]
     fn test_string_value() {
         let processor = GsubProcessor {
-            pattern: Some(regex::Regex::new(r"\d+").unwrap()),
-            replacement: Some("xxx".to_string()),
-            ..Default::default()
+            fields: Fields::default(),
+            pattern: regex::Regex::new(r"\d+").unwrap(),
+            replacement: "xxx".to_string(),
+            ignore_missing: false,
         };
 
         let val = Value::String("123".to_string());
diff --git a/src/pipeline/src/etl/processor/join.rs b/src/pipeline/src/etl/processor/join.rs
index 6913a5428873..72fafdbf7dd1 100644
--- a/src/pipeline/src/etl/processor/join.rs
+++ b/src/pipeline/src/etl/processor/join.rs
@@ -32,29 +32,20 @@ pub(crate) const PROCESSOR_JOIN: &str = "join";
 #[derive(Debug, Default)]
 pub struct JoinProcessor {
     fields: Fields,
-    separator: Option<String>,
+    separator: String,
     ignore_missing: bool,
 }
 
 impl JoinProcessor {
     fn process(&self, arr: &Array) -> Result<Value> {
-        let sep = self.separator.as_ref().unwrap();
         let val = arr
             .iter()
             .map(|v| v.to_str_value())
             .collect::<Vec<String>>()
-            .join(sep);
+            .join(&self.separator);
 
         Ok(Value::String(val))
     }
-
-    fn check(self) -> Result<Self> {
-        if self.separator.is_none() {
-            return JoinSeparatorRequiredSnafu.fail();
-        }
-
-        Ok(self)
-    }
 }
 
 impl TryFrom<&yaml_rust::yaml::Hash> for JoinProcessor {
@@ -87,12 +78,11 @@ impl TryFrom<&yaml_rust::yaml::Hash> for JoinProcessor {
             }
         }
 
-        let builder = JoinProcessor {
+        Ok(JoinProcessor {
             fields,
-            separator,
+            separator: separator.context(JoinSeparatorRequiredSnafu)?,
             ignore_missing,
-        };
-        builder.check()
+        })
     }
 }
 
@@ -146,7 +136,7 @@ mod tests {
     #[test]
     fn test_join_processor() {
         let processor = JoinProcessor {
-            separator: Some("-".to_string()),
+            separator: "-".to_string(),
             ..Default::default()
         };
 
diff --git a/src/pipeline/src/etl/processor/json_path.rs b/src/pipeline/src/etl/processor/json_path.rs
index c7b4210e83f1..92916263e4e9 100644
--- a/src/pipeline/src/etl/processor/json_path.rs
+++ b/src/pipeline/src/etl/processor/json_path.rs
@@ -67,22 +67,18 @@ impl TryFrom<&yaml_rust::yaml::Hash> for JsonPathProcessor {
                 _ => {}
             }
         }
-        if let Some(json_path) = json_path {
-            let processor = JsonPathProcessor {
-                fields,
-                json_path,
-                ignore_missing,
-                result_idex,
-            };
-
-            Ok(processor)
-        } else {
-            ProcessorMissingFieldSnafu {
+
+        let processor = JsonPathProcessor {
+            fields,
+            json_path: json_path.context(ProcessorMissingFieldSnafu {
                 processor: PROCESSOR_JSON_PATH,
                 field: JSON_PATH_NAME,
-            }
-            .fail()
-        }
+            })?,
+            ignore_missing,
+            result_index: result_idex,
+        };
+
+        Ok(processor)
     }
 }
 
@@ -91,7 +87,7 @@ pub struct JsonPathProcessor {
     fields: Fields,
     json_path: JsonPath<Value>,
     ignore_missing: bool,
-    result_idex: Option<usize>,
+    result_index: Option<usize>,
 }
 
 impl Default for JsonPathProcessor {
@@ -100,7 +96,7 @@ impl Default for JsonPathProcessor {
             fields: Fields::default(),
             json_path: JsonPath::try_from("$").unwrap(),
             ignore_missing: false,
-            result_idex: None,
+            result_index: None,
         }
     }
 }
@@ -110,7 +106,7 @@ impl JsonPathProcessor {
         let processed = self.json_path.find(val);
         match processed {
             Value::Array(arr) => {
-                if let Some(index) = self.result_idex {
+                if let Some(index) = self.result_index {
                     Ok(arr.get(index).cloned().unwrap_or(Value::Null))
                 } else {
                     Ok(Value::Array(arr))
@@ -166,7 +162,7 @@ mod test {
         let json_path = JsonPath::try_from("$.hello").unwrap();
         let processor = JsonPathProcessor {
             json_path,
-            result_idex: Some(0),
+            result_index: Some(0),
             ..Default::default()
         };