OpenCSGs
diff --git a/‎frontend/src/components/dataflow_config/dataAcquisition/dataSourceManagement/components/dataSourceInfo.vue‎
Lines changed: 1 addition & 1 deletion b/‎frontend/src/components/dataflow_config/dataAcquisition/dataSourceManagement/components/dataSourceInfo.vue‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎frontend/src/components/dataflow_config/dataAcquisition/dataSourceManagement/newDataSource.vue‎
Lines changed: 38 additions & 38 deletions b/‎frontend/src/components/dataflow_config/dataAcquisition/dataSourceManagement/newDataSource.vue‎
Lines changed: 38 additions & 38 deletions
diff --git a/‎frontend/src/locales/en_js/datapipelines.js‎
Lines changed: 3 additions & 2 deletions b/‎frontend/src/locales/en_js/datapipelines.js‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎frontend/src/locales/en_js/operator_en.json‎
Lines changed: 61 additions & 2 deletions b/‎frontend/src/locales/en_js/operator_en.json‎
Lines changed: 61 additions & 2 deletions
diff --git a/‎frontend/src/locales/zh_hant_js/datapipelines.js‎
Lines changed: 3 additions & 2 deletions b/‎frontend/src/locales/zh_hant_js/datapipelines.js‎
Lines changed: 3 additions & 2 deletions
@@ -141,7 +141,7 @@ onMounted(() => {
 
 const getDataFlow = (extra_config = '{}') => {
   const dataFlow = JSON.parse(extra_config);
-  return `${dataFlow.csg_hub_dataset_id || ''}${dataFlow.csg_hub_dataset_default_branch ? ` > ${dataFlow.csg_hub_dataset_default_branch}` : ''}`;
+  return `${dataFlow.csg_hub_dataset_id || ''}${dataFlow.csg_hub_dataset_branch ? ` > ${dataFlow.csg_hub_dataset_branch}` : ''}`;
 }
 
 /**
 
@@ -254,39 +254,6 @@
             </el-form-item>
           </el-col>
 
-          <el-col
-            v-if="typeVal === 'Mysql' || typeVal === 'Hive'"
-            :xs="24"
-            :sm="24"
-            :md="12"
-            :lg="12"
-            :xl="12"
-          >
-            <el-form-item prop="password" class="mt-[12px]">
-              <template #label>
-                <p class="text-gray-500 text-xs">
-                  {{ t("dataPipelines.password") }}
-                </p>
-              </template>
-              <el-input
-                v-model="formInline.password"
-                :placeholder="`${t('dataPipelines.toInput')}${t(
-                  'dataPipelines.password'
-                )}`"
-                type="password"
-                show-password
-                clearable
-              >
-                <template #prefix>
-                  <SvgIcon
-                    class="w-5 h-5 mx-2 text-gray-500"
-                    name="auth-icon"
-                  />
-                </template>
-              </el-input>
-            </el-form-item>
-          </el-col>
-
           <el-col
             :xs="24"
             :sm="24"
@@ -310,20 +277,53 @@
               >
                 <el-option
                   :label="t('dataPipelines.authType_option_NONE')"
-                  value="NONE"
+                  value="NOSASL"
                 ></el-option>
                 <el-option
                   :label="t('dataPipelines.authType_option_LDAP')"
                   value="LDAP"
                 ></el-option>
-                <el-option
+                <!-- <el-option
                   :label="t('dataPipelines.authType_option_KERBEROS')"
                   value="KERBEROS"
-                ></el-option>
+                ></el-option> -->
               </el-select>
-              <span class="text-gray-400 text-xs font-light mt-[10px]">{{
+              <!-- <span class="text-gray-400 text-xs font-light mt-[10px]">{{
                 t("dataPipelines.authType_placeholder")
-              }}</span>
+              }}</span> -->
+            </el-form-item>
+          </el-col>
+
+          <el-col
+            v-if="typeVal === 'Mysql' || (typeVal === 'Hive' && formInline.auth_type !== 'NOSASL')"
+            :xs="24"
+            :sm="24"
+            :md="12"
+            :lg="12"
+            :xl="12"
+          >
+            <el-form-item prop="password" class="mt-[12px]">
+              <template #label>
+                <p class="text-gray-500 text-xs">
+                  {{ t("dataPipelines.password") }}
+                </p>
+              </template>
+              <el-input
+                v-model="formInline.password"
+                :placeholder="`${t('dataPipelines.toInput')}${t(
+                  'dataPipelines.password'
+                )}`"
+                type="password"
+                show-password
+                clearable
+              >
+                <template #prefix>
+                  <SvgIcon
+                    class="w-5 h-5 mx-2 text-gray-500"
+                    name="auth-icon"
+                  />
+                </template>
+              </el-input>
             </el-form-item>
           </el-col>
 
 
@@ -13,7 +13,6 @@ export const dataPipelines = {
   "targetFormat": "Target Format",
   "dataFlowBranch": "Data Flow Branch",
   "startExecution": "Start Execution",
-  "inProgress": "In Progress",
   "searchTaskName": "Search Task Name",
   "confirmTermination": "Confirm Termination",
   "terminate": "Terminate",
@@ -40,7 +39,6 @@ export const dataPipelines = {
     }
   },
   "testingConnection": "Testing connection",
-  "submitting": "Submitting",
   "pleaseSelectAnExecutionTime": "Please select an execution time",
   "deletingTask": "Deleting task",
   "terminatingTask": "Terminating task",
@@ -206,6 +204,7 @@ export const dataPipelines = {
   "dataCleaning": "Data Cleaning",
   "processingStatus": "Processing Status",
   "processingText": "Processing Text",
+  "inProgress": "In Progress",
   "completed": "Completed",
   "dataSource": "Data Source",
   "dataSourceBranch": "Data Source Branch",
@@ -258,6 +257,7 @@ export const dataPipelines = {
   "uploadFailedTips2": "The icon size cannot exceed 10MB.",
   "uploadFailedTips3": "Upload failed, please try again",
   "networkError": "Network error, please check the connection and try again",
+  "submitting": "Submitting",
   "algorithmTemplateDescription": "The algorithm template allows users to build workflows using various model operators, enabling tasks such as data cleaning, automated data augmentation, and analysis.",
   "taskTemplate": "Task Template",
   "searchTemplate": "Search Template",
@@ -383,6 +383,7 @@ export const dataPipelines = {
   "fineweb_edu_chinese_common_internal": "fineweb edu chinese common",
   "smoltalk_chinese_common_internal": "smoltalk chinese common",
   "cosmopedia_chinese_preprocess_internal": "cosmopedia chinese preprocess",
+  "md_to_jsonl_preprocess_internal": "md to jsonl tool",
 
 
   "analysis_common_internal_dec": "This analyzer class is used to analyze specific datasets. It calculates statistics for all filtering operations in the configuration file, applies various analyses (such as overall analysis, column-by-column analysis, etc.) to these statistics, and generates analysis results (statistical tables, distribution charts, etc.) to help users better understand the input dataset.",
 
@@ -175,6 +175,7 @@
     "extract_qa_mapper": {
         "name": "QA pair extractor",
         "description": "\n    Mapper to extract question and answer pair from text samples.\n    Recommended model list: [\n        'alibaba-pai/pai-llama3-8b-doc2qa',\n        'alibaba-pai/pai-baichuan2-7b-doc2qa',\n        'alibaba-pai/pai-qwen1_5-4b-doc2qa',\n        'alibaba-pai/pai-qwen1_5-7b-doc2qa',\n        'alibaba-pai/pai-qwen1_5-1b8-doc2qa',\n        'alibaba-pai/pai-qwen1_5-0b5-doc2qa'\n    ]\n    These recommended models are all trained with Chinese data\n    and are suitable for Chinese.\n    ",
+        "operator_description": "Convert a text into a JSON format Q&A pair, such as:[\r\n    {\r\n      \"messages\": [\r\n        {\"role\": \"user\", \"content\": \"Question 1\"},\r\n        {\"role\": \"assistant\", \"content\": \"Answer 1\"}\r\n      ]\r\n    }\r\n  ]",
         "type": "Mapper",
         "group": "",
         "samples": {
@@ -192,6 +193,18 @@
                     }
                 ],
                 "value": "alibaba-pai/pai-qwen1_5-7b-doc2qa"
+            },
+            {
+                "name": "model_url",
+                "type": "LIST",
+                "option_values": null,
+                "value": "https://api.deepseek.com/chat/completions"
+            },
+            {
+                "name": "auth_token",
+                "type": "LIST",
+                "option_values": null,
+                "value": ""
             }
         ]
     },
@@ -358,6 +371,18 @@
                     }
                 ],
                 "value": "alibaba-pai/Qwen2-7B-Instruct-Refine"
+            },
+            {
+                "name": "model_url",
+                "type": "LIST",
+                "option_values": null,
+                "value": "https://api.deepseek.com/chat/completions"
+            },
+            {
+                "name": "auth_token",
+                "type": "LIST",
+                "option_values": null,
+                "value": ""
             }
         ]
     },
@@ -625,6 +650,7 @@
     "sentence_split_mapper": {
         "name": "Sentence Spliter",
         "description": "Mapper to split text samples to sentences.",
+        "operator_description": "Split a paragraph of text into multiple sentences",
         "type": "Mapper",
         "group": "",
         "samples": {
@@ -857,6 +883,7 @@
     "perplexity_filter": {
         "name": "Perplexity Score Filter",
         "description": "Filter to keep samples with perplexity score less than a specific max\n    value.",
+        "operator_description": "Delete low-quality text",
         "type": "Filter",
         "group": "",
         "samples": {
@@ -914,6 +941,7 @@
     "specified_field_filter": {
         "name": "Specified Field Information Filter",
         "description": "\n    Filter based on specified field information.\n\n    If the specified field information in the sample is not within the\n    specified target value, the sample will be filtered.\n    ",
+        "operator_description": "Filter based on a certain field value of the data",
         "type": "Filter",
         "group": "",
         "samples": {
@@ -1267,6 +1295,7 @@
     "document_deduplicator": {
         "name": "Document Deduplicator(MD5 Hash)",
         "description": "\n    Deduplicator to deduplicate samples at document-level using exact matching.\n\n    Using md5 hash to deduplicate samples.\n    ",
+        "operator_description": "\n    Calculate and delete identical documents based on MD5.\n    ",
         "type": "Deduplicator",
         "group": "",
         "samples": {
@@ -1278,19 +1307,22 @@
                 "name": "lowercase",
                 "type": "BOOLEAN",
                 "option_values": null,
-                "value": true
+                "value": true,
+                "config_description": "Do you ignore uppercase and lowercase letters"
             },
             {
                 "name": "ignore_non_character",
                 "type": "BOOLEAN",
                 "option_values": null,
-                "value": true
+                "value": true,
+                "config_description": "Do you ignore spaces, numbers, punctuation marks"
             }
         ]
     },
     "document_minhash_deduplicator": {
         "name": "Document Deduplicator(MinHashLSH)",
         "description": "\n    Deduplicator to deduplicate samples at document-level using MinHashLSH.\n\n    Different from simhash, minhash is stored as bytes, so they won't be\n    kept in the final dataset.\n    ",
+        "operator_description": "Proficient in deleting similar documents mixed with multiple languages",
         "type": "Deduplicator",
         "group": "",
         "samples": {
@@ -1374,6 +1406,7 @@
     "document_simhash_deduplicator": {
         "name": "Document Deduplicator(SimHash)",
         "description": "Deduplicator to deduplicate samples at document-level using SimHash.",
+        "operator_description": "Delete documents with similar but not identical content",
         "type": "Deduplicator",
         "group": "",
         "samples": {
@@ -1471,6 +1504,7 @@
     "random_selector": {
         "name": "Random Selector",
         "description": "Selector to random select samples. ",
+        "operator_description": "Randomly select a portion from the data",
         "type": "Selector",
         "group": "",
         "samples": {
@@ -1727,6 +1761,7 @@
     "dedup_and_save_deduplicator": {
         "name": "dedup_and_save_deduplicator",
         "description": "A deduplicator based on graph connectivity. It constructs a similarity graph by connecting samples with similarity scores above the threshold, then keeps only one sample (with minimum index) from each connected component. Suitable for datasets with pre-computed nearest neighbor similarity information.",
+        "operator_description": "Based on the similarity calculated data, deduplication is performed and some large fields (such as vectors and text) are moved to the statistical information to save space",
         "type": "Deduplicator",
         "group": "",
         "samples": {
@@ -1771,5 +1806,29 @@
                 "value": "https://dashscope.aliyuncs.com/compatible-mode/v1"
             }
         ]
+    },
+    "multi_keyword_filter": {
+        "name": "multi_keyword_filter",
+        "description": "",
+        "type": "Filter",
+        "group": "",
+        "samples": {
+            "before": "",
+            "after": ""
+        },
+        "params": [
+            {
+                "name": "keywords",
+                "type": "LIST",
+                "option_values": null,
+                "value": "危险,help,Text"
+            },
+            {
+                "name": "case_sensitive",
+                "type": "BOOLEAN",
+                "option_values": null,
+                "value": false
+            }
+        ]
     }
 }
@@ -42,7 +42,6 @@ export const dataPipelines = {
   "pleaseSelectAnExecutionTime": "請選擇執行時間",
   "deletingTask": "正在刪除任務",
   "terminatingTask": "正在終止任務",
-  "createTask": "創建任務",
   "addDataSource": "添加數據源",
   "fileFormat": "文件格式",
   "connectionStatus": "連接狀態",
@@ -96,6 +95,7 @@ export const dataPipelines = {
   "loading": "加載中",
   "taskCategories": "任務分類",
   "allCategories": "全部分類",
+  "createTask": "創建任務",
   "taskList": "任務列表",
   "taskName": "任務名稱",
   "DatabaseName": "數據庫名稱",
@@ -227,7 +227,6 @@ export const dataPipelines = {
   "execute": "執行",
   "cancelExecute": "取消執行",
   "executeConfirm": "確認執行",
-  "confirm": "確認",
   "reset": "替換",
   "details": "詳情",
   "authorize": "授权",
@@ -338,6 +337,7 @@ export const dataPipelines = {
   "creationCompleted": "創建完成",
   "updateTemplate": "更新模板",
   "cancel": "取消",
+  "confirm": "確定",
   "delTemplateTitle": "刪除模板",
   "delSuccess": "删除成功",
   "delFailed": "刪除失敗",
@@ -381,6 +381,7 @@ export const dataPipelines = {
   "fineweb_edu_chinese_common_internal": "文本價值評估",
   "smoltalk_chinese_common_internal": "高質量對話生成",
   "cosmopedia_chinese_preprocess_internal": "增強文本描述工具",
+  "md_to_jsonl_preprocess_internal": "MD轉JSONL工具",
 
   "analysis_common_internal_dec": "此分析器類用於分析特定數據集。它會為配置文件中的所有過濾操作計算統計數據，對這些統計數據應用多種分析（如整體分析、逐列分析等），並生成分析結果（統計表、分佈圖等），幫助用戶更好地理解輸入數據集。",
   "dataset_spliter_by_language_preprocess_internal_dec": "從源目錄加載數據集，然後使用名為 LanguageIDScoreFilter 的操作過濾器進行語言識別，最後按語言分割數據集並保存。",
Original file line number	Diff line number	Diff line change
`@@ -141,7 +141,7 @@ onMounted(() => {`
`141`	`141`
`142`	`142`	`const getDataFlow = (extra_config = '{}') => {`
`143`	`143`	`const dataFlow = JSON.parse(extra_config);`
`144`		- return `${dataFlow.csg_hub_dataset_id \|\| ''}${dataFlow.csg_hub_dataset_default_branch ? ` > ${dataFlow.csg_hub_dataset_default_branch}` : ''}`;
	`144`	+ return `${dataFlow.csg_hub_dataset_id \|\| ''}${dataFlow.csg_hub_dataset_branch ? ` > ${dataFlow.csg_hub_dataset_branch}` : ''}`;
`145`	`145`	`}`
`146`	`146`
`147`	`147`	`/**`