Skip to content

Commit 114cda7

Browse files
z275748353张龙彬
andauthored
fix data-flow bugs (#1433)
* add dataflow * add dataflow * remove antv/x6/lib/registry package * Fix bugs * Fix bugs * add package * add package * add zhHantOps * 1.Adjust the image path for operator management 2.Add permission judgment on whether to display the menu in operator management * Operator Management: Modification of dataflow/operator/ interface * 1.Add the cancellation of execution for internationalization and status verification 2.If a user has no organization or no authorized operators, all public operators will be queried by default * 1.Add the cancellation of execution for internationalization and status verification 2.If a user has no organization or no authorized operators, all public operators will be queried by default * 1.Add the cancellation of execution for internationalization and status verification 2.If a user has no organization or no authorized operators, all public operators will be queried by default * Add newly developed operators, internationalize tools, and supplement the internationalization of statistics. * Add newly developed operators, internationalize tools, and supplement the internationalization of statistics. * 1.Add template deletion function 2.Celery node deletion (admin only, offline status) 3.Block the option of creating tool operators, set tools as default 4.Internationalization update * Fix the bug of dataflow with ID #36 * Fix the bug of dataflow with ID OpenCSGs/csghub-dataflow#31 * Fix the bug of dataflow with ID OpenCSGs/csghub-dataflow#43 --------- Co-authored-by: 张龙彬 <[email protected]>
1 parent b3ce78c commit 114cda7

File tree

10 files changed

+340
-61
lines changed

10 files changed

+340
-61
lines changed

frontend/src/components/dataflow_config/dataAcquisition/dataSourceManagement/components/dataSourceInfo.vue

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -141,7 +141,7 @@ onMounted(() => {
141141
142142
const getDataFlow = (extra_config = '{}') => {
143143
const dataFlow = JSON.parse(extra_config);
144-
return `${dataFlow.csg_hub_dataset_id || ''}${dataFlow.csg_hub_dataset_default_branch ? ` > ${dataFlow.csg_hub_dataset_default_branch}` : ''}`;
144+
return `${dataFlow.csg_hub_dataset_id || ''}${dataFlow.csg_hub_dataset_branch ? ` > ${dataFlow.csg_hub_dataset_branch}` : ''}`;
145145
}
146146
147147
/**

frontend/src/components/dataflow_config/dataAcquisition/dataSourceManagement/newDataSource.vue

Lines changed: 38 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -254,39 +254,6 @@
254254
</el-form-item>
255255
</el-col>
256256

257-
<el-col
258-
v-if="typeVal === 'Mysql' || typeVal === 'Hive'"
259-
:xs="24"
260-
:sm="24"
261-
:md="12"
262-
:lg="12"
263-
:xl="12"
264-
>
265-
<el-form-item prop="password" class="mt-[12px]">
266-
<template #label>
267-
<p class="text-gray-500 text-xs">
268-
{{ t("dataPipelines.password") }}
269-
</p>
270-
</template>
271-
<el-input
272-
v-model="formInline.password"
273-
:placeholder="`${t('dataPipelines.toInput')}${t(
274-
'dataPipelines.password'
275-
)}`"
276-
type="password"
277-
show-password
278-
clearable
279-
>
280-
<template #prefix>
281-
<SvgIcon
282-
class="w-5 h-5 mx-2 text-gray-500"
283-
name="auth-icon"
284-
/>
285-
</template>
286-
</el-input>
287-
</el-form-item>
288-
</el-col>
289-
290257
<el-col
291258
:xs="24"
292259
:sm="24"
@@ -310,20 +277,53 @@
310277
>
311278
<el-option
312279
:label="t('dataPipelines.authType_option_NONE')"
313-
value="NONE"
280+
value="NOSASL"
314281
></el-option>
315282
<el-option
316283
:label="t('dataPipelines.authType_option_LDAP')"
317284
value="LDAP"
318285
></el-option>
319-
<el-option
286+
<!-- <el-option
320287
:label="t('dataPipelines.authType_option_KERBEROS')"
321288
value="KERBEROS"
322-
></el-option>
289+
></el-option> -->
323290
</el-select>
324-
<span class="text-gray-400 text-xs font-light mt-[10px]">{{
291+
<!-- <span class="text-gray-400 text-xs font-light mt-[10px]">{{
325292
t("dataPipelines.authType_placeholder")
326-
}}</span>
293+
}}</span> -->
294+
</el-form-item>
295+
</el-col>
296+
297+
<el-col
298+
v-if="typeVal === 'Mysql' || (typeVal === 'Hive' && formInline.auth_type !== 'NOSASL')"
299+
:xs="24"
300+
:sm="24"
301+
:md="12"
302+
:lg="12"
303+
:xl="12"
304+
>
305+
<el-form-item prop="password" class="mt-[12px]">
306+
<template #label>
307+
<p class="text-gray-500 text-xs">
308+
{{ t("dataPipelines.password") }}
309+
</p>
310+
</template>
311+
<el-input
312+
v-model="formInline.password"
313+
:placeholder="`${t('dataPipelines.toInput')}${t(
314+
'dataPipelines.password'
315+
)}`"
316+
type="password"
317+
show-password
318+
clearable
319+
>
320+
<template #prefix>
321+
<SvgIcon
322+
class="w-5 h-5 mx-2 text-gray-500"
323+
name="auth-icon"
324+
/>
325+
</template>
326+
</el-input>
327327
</el-form-item>
328328
</el-col>
329329

frontend/src/locales/en_js/datapipelines.js

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,6 @@ export const dataPipelines = {
1313
"targetFormat": "Target Format",
1414
"dataFlowBranch": "Data Flow Branch",
1515
"startExecution": "Start Execution",
16-
"inProgress": "In Progress",
1716
"searchTaskName": "Search Task Name",
1817
"confirmTermination": "Confirm Termination",
1918
"terminate": "Terminate",
@@ -40,7 +39,6 @@ export const dataPipelines = {
4039
}
4140
},
4241
"testingConnection": "Testing connection",
43-
"submitting": "Submitting",
4442
"pleaseSelectAnExecutionTime": "Please select an execution time",
4543
"deletingTask": "Deleting task",
4644
"terminatingTask": "Terminating task",
@@ -206,6 +204,7 @@ export const dataPipelines = {
206204
"dataCleaning": "Data Cleaning",
207205
"processingStatus": "Processing Status",
208206
"processingText": "Processing Text",
207+
"inProgress": "In Progress",
209208
"completed": "Completed",
210209
"dataSource": "Data Source",
211210
"dataSourceBranch": "Data Source Branch",
@@ -258,6 +257,7 @@ export const dataPipelines = {
258257
"uploadFailedTips2": "The icon size cannot exceed 10MB.",
259258
"uploadFailedTips3": "Upload failed, please try again",
260259
"networkError": "Network error, please check the connection and try again",
260+
"submitting": "Submitting",
261261
"algorithmTemplateDescription": "The algorithm template allows users to build workflows using various model operators, enabling tasks such as data cleaning, automated data augmentation, and analysis.",
262262
"taskTemplate": "Task Template",
263263
"searchTemplate": "Search Template",
@@ -383,6 +383,7 @@ export const dataPipelines = {
383383
"fineweb_edu_chinese_common_internal": "fineweb edu chinese common",
384384
"smoltalk_chinese_common_internal": "smoltalk chinese common",
385385
"cosmopedia_chinese_preprocess_internal": "cosmopedia chinese preprocess",
386+
"md_to_jsonl_preprocess_internal": "md to jsonl tool",
386387

387388

388389
"analysis_common_internal_dec": "This analyzer class is used to analyze specific datasets. It calculates statistics for all filtering operations in the configuration file, applies various analyses (such as overall analysis, column-by-column analysis, etc.) to these statistics, and generates analysis results (statistical tables, distribution charts, etc.) to help users better understand the input dataset.",

frontend/src/locales/en_js/operator_en.json

Lines changed: 61 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -175,6 +175,7 @@
175175
"extract_qa_mapper": {
176176
"name": "QA pair extractor",
177177
"description": "\n Mapper to extract question and answer pair from text samples.\n Recommended model list: [\n 'alibaba-pai/pai-llama3-8b-doc2qa',\n 'alibaba-pai/pai-baichuan2-7b-doc2qa',\n 'alibaba-pai/pai-qwen1_5-4b-doc2qa',\n 'alibaba-pai/pai-qwen1_5-7b-doc2qa',\n 'alibaba-pai/pai-qwen1_5-1b8-doc2qa',\n 'alibaba-pai/pai-qwen1_5-0b5-doc2qa'\n ]\n These recommended models are all trained with Chinese data\n and are suitable for Chinese.\n ",
178+
"operator_description": "Convert a text into a JSON format Q&A pair, such as:[\r\n {\r\n \"messages\": [\r\n {\"role\": \"user\", \"content\": \"Question 1\"},\r\n {\"role\": \"assistant\", \"content\": \"Answer 1\"}\r\n ]\r\n }\r\n ]",
178179
"type": "Mapper",
179180
"group": "",
180181
"samples": {
@@ -192,6 +193,18 @@
192193
}
193194
],
194195
"value": "alibaba-pai/pai-qwen1_5-7b-doc2qa"
196+
},
197+
{
198+
"name": "model_url",
199+
"type": "LIST",
200+
"option_values": null,
201+
"value": "https://api.deepseek.com/chat/completions"
202+
},
203+
{
204+
"name": "auth_token",
205+
"type": "LIST",
206+
"option_values": null,
207+
"value": ""
195208
}
196209
]
197210
},
@@ -358,6 +371,18 @@
358371
}
359372
],
360373
"value": "alibaba-pai/Qwen2-7B-Instruct-Refine"
374+
},
375+
{
376+
"name": "model_url",
377+
"type": "LIST",
378+
"option_values": null,
379+
"value": "https://api.deepseek.com/chat/completions"
380+
},
381+
{
382+
"name": "auth_token",
383+
"type": "LIST",
384+
"option_values": null,
385+
"value": ""
361386
}
362387
]
363388
},
@@ -625,6 +650,7 @@
625650
"sentence_split_mapper": {
626651
"name": "Sentence Spliter",
627652
"description": "Mapper to split text samples to sentences.",
653+
"operator_description": "Split a paragraph of text into multiple sentences",
628654
"type": "Mapper",
629655
"group": "",
630656
"samples": {
@@ -857,6 +883,7 @@
857883
"perplexity_filter": {
858884
"name": "Perplexity Score Filter",
859885
"description": "Filter to keep samples with perplexity score less than a specific max\n value.",
886+
"operator_description": "Delete low-quality text",
860887
"type": "Filter",
861888
"group": "",
862889
"samples": {
@@ -914,6 +941,7 @@
914941
"specified_field_filter": {
915942
"name": "Specified Field Information Filter",
916943
"description": "\n Filter based on specified field information.\n\n If the specified field information in the sample is not within the\n specified target value, the sample will be filtered.\n ",
944+
"operator_description": "Filter based on a certain field value of the data",
917945
"type": "Filter",
918946
"group": "",
919947
"samples": {
@@ -1267,6 +1295,7 @@
12671295
"document_deduplicator": {
12681296
"name": "Document Deduplicator(MD5 Hash)",
12691297
"description": "\n Deduplicator to deduplicate samples at document-level using exact matching.\n\n Using md5 hash to deduplicate samples.\n ",
1298+
"operator_description": "\n Calculate and delete identical documents based on MD5.\n ",
12701299
"type": "Deduplicator",
12711300
"group": "",
12721301
"samples": {
@@ -1278,19 +1307,22 @@
12781307
"name": "lowercase",
12791308
"type": "BOOLEAN",
12801309
"option_values": null,
1281-
"value": true
1310+
"value": true,
1311+
"config_description": "Do you ignore uppercase and lowercase letters"
12821312
},
12831313
{
12841314
"name": "ignore_non_character",
12851315
"type": "BOOLEAN",
12861316
"option_values": null,
1287-
"value": true
1317+
"value": true,
1318+
"config_description": "Do you ignore spaces, numbers, punctuation marks"
12881319
}
12891320
]
12901321
},
12911322
"document_minhash_deduplicator": {
12921323
"name": "Document Deduplicator(MinHashLSH)",
12931324
"description": "\n Deduplicator to deduplicate samples at document-level using MinHashLSH.\n\n Different from simhash, minhash is stored as bytes, so they won't be\n kept in the final dataset.\n ",
1325+
"operator_description": "Proficient in deleting similar documents mixed with multiple languages",
12941326
"type": "Deduplicator",
12951327
"group": "",
12961328
"samples": {
@@ -1374,6 +1406,7 @@
13741406
"document_simhash_deduplicator": {
13751407
"name": "Document Deduplicator(SimHash)",
13761408
"description": "Deduplicator to deduplicate samples at document-level using SimHash.",
1409+
"operator_description": "Delete documents with similar but not identical content",
13771410
"type": "Deduplicator",
13781411
"group": "",
13791412
"samples": {
@@ -1471,6 +1504,7 @@
14711504
"random_selector": {
14721505
"name": "Random Selector",
14731506
"description": "Selector to random select samples. ",
1507+
"operator_description": "Randomly select a portion from the data",
14741508
"type": "Selector",
14751509
"group": "",
14761510
"samples": {
@@ -1727,6 +1761,7 @@
17271761
"dedup_and_save_deduplicator": {
17281762
"name": "dedup_and_save_deduplicator",
17291763
"description": "A deduplicator based on graph connectivity. It constructs a similarity graph by connecting samples with similarity scores above the threshold, then keeps only one sample (with minimum index) from each connected component. Suitable for datasets with pre-computed nearest neighbor similarity information.",
1764+
"operator_description": "Based on the similarity calculated data, deduplication is performed and some large fields (such as vectors and text) are moved to the statistical information to save space",
17301765
"type": "Deduplicator",
17311766
"group": "",
17321767
"samples": {
@@ -1771,5 +1806,29 @@
17711806
"value": "https://dashscope.aliyuncs.com/compatible-mode/v1"
17721807
}
17731808
]
1809+
},
1810+
"multi_keyword_filter": {
1811+
"name": "multi_keyword_filter",
1812+
"description": "",
1813+
"type": "Filter",
1814+
"group": "",
1815+
"samples": {
1816+
"before": "",
1817+
"after": ""
1818+
},
1819+
"params": [
1820+
{
1821+
"name": "keywords",
1822+
"type": "LIST",
1823+
"option_values": null,
1824+
"value": "危险,help,Text"
1825+
},
1826+
{
1827+
"name": "case_sensitive",
1828+
"type": "BOOLEAN",
1829+
"option_values": null,
1830+
"value": false
1831+
}
1832+
]
17741833
}
17751834
}

frontend/src/locales/zh_hant_js/datapipelines.js

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,6 @@ export const dataPipelines = {
4242
"pleaseSelectAnExecutionTime": "請選擇執行時間",
4343
"deletingTask": "正在刪除任務",
4444
"terminatingTask": "正在終止任務",
45-
"createTask": "創建任務",
4645
"addDataSource": "添加數據源",
4746
"fileFormat": "文件格式",
4847
"connectionStatus": "連接狀態",
@@ -96,6 +95,7 @@ export const dataPipelines = {
9695
"loading": "加載中",
9796
"taskCategories": "任務分類",
9897
"allCategories": "全部分類",
98+
"createTask": "創建任務",
9999
"taskList": "任務列表",
100100
"taskName": "任務名稱",
101101
"DatabaseName": "數據庫名稱",
@@ -227,7 +227,6 @@ export const dataPipelines = {
227227
"execute": "執行",
228228
"cancelExecute": "取消執行",
229229
"executeConfirm": "確認執行",
230-
"confirm": "確認",
231230
"reset": "替換",
232231
"details": "詳情",
233232
"authorize": "授权",
@@ -338,6 +337,7 @@ export const dataPipelines = {
338337
"creationCompleted": "創建完成",
339338
"updateTemplate": "更新模板",
340339
"cancel": "取消",
340+
"confirm": "確定",
341341
"delTemplateTitle": "刪除模板",
342342
"delSuccess": "删除成功",
343343
"delFailed": "刪除失敗",
@@ -381,6 +381,7 @@ export const dataPipelines = {
381381
"fineweb_edu_chinese_common_internal": "文本價值評估",
382382
"smoltalk_chinese_common_internal": "高質量對話生成",
383383
"cosmopedia_chinese_preprocess_internal": "增強文本描述工具",
384+
"md_to_jsonl_preprocess_internal": "MD轉JSONL工具",
384385

385386
"analysis_common_internal_dec": "此分析器類用於分析特定數據集。它會為配置文件中的所有過濾操作計算統計數據,對這些統計數據應用多種分析(如整體分析、逐列分析等),並生成分析結果(統計表、分佈圖等),幫助用戶更好地理解輸入數據集。",
386387
"dataset_spliter_by_language_preprocess_internal_dec": "從源目錄加載數據集,然後使用名為 LanguageIDScoreFilter 的操作過濾器進行語言識別,最後按語言分割數據集並保存。",

0 commit comments

Comments
 (0)