1
- root : ' /raid/fayw/semdedup '
2
- sample : 10
1
+ root : ' /ads_ds3/data/SemDeDup_BenchMark '
2
+ sample : 12 # todo name to num_samples
3
3
id_col :
4
- name : ' id' # 'adlr_id'
5
- type : ' int' # 'str'
6
-
4
+ name : ' adlr_id' # 'id' #'adlr_id'
5
+ type : ' str' # 'int' #'str'
7
6
8
7
# Embeddings
9
8
embeddings :
10
9
input_column : ' text'
11
- datapath : ' datasets/c4/realnewslike/modified/'
12
- # datapath: 'datasets/prospector-lm/cleaned_exact_dedup_all_cc'
13
- emb_parquet_path : " emb_fbopt125_c4_crossfit_10" # "emb_sentmpnet_c4_crossfit_5" # #
14
- model_name : ' facebook/opt-125m' # 'sentence-transformers/all-mpnet-base-v2' # '/raid/fayw/semdedup/models/facebook-opt-125m/'
15
-
10
+ input_data_dir : ' /datasets/prospector-lm/cleaned_exact_dedup_all_cc'
11
+ output_data_dir : ' /raid/vjawa/prospector-lm/embeddings_crossfit_fb_c4_10'
12
+ path_or_name : ' facebook/opt-125m'
16
13
emb_size : 768
17
- batch_size : 32
18
- num_workers : 8
19
- pool_size : ' 10GB'
14
+ batch_size : 512
20
15
21
16
clustering :
22
- save_loc : " results_fbopt125_c4_crossfit_10" # "results_fbopt125_cc_crossfit_5"
23
- num_clusters : 12 # -- number of clusters
17
+ save_loc : " results_fbopt_nc_11_crossfit" # "results_fbopt125_cc_crossfit_5"
18
+ # save_loc: "results_crossfit_fb_c4_10a"
19
+ num_clusters : 1000 # -- number of clusters
24
20
seed : 1234
25
- niter : 1000
26
- Kmeans_with_cos_dist : True
21
+ niter : 100
22
+ # Kmeans can only be done with L2 using cuML.
23
+ Kmeans_with_cos_dist : False # Only False allowed
27
24
28
25
29
26
semdedup :
@@ -33,9 +30,8 @@ semdedup:
33
30
# cluster size is larger than it, we will devide the cluster into small
34
31
# clusters and process each one separately.
35
32
largest_cluster_size_to_process : 100000
36
- sim_metric : ' cosine'
37
- keep_hard : True # True for hard examples
33
+ sim_metric : " cosine" # Only cosine is allowed.
38
34
39
35
extract_dedup :
40
- eps : ' 0.01 ' # '0.01', '0.001', '0.0001', '1e-05', '1e-06'
41
-
36
+ use_eps_from_yml : True
37
+ eps : 0.01 0.001
0 commit comments