Skip to content

Commit d7c7b74

Browse files
committed
Cleanup compute_embeddings_crossfit.py
Signed-off-by: Vibhu Jawa <[email protected]>
1 parent 04f7efc commit d7c7b74

File tree

2 files changed

+19
-22
lines changed

2 files changed

+19
-22
lines changed
Lines changed: 17 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,29 +1,26 @@
1-
root: '/raid/fayw/semdedup'
2-
sample: 10
1+
root: '/ads_ds3/data/SemDeDup_BenchMark'
2+
sample: 12 # todo name to num_samples
33
id_col:
4-
name: 'id' #'adlr_id'
5-
type: 'int' #'str'
6-
4+
name: 'adlr_id' # 'id' #'adlr_id'
5+
type: 'str' # 'int' #'str'
76

87
# Embeddings
98
embeddings:
109
input_column: 'text'
11-
datapath: 'datasets/c4/realnewslike/modified/'
12-
#datapath: 'datasets/prospector-lm/cleaned_exact_dedup_all_cc'
13-
emb_parquet_path: "emb_fbopt125_c4_crossfit_10" #"emb_sentmpnet_c4_crossfit_5" # #
14-
model_name: 'facebook/opt-125m' #'sentence-transformers/all-mpnet-base-v2' # '/raid/fayw/semdedup/models/facebook-opt-125m/'
15-
10+
input_data_dir: '/datasets/prospector-lm/cleaned_exact_dedup_all_cc'
11+
output_data_dir: '/raid/vjawa/prospector-lm/embeddings_crossfit_fb_c4_10'
12+
path_or_name: 'facebook/opt-125m'
1613
emb_size: 768
17-
batch_size: 32
18-
num_workers: 8
19-
pool_size: '10GB'
14+
batch_size: 512
2015

2116
clustering:
22-
save_loc: "results_fbopt125_c4_crossfit_10" #"results_fbopt125_cc_crossfit_5"
23-
num_clusters: 12 # -- number of clusters
17+
save_loc: "results_fbopt_nc_11_crossfit" #"results_fbopt125_cc_crossfit_5"
18+
#save_loc: "results_crossfit_fb_c4_10a"
19+
num_clusters: 1000 # -- number of clusters
2420
seed: 1234
25-
niter: 1000
26-
Kmeans_with_cos_dist: True
21+
niter: 100
22+
# Kmeans can only be done with L2 using cuML.
23+
Kmeans_with_cos_dist: False # Only False allowed
2724

2825

2926
semdedup:
@@ -33,9 +30,8 @@ semdedup:
3330
# cluster size is larger than it, we will devide the cluster into small
3431
# clusters and process each one separately.
3532
largest_cluster_size_to_process: 100000
36-
sim_metric: 'cosine'
37-
keep_hard: True # True for hard examples
33+
sim_metric: "cosine" # Only cosine is allowed.
3834

3935
extract_dedup:
40-
eps: '0.01' # '0.01', '0.001', '0.0001', '1e-05', '1e-06'
41-
36+
use_eps_from_yml: True
37+
eps: 0.01 0.001

setup.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,8 @@
6060
"presidio-anonymizer==2.2.351",
6161
"usaddress==0.5.10",
6262
"nemo_toolkit[nlp]>=1.23.0",
63-
"crossfit @ git+https://github.com/rapidsai/crossfit.git@1ee3de4",
63+
"Cython",
64+
"crossfit @ git+https://github.com/rapidsai/[email protected]",
6465
# justext installation breaks without lxml[html_clean]
6566
# due to this: https://github.com/miso-belica/jusText/issues/47
6667
"lxml[html_clean]",

0 commit comments

Comments
 (0)