Skip to content

Commit ee22993

Browse files
committed
add WebVision dataset
1 parent 3936578 commit ee22993

24 files changed

+1484
-704
lines changed

README.md

+24-3
Original file line numberDiff line numberDiff line change
@@ -138,6 +138,15 @@ Raw dataset should be downloaded in to local folder before data-build process. T
138138
│   │    ├── t10k-images-idx3-ubyte
139139
│   │    ...
140140
│   │    └── train-labels-idx1-ubyte.gz
141+
│ ├── webvision1.0/
142+
│ │ ├── info
143+
│ │ ├── google
144+
│ │ ├── README.txt
145+
│ │ └── val_images_256
146+
│ ├── imagenet_data/
147+
│ │ └── ILSVRC2012
148+
│ │    ├── train
149+
│ │    └── val
141150
│ ├── clothing1M/
142151
│ │ ├── category_names_chn.txt
143152
│ │ ├── category_names_eng.txt
@@ -182,6 +191,8 @@ Raw dataset should be downloaded in to local folder before data-build process. T
182191

183192
- To download __[Clothing1M](https://github.com/Cysu/noisy_label)__, please contact *tong.xiao.work[at]gmail[dot]com* to get the download link. Untar the images and unzip the annotations under `rawdata/clothing1M`.
184193

194+
- To download __[WebVision1.0](https://data.vision.ee.ethz.ch/cvl/webvision/download.html)__, only use "Resized Images (small version)". Unzip the data under `rawdata/webvision1.0`.
195+
185196

186197

187198
### Build dataset with noisy label
@@ -216,10 +227,13 @@ $ python build_dataset_fed.py --dataset cifar10 \
216227
- `--min_noise_ratio 0.3 --max_noise_ratio 0.5 --noise_mode sym` for localized symmetric noise $\varepsilon_k \sim \mathcal{U}(0.3, 0.5)$
217228
- `--min_noise_ratio 0.3 --max_noise_ratio 0.5 --noise_mode asym` for localized asymmetric noise $\varepsilon_k \sim \mathcal{U}(0.3, 0.5)$
218229

219-
- Real noise (only works for Clothing1M): `--dataset clothing1m --globalize --noise_mode real --num_sampels 64000`
220-
221-
- `--num_samples` is for specifying number of training samples used for Clothing1M, the default is 64000
230+
- Real noise:
222231

232+
- `--dataset clothing1m` for Clothing1M, need to specify ` --num_sampels` as number of training samples used for Clothing1M, the default is 64000
233+
- `--dataset webvision` for WebVision1.0, need to specify `--raw_imagenet_dir` for ImageNet validation set raw data dir
234+
235+
> Once `--dataset` is set with `clothing1m` or `webvision`, the arguement will automatically set `--globalize --noise_mode real` for real world noise.
236+
223237

224238

225239

@@ -244,6 +258,12 @@ $ python build_dataset_fed.py --dataset cifar10 \
244258
- Non-IID quantity skew: `--partition noniid-quantity --num_clients 10 --dir_alpha 0.1`
245259
- Non-IID Dirichlet-based label skew: `--partition noniid-labeldir --dir_alpha 0.1 --num_clients 10`
246260
- Non-IID quantity-based label skew: `--partition noniid-#label --major_classes_num 5 --num_clients 10`
261+
- WebVision: `--dataset webvision`
262+
- IID: `--partition iid --num_clients 10`
263+
- Non-IID quantity skew: `--partition noniid-quantity --num_clients 10 --dir_alpha 0.1`
264+
- Non-IID Dirichlet-based label skew: `--partition noniid-labeldir --dir_alpha 0.6 --num_clients 10`
265+
- Non-IID quantity-based label skew: `--partition noniid-#label --major_classes_num 20 --num_clients 10`
266+
247267

248268

249269

@@ -544,6 +564,7 @@ For more scripts, please check [scripts](./scripts/) folder.
544564
### Datasets
545565

546566
- [ ] Include mini-ImageNet (synthetic noise)
567+
- [x] Include WebVision (real-world noise)
547568
- [ ] Include Food-101N (real-world noise)
548569
- [ ] Include ANIMAL-10N (real-world noise)
549570

build_dataset_fed.py

+102-4
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@
4444
- Non-IID-xxx
4545
- Non-IID-xxx
4646
"""
47+
4748
import argparse
4849

4950
# from progress.bar import Bar as Bar
@@ -58,7 +59,10 @@
5859
FedNLLMNIST,
5960
FedNLLSVHN,
6061
FedNLLClothing1M,
62+
FedNLLWebVision,
63+
FedNLLSynthetic,
6164
)
65+
from fednoisy.data.NLLData import functional as nllF
6266

6367

6468
def read_args():
@@ -74,9 +78,25 @@ def read_args():
7478
"--partition",
7579
default="iid",
7680
type=str,
77-
choices=["iid", "noniid-#label", "noniid-labeldir", "noniid-quantity"],
81+
choices=[
82+
"iid",
83+
"noniid",
84+
"noniid-#label",
85+
"noniid-labeldir",
86+
"noniid-quantity",
87+
],
7888
help="Data partition scheme for federated setting.",
7989
)
90+
parser.add_argument(
91+
"--personalize",
92+
action="store_true",
93+
help="Whether use personalized local test set for each client. If True, then each client's class ratio of local test set is same as the training set",
94+
)
95+
parser.add_argument(
96+
"--balance",
97+
action="store_true",
98+
help="whether use balance partition for Synthetic dataset.",
99+
)
80100
parser.add_argument(
81101
"--num_clients",
82102
default=10,
@@ -140,24 +160,56 @@ def read_args():
140160
"--num_samples",
141161
default=32 * 2 * 1000,
142162
type=int,
143-
help="Number of samples used for Clothing1M training. Defaults as 64000.",
163+
help="Number of samples used for Clothing1M/Synthetic data training. Defaults as 64000.",
164+
)
165+
166+
parser.add_argument(
167+
"--num_test_samples",
168+
default=1000,
169+
type=int,
170+
help="Number of test samples for synthetic dataset.",
171+
)
172+
parser.add_argument(
173+
"--feature_dim",
174+
type=int,
175+
default=100,
176+
help="Feature dimension for synthetic dataset.",
177+
)
178+
parser.add_argument(
179+
"--use_bias",
180+
action="store_true",
181+
help="Whether to use bias in synthetic data generation. If True, Y = Xw + b + ε; otherwise Y = Xw + ε.",
144182
)
145183

146184
# ----Dataset path options----
147185
parser.add_argument(
148186
"--dataset",
149187
default="cifar10",
150188
type=str,
151-
choices=["mnist", "cifar10", "cifar100", "svhn", "clothing1m", "webvision"],
189+
choices=[
190+
"mnist",
191+
"cifar10",
192+
"cifar100",
193+
"svhn",
194+
"clothing1m",
195+
"webvision",
196+
"synthetic",
197+
],
152198
help="Dataset for experiment. Current support: ['mnist', 'cifar10', "
153-
"'cifar100', 'svhn', 'clothing1m', 'webvision']",
199+
"'cifar100', 'svhn', 'clothing1m', 'webvision', 'synthetic]",
154200
)
155201
parser.add_argument(
156202
"--raw_data_dir",
157203
default="../data",
158204
type=str,
159205
help="Directory for raw dataset download",
160206
)
207+
parser.add_argument(
208+
"--raw_imagenet_dir",
209+
default="../rawdata/imagenet",
210+
type=str,
211+
help="Directory for raw dataset download",
212+
)
161213
parser.add_argument(
162214
"--data_dir",
163215
default="../noisy_label_data",
@@ -242,9 +294,11 @@ def read_args():
242294
max_noise_ratio=args.max_noise_ratio,
243295
root_dir=args.raw_data_dir,
244296
out_dir=args.data_dir,
297+
personalize=args.personalize,
245298
)
246299
nll_cifar10.create_nll_scene(seed=args.seed)
247300
nll_cifar10.save_nll_scene()
301+
248302
elif args.dataset == "cifar100":
249303
nll_cifar100 = FedNLLCIFAR100(
250304
globalize=args.globalize,
@@ -258,9 +312,11 @@ def read_args():
258312
max_noise_ratio=args.max_noise_ratio,
259313
root_dir=args.raw_data_dir,
260314
out_dir=args.data_dir,
315+
personalize=args.personalize,
261316
)
262317
nll_cifar100.create_nll_scene(seed=args.seed)
263318
nll_cifar100.save_nll_scene()
319+
264320
elif args.dataset == "mnist":
265321
nll_mnist = FedNLLMNIST(
266322
globalize=args.globalize,
@@ -274,6 +330,7 @@ def read_args():
274330
max_noise_ratio=args.max_noise_ratio,
275331
root_dir=args.raw_data_dir,
276332
out_dir=args.data_dir,
333+
personalize=args.personalize,
277334
)
278335
nll_mnist.create_nll_scene(seed=args.seed)
279336
nll_mnist.save_nll_scene()
@@ -291,11 +348,15 @@ def read_args():
291348
max_noise_ratio=args.max_noise_ratio,
292349
root_dir=args.raw_data_dir,
293350
out_dir=args.data_dir,
351+
personalize=args.personalize,
294352
)
295353
nll_svhn.create_nll_scene(seed=args.seed)
296354
nll_svhn.save_nll_scene()
297355

298356
elif args.dataset == "clothing1m":
357+
args.noise_mode = "real"
358+
args.globalize = True
359+
args.noise_ratio = 0.39
299360
nll_clothing1m = FedNLLClothing1M(
300361
root_dir=args.raw_data_dir,
301362
out_dir=args.data_dir,
@@ -308,5 +369,42 @@ def read_args():
308369
nll_clothing1m.create_nll_scene(seed=args.seed)
309370
nll_clothing1m.save_nll_scene()
310371

372+
elif args.dataset == "webvision":
373+
args.noise_mode = "real"
374+
args.globalize = True
375+
args.noise_ratio = 0.20
376+
nll_webvision = FedNLLWebVision(
377+
root_dir=args.raw_data_dir,
378+
imagenet_root_dir=args.raw_imagenet_dir,
379+
out_dir=args.data_dir,
380+
partition=args.partition,
381+
num_clients=args.num_clients,
382+
dir_alpha=args.dir_alpha,
383+
major_classes_num=args.major_classes_num,
384+
)
385+
nll_webvision.create_nll_scene(seed=args.seed)
386+
nll_webvision.save_nll_scene()
387+
388+
elif args.dataset == "synthetic":
389+
nll_synthetic = FedNLLSynthetic(
390+
out_dir=args.data_dir,
391+
num_clients=args.num_clients,
392+
init_mu=0,
393+
init_sigma=1,
394+
partition=args.partition,
395+
balance=args.balance,
396+
train_sample_num=args.num_samples,
397+
test_sample_num=args.num_test_samples,
398+
feature_dim=args.feature_dim,
399+
use_bias=args.use_bias,
400+
dir_alpha=args.dir_alpha,
401+
)
402+
args.init_mu = 0
403+
args.init_sigma = 1
404+
nll_synthetic.create_nll_scene(seed=args.seed)
405+
nll_synthetic.save_nll_scene()
406+
nll_name = nllF.FedNLL_name(**vars(args))
407+
print(f"{nll_name}")
408+
311409
else:
312410
raise ValueError(f"dataset='{args.dataset}' is not supported!")

fednoisy/algorithms/fedavg/client.py

+3
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,7 @@ def train(self, model_parameters, train_loader):
107107
data_size = len(train_loader.dataset)
108108

109109
for epoch in range(self.epochs):
110+
self._model.train() # TODO
110111
self._LOGGER.info(
111112
f"Round {self.round} client-{self.cur_cid} local train epoch [{epoch}/{self.epochs}]"
112113
)
@@ -137,6 +138,8 @@ def evaluate(self):
137138
multimodel=multimodel,
138139
)
139140

141+
# TODO: add ImageNet evaluation code
142+
140143
return loss_, acc_
141144

142145

fednoisy/algorithms/fedavg/main.py

+4
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,10 @@
5555
args.noise_mode = "real"
5656
args.globalize = True
5757
args.noise_ratio = 0.39
58+
elif args.dataset == "webvision":
59+
args.noise_mode = "real"
60+
args.globalize = True
61+
args.noise_ratio = 0.20
5862

5963
nll_name = nllF.FedNLL_name(**vars(args))
6064
exp_name = make_exp_name("fedavg", args)

fednoisy/algorithms/fedavg/misc.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ def read_fednll_args():
2929
parser.add_argument("--batch_size", type=int, default=128)
3030
parser.add_argument("--epochs", type=int, default=2)
3131
parser.add_argument("--lr", type=float, default=0.01)
32-
parser.add_argument("--weight_decay", type=float, default=1e-3)
32+
parser.add_argument("--weight_decay", type=float, default=5e-4)
3333
parser.add_argument("--momentum", type=float, default=0.9)
3434

3535
# ==== FedNLL data args ====

fednoisy/algorithms/singleset/__init__.py

Whitespace-only changes.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
import torch
2+
import argparse
3+
import sys
4+
import os
5+
from copy import deepcopy
6+
from typing import Dict, Tuple, List, Optional
7+
8+
from torch import nn
9+
from torch.utils.data import DataLoader
10+
import torchvision
11+
import torchvision.transforms as transforms
12+
13+
from fedlab.contrib.dataset.basic_dataset import FedDataset
14+
from fedlab.utils.logger import Logger
15+
16+
sys.path.append(os.getcwd())
17+
from fednoisy.data.NLLData import functional as nllF
18+
from fednoisy.data import (
19+
CLASS_NUM,
20+
TRAIN_SAMPLE_NUM,
21+
TEST_SAMPLE_NUM,
22+
CIFAR10_TRANSITION_MATRIX,
23+
NORM_VALUES,
24+
)
25+
26+
# from fednoisy.utils import misc as misc
27+
from fednoisy.data.NLLData import functional as nllF
28+
from fednoisy.data.dataset import FedNLLClientDataset
29+
from fednoisy.utils.misc import setup_seed, make_dirs, make_exp_name, AverageMeter
30+
from fednoisy.models.build_model import build_model
31+
from fednoisy.algorithms.singleset.misc import read_singlenll_args
32+
33+
34+
args = read_singlenll_args()
35+
if torch.cuda.is_available():
36+
args.cuda = True
37+
else:
38+
args.cuda = False
39+
40+
setup_seed(args.seed)
41+
nll_name = nllF.FedNLL_name(**vars(args))
42+
43+
# ==== Data loader
44+
test_transform = transforms.Compose(
45+
[
46+
transforms.ToTensor(),
47+
transforms.Normalize(*NORM_VALUES[args.dataset]),
48+
]
49+
)
50+
test_dataset = torchvision.datasets.CIFAR10(
51+
train=False, root=args.raw_data_dir, transform=test_transform
52+
)
53+
test_loader = DataLoader(
54+
dataset=test_dataset, batch_size=args.batch_size, shuffle=False
55+
)
56+
dataset = FedNLLClientDataset(args)
57+
train_loader = dataset.get_dataloader(args.client_id, args.batch_size)
58+
59+
# ==== Get model
60+
model = build_model(args.model, CLASS_NUM[args.dataset])
61+
model = model.to(args.device)
62+
63+
# === Optimizer
64+
# optimizer = torch.optim.SGD(
65+
# model.parameters(), args.lr, weight_decay=args.weight_decay, momentum=args.momentum
66+
# )
67+
optimizer = torch.optim.SGD(model.parameters(), args.lr, weight_decay=args.weight_decay)
68+
criterion = torch.nn.CrossEntropyLoss()
69+
70+
# ==== Setup log
71+
logger = Logger(log_name="SingleSetTrainer")
72+
73+
# ==== Training
74+
for epoch in range(args.epochs):
75+
logger.info(f"Epoch [{epoch}/{args.epochs}] Client-{args.client_id} local training")
76+
model.train()
77+
for imgs, labels, noisy_labels in train_loader:
78+
if args.cuda:
79+
imgs = imgs.cuda(args.device)
80+
noisy_labels = noisy_labels.cuda(args.device)
81+
82+
output = model(imgs)
83+
loss = criterion(output, noisy_labels)
84+
85+
optimizer.zero_grad()
86+
loss.backward()
87+
optimizer.step()
88+
# logger.info(f"loss: {loss.item()}")
89+
90+
loss_, acc_ = evaluate(model, nn.CrossEntropyLoss(), test_loader)
91+
logger.info(
92+
f"Epoch [{epoch}/{args.epochs}] Client-{args.client_id} test accuracy: {acc_*100}%"
93+
)

0 commit comments

Comments
 (0)