1195343015
diff --git a/‎megatron/arguments.py
Lines changed: 0 additions & 3 deletions b/‎megatron/arguments.py
Lines changed: 0 additions & 3 deletions
diff --git a/‎megatron/checkpointing.py
Lines changed: 3 additions & 3 deletions b/‎megatron/checkpointing.py
Lines changed: 3 additions & 3 deletions
diff --git a/‎megatron/data/__init__.py
Lines changed: 0 additions & 2 deletions b/‎megatron/data/__init__.py
Lines changed: 0 additions & 2 deletions
diff --git a/‎megatron/data/bert_dataset.py
Lines changed: 4 additions & 6 deletions b/‎megatron/data/bert_dataset.py
Lines changed: 4 additions & 6 deletions
diff --git a/‎megatron/data/gpt2_dataset.py
Lines changed: 8 additions & 10 deletions b/‎megatron/data/gpt2_dataset.py
Lines changed: 8 additions & 10 deletions
diff --git a/‎megatron/data/indexed_dataset.py
Lines changed: 13 additions & 5 deletions b/‎megatron/data/indexed_dataset.py
Lines changed: 13 additions & 5 deletions
diff --git a/‎megatron/data/samplers.py
Lines changed: 4 additions & 3 deletions b/‎megatron/data/samplers.py
Lines changed: 4 additions & 3 deletions
diff --git a/‎megatron/data/test/test_indexed_dataset.py
Lines changed: 13 additions & 10 deletions b/‎megatron/data/test/test_indexed_dataset.py
Lines changed: 13 additions & 10 deletions
@@ -357,7 +357,6 @@ def _add_gpt2_args(parser):
     return parser
 
 
-
 def add_data_args_(parser):
     """Train/valid/test data arguments."""
 
@@ -367,6 +366,4 @@ def add_data_args_(parser):
                        choices=['raw', 'lazy', 'tfrecords', 'numpy', 'binary'],
                        help='Which data loader to use. Default varies by model.')
 
-
     return parser
-
@@ -67,7 +67,7 @@ def get_checkpoint_name(checkpoints_path, iteration,
         directory = 'iter_{:07d}'.format(iteration)
     return os.path.join(checkpoints_path, directory,
                         'mp_rank_{:02d}'.format(
-                            mpu.get_model_parallel_rank() if mp_rank is None \
+                            mpu.get_model_parallel_rank() if mp_rank is None
                             else mp_rank),
                         'model_optim_rng.pt')
 
@@ -179,7 +179,7 @@ def load_checkpoint(model, optimizer, lr_scheduler):
             'megatron.fp16.loss_scaler']
         state_dict = torch.load(checkpoint_name, map_location='cpu')
         sys.modules.pop('fp16.loss_scaler', None)
-    except:
+    except BaseException:
         print_rank_0('could not load the checkpoint')
         sys.exit()
 
@@ -190,7 +190,7 @@ def load_checkpoint(model, optimizer, lr_scheduler):
         try:
             iteration = state_dict['iteration']
         except KeyError:
-            try: # Backward compatible with older checkpoints
+            try:  # Backward compatible with older checkpoints
                 iteration = state_dict['total_iters']
             except KeyError:
                 print_rank_0('A metadata file exists but unable to load '
 
@@ -1,3 +1 @@
 from . import indexed_dataset
-
-
@@ -47,6 +47,7 @@ def build_train_valid_test_datasets(data_prefix, data_impl, splits_string,
 
     # Print stats about the splits.
     print_rank_0(' > dataset split:')
+
     def print_split_stats(name, index):
         print_rank_0('    {}:'.format(name))
         print_rank_0('     document indices in [{}, {}) total of {} '
@@ -113,7 +114,6 @@ def __init__(self, name, indexed_dataset, data_prefix,
         # Dataset.
         self.indexed_dataset = indexed_dataset
 
-
         # Build the samples mapping.
         self.samples_mapping = get_samples_mapping_(self.indexed_dataset,
                                                     data_prefix,
@@ -133,11 +133,9 @@ def __init__(self, name, indexed_dataset, data_prefix,
         self.mask_id = tokenizer.mask
         self.pad_id = tokenizer.pad
 
-
     def __len__(self):
         return self.samples_mapping.shape[0]
 
-
     def __getitem__(self, idx):
 
         start_index, end_index, seq_length = self.samples_mapping[idx]
@@ -148,7 +146,7 @@ def __getitem__(self, idx):
         # python randint is inclusive whereas the numpy one is exclusive.
         np_rng = np.random.RandomState(seed=(self.seed + idx))
         return build_training_sample(sample, seq_length,
-                                     self.max_seq_length, # needed for padding
+                                     self.max_seq_length,  # needed for padding
                                      self.vocab_id_list,
                                      self.vocab_id_to_token_dict,
                                      self.cls_id, self.sep_id,
@@ -192,7 +190,7 @@ def get_train_valid_test_split_(splits_string, size):
     splits = splits[:3]
     splits_sum = sum(splits)
     assert splits_sum > 0.0
-    splits = [split/splits_sum for split in splits]
+    splits = [split / splits_sum for split in splits]
     splits_index = [0]
     for index, split in enumerate(splits):
         splits_index.append(splits_index[index] +
@@ -254,7 +252,7 @@ def get_samples_mapping_(indexed_dataset,
             indexed_dataset.sizes,
             num_epochs,
             max_num_samples,
-            max_seq_length-3, # account for added tokens
+            max_seq_length - 3,  # account for added tokens
             short_seq_prob,
             seed,
             verbose)
 
@@ -42,6 +42,7 @@ def build_train_valid_test_datasets(data_prefix, data_impl, splits_string,
 
     # Print stats about the splits.
     print_rank_0(' > dataset split:')
+
     def print_split_stats(name, index):
         print_rank_0('    {}:'.format(name))
         print_rank_0('     document indices in [{}, {}) total of {} '
@@ -54,7 +55,7 @@ def print_split_stats(name, index):
     def build_dataset(index, name):
         dataset = None
         if splits[index + 1] > splits[index]:
-            documents = np.arange(start=splits[index], stop=splits[index+1],
+            documents = np.arange(start=splits[index], stop=splits[index + 1],
                                   step=1, dtype=np.int32)
             dataset = GPT2Dataset(name, data_prefix,
                                   documents, indexed_dataset,
@@ -102,21 +103,19 @@ def __init__(self, name, data_prefix, documents, indexed_dataset,
             self.name, data_prefix, documents, self.indexed_dataset.sizes,
             num_samples, seq_length, seed)
 
-
     def __len__(self):
         # -1 is due to data structure used to retieve the index:
         #    sample i --> [sample_idx[i], sample_idx[i+1])
         return self.sample_idx.shape[0] - 1
 
-
     def __getitem__(self, idx):
         # Get the shuffled index.
         idx = self.shuffle_idx[idx]
         # Start and end documents and offsets.
         doc_index_f = self.sample_idx[idx][0]
-        doc_index_l = self.sample_idx[idx+1][0]
+        doc_index_l = self.sample_idx[idx + 1][0]
         offset_f = self.sample_idx[idx][1]
-        offset_l = self.sample_idx[idx+1][1]
+        offset_l = self.sample_idx[idx + 1][1]
         # If we are within the same document, just extract the chunk.
         if doc_index_f == doc_index_l:
             sample = self.indexed_dataset.get(self.doc_idx[doc_index_f],
@@ -127,18 +126,17 @@ def __getitem__(self, idx):
             sample_list = [self.indexed_dataset.get(self.doc_idx[doc_index_f],
                                                     offset=offset_f)]
             # Loop over all in between documents and add the entire document.
-            for i in range(doc_index_f+1, doc_index_l):
+            for i in range(doc_index_f + 1, doc_index_l):
                 sample_list.append(self.indexed_dataset.get(self.doc_idx[i]))
             # And finally add the relevant portion of last document.
             sample_list.append(self.indexed_dataset.get(
                 self.doc_idx[doc_index_l],
-                length=offset_l+1))
+                length=offset_l + 1))
             sample = np.concatenate(sample_list)
 
         return {'text': np.array(sample, dtype=np.int64)}
 
 
-
 def _build_index_mappings(name, data_prefix, documents, sizes,
                           num_samples, seq_length, seed):
     """Build doc-idx, sample-idx, and shuffle-idx.
@@ -185,7 +183,7 @@ def _build_index_mappings(name, data_prefix, documents, sizes,
             assert sizes.dtype == np.int32
             sample_idx = helpers.build_sample_idx(sizes, doc_idx, seq_length,
                                                   num_epochs, tokens_per_epoch)
-            #sample_idx = _build_sample_idx(sizes, doc_idx, seq_length,
+            # sample_idx = _build_sample_idx(sizes, doc_idx, seq_length,
             #                               num_epochs, tokens_per_epoch)
             np.save(sample_idx_filename, sample_idx, allow_pickle=True)
             print_rank_0(' > elasped time to build and save sample-idx mapping '
@@ -194,7 +192,7 @@ def _build_index_mappings(name, data_prefix, documents, sizes,
             start_time = time.time()
             # -1 is due to data structure used to retieve the index:
             #    sample i --> [sample_idx[i], sample_idx[i+1])
-            shuffle_idx = _build_shuffle_idx(sample_idx.shape[0]-1, np_rng)
+            shuffle_idx = _build_shuffle_idx(sample_idx.shape[0] - 1, np_rng)
             np.save(shuffle_idx_filename, shuffle_idx, allow_pickle=True)
             print_rank_0(' > elasped time to build and save shuffle-idx mapping'
                          ' (seconds): {:4f}'.format(time.time() - start_time))
 
@@ -20,6 +20,7 @@
 import torch
 from megatron import print_rank_0
 
+
 def __best_fitting_dtype(vocab_size=None):
     if vocab_size is not None and vocab_size < 65500:
         return np.uint16
@@ -109,13 +110,15 @@ def index_file_path(prefix_path):
 def data_file_path(prefix_path):
     return prefix_path + '.bin'
 
+
 def create_doc_idx(sizes):
     doc_idx = [0]
     for i, s in enumerate(sizes):
         if s == 0:
-            doc_idx.append(i+1)
+            doc_idx.append(i + 1)
     return doc_idx
 
+
 class IndexedDataset(torch.utils.data.Dataset):
     """Loader for IndexedDataset"""
     _HDR_MAGIC = b'TNTIDX\x00\x00'
@@ -155,7 +158,7 @@ def __del__(self):
         if self.data_file:
             self.data_file.close()
 
-    #@lru_cache(maxsize=8)
+    # @lru_cache(maxsize=8)
     def __getitem__(self, idx):
         if not self.data_file:
             self.read_data(self.path)
@@ -235,7 +238,7 @@ def prefetch(self, indices):
             self.data_file.close()
             self.data_file = None
 
-    #@lru_cache(maxsize=8)
+    # @lru_cache(maxsize=8)
     def __getitem__(self, idx):
         if isinstance(idx, int):
             i = idx
@@ -399,13 +402,18 @@ def __init__(self, path, skip_warmup=False):
             self._bin_buffer_mmap = np.memmap(path, mode='r', order='C')
             self._bin_buffer = memoryview(self._bin_buffer_mmap)
             print_rank_0("    reading sizes...")
-            self._sizes = np.frombuffer(self._bin_buffer, dtype=np.int32, count=self._len, offset=offset)
+            self._sizes = np.frombuffer(
+                self._bin_buffer,
+                dtype=np.int32,
+                count=self._len,
+                offset=offset)
             print_rank_0("    reading pointers...")
             self._pointers = np.frombuffer(self._bin_buffer, dtype=np.int64, count=self._len,
                                            offset=offset + self._sizes.nbytes)
             print_rank_0("    reading document index...")
             self._doc_idx = np.frombuffer(self._bin_buffer, dtype=np.int64, count=self._doc_count,
                                           offset=offset + self._sizes.nbytes + self._pointers.nbytes)
+
         def __del__(self):
             self._bin_buffer_mmap._mmap.close()
             del self._bin_buffer_mmap
@@ -464,7 +472,7 @@ def __del__(self):
     def __len__(self):
         return len(self._index)
 
-    #@lru_cache(maxsize=8)
+    # @lru_cache(maxsize=8)
     def __getitem__(self, idx):
         if isinstance(idx, int):
             ptr, size = self._index[idx]
 
@@ -81,6 +81,7 @@ class DistributedBatchSampler(data.sampler.BatchSampler):
     sampler level. This allows wrapping of arbitrary data samplers
     (sequential, random, WeightedRandomSampler, etc.) with this batch
     sampler."""
+
     def __init__(self, sampler, batch_size, drop_last, rank=-1,
                  world_size=2, wrap_last=False):
         super(DistributedBatchSampler, self).__init__(sampler, batch_size,
@@ -120,7 +121,7 @@ def __iter__(self):
     def data_iterator(self, _iter, wrap_around=False):
         """iterates through data and handles wrap around"""
         for i, idx in enumerate(_iter):
-            if i < self.wrap_around%self.batch_size:
+            if i < self.wrap_around % self.batch_size:
                 continue
             if wrap_around:
                 self.wrap_around += 1
@@ -129,6 +130,6 @@ def data_iterator(self, _iter, wrap_around=False):
 
     def _batch(self, batch):
         """extracts samples only pertaining to this worker's batch"""
-        start = self.rank*self.batch_size//self.world_size
-        end = (self.rank+1)*self.batch_size//self.world_size
+        start = self.rank * self.batch_size // self.world_size
+        end = (self.rank + 1) * self.batch_size // self.world_size
         return batch[start:end]
@@ -2,6 +2,8 @@
 # put some code used during development and manual testing of
 # indexed_dataset.
 
+from megatron.data import indexed_dataset
+from megatron.tokenizer import build_tokenizer
 import argparse
 import os
 import sys
@@ -11,8 +13,6 @@
 script_dir = os.path.dirname(os.path.realpath(__file__))
 sys.path.append(os.path.join(script_dir, "../../../"))
 
-from megatron.tokenizer import build_tokenizer
-from megatron.data import indexed_dataset
 
 def test_indexed_dataset(args):
     ds = indexed_dataset.make_dataset(args.data, args.dataset_impl)
@@ -23,12 +23,12 @@ def test_indexed_dataset(args):
     if ds.supports_prefetch:
         # just prefetch the whole thing in test (so assume it is small)
         ds.prefetch(range(len(ds)))
-    if args.count > len(ds.doc_idx)-1:
-        args.count = len(ds.doc_idx)-1
+    if args.count > len(ds.doc_idx) - 1:
+        args.count = len(ds.doc_idx) - 1
 
     for i in range(args.count):
         start = ds.doc_idx[i]
-        end = ds.doc_idx[i+1]
+        end = ds.doc_idx[i + 1]
         ids = ds[start:end]
         print(f"Document {i}:")
         print("--------------")
@@ -39,26 +39,27 @@ def test_indexed_dataset(args):
             print(text)
             print("---")
 
+
 def test_indexed_dataset_get(args):
     ds = indexed_dataset.make_dataset(args.data, args.dataset_impl)
     tokenizer = build_tokenizer(args)
     size = ds.sizes[0]
     print(f"size: {size}")
     full = ds.get(0)
     print(full)
-    #print(tokenizer.detokenize(full.data.tolist()))
+    # print(tokenizer.detokenize(full.data.tolist()))
     print("---")
-    end = ds.get(0, offset=size-10)
+    end = ds.get(0, offset=size - 10)
     print(end)
-    #print(tokenizer.detokenize(end.data.tolist()))
+    # print(tokenizer.detokenize(end.data.tolist()))
 
     start = ds.get(0, length=10)
     print(start)
-    #print(tokenizer.detokenize(start.data.tolist()))
+    # print(tokenizer.detokenize(start.data.tolist()))
 
     part = ds.get(0, offset=2, length=8)
     print(part)
-    #print(tokenizer.detokenize(part.data.tolist()))
+    # print(tokenizer.detokenize(part.data.tolist()))
 
 # def test_albert_dataset(args):
 #     # tokenizer = FullBertTokenizer(args.vocab, do_lower_case=True)
@@ -77,6 +78,7 @@ def test_indexed_dataset_get(args):
 #         if i >= args.count-1:
 #             exit()
 
+
 def main():
     parser = argparse.ArgumentParser()
     parser.add_argument('--data', type=str, help='prefix to data files')
@@ -118,5 +120,6 @@ def main():
 #    test_albert_dataset(args)
     test_indexed_dataset_get(args)
 
+
 if __name__ == "__main__":
     main()
Original file line number	Diff line number	Diff line change
`@@ -1,3 +1 @@`
`1`	`1`	`from . import indexed_dataset`
`2`		`-`
`3`		`-`