Skip to content

Commit

Permalink
Merge pull request #14 from codertimo/alpha0.0.1a4
Browse files Browse the repository at this point in the history
alpha-0.0.1a4 version released
  • Loading branch information
codertimo authored Oct 23, 2018
2 parents 7efd2b5 + 427373c commit 0d076e0
Show file tree
Hide file tree
Showing 9 changed files with 137 additions and 48 deletions.
2 changes: 1 addition & 1 deletion LICENSE
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,7 @@
same "printed page" as the copyright notice for easier
identification within third-party archives.

Copyright 2018 Junseong Kim, Scatter Labs, BERT contributors
Copyright 2018 Junseong Kim, Scatter Lab, BERT contributors

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ bert-vocab -c data/corpus.small -o data/vocab.small

### 2. Train your own BERT model
```shell
bert -c data/dataset.small -v data/vocab.small -o output/bert.model
bert -c data/corpus.small -v data/vocab.small -o output/bert.model
```

## Language Model Pre-training
Expand Down
54 changes: 29 additions & 25 deletions bert_pytorch/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,27 +10,30 @@
def train():
parser = argparse.ArgumentParser()

parser.add_argument("-c", "--train_dataset", required=True, type=str)
parser.add_argument("-t", "--test_dataset", type=str, default=None)
parser.add_argument("-v", "--vocab_path", required=True, type=str)
parser.add_argument("-o", "--output_path", required=True, type=str)

parser.add_argument("-hs", "--hidden", type=int, default=256)
parser.add_argument("-l", "--layers", type=int, default=8)
parser.add_argument("-a", "--attn_heads", type=int, default=8)
parser.add_argument("-s", "--seq_len", type=int, default=20)

parser.add_argument("-b", "--batch_size", type=int, default=64)
parser.add_argument("-e", "--epochs", type=int, default=10)
parser.add_argument("-w", "--num_workers", type=int, default=5)
parser.add_argument("--with_cuda", type=bool, default=True)
parser.add_argument("--log_freq", type=int, default=10)
parser.add_argument("--corpus_lines", type=int, default=None)

parser.add_argument("--lr", type=float, default=1e-3)
parser.add_argument("--adam_weight_decay", type=float, default=0.01)
parser.add_argument("--adam_beta1", type=float, default=0.9)
parser.add_argument("--adam_beta2", type=float, default=0.999)
parser.add_argument("-c", "--train_dataset", required=True, type=str, help="train dataset for train bert")
parser.add_argument("-t", "--test_dataset", type=str, default=None, help="test set for evaluate train set")
parser.add_argument("-v", "--vocab_path", required=True, type=str, help="built vocab model path with bert-vocab")
parser.add_argument("-o", "--output_path", required=True, type=str, help="ex)output/bert.model")

parser.add_argument("-hs", "--hidden", type=int, default=256, help="hidden size of transformer model")
parser.add_argument("-l", "--layers", type=int, default=8, help="number of layers")
parser.add_argument("-a", "--attn_heads", type=int, default=8, help="number of attention heads")
parser.add_argument("-s", "--seq_len", type=int, default=20, help="maximum sequence len")

parser.add_argument("-b", "--batch_size", type=int, default=64, help="number of batch_size")
parser.add_argument("-e", "--epochs", type=int, default=10, help="number of epochs")
parser.add_argument("-w", "--num_workers", type=int, default=5, help="dataloader worker size")

parser.add_argument("--with_cuda", type=bool, default=True, help="training with CUDA: true, or false")
parser.add_argument("--log_freq", type=int, default=10, help="printing loss every n iter: setting n")
parser.add_argument("--corpus_lines", type=int, default=None, help="total number of lines in corpus")
parser.add_argument("--cuda_devices", type=int, nargs='+', default=None, help="CUDA device ids")
parser.add_argument("--on_memory", type=bool, default=True, help="Loading on memory: true or false")

parser.add_argument("--lr", type=float, default=1e-3, help="learning rate of adam")
parser.add_argument("--adam_weight_decay", type=float, default=0.01, help="weight_decay of adam")
parser.add_argument("--adam_beta1", type=float, default=0.9, help="adam first beta value")
parser.add_argument("--adam_beta2", type=float, default=0.999, help="adam first beta value")

args = parser.parse_args()

Expand All @@ -39,11 +42,12 @@ def train():
print("Vocab Size: ", len(vocab))

print("Loading Train Dataset", args.train_dataset)
train_dataset = BERTDataset(args.train_dataset, vocab, seq_len=args.seq_len, corpus_lines=args.corpus_lines)
train_dataset = BERTDataset(args.train_dataset, vocab, seq_len=args.seq_len,
corpus_lines=args.corpus_lines, on_memory=args.on_memory)

print("Loading Test Dataset", args.test_dataset)
test_dataset = BERTDataset(args.test_dataset, vocab,
seq_len=args.seq_len) if args.test_dataset is not None else None
test_dataset = BERTDataset(args.test_dataset, vocab, seq_len=args.seq_len, on_memory=args.on_memory) \
if args.test_dataset is not None else None

print("Creating Dataloader")
train_data_loader = DataLoader(train_dataset, batch_size=args.batch_size, num_workers=args.num_workers)
Expand All @@ -56,7 +60,7 @@ def train():
print("Creating BERT Trainer")
trainer = BERTTrainer(bert, len(vocab), train_dataloader=train_data_loader, test_dataloader=test_data_loader,
lr=args.lr, betas=(args.adam_beta1, args.adam_beta2), weight_decay=args.adam_weight_decay,
with_cuda=args.with_cuda, log_freq=args.log_freq)
with_cuda=args.with_cuda, cuda_devices=args.cuda_devices, log_freq=args.log_freq)

print("Training Start")
for epoch in range(args.epochs):
Expand Down
70 changes: 59 additions & 11 deletions bert_pytorch/dataset/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,19 +5,37 @@


class BERTDataset(Dataset):
def __init__(self, corpus_path, vocab, seq_len, encoding="utf-8", corpus_lines=None):
def __init__(self, corpus_path, vocab, seq_len, encoding="utf-8", corpus_lines=None, on_memory=True):
self.vocab = vocab
self.seq_len = seq_len

self.on_memory = on_memory
self.corpus_lines = corpus_lines
self.corpus_path = corpus_path
self.encoding = encoding

with open(corpus_path, "r", encoding=encoding) as f:
self.datas = [line[:-1].split("\t")
for line in tqdm.tqdm(f, desc="Loading Dataset", total=corpus_lines)]
if self.corpus_lines is None and not on_memory:
for _ in tqdm.tqdm(f, desc="Loading Dataset", total=corpus_lines):
self.corpus_lines += 1

if on_memory:
self.lines = [line[:-1].split("\t")
for line in tqdm.tqdm(f, desc="Loading Dataset", total=corpus_lines)]
self.corpus_lines = len(self.lines)

if not on_memory:
self.file = open(corpus_path, "r", encoding=encoding)
self.random_file = open(corpus_path, "r", encoding=encoding)

for _ in range(random.randint(self.corpus_lines if self.corpus_lines < 1000 else 1000)):
self.random_file.__next__()

def __len__(self):
return len(self.datas)
return self.corpus_lines

def __getitem__(self, item):
t1, (t2, is_next_label) = self.datas[item][0], self.random_sent(item)
t1, t2, is_next_label = self.random_sent(item)
t1_random, t1_label = self.random_word(t1)
t2_random, t2_label = self.random_word(t2)

Expand Down Expand Up @@ -49,16 +67,18 @@ def random_word(self, sentence):
for i, token in enumerate(tokens):
prob = random.random()
if prob < 0.15:
# 80% randomly change token to make token
if prob < prob * 0.8:
prob /= 0.15

# 80% randomly change token to mask token
if prob < 0.8:
tokens[i] = self.vocab.mask_index

# 10% randomly change token to random token
elif prob * 0.8 <= prob < prob * 0.9:
elif prob < 0.9:
tokens[i] = random.randrange(len(self.vocab))

# 10% randomly change token to current token
elif prob >= prob * 0.9:
else:
tokens[i] = self.vocab.stoi.get(token, self.vocab.unk_index)

output_label.append(self.vocab.stoi.get(token, self.vocab.unk_index))
Expand All @@ -70,8 +90,36 @@ def random_word(self, sentence):
return tokens, output_label

def random_sent(self, index):
t1, t2 = self.get_corpus_line(index)

# output_text, label(isNotNext:0, isNext:1)
if random.random() > 0.5:
return self.datas[index][1], 1
return t1, t2, 1
else:
return t1, self.get_random_line(), 0

def get_corpus_line(self, item):
if self.on_memory:
return self.lines[item][0], self.lines[item][1]
else:
return self.datas[random.randrange(len(self.datas))][1], 0
line = self.file.__next__()
if line is None:
self.file.close()
self.file = open(self.corpus_path, "r", encoding=self.encoding)
line = self.file.__next__()

t1, t2 = line[:-1].split("\t")
return t1, t2

def get_random_line(self):
if self.on_memory:
return self.lines[random.randrange(len(self.lines))][1]

line = self.file.__next__()
if line is None:
self.file.close()
self.file = open(self.corpus_path, "r", encoding=self.encoding)
for _ in range(random.randint(self.corpus_lines if self.corpus_lines < 1000 else 1000)):
self.random_file.__next__()
line = self.random_file.__next__()
return line[:-1].split("\t")[1]
2 changes: 1 addition & 1 deletion bert_pytorch/model/embedding/position.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ def __init__(self, d_model, max_len=512):
pe.require_grad = False

position = torch.arange(0, max_len).float().unsqueeze(1)
div_term = (torch.arange(0, d_model, 2) * -(math.log(10000.0) / d_model)).float().exp()
div_term = (torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model)).exp()

pe[:, 0::2] = torch.sin(position * div_term)
pe[:, 1::2] = torch.cos(position * div_term)
Expand Down
35 changes: 35 additions & 0 deletions bert_pytorch/trainer/optim_schedule.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
'''A wrapper class for optimizer '''
import numpy as np


class ScheduledOptim():
'''A simple wrapper class for learning rate scheduling'''

def __init__(self, optimizer, d_model, n_warmup_steps):
self._optimizer = optimizer
self.n_warmup_steps = n_warmup_steps
self.n_current_steps = 0
self.init_lr = np.power(d_model, -0.5)

def step_and_update_lr(self):
"Step with the inner optimizer"
self._update_learning_rate()
self._optimizer.step()

def zero_grad(self):
"Zero out the gradients by the inner optimizer"
self._optimizer.zero_grad()

def _get_lr_scale(self):
return np.min([
np.power(self.n_current_steps, -0.5),
np.power(self.n_warmup_steps, -1.5) * self.n_current_steps])

def _update_learning_rate(self):
''' Learning rate scheduling per step '''

self.n_current_steps += 1
lr = self.init_lr * self._get_lr_scale()

for param_group in self._optimizer.param_groups:
param_group['lr'] = lr
14 changes: 8 additions & 6 deletions bert_pytorch/trainer/pretrain.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from torch.utils.data import DataLoader

from ..model import BERTLM, BERT
from .optim_schedule import ScheduledOptim

import tqdm

Expand All @@ -21,8 +22,8 @@ class BERTTrainer:

def __init__(self, bert: BERT, vocab_size: int,
train_dataloader: DataLoader, test_dataloader: DataLoader = None,
lr: float = 1e-4, betas=(0.9, 0.999), weight_decay: float = 0.01,
with_cuda: bool = True, log_freq: int = 10):
lr: float = 1e-4, betas=(0.9, 0.999), weight_decay: float = 0.01, warmup_steps=10000,
with_cuda: bool = True, cuda_devices=None, log_freq: int = 10):
"""
:param bert: BERT model which you want to train
:param vocab_size: total word vocab size
Expand All @@ -45,16 +46,17 @@ def __init__(self, bert: BERT, vocab_size: int,
self.model = BERTLM(bert, vocab_size).to(self.device)

# Distributed GPU training if CUDA can detect more than 1 GPU
if torch.cuda.device_count() > 1:
if with_cuda and torch.cuda.device_count() > 1:
print("Using %d GPUS for BERT" % torch.cuda.device_count())
self.model = nn.DataParallel(self.model)
self.model = nn.DataParallel(self.model, device_ids=cuda_devices)

# Setting the train and test data loader
self.train_data = train_dataloader
self.test_data = test_dataloader

# Setting the Adam optimizer with hyper-param
self.optim = Adam(self.model.parameters(), lr=lr, betas=betas, weight_decay=weight_decay)
self.optim_schedule = ScheduledOptim(self.optim, self.bert.hidden, n_warmup_steps=warmup_steps)

# Using Negative Log Likelihood Loss function for predicting the masked_token
self.criterion = nn.NLLLoss(ignore_index=0)
Expand Down Expand Up @@ -110,9 +112,9 @@ def iteration(self, epoch, data_loader, train=True):

# 3. backward and optimization only in train
if train:
self.optim.zero_grad()
self.optim_schedule.zero_grad()
loss.backward()
self.optim.step()
self.optim_schedule.step_and_update_lr()

# next sentence prediction accuracy
correct = next_sent_output.argmax(dim=-1).eq(data["is_next"]).sum().item()
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
tqdm
numpy
torch>=0.4.0
torch>=0.4.0
4 changes: 2 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,10 @@
import os
import sys

__version__ = "0.0.1a3"
__version__ = "0.0.1a4"

with open("requirements.txt") as f:
require_packages = [line[:-1] for line in f]
require_packages = [line[:-1] if line[-1] == "\n" else line for line in f]

with open("README.md", "r", encoding="utf-8") as f:
long_description = f.read()
Expand Down

0 comments on commit 0d076e0

Please sign in to comment.