Skip to content

Commit 2f687e4

Browse files
Revert "change g2p and other fix" (fishaudio#40)
1 parent c3df0ba commit 2f687e4

File tree

11 files changed

+253
-75
lines changed

11 files changed

+253
-75
lines changed

bert/bert-base-japanese-v3/README.md

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
---
2+
license: apache-2.0
3+
datasets:
4+
- cc100
5+
- wikipedia
6+
language:
7+
- ja
8+
widget:
9+
- text: 東北大学で[MASK]の研究をしています。
10+
---
11+
12+
# BERT base Japanese (unidic-lite with whole word masking, CC-100 and jawiki-20230102)
13+
14+
This is a [BERT](https://github.com/google-research/bert) model pretrained on texts in the Japanese language.
15+
16+
This version of the model processes input texts with word-level tokenization based on the Unidic 2.1.2 dictionary (available in [unidic-lite](https://pypi.org/project/unidic-lite/) package), followed by the WordPiece subword tokenization.
17+
Additionally, the model is trained with the whole word masking enabled for the masked language modeling (MLM) objective.
18+
19+
The codes for the pretraining are available at [cl-tohoku/bert-japanese](https://github.com/cl-tohoku/bert-japanese/).
20+
21+
## Model architecture
22+
23+
The model architecture is the same as the original BERT base model; 12 layers, 768 dimensions of hidden states, and 12 attention heads.
24+
25+
## Training Data
26+
27+
The model is trained on the Japanese portion of [CC-100 dataset](https://data.statmt.org/cc-100/) and the Japanese version of Wikipedia.
28+
For Wikipedia, we generated a text corpus from the [Wikipedia Cirrussearch dump file](https://dumps.wikimedia.org/other/cirrussearch/) as of January 2, 2023.
29+
The corpus files generated from CC-100 and Wikipedia are 74.3GB and 4.9GB in size and consist of approximately 392M and 34M sentences, respectively.
30+
31+
For the purpose of splitting texts into sentences, we used [fugashi](https://github.com/polm/fugashi) with [mecab-ipadic-NEologd](https://github.com/neologd/mecab-ipadic-neologd) dictionary (v0.0.7).
32+
33+
## Tokenization
34+
35+
The texts are first tokenized by MeCab with the Unidic 2.1.2 dictionary and then split into subwords by the WordPiece algorithm.
36+
The vocabulary size is 32768.
37+
38+
We used [fugashi](https://github.com/polm/fugashi) and [unidic-lite](https://github.com/polm/unidic-lite) packages for the tokenization.
39+
40+
## Training
41+
42+
We trained the model first on the CC-100 corpus for 1M steps and then on the Wikipedia corpus for another 1M steps.
43+
For training of the MLM (masked language modeling) objective, we introduced whole word masking in which all of the subword tokens corresponding to a single word (tokenized by MeCab) are masked at once.
44+
45+
For training of each model, we used a v3-8 instance of Cloud TPUs provided by [TPU Research Cloud](https://sites.research.google/trc/about/).
46+
47+
## Licenses
48+
49+
The pretrained models are distributed under the Apache License 2.0.
50+
51+
## Acknowledgments
52+
53+
This model is trained with Cloud TPUs provided by [TPU Research Cloud](https://sites.research.google/trc/about/) program.

bert/bert-large-japanese-v2/config.json renamed to bert/bert-base-japanese-v3/config.json

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,14 +5,14 @@
55
"attention_probs_dropout_prob": 0.1,
66
"hidden_act": "gelu",
77
"hidden_dropout_prob": 0.1,
8-
"hidden_size": 1024,
8+
"hidden_size": 768,
99
"initializer_range": 0.02,
10-
"intermediate_size": 4096,
10+
"intermediate_size": 3072,
1111
"layer_norm_eps": 1e-12,
1212
"max_position_embeddings": 512,
1313
"model_type": "bert",
14-
"num_attention_heads": 16,
15-
"num_hidden_layers": 24,
14+
"num_attention_heads": 12,
15+
"num_hidden_layers": 12,
1616
"pad_token_id": 0,
1717
"type_vocab_size": 2,
1818
"vocab_size": 32768

bert/bert-large-japanese-v2/tokenizer_config.json

Lines changed: 0 additions & 10 deletions
This file was deleted.

data_utils.py

Lines changed: 4 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -154,13 +154,13 @@ def get_text(self, text, word2ph, phone, tone, language_str, wav_path):
154154

155155
if language_str == "ZH":
156156
bert = bert
157-
ja_bert = torch.zeros(1024, len(phone))
157+
ja_bert = torch.zeros(768, len(phone))
158158
elif language_str == "JA":
159159
ja_bert = bert
160160
bert = torch.zeros(1024, len(phone))
161161
else:
162162
bert = torch.zeros(1024, len(phone))
163-
ja_bert = torch.zeros(1024, len(phone))
163+
ja_bert = torch.zeros(768, len(phone))
164164
assert bert.shape[-1] == len(phone), (
165165
bert.shape,
166166
len(phone),
@@ -208,13 +208,7 @@ def __call__(self, batch):
208208
torch.LongTensor([x[1].size(1) for x in batch]), dim=0, descending=True
209209
)
210210

211-
max_text_len = max(
212-
[
213-
batch[ids_sorted_decreasing[i]][7].size(1)
214-
for i in range(len(ids_sorted_decreasing))
215-
]
216-
+ [len(x[0]) for x in batch]
217-
)
211+
max_text_len = max([len(x[0]) for x in batch])
218212
max_spec_len = max([x[1].size(1) for x in batch])
219213
max_wav_len = max([x[2].size(1) for x in batch])
220214

@@ -227,7 +221,7 @@ def __call__(self, batch):
227221
tone_padded = torch.LongTensor(len(batch), max_text_len)
228222
language_padded = torch.LongTensor(len(batch), max_text_len)
229223
bert_padded = torch.FloatTensor(len(batch), 1024, max_text_len)
230-
ja_bert_padded = torch.FloatTensor(len(batch), 1024, max_text_len)
224+
ja_bert_padded = torch.FloatTensor(len(batch), 768, max_text_len)
231225

232226
spec_padded = torch.FloatTensor(len(batch), batch[0][1].size(0), max_spec_len)
233227
wav_padded = torch.FloatTensor(len(batch), 1, max_wav_len)

models.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -340,7 +340,7 @@ def __init__(
340340
self.language_emb = nn.Embedding(num_languages, hidden_channels)
341341
nn.init.normal_(self.language_emb.weight, 0.0, hidden_channels**-0.5)
342342
self.bert_proj = nn.Conv1d(1024, hidden_channels, 1)
343-
self.ja_bert_proj = nn.Conv1d(1024, hidden_channels, 1)
343+
self.ja_bert_proj = nn.Conv1d(768, hidden_channels, 1)
344344

345345
self.encoder = attentions.Encoder(
346346
hidden_channels,

requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ pypinyin
1515
cn2an
1616
gradio
1717
av
18-
pyopenjtalk
18+
mecab-python3
1919
loguru
2020
unidic-lite
2121
cmudict

text/__init__.py

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -11,12 +11,7 @@ def cleaned_text_to_sequence(cleaned_text, tones, language):
1111
Returns:
1212
List of integers corresponding to the symbols in the text
1313
"""
14-
phones = [] # _symbol_to_id[symbol] for symbol in cleaned_text
15-
for symbol in cleaned_text:
16-
try:
17-
phones.append(_symbol_to_id[symbol])
18-
except KeyError:
19-
phones.append(0) # symbol not found in ID map, use 0('_') by default
14+
phones = [_symbol_to_id[symbol] for symbol in cleaned_text]
2015
tone_start = language_tone_start_map[language]
2116
tones = [i + tone_start for i in tones]
2217
lang_id = language_id_map[language]

0 commit comments

Comments
 (0)