forked from TensorSpeech/TensorFlowASR
-
Notifications
You must be signed in to change notification settings - Fork 0
/
generate_vocab_sentencepiece.py
32 lines (21 loc) · 985 Bytes
/
generate_vocab_sentencepiece.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
import os
import argparse
from tensorflow_asr.utils.env_util import setup_environment, setup_strategy
logger = setup_environment()
import tensorflow as tf
DEFAULT_YAML = os.path.join(os.path.abspath(os.path.dirname(__file__)), "config.yml")
tf.keras.backend.clear_session()
parser = argparse.ArgumentParser(prog="Vocab Training with SentencePiece")
parser.add_argument("--config", type=str, default=DEFAULT_YAML,
help="The file path of model configuration file")
parser.add_argument("--devices", type=int, nargs="*", default=[0],
help="Devices' ids to apply distributed training")
args = parser.parse_args()
strategy = setup_strategy(args.devices)
from tensorflow_asr.configs.config import Config
from tensorflow_asr.featurizers.text_featurizers import SentencePieceFeaturizer
config = Config(args.config)
logger.info("Generating subwords ...")
text_featurizer = SentencePieceFeaturizer.build_from_corpus(
config.decoder_config
)