add readme.md

Hiroshiba · Jun 4, 2019 · 55c5280 · 55c5280
1 parent 3104d5a
commit 55c5280
Show file tree

Hide file tree

Showing 7 changed files with 182 additions and 9 deletions.
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,19 @@
+Copyright (c) 2019 Kazuyuki Hiroshiba.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/readme.md b/readme.md
@@ -0,0 +1,90 @@
+# yukarin
+ディープラーニング声質変換の第１段階モデルの学習コード。
+
+## 推奨環境
+* Unix系のPython3.6.3
+
+## 準備
+### 必要なライブラリのインストール
+```bash
+pip install -r requirements.txt
+```
+
+### コードの実行方法（予備知識）
+このリポジトリのコードを実行するには、`yukarin`ライブラリをパス（PYTHONPATH）に通す必要があります。
+例えば`scripts/foo.py`を実行するには、以下のように書いて、パスを通します。
+
+```bash
+PYTHONPATH=`pwd` python scripts/foo.py
+```
+
+## データ作成
+1. 音声データを用意する
+入力音声データと、目標音声データを大量に用意し、別々のディレクトリ（例：`input_wav`と`target_wav`）に配置します。
+ファイル名は揃えるか、もしくは[glob](https://docs.python.org/ja/3/library/glob.html)の順序が同じになるようにします。
+
+2. 音響特徴量を切り出す
+入力と目標の音声データそれぞれの音響特徴量ファイルを出力します。
+
+```bash
+python scripts/extract_acoustic_feature.py \
+    -i './input_wav/*' \
+    -o './input_feature/'
+```
+
+3. データを揃える（アライメントする）
+入力と目標の音声データを時間方向に揃えます。
+次の例では、`input_dir`と`target_dir`のアライメントデータを`aligned_indexes`に出力します。
+
+```bash
+python scripts/extract_align_indexes.py \
+    -i1 './input_feature/*' \
+    -i2 './target_feature/*' \
+    -o './aligned_indexes/'
+```
+
+4. 周波数の統計量を求める
+声の高さの変換に必要な、周波数の統計量を入力・目標音声データそれぞれに対して求めます。
+
+```bash
+python scripts/extract_acoustic_feature.py \
+    -i './input_feature/*' \
+    -o './input_statistics.npy'
+```
+
+## 学習
+1. 学習用の設定ファイル`config.json`を作る
+`sample_config.json`の`input_glob`、`target_glob`、`indexes_glob`を変更すればとりあえず動きます。
+
+2. 学習する
+
+```bash
+python scripts/train,py \
+    config.json \
+    ./model_stage1/
+```
+
+3. 第２段階モデルを学習する
+[become-yukarin](https://github.com/Hiroshiba/become-yukarin)の[第２段階の学習](https://github.com/Hiroshiba/become-yukarin#%E7%AC%AC%EF%BC%92%E6%AE%B5%E9%9A%8E%E3%81%AE%E5%AD%A6%E7%BF%92)を参考に、
+第２段階モデルを学習します。
+
+## テスト
+テスト用の入力音声データをディレクトリ（例：`test_wav`）に配置し、`voice_change.py`を実行します。
+
+```bash
+python scripts/voice_change.py \
+    --voice_changer_model_dir './model_stage1' \
+    --voice_changer_config './model_stage1/config.json' \
+    --super_resolution_model './model_stage2/' \
+    --super_resolution_config './model_stage2/config.json' \
+    --input_statistics 'input_statistics.npy' \
+    --target_statistics 'target_statistics.npy' \
+    --out_sampling_rate 24000 \
+    --disable_dataset_test \
+    --dataset_target_wave_dir '' \
+    --test_wave_dir './test_wav' \
+    --output_dir './output/'
+```
+
+## License
+[MIT License](./LICENSE)
diff --git a/requiremets.txt b/requiremets.txt
@@ -1,13 +1,14 @@
 numpy
-cupy
-chainer
-librosa
+cupy<6.0.0
+chainer<6.0.0
+librosa<0.7.0
 pysptk
 pyworld
 fastdtw
 matplotlib
 chainerui
 tensorflow
 pillow
+tqdm
 git+https://github.com/neka-nat/tensorboard-chainer
 git+https://github.com/Hiroshiba/become-yukarin
diff --git a/sample_config.json b/sample_config.json
@@ -0,0 +1,62 @@
+{
+  "dataset": {
+    "acoustic_param": {
+      "alpha": 0.410,
+      "dtype": "float32",
+      "f0_ceil": 800,
+      "f0_floor": 71,
+      "fft_length": 1024,
+      "frame_period": 5,
+      "order": 8,
+      "pad_second": 0,
+      "sampling_rate": 24000,
+      "threshold_db": 25
+    },
+    "input_glob": "./input_feature/*.npy",
+    "target_glob": "./target_feature/*.npy",
+    "indexes_glob": "./aligned_indexes/*.npy",
+    "in_features": [
+      "mc"
+    ],
+    "out_features": [
+      "mc"
+    ],
+    "train_crop_size": 512,
+    "input_global_noise": 0.01,
+    "input_local_noise": 0.01,
+    "target_global_noise": 0.01,
+    "target_local_noise": 0.01,
+    "seed": 0,
+    "num_test": 5
+  },
+  "model": {
+    "in_channels": 9,
+    "out_channels": 9,
+    "generator_base_channels": 8,
+    "generator_extensive_layers": 8,
+    "discriminator_base_channels": 1,
+    "discriminator_extensive_layers": 5,
+    "weak_discriminator": true
+  },
+  "loss": {
+    "adversarial": 0,
+    "mse": 100
+  },
+  "project": {
+    "name": "",
+    "tags": []
+  },
+  "train": {
+    "batchsize": 8,
+    "gpu": 0,
+    "log_iteration": 250,
+    "snapshot_iteration": 10000,
+    "stop_iteration": null,
+    "optimizer": {
+      "alpha": 0.0002,
+      "beta1": 0.5,
+      "beta2": 0.999,
+      "name": "Adam"
+    }
+  }
+}
diff --git a/setup.py b/setup.py
@@ -7,19 +7,20 @@
     url='https://github.com/Hiroshiba/yukarin',
     author='Kazuyuki Hiroshiba',
     author_email='[email protected]',
-    description='Everyone become Yuduki Yukari with DeepLearning power.',
+    description='Everyone become Yuzuki Yukari with DeepLearning power.',
     license='MIT License',
     install_requires=[
         'numpy',
-        'chainer',
-        'librosa',
+        'chainer<6.0.0',
+        'librosa<0.7.0',
         'pysptk',
         'pyworld',
         'fastdtw',
     ],
     classifiers=[
         'Programming Language :: Python :: 3.5',
         'Programming Language :: Python :: 3.6',
+        'Programming Language :: Python :: 3.7',
         'License :: OSI Approved :: MIT License',
     ]
 )
diff --git a/yukarin/acoustic_feature.py b/yukarin/acoustic_feature.py
@@ -141,7 +141,7 @@ def extract(cls, wave: Wave, frame_period, f0_floor, f0_ceil, fft_length, order,
         mc = pysptk.sp2mc(sp, order=order, alpha=alpha)
         coded_ap = pyworld.code_aperiodicity(ap, fs)
         voiced: numpy.ndarray = ~(f0 == 0)
-            
+
         if len(x) % fft_length > 0:
             f0 = f0[:-1]
             t = t[:-1]
@@ -214,7 +214,7 @@ def save(self, path: Path, ignores: Iterable[str] = None):
 
     @staticmethod
     def load(path: Path):
-        d: Dict = numpy.load(path).item()
+        d: Dict = numpy.load(path, allow_pickle=True).item()
         return AcousticFeature(**d)
 
     @staticmethod

diff --git a/yukarin/align_indexes.py b/yukarin/align_indexes.py
@@ -59,7 +59,7 @@ def save(
 
     @staticmethod
     def load(path: Path):
-        d = numpy.load(path).item()  # type: dict
+        d = numpy.load(path, allow_pickle=True).item()  # type: dict
         feature = AlignIndexes(
             feature1=d['feature1'],
             feature2=d['feature2'],