From b562b12cb912567007e4453112f8a6961a563665 Mon Sep 17 00:00:00 2001 From: Ftps Date: Sat, 19 Aug 2023 19:10:31 +0900 Subject: [PATCH 01/65] replace i18n --- i18n.py | 2 +- i18n/i18n.py | 28 ++++++++++++++++++++++++++++ {lib/i18n => i18n/locale}/en_US.json | 0 {lib/i18n => i18n/locale}/es_ES.json | 0 {lib/i18n => i18n/locale}/it_IT.json | 0 {lib/i18n => i18n/locale}/ja_JP.json | 0 {lib/i18n => i18n/locale}/ru_RU.json | 0 {lib/i18n => i18n/locale}/tr_TR.json | 0 {lib/i18n => i18n/locale}/zh_CN.json | 0 {lib/i18n => i18n/locale}/zh_HK.json | 0 {lib/i18n => i18n/locale}/zh_SG.json | 0 {lib/i18n => i18n/locale}/zh_TW.json | 0 {lib/i18n => i18n}/locale_diff.py | 2 +- 13 files changed, 30 insertions(+), 2 deletions(-) create mode 100644 i18n/i18n.py rename {lib/i18n => i18n/locale}/en_US.json (100%) rename {lib/i18n => i18n/locale}/es_ES.json (100%) rename {lib/i18n => i18n/locale}/it_IT.json (100%) rename {lib/i18n => i18n/locale}/ja_JP.json (100%) rename {lib/i18n => i18n/locale}/ru_RU.json (100%) rename {lib/i18n => i18n/locale}/tr_TR.json (100%) rename {lib/i18n => i18n/locale}/zh_CN.json (100%) rename {lib/i18n => i18n/locale}/zh_HK.json (100%) rename {lib/i18n => i18n/locale}/zh_SG.json (100%) rename {lib/i18n => i18n/locale}/zh_TW.json (100%) rename {lib/i18n => i18n}/locale_diff.py (98%) diff --git a/i18n.py b/i18n.py index d64f2ea..28b17c7 100644 --- a/i18n.py +++ b/i18n.py @@ -4,7 +4,7 @@ import os def load_language_list(language): - with open(f"./lib/i18n/{language}.json", "r", encoding="utf-8") as f: + with open(f"./i18n/locale/{language}.json", "r", encoding="utf-8") as f: language_list = json.load(f) return language_list diff --git a/i18n/i18n.py b/i18n/i18n.py new file mode 100644 index 0000000..28b17c7 --- /dev/null +++ b/i18n/i18n.py @@ -0,0 +1,28 @@ +import locale +import json +import os + + +def load_language_list(language): + with open(f"./i18n/locale/{language}.json", "r", encoding="utf-8") as f: + language_list = json.load(f) + return language_list + + +class I18nAuto: + def __init__(self, language=None): + if language in ["Auto", None]: + language = locale.getdefaultlocale()[ + 0 + ] # getlocale can't identify the system's language ((None, None)) + if not os.path.exists(f"./lib/i18n/{language}.json"): + language = "en_US" + self.language = language + # print("Use Language:", language) + self.language_map = load_language_list(language) + + def __call__(self, key): + return self.language_map.get(key, key) + + def print(self): + print("Use Language:", self.language) diff --git a/lib/i18n/en_US.json b/i18n/locale/en_US.json similarity index 100% rename from lib/i18n/en_US.json rename to i18n/locale/en_US.json diff --git a/lib/i18n/es_ES.json b/i18n/locale/es_ES.json similarity index 100% rename from lib/i18n/es_ES.json rename to i18n/locale/es_ES.json diff --git a/lib/i18n/it_IT.json b/i18n/locale/it_IT.json similarity index 100% rename from lib/i18n/it_IT.json rename to i18n/locale/it_IT.json diff --git a/lib/i18n/ja_JP.json b/i18n/locale/ja_JP.json similarity index 100% rename from lib/i18n/ja_JP.json rename to i18n/locale/ja_JP.json diff --git a/lib/i18n/ru_RU.json b/i18n/locale/ru_RU.json similarity index 100% rename from lib/i18n/ru_RU.json rename to i18n/locale/ru_RU.json diff --git a/lib/i18n/tr_TR.json b/i18n/locale/tr_TR.json similarity index 100% rename from lib/i18n/tr_TR.json rename to i18n/locale/tr_TR.json diff --git a/lib/i18n/zh_CN.json b/i18n/locale/zh_CN.json similarity index 100% rename from lib/i18n/zh_CN.json rename to i18n/locale/zh_CN.json diff --git a/lib/i18n/zh_HK.json b/i18n/locale/zh_HK.json similarity index 100% rename from lib/i18n/zh_HK.json rename to i18n/locale/zh_HK.json diff --git a/lib/i18n/zh_SG.json b/i18n/locale/zh_SG.json similarity index 100% rename from lib/i18n/zh_SG.json rename to i18n/locale/zh_SG.json diff --git a/lib/i18n/zh_TW.json b/i18n/locale/zh_TW.json similarity index 100% rename from lib/i18n/zh_TW.json rename to i18n/locale/zh_TW.json diff --git a/lib/i18n/locale_diff.py b/i18n/locale_diff.py similarity index 98% rename from lib/i18n/locale_diff.py rename to i18n/locale_diff.py index 2572779..196829b 100644 --- a/lib/i18n/locale_diff.py +++ b/i18n/locale_diff.py @@ -6,7 +6,7 @@ from collections import OrderedDict standard_file = "zh_CN.json" # Find all JSON files in the directory -dir_path = "./" +dir_path = "i18n/locale" languages = [ f for f in os.listdir(dir_path) if f.endswith(".json") and f != standard_file ] From 43d109e9c47a2024eac2e2efdbef2f6897827bf6 Mon Sep 17 00:00:00 2001 From: Tps-F Date: Sat, 19 Aug 2023 10:17:30 +0000 Subject: [PATCH 02/65] Apply Code Formatter Change --- config.py | 35 +++++++++++++++++++++++++++++------ infer-web.py | 5 ++++- 2 files changed, 33 insertions(+), 7 deletions(-) diff --git a/config.py b/config.py index 0261b02..53f8135 100644 --- a/config.py +++ b/config.py @@ -148,28 +148,51 @@ class Config: x_max = 32 if self.dml: print("use DirectML instead") - if(os.path.exists("runtime\Lib\site-packages\onnxruntime\capi\DirectML.dll")==False): + if ( + os.path.exists( + "runtime\Lib\site-packages\onnxruntime\capi\DirectML.dll" + ) + == False + ): try: - os.rename("runtime\Lib\site-packages\onnxruntime", "runtime\Lib\site-packages\onnxruntime-cuda") + os.rename( + "runtime\Lib\site-packages\onnxruntime", + "runtime\Lib\site-packages\onnxruntime-cuda", + ) except: pass try: - os.rename("runtime\Lib\site-packages\onnxruntime-dml", "runtime\Lib\site-packages\onnxruntime") + os.rename( + "runtime\Lib\site-packages\onnxruntime-dml", + "runtime\Lib\site-packages\onnxruntime", + ) except: pass import torch_directml + self.device = torch_directml.device(torch_directml.default_device()) self.is_half = False else: if self.instead: print(f"use {self.instead} instead") - if(os.path.exists("runtime\Lib\site-packages\onnxruntime\capi\onnxruntime_providers_cuda.dll")==False): + if ( + os.path.exists( + "runtime\Lib\site-packages\onnxruntime\capi\onnxruntime_providers_cuda.dll" + ) + == False + ): try: - os.rename("runtime\Lib\site-packages\onnxruntime", "runtime\Lib\site-packages\onnxruntime-dml") + os.rename( + "runtime\Lib\site-packages\onnxruntime", + "runtime\Lib\site-packages\onnxruntime-dml", + ) except: pass try: - os.rename("runtime\Lib\site-packages\onnxruntime-cuda", "runtime\Lib\site-packages\onnxruntime") + os.rename( + "runtime\Lib\site-packages\onnxruntime-cuda", + "runtime\Lib\site-packages\onnxruntime", + ) except: pass return x_pad, x_query, x_center, x_max diff --git a/infer-web.py b/infer-web.py index 742299b..c5fd117 100644 --- a/infer-web.py +++ b/infer-web.py @@ -239,7 +239,10 @@ def vc_single( times[0], times[1], times[2], - ), (resample_sr if resample_sr >= 16000 and tgt_sr != resample_sr else tgt_sr, audio_opt) + ), ( + resample_sr if resample_sr >= 16000 and tgt_sr != resample_sr else tgt_sr, + audio_opt, + ) except: info = traceback.format_exc() print(info) From 78dae1dd096f8a39a4e1a086da0b6699b0c6f1fa Mon Sep 17 00:00:00 2001 From: Ftps Date: Sat, 19 Aug 2023 19:36:15 +0900 Subject: [PATCH 03/65] replace configs --- config.py => configs/config.py | 12 ++++++------ configs/{ => v1}/32k.json | 2 +- configs/{ => v1}/40k.json | 2 +- configs/{ => v1}/48k.json | 2 +- configs/{32k_v2.json => v2/32k.json} | 2 +- configs/{48k_v2.json => v2/48k.json} | 2 +- 6 files changed, 11 insertions(+), 11 deletions(-) rename config.py => configs/config.py (96%) rename configs/{ => v1}/32k.json (97%) rename configs/{ => v1}/40k.json (97%) rename configs/{ => v1}/48k.json (97%) rename configs/{32k_v2.json => v2/32k.json} (97%) rename configs/{48k_v2.json => v2/48k.json} (97%) diff --git a/config.py b/configs/config.py similarity index 96% rename from config.py rename to configs/config.py index 0261b02..a539478 100644 --- a/config.py +++ b/configs/config.py @@ -7,11 +7,11 @@ from multiprocessing import cpu_count def use_fp32_config(): for config_file in [ - "32k.json", - "40k.json", - "48k.json", - "48k_v2.json", - "32k_v2.json", + "v1/32k.json", + "v1/40k.json", + "v1/48k.json", + "v2/48k.json", + "v2/32k.json", ]: with open(f"configs/{config_file}", "r") as f: strr = f.read().replace("true", "false") @@ -148,7 +148,7 @@ class Config: x_max = 32 if self.dml: print("use DirectML instead") - if(os.path.exists("runtime\Lib\site-packages\onnxruntime\capi\DirectML.dll")==False): + if os.path.exists("runtime\Lib\site-packages\onnxruntime\capi\DirectML.dll")==False: try: os.rename("runtime\Lib\site-packages\onnxruntime", "runtime\Lib\site-packages\onnxruntime-cuda") except: diff --git a/configs/32k.json b/configs/v1/32k.json similarity index 97% rename from configs/32k.json rename to configs/v1/32k.json index d5f16d6..400b6be 100644 --- a/configs/32k.json +++ b/configs/v1/32k.json @@ -7,7 +7,7 @@ "betas": [0.8, 0.99], "eps": 1e-9, "batch_size": 4, - "fp16_run": true, + "fp16_run": false, "lr_decay": 0.999875, "segment_size": 12800, "init_lr_ratio": 1, diff --git a/configs/40k.json b/configs/v1/40k.json similarity index 97% rename from configs/40k.json rename to configs/v1/40k.json index 4ffc87b..cb30b8b 100644 --- a/configs/40k.json +++ b/configs/v1/40k.json @@ -7,7 +7,7 @@ "betas": [0.8, 0.99], "eps": 1e-9, "batch_size": 4, - "fp16_run": true, + "fp16_run": false, "lr_decay": 0.999875, "segment_size": 12800, "init_lr_ratio": 1, diff --git a/configs/48k.json b/configs/v1/48k.json similarity index 97% rename from configs/48k.json rename to configs/v1/48k.json index 2d0e05b..6875991 100644 --- a/configs/48k.json +++ b/configs/v1/48k.json @@ -7,7 +7,7 @@ "betas": [0.8, 0.99], "eps": 1e-9, "batch_size": 4, - "fp16_run": true, + "fp16_run": false, "lr_decay": 0.999875, "segment_size": 11520, "init_lr_ratio": 1, diff --git a/configs/32k_v2.json b/configs/v2/32k.json similarity index 97% rename from configs/32k_v2.json rename to configs/v2/32k.json index 70e534f..36adb8a 100644 --- a/configs/32k_v2.json +++ b/configs/v2/32k.json @@ -7,7 +7,7 @@ "betas": [0.8, 0.99], "eps": 1e-9, "batch_size": 4, - "fp16_run": true, + "fp16_run": false, "lr_decay": 0.999875, "segment_size": 12800, "init_lr_ratio": 1, diff --git a/configs/48k_v2.json b/configs/v2/48k.json similarity index 97% rename from configs/48k_v2.json rename to configs/v2/48k.json index 75f770c..73ee363 100644 --- a/configs/48k_v2.json +++ b/configs/v2/48k.json @@ -7,7 +7,7 @@ "betas": [0.8, 0.99], "eps": 1e-9, "batch_size": 4, - "fp16_run": true, + "fp16_run": false, "lr_decay": 0.999875, "segment_size": 17280, "init_lr_ratio": 1, From 2e2a72f0e530dd2e5e55a19699ca800137f65e7a Mon Sep 17 00:00:00 2001 From: Ftps Date: Sat, 19 Aug 2023 19:40:08 +0900 Subject: [PATCH 04/65] replace infer_pack --- {lib => infer/lib}/infer_pack/attentions.py | 6 +++--- {lib => infer/lib}/infer_pack/commons.py | 0 {lib => infer/lib}/infer_pack/models.py | 12 ++++++------ {lib => infer/lib}/infer_pack/models_onnx.py | 12 ++++++------ {lib => infer/lib}/infer_pack/modules.py | 6 +++--- .../infer_pack/modules/F0Predictor/DioF0Predictor.py | 2 +- .../infer_pack/modules/F0Predictor/F0Predictor.py | 0 .../modules/F0Predictor/HarvestF0Predictor.py | 2 +- .../infer_pack/modules/F0Predictor/PMF0Predictor.py | 2 +- .../lib}/infer_pack/modules/F0Predictor/__init__.py | 0 {lib => infer/lib}/infer_pack/onnx_inference.py | 0 {lib => infer/lib}/infer_pack/transforms.py | 0 12 files changed, 21 insertions(+), 21 deletions(-) rename {lib => infer/lib}/infer_pack/attentions.py (96%) rename {lib => infer/lib}/infer_pack/commons.py (100%) rename {lib => infer/lib}/infer_pack/models.py (99%) rename {lib => infer/lib}/infer_pack/models_onnx.py (98%) rename {lib => infer/lib}/infer_pack/modules.py (95%) rename {lib => infer/lib}/infer_pack/modules/F0Predictor/DioF0Predictor.py (94%) rename {lib => infer/lib}/infer_pack/modules/F0Predictor/F0Predictor.py (100%) rename {lib => infer/lib}/infer_pack/modules/F0Predictor/HarvestF0Predictor.py (94%) rename {lib => infer/lib}/infer_pack/modules/F0Predictor/PMF0Predictor.py (95%) rename {lib => infer/lib}/infer_pack/modules/F0Predictor/__init__.py (100%) rename {lib => infer/lib}/infer_pack/onnx_inference.py (100%) rename {lib => infer/lib}/infer_pack/transforms.py (100%) diff --git a/lib/infer_pack/attentions.py b/infer/lib/infer_pack/attentions.py similarity index 96% rename from lib/infer_pack/attentions.py rename to infer/lib/infer_pack/attentions.py index 84d5c87..fc3538b 100644 --- a/lib/infer_pack/attentions.py +++ b/infer/lib/infer_pack/attentions.py @@ -5,9 +5,9 @@ import torch from torch import nn from torch.nn import functional as F -from lib.infer_pack import commons -from lib.infer_pack import modules -from lib.infer_pack.modules import LayerNorm +from infer.lib.infer_pack import commons +from infer.lib.infer_pack import modules +from infer.lib.infer_pack.modules import LayerNorm class Encoder(nn.Module): diff --git a/lib/infer_pack/commons.py b/infer/lib/infer_pack/commons.py similarity index 100% rename from lib/infer_pack/commons.py rename to infer/lib/infer_pack/commons.py diff --git a/lib/infer_pack/models.py b/infer/lib/infer_pack/models.py similarity index 99% rename from lib/infer_pack/models.py rename to infer/lib/infer_pack/models.py index 4749738..8c598cf 100644 --- a/lib/infer_pack/models.py +++ b/infer/lib/infer_pack/models.py @@ -3,15 +3,15 @@ from time import time as ttime import torch from torch import nn from torch.nn import functional as F -from lib.infer_pack import modules -from lib.infer_pack import attentions -from lib.infer_pack import commons -from lib.infer_pack.commons import init_weights, get_padding +from infer.lib.infer_pack import modules +from infer.lib.infer_pack import attentions +from infer.lib.infer_pack import commons +from infer.lib.infer_pack.commons import init_weights, get_padding from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm -from lib.infer_pack.commons import init_weights +from infer.lib.infer_pack.commons import init_weights import numpy as np -from lib.infer_pack import commons +from infer.lib.infer_pack import commons class TextEncoder256(nn.Module): diff --git a/lib/infer_pack/models_onnx.py b/infer/lib/infer_pack/models_onnx.py similarity index 98% rename from lib/infer_pack/models_onnx.py rename to infer/lib/infer_pack/models_onnx.py index 963e67b..f4b2a15 100644 --- a/lib/infer_pack/models_onnx.py +++ b/infer/lib/infer_pack/models_onnx.py @@ -3,15 +3,15 @@ from time import time as ttime import torch from torch import nn from torch.nn import functional as F -from lib.infer_pack import modules -from lib.infer_pack import attentions -from lib.infer_pack import commons -from lib.infer_pack.commons import init_weights, get_padding +from infer.lib.infer_pack import modules +from infer.lib.infer_pack import attentions +from infer.lib.infer_pack import commons +from infer.lib.infer_pack.commons import init_weights, get_padding from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm -from lib.infer_pack.commons import init_weights +from infer.lib.infer_pack.commons import init_weights import numpy as np -from lib.infer_pack import commons +from infer.lib.infer_pack import commons class TextEncoder256(nn.Module): diff --git a/lib/infer_pack/modules.py b/infer/lib/infer_pack/modules.py similarity index 95% rename from lib/infer_pack/modules.py rename to infer/lib/infer_pack/modules.py index b54dc47..386f7a2 100644 --- a/lib/infer_pack/modules.py +++ b/infer/lib/infer_pack/modules.py @@ -9,9 +9,9 @@ from torch.nn import functional as F from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d from torch.nn.utils import weight_norm, remove_weight_norm -from lib.infer_pack import commons -from lib.infer_pack.commons import init_weights, get_padding -from lib.infer_pack.transforms import piecewise_rational_quadratic_transform +from infer.lib.infer_pack import commons +from infer.lib.infer_pack.commons import init_weights, get_padding +from infer.lib.infer_pack.transforms import piecewise_rational_quadratic_transform LRELU_SLOPE = 0.1 diff --git a/lib/infer_pack/modules/F0Predictor/DioF0Predictor.py b/infer/lib/infer_pack/modules/F0Predictor/DioF0Predictor.py similarity index 94% rename from lib/infer_pack/modules/F0Predictor/DioF0Predictor.py rename to infer/lib/infer_pack/modules/F0Predictor/DioF0Predictor.py index b5a8e3e..e82a7fe 100644 --- a/lib/infer_pack/modules/F0Predictor/DioF0Predictor.py +++ b/infer/lib/infer_pack/modules/F0Predictor/DioF0Predictor.py @@ -1,4 +1,4 @@ -from lib.infer_pack.modules.F0Predictor.F0Predictor import F0Predictor +from infer.lib.infer_pack.modules.F0Predictor.F0Predictor import F0Predictor import pyworld import numpy as np diff --git a/lib/infer_pack/modules/F0Predictor/F0Predictor.py b/infer/lib/infer_pack/modules/F0Predictor/F0Predictor.py similarity index 100% rename from lib/infer_pack/modules/F0Predictor/F0Predictor.py rename to infer/lib/infer_pack/modules/F0Predictor/F0Predictor.py diff --git a/lib/infer_pack/modules/F0Predictor/HarvestF0Predictor.py b/infer/lib/infer_pack/modules/F0Predictor/HarvestF0Predictor.py similarity index 94% rename from lib/infer_pack/modules/F0Predictor/HarvestF0Predictor.py rename to infer/lib/infer_pack/modules/F0Predictor/HarvestF0Predictor.py index f8dae30..eb96c52 100644 --- a/lib/infer_pack/modules/F0Predictor/HarvestF0Predictor.py +++ b/infer/lib/infer_pack/modules/F0Predictor/HarvestF0Predictor.py @@ -1,4 +1,4 @@ -from lib.infer_pack.modules.F0Predictor.F0Predictor import F0Predictor +from infer.lib.infer_pack.modules.F0Predictor.F0Predictor import F0Predictor import pyworld import numpy as np diff --git a/lib/infer_pack/modules/F0Predictor/PMF0Predictor.py b/infer/lib/infer_pack/modules/F0Predictor/PMF0Predictor.py similarity index 95% rename from lib/infer_pack/modules/F0Predictor/PMF0Predictor.py rename to infer/lib/infer_pack/modules/F0Predictor/PMF0Predictor.py index b70de29..384ff4c 100644 --- a/lib/infer_pack/modules/F0Predictor/PMF0Predictor.py +++ b/infer/lib/infer_pack/modules/F0Predictor/PMF0Predictor.py @@ -1,4 +1,4 @@ -from lib.infer_pack.modules.F0Predictor.F0Predictor import F0Predictor +from infer.lib.infer_pack.modules.F0Predictor.F0Predictor import F0Predictor import parselmouth import numpy as np diff --git a/lib/infer_pack/modules/F0Predictor/__init__.py b/infer/lib/infer_pack/modules/F0Predictor/__init__.py similarity index 100% rename from lib/infer_pack/modules/F0Predictor/__init__.py rename to infer/lib/infer_pack/modules/F0Predictor/__init__.py diff --git a/lib/infer_pack/onnx_inference.py b/infer/lib/infer_pack/onnx_inference.py similarity index 100% rename from lib/infer_pack/onnx_inference.py rename to infer/lib/infer_pack/onnx_inference.py diff --git a/lib/infer_pack/transforms.py b/infer/lib/infer_pack/transforms.py similarity index 100% rename from lib/infer_pack/transforms.py rename to infer/lib/infer_pack/transforms.py From d9b23e1e8121317acc974368bb402bd09226f15a Mon Sep 17 00:00:00 2001 From: Ftps Date: Sat, 19 Aug 2023 19:43:02 +0900 Subject: [PATCH 05/65] replace uvr5_pack --- .../lib}/uvr5_pack/lib_v5/dataset.py | 0 {lib => infer/lib}/uvr5_pack/lib_v5/layers.py | 0 .../lib}/uvr5_pack/lib_v5/layers_123812KB .py | 0 .../lib}/uvr5_pack/lib_v5/layers_123821KB.py | 0 .../lib}/uvr5_pack/lib_v5/layers_33966KB.py | 0 .../lib}/uvr5_pack/lib_v5/layers_537227KB.py | 0 .../lib}/uvr5_pack/lib_v5/layers_537238KB.py | 0 .../lib}/uvr5_pack/lib_v5/layers_new.py | 0 .../lib}/uvr5_pack/lib_v5/model_param_init.py | 0 .../modelparams/1band_sr16000_hl512.json | 0 .../modelparams/1band_sr32000_hl512.json | 0 .../modelparams/1band_sr33075_hl384.json | 0 .../modelparams/1band_sr44100_hl1024.json | 0 .../modelparams/1band_sr44100_hl256.json | 0 .../modelparams/1band_sr44100_hl512.json | 0 .../modelparams/1band_sr44100_hl512_cut.json | 0 .../lib_v5/modelparams/2band_32000.json | 0 .../lib_v5/modelparams/2band_44100_lofi.json | 0 .../lib_v5/modelparams/2band_48000.json | 0 .../lib_v5/modelparams/3band_44100.json | 0 .../lib_v5/modelparams/3band_44100_mid.json | 0 .../lib_v5/modelparams/3band_44100_msb2.json | 0 .../lib_v5/modelparams/4band_44100.json | 0 .../lib_v5/modelparams/4band_44100_mid.json | 0 .../lib_v5/modelparams/4band_44100_msb.json | 0 .../lib_v5/modelparams/4band_44100_msb2.json | 0 .../modelparams/4band_44100_reverse.json | 0 .../lib_v5/modelparams/4band_44100_sw.json | 0 .../lib_v5/modelparams/4band_v2.json | 0 .../lib_v5/modelparams/4band_v2_sn.json | 0 .../lib_v5/modelparams/4band_v3.json | 0 .../lib_v5/modelparams/ensemble.json | 0 {lib => infer/lib}/uvr5_pack/lib_v5/nets.py | 0 .../lib}/uvr5_pack/lib_v5/nets_123812KB.py | 0 .../lib}/uvr5_pack/lib_v5/nets_123821KB.py | 0 .../lib}/uvr5_pack/lib_v5/nets_33966KB.py | 0 .../lib}/uvr5_pack/lib_v5/nets_537227KB.py | 0 .../lib}/uvr5_pack/lib_v5/nets_537238KB.py | 0 .../lib}/uvr5_pack/lib_v5/nets_61968KB.py | 0 .../lib}/uvr5_pack/lib_v5/nets_new.py | 0 .../lib}/uvr5_pack/lib_v5/spec_utils.py | 0 {lib => infer/lib}/uvr5_pack/name_params.json | 96 +++++++++---------- {lib => infer/lib}/uvr5_pack/utils.py | 2 +- 43 files changed, 49 insertions(+), 49 deletions(-) rename {lib => infer/lib}/uvr5_pack/lib_v5/dataset.py (100%) rename {lib => infer/lib}/uvr5_pack/lib_v5/layers.py (100%) rename {lib => infer/lib}/uvr5_pack/lib_v5/layers_123812KB .py (100%) rename {lib => infer/lib}/uvr5_pack/lib_v5/layers_123821KB.py (100%) rename {lib => infer/lib}/uvr5_pack/lib_v5/layers_33966KB.py (100%) rename {lib => infer/lib}/uvr5_pack/lib_v5/layers_537227KB.py (100%) rename {lib => infer/lib}/uvr5_pack/lib_v5/layers_537238KB.py (100%) rename {lib => infer/lib}/uvr5_pack/lib_v5/layers_new.py (100%) rename {lib => infer/lib}/uvr5_pack/lib_v5/model_param_init.py (100%) rename {lib => infer/lib}/uvr5_pack/lib_v5/modelparams/1band_sr16000_hl512.json (100%) rename {lib => infer/lib}/uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json (100%) rename {lib => infer/lib}/uvr5_pack/lib_v5/modelparams/1band_sr33075_hl384.json (100%) rename {lib => infer/lib}/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl1024.json (100%) rename {lib => infer/lib}/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl256.json (100%) rename {lib => infer/lib}/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512.json (100%) rename {lib => infer/lib}/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512_cut.json (100%) rename {lib => infer/lib}/uvr5_pack/lib_v5/modelparams/2band_32000.json (100%) rename {lib => infer/lib}/uvr5_pack/lib_v5/modelparams/2band_44100_lofi.json (100%) rename {lib => infer/lib}/uvr5_pack/lib_v5/modelparams/2band_48000.json (100%) rename {lib => infer/lib}/uvr5_pack/lib_v5/modelparams/3band_44100.json (100%) rename {lib => infer/lib}/uvr5_pack/lib_v5/modelparams/3band_44100_mid.json (100%) rename {lib => infer/lib}/uvr5_pack/lib_v5/modelparams/3band_44100_msb2.json (100%) rename {lib => infer/lib}/uvr5_pack/lib_v5/modelparams/4band_44100.json (100%) rename {lib => infer/lib}/uvr5_pack/lib_v5/modelparams/4band_44100_mid.json (100%) rename {lib => infer/lib}/uvr5_pack/lib_v5/modelparams/4band_44100_msb.json (100%) rename {lib => infer/lib}/uvr5_pack/lib_v5/modelparams/4band_44100_msb2.json (100%) rename {lib => infer/lib}/uvr5_pack/lib_v5/modelparams/4band_44100_reverse.json (100%) rename {lib => infer/lib}/uvr5_pack/lib_v5/modelparams/4band_44100_sw.json (100%) rename {lib => infer/lib}/uvr5_pack/lib_v5/modelparams/4band_v2.json (100%) rename {lib => infer/lib}/uvr5_pack/lib_v5/modelparams/4band_v2_sn.json (100%) rename {lib => infer/lib}/uvr5_pack/lib_v5/modelparams/4band_v3.json (100%) rename {lib => infer/lib}/uvr5_pack/lib_v5/modelparams/ensemble.json (100%) rename {lib => infer/lib}/uvr5_pack/lib_v5/nets.py (100%) rename {lib => infer/lib}/uvr5_pack/lib_v5/nets_123812KB.py (100%) rename {lib => infer/lib}/uvr5_pack/lib_v5/nets_123821KB.py (100%) rename {lib => infer/lib}/uvr5_pack/lib_v5/nets_33966KB.py (100%) rename {lib => infer/lib}/uvr5_pack/lib_v5/nets_537227KB.py (100%) rename {lib => infer/lib}/uvr5_pack/lib_v5/nets_537238KB.py (100%) rename {lib => infer/lib}/uvr5_pack/lib_v5/nets_61968KB.py (100%) rename {lib => infer/lib}/uvr5_pack/lib_v5/nets_new.py (100%) rename {lib => infer/lib}/uvr5_pack/lib_v5/spec_utils.py (100%) rename {lib => infer/lib}/uvr5_pack/name_params.json (61%) rename {lib => infer/lib}/uvr5_pack/utils.py (97%) diff --git a/lib/uvr5_pack/lib_v5/dataset.py b/infer/lib/uvr5_pack/lib_v5/dataset.py similarity index 100% rename from lib/uvr5_pack/lib_v5/dataset.py rename to infer/lib/uvr5_pack/lib_v5/dataset.py diff --git a/lib/uvr5_pack/lib_v5/layers.py b/infer/lib/uvr5_pack/lib_v5/layers.py similarity index 100% rename from lib/uvr5_pack/lib_v5/layers.py rename to infer/lib/uvr5_pack/lib_v5/layers.py diff --git a/lib/uvr5_pack/lib_v5/layers_123812KB .py b/infer/lib/uvr5_pack/lib_v5/layers_123812KB .py similarity index 100% rename from lib/uvr5_pack/lib_v5/layers_123812KB .py rename to infer/lib/uvr5_pack/lib_v5/layers_123812KB .py diff --git a/lib/uvr5_pack/lib_v5/layers_123821KB.py b/infer/lib/uvr5_pack/lib_v5/layers_123821KB.py similarity index 100% rename from lib/uvr5_pack/lib_v5/layers_123821KB.py rename to infer/lib/uvr5_pack/lib_v5/layers_123821KB.py diff --git a/lib/uvr5_pack/lib_v5/layers_33966KB.py b/infer/lib/uvr5_pack/lib_v5/layers_33966KB.py similarity index 100% rename from lib/uvr5_pack/lib_v5/layers_33966KB.py rename to infer/lib/uvr5_pack/lib_v5/layers_33966KB.py diff --git a/lib/uvr5_pack/lib_v5/layers_537227KB.py b/infer/lib/uvr5_pack/lib_v5/layers_537227KB.py similarity index 100% rename from lib/uvr5_pack/lib_v5/layers_537227KB.py rename to infer/lib/uvr5_pack/lib_v5/layers_537227KB.py diff --git a/lib/uvr5_pack/lib_v5/layers_537238KB.py b/infer/lib/uvr5_pack/lib_v5/layers_537238KB.py similarity index 100% rename from lib/uvr5_pack/lib_v5/layers_537238KB.py rename to infer/lib/uvr5_pack/lib_v5/layers_537238KB.py diff --git a/lib/uvr5_pack/lib_v5/layers_new.py b/infer/lib/uvr5_pack/lib_v5/layers_new.py similarity index 100% rename from lib/uvr5_pack/lib_v5/layers_new.py rename to infer/lib/uvr5_pack/lib_v5/layers_new.py diff --git a/lib/uvr5_pack/lib_v5/model_param_init.py b/infer/lib/uvr5_pack/lib_v5/model_param_init.py similarity index 100% rename from lib/uvr5_pack/lib_v5/model_param_init.py rename to infer/lib/uvr5_pack/lib_v5/model_param_init.py diff --git a/lib/uvr5_pack/lib_v5/modelparams/1band_sr16000_hl512.json b/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr16000_hl512.json similarity index 100% rename from lib/uvr5_pack/lib_v5/modelparams/1band_sr16000_hl512.json rename to infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr16000_hl512.json diff --git a/lib/uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json b/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json similarity index 100% rename from lib/uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json rename to infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json diff --git a/lib/uvr5_pack/lib_v5/modelparams/1band_sr33075_hl384.json b/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr33075_hl384.json similarity index 100% rename from lib/uvr5_pack/lib_v5/modelparams/1band_sr33075_hl384.json rename to infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr33075_hl384.json diff --git a/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl1024.json b/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl1024.json similarity index 100% rename from lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl1024.json rename to infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl1024.json diff --git a/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl256.json b/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl256.json similarity index 100% rename from lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl256.json rename to infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl256.json diff --git a/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512.json b/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512.json similarity index 100% rename from lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512.json rename to infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512.json diff --git a/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512_cut.json b/infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512_cut.json similarity index 100% rename from lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512_cut.json rename to infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512_cut.json diff --git a/lib/uvr5_pack/lib_v5/modelparams/2band_32000.json b/infer/lib/uvr5_pack/lib_v5/modelparams/2band_32000.json similarity index 100% rename from lib/uvr5_pack/lib_v5/modelparams/2band_32000.json rename to infer/lib/uvr5_pack/lib_v5/modelparams/2band_32000.json diff --git a/lib/uvr5_pack/lib_v5/modelparams/2band_44100_lofi.json b/infer/lib/uvr5_pack/lib_v5/modelparams/2band_44100_lofi.json similarity index 100% rename from lib/uvr5_pack/lib_v5/modelparams/2band_44100_lofi.json rename to infer/lib/uvr5_pack/lib_v5/modelparams/2band_44100_lofi.json diff --git a/lib/uvr5_pack/lib_v5/modelparams/2band_48000.json b/infer/lib/uvr5_pack/lib_v5/modelparams/2band_48000.json similarity index 100% rename from lib/uvr5_pack/lib_v5/modelparams/2band_48000.json rename to infer/lib/uvr5_pack/lib_v5/modelparams/2band_48000.json diff --git a/lib/uvr5_pack/lib_v5/modelparams/3band_44100.json b/infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100.json similarity index 100% rename from lib/uvr5_pack/lib_v5/modelparams/3band_44100.json rename to infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100.json diff --git a/lib/uvr5_pack/lib_v5/modelparams/3band_44100_mid.json b/infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100_mid.json similarity index 100% rename from lib/uvr5_pack/lib_v5/modelparams/3band_44100_mid.json rename to infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100_mid.json diff --git a/lib/uvr5_pack/lib_v5/modelparams/3band_44100_msb2.json b/infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100_msb2.json similarity index 100% rename from lib/uvr5_pack/lib_v5/modelparams/3band_44100_msb2.json rename to infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100_msb2.json diff --git a/lib/uvr5_pack/lib_v5/modelparams/4band_44100.json b/infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100.json similarity index 100% rename from lib/uvr5_pack/lib_v5/modelparams/4band_44100.json rename to infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100.json diff --git a/lib/uvr5_pack/lib_v5/modelparams/4band_44100_mid.json b/infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_mid.json similarity index 100% rename from lib/uvr5_pack/lib_v5/modelparams/4band_44100_mid.json rename to infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_mid.json diff --git a/lib/uvr5_pack/lib_v5/modelparams/4band_44100_msb.json b/infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_msb.json similarity index 100% rename from lib/uvr5_pack/lib_v5/modelparams/4band_44100_msb.json rename to infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_msb.json diff --git a/lib/uvr5_pack/lib_v5/modelparams/4band_44100_msb2.json b/infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_msb2.json similarity index 100% rename from lib/uvr5_pack/lib_v5/modelparams/4band_44100_msb2.json rename to infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_msb2.json diff --git a/lib/uvr5_pack/lib_v5/modelparams/4band_44100_reverse.json b/infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_reverse.json similarity index 100% rename from lib/uvr5_pack/lib_v5/modelparams/4band_44100_reverse.json rename to infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_reverse.json diff --git a/lib/uvr5_pack/lib_v5/modelparams/4band_44100_sw.json b/infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_sw.json similarity index 100% rename from lib/uvr5_pack/lib_v5/modelparams/4band_44100_sw.json rename to infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_sw.json diff --git a/lib/uvr5_pack/lib_v5/modelparams/4band_v2.json b/infer/lib/uvr5_pack/lib_v5/modelparams/4band_v2.json similarity index 100% rename from lib/uvr5_pack/lib_v5/modelparams/4band_v2.json rename to infer/lib/uvr5_pack/lib_v5/modelparams/4band_v2.json diff --git a/lib/uvr5_pack/lib_v5/modelparams/4band_v2_sn.json b/infer/lib/uvr5_pack/lib_v5/modelparams/4band_v2_sn.json similarity index 100% rename from lib/uvr5_pack/lib_v5/modelparams/4band_v2_sn.json rename to infer/lib/uvr5_pack/lib_v5/modelparams/4band_v2_sn.json diff --git a/lib/uvr5_pack/lib_v5/modelparams/4band_v3.json b/infer/lib/uvr5_pack/lib_v5/modelparams/4band_v3.json similarity index 100% rename from lib/uvr5_pack/lib_v5/modelparams/4band_v3.json rename to infer/lib/uvr5_pack/lib_v5/modelparams/4band_v3.json diff --git a/lib/uvr5_pack/lib_v5/modelparams/ensemble.json b/infer/lib/uvr5_pack/lib_v5/modelparams/ensemble.json similarity index 100% rename from lib/uvr5_pack/lib_v5/modelparams/ensemble.json rename to infer/lib/uvr5_pack/lib_v5/modelparams/ensemble.json diff --git a/lib/uvr5_pack/lib_v5/nets.py b/infer/lib/uvr5_pack/lib_v5/nets.py similarity index 100% rename from lib/uvr5_pack/lib_v5/nets.py rename to infer/lib/uvr5_pack/lib_v5/nets.py diff --git a/lib/uvr5_pack/lib_v5/nets_123812KB.py b/infer/lib/uvr5_pack/lib_v5/nets_123812KB.py similarity index 100% rename from lib/uvr5_pack/lib_v5/nets_123812KB.py rename to infer/lib/uvr5_pack/lib_v5/nets_123812KB.py diff --git a/lib/uvr5_pack/lib_v5/nets_123821KB.py b/infer/lib/uvr5_pack/lib_v5/nets_123821KB.py similarity index 100% rename from lib/uvr5_pack/lib_v5/nets_123821KB.py rename to infer/lib/uvr5_pack/lib_v5/nets_123821KB.py diff --git a/lib/uvr5_pack/lib_v5/nets_33966KB.py b/infer/lib/uvr5_pack/lib_v5/nets_33966KB.py similarity index 100% rename from lib/uvr5_pack/lib_v5/nets_33966KB.py rename to infer/lib/uvr5_pack/lib_v5/nets_33966KB.py diff --git a/lib/uvr5_pack/lib_v5/nets_537227KB.py b/infer/lib/uvr5_pack/lib_v5/nets_537227KB.py similarity index 100% rename from lib/uvr5_pack/lib_v5/nets_537227KB.py rename to infer/lib/uvr5_pack/lib_v5/nets_537227KB.py diff --git a/lib/uvr5_pack/lib_v5/nets_537238KB.py b/infer/lib/uvr5_pack/lib_v5/nets_537238KB.py similarity index 100% rename from lib/uvr5_pack/lib_v5/nets_537238KB.py rename to infer/lib/uvr5_pack/lib_v5/nets_537238KB.py diff --git a/lib/uvr5_pack/lib_v5/nets_61968KB.py b/infer/lib/uvr5_pack/lib_v5/nets_61968KB.py similarity index 100% rename from lib/uvr5_pack/lib_v5/nets_61968KB.py rename to infer/lib/uvr5_pack/lib_v5/nets_61968KB.py diff --git a/lib/uvr5_pack/lib_v5/nets_new.py b/infer/lib/uvr5_pack/lib_v5/nets_new.py similarity index 100% rename from lib/uvr5_pack/lib_v5/nets_new.py rename to infer/lib/uvr5_pack/lib_v5/nets_new.py diff --git a/lib/uvr5_pack/lib_v5/spec_utils.py b/infer/lib/uvr5_pack/lib_v5/spec_utils.py similarity index 100% rename from lib/uvr5_pack/lib_v5/spec_utils.py rename to infer/lib/uvr5_pack/lib_v5/spec_utils.py diff --git a/lib/uvr5_pack/name_params.json b/infer/lib/uvr5_pack/name_params.json similarity index 61% rename from lib/uvr5_pack/name_params.json rename to infer/lib/uvr5_pack/name_params.json index 950adcf..8ed51a6 100644 --- a/lib/uvr5_pack/name_params.json +++ b/infer/lib/uvr5_pack/name_params.json @@ -4,92 +4,92 @@ "model_hash_name" : [ { "hash_name": "47939caf0cfe52a0e81442b85b971dfd", - "model_params": "lib/uvr5_pack/lib_v5/modelparams/4band_44100.json", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100.json", "param_name": "4band_44100" }, { "hash_name": "4e4ecb9764c50a8c414fee6e10395bbe", - "model_params": "lib/uvr5_pack/lib_v5/modelparams/4band_v2.json", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_v2.json", "param_name": "4band_v2" }, { "hash_name": "ca106edd563e034bde0bdec4bb7a4b36", - "model_params": "lib/uvr5_pack/lib_v5/modelparams/4band_v2.json", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_v2.json", "param_name": "4band_v2" }, { "hash_name": "e60a1e84803ce4efc0a6551206cc4b71", - "model_params": "lib/uvr5_pack/lib_v5/modelparams/4band_44100.json", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100.json", "param_name": "4band_44100" }, { "hash_name": "a82f14e75892e55e994376edbf0c8435", - "model_params": "lib/uvr5_pack/lib_v5/modelparams/4band_44100.json", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100.json", "param_name": "4band_44100" }, { "hash_name": "6dd9eaa6f0420af9f1d403aaafa4cc06", - "model_params": "lib/uvr5_pack/lib_v5/modelparams/4band_v2_sn.json", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_v2_sn.json", "param_name": "4band_v2_sn" }, { "hash_name": "08611fb99bd59eaa79ad27c58d137727", - "model_params": "lib/uvr5_pack/lib_v5/modelparams/4band_v2_sn.json", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_v2_sn.json", "param_name": "4band_v2_sn" }, { "hash_name": "5c7bbca45a187e81abbbd351606164e5", - "model_params": "lib/uvr5_pack/lib_v5/modelparams/3band_44100_msb2.json", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100_msb2.json", "param_name": "3band_44100_msb2" }, { "hash_name": "d6b2cb685a058a091e5e7098192d3233", - "model_params": "lib/uvr5_pack/lib_v5/modelparams/3band_44100_msb2.json", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100_msb2.json", "param_name": "3band_44100_msb2" }, { "hash_name": "c1b9f38170a7c90e96f027992eb7c62b", - "model_params": "lib/uvr5_pack/lib_v5/modelparams/4band_44100.json", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100.json", "param_name": "4band_44100" }, { "hash_name": "c3448ec923fa0edf3d03a19e633faa53", - "model_params": "lib/uvr5_pack/lib_v5/modelparams/4band_44100.json", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100.json", "param_name": "4band_44100" }, { "hash_name": "68aa2c8093d0080704b200d140f59e54", - "model_params": "lib/uvr5_pack/lib_v5/modelparams/3band_44100.json", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100.json", "param_name": "3band_44100" }, { "hash_name": "fdc83be5b798e4bd29fe00fe6600e147", - "model_params": "lib/uvr5_pack/lib_v5/modelparams/3band_44100_mid.json", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100_mid.json", "param_name": "3band_44100_mid.json" }, { "hash_name": "2ce34bc92fd57f55db16b7a4def3d745", - "model_params": "lib/uvr5_pack/lib_v5/modelparams/3band_44100_mid.json", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100_mid.json", "param_name": "3band_44100_mid.json" }, { "hash_name": "52fdca89576f06cf4340b74a4730ee5f", - "model_params": "lib/uvr5_pack/lib_v5/modelparams/4band_44100.json", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100.json", "param_name": "4band_44100.json" }, { "hash_name": "41191165b05d38fc77f072fa9e8e8a30", - "model_params": "lib/uvr5_pack/lib_v5/modelparams/4band_44100.json", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100.json", "param_name": "4band_44100.json" }, { "hash_name": "89e83b511ad474592689e562d5b1f80e", - "model_params": "lib/uvr5_pack/lib_v5/modelparams/2band_32000.json", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/2band_32000.json", "param_name": "2band_32000.json" }, { "hash_name": "0b954da81d453b716b114d6d7c95177f", - "model_params": "lib/uvr5_pack/lib_v5/modelparams/2band_32000.json", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/2band_32000.json", "param_name": "2band_32000.json" } @@ -97,47 +97,47 @@ "v4 Models": [ { "hash_name": "6a00461c51c2920fd68937d4609ed6c8", - "model_params": "lib/uvr5_pack/lib_v5/modelparams/1band_sr16000_hl512.json", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr16000_hl512.json", "param_name": "1band_sr16000_hl512" }, { "hash_name": "0ab504864d20f1bd378fe9c81ef37140", - "model_params": "lib/uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json", "param_name": "1band_sr32000_hl512" }, { "hash_name": "7dd21065bf91c10f7fccb57d7d83b07f", - "model_params": "lib/uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json", "param_name": "1band_sr32000_hl512" }, { "hash_name": "80ab74d65e515caa3622728d2de07d23", - "model_params": "lib/uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json", "param_name": "1band_sr32000_hl512" }, { "hash_name": "edc115e7fc523245062200c00caa847f", - "model_params": "lib/uvr5_pack/lib_v5/modelparams/1band_sr33075_hl384.json", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr33075_hl384.json", "param_name": "1band_sr33075_hl384" }, { "hash_name": "28063e9f6ab5b341c5f6d3c67f2045b7", - "model_params": "lib/uvr5_pack/lib_v5/modelparams/1band_sr33075_hl384.json", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr33075_hl384.json", "param_name": "1band_sr33075_hl384" }, { "hash_name": "b58090534c52cbc3e9b5104bad666ef2", - "model_params": "lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512.json", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512.json", "param_name": "1band_sr44100_hl512" }, { "hash_name": "0cdab9947f1b0928705f518f3c78ea8f", - "model_params": "lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512.json", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512.json", "param_name": "1band_sr44100_hl512" }, { "hash_name": "ae702fed0238afb5346db8356fe25f13", - "model_params": "lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl1024.json", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl1024.json", "param_name": "1band_sr44100_hl1024" } ] @@ -148,113 +148,113 @@ "1 Band": [ { "hash_name": "1band_sr16000_hl512", - "model_params": "lib/uvr5_pack/lib_v5/modelparams/1band_sr16000_hl512.json", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr16000_hl512.json", "param_name": "1band_sr16000_hl512" }, { "hash_name": "1band_sr32000_hl512", - "model_params": "lib/uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json", "param_name": "1band_sr16000_hl512" }, { "hash_name": "1band_sr33075_hl384", - "model_params": "lib/uvr5_pack/lib_v5/modelparams/1band_sr33075_hl384.json", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr33075_hl384.json", "param_name": "1band_sr33075_hl384" }, { "hash_name": "1band_sr44100_hl256", - "model_params": "lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl256.json", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl256.json", "param_name": "1band_sr44100_hl256" }, { "hash_name": "1band_sr44100_hl512", - "model_params": "lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512.json", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512.json", "param_name": "1band_sr44100_hl512" }, { "hash_name": "1band_sr44100_hl1024", - "model_params": "lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl1024.json", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl1024.json", "param_name": "1band_sr44100_hl1024" } ], "2 Band": [ { "hash_name": "2band_44100_lofi", - "model_params": "lib/uvr5_pack/lib_v5/modelparams/2band_44100_lofi.json", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/2band_44100_lofi.json", "param_name": "2band_44100_lofi" }, { "hash_name": "2band_32000", - "model_params": "lib/uvr5_pack/lib_v5/modelparams/2band_32000.json", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/2band_32000.json", "param_name": "2band_32000" }, { "hash_name": "2band_48000", - "model_params": "lib/uvr5_pack/lib_v5/modelparams/2band_48000.json", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/2band_48000.json", "param_name": "2band_48000" } ], "3 Band": [ { "hash_name": "3band_44100", - "model_params": "lib/uvr5_pack/lib_v5/modelparams/3band_44100.json", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100.json", "param_name": "3band_44100" }, { "hash_name": "3band_44100_mid", - "model_params": "lib/uvr5_pack/lib_v5/modelparams/3band_44100_mid.json", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100_mid.json", "param_name": "3band_44100_mid" }, { "hash_name": "3band_44100_msb2", - "model_params": "lib/uvr5_pack/lib_v5/modelparams/3band_44100_msb2.json", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100_msb2.json", "param_name": "3band_44100_msb2" } ], "4 Band": [ { "hash_name": "4band_44100", - "model_params": "lib/uvr5_pack/lib_v5/modelparams/4band_44100.json", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100.json", "param_name": "4band_44100" }, { "hash_name": "4band_44100_mid", - "model_params": "lib/uvr5_pack/lib_v5/modelparams/4band_44100_mid.json", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_mid.json", "param_name": "4band_44100_mid" }, { "hash_name": "4band_44100_msb", - "model_params": "lib/uvr5_pack/lib_v5/modelparams/4band_44100_msb.json", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_msb.json", "param_name": "4band_44100_msb" }, { "hash_name": "4band_44100_msb2", - "model_params": "lib/uvr5_pack/lib_v5/modelparams/4band_44100_msb2.json", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_msb2.json", "param_name": "4band_44100_msb2" }, { "hash_name": "4band_44100_reverse", - "model_params": "lib/uvr5_pack/lib_v5/modelparams/4band_44100_reverse.json", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_reverse.json", "param_name": "4band_44100_reverse" }, { "hash_name": "4band_44100_sw", - "model_params": "lib/uvr5_pack/lib_v5/modelparams/4band_44100_sw.json", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_sw.json", "param_name": "4band_44100_sw" }, { "hash_name": "4band_v2", - "model_params": "lib/uvr5_pack/lib_v5/modelparams/4band_v2.json", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_v2.json", "param_name": "4band_v2" }, { "hash_name": "4band_v2_sn", - "model_params": "lib/uvr5_pack/lib_v5/modelparams/4band_v2_sn.json", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_v2_sn.json", "param_name": "4band_v2_sn" }, { "hash_name": "tmodelparam", - "model_params": "lib/uvr5_pack/lib_v5/modelparams/tmodelparam.json", + "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/tmodelparam.json", "param_name": "User Model Param Set" } ] diff --git a/lib/uvr5_pack/utils.py b/infer/lib/uvr5_pack/utils.py similarity index 97% rename from lib/uvr5_pack/utils.py rename to infer/lib/uvr5_pack/utils.py index 0fafe87..a04c001 100644 --- a/lib/uvr5_pack/utils.py +++ b/infer/lib/uvr5_pack/utils.py @@ -4,7 +4,7 @@ from tqdm import tqdm import json -def load_data(file_name: str = "./lib/uvr5_pack/name_params.json") -> dict: +def load_data(file_name: str = "./infer/lib/uvr5_pack/name_params.json") -> dict: with open(file_name, "r") as f: data = json.load(f) From 6396af82491edc751f14c4369d393ff0331a5d3c Mon Sep 17 00:00:00 2001 From: Ftps Date: Sat, 19 Aug 2023 19:46:08 +0900 Subject: [PATCH 06/65] assets --- {pretrained => assets/pretrained}/.gitignore | 0 {pretrained_v2 => assets/pretrained_v2}/.gitignore | 0 {uvr5_weights => assets/uvr5_weights}/.gitignore | 0 {weights => assets/weights}/.gitignore | 0 4 files changed, 0 insertions(+), 0 deletions(-) rename {pretrained => assets/pretrained}/.gitignore (100%) rename {pretrained_v2 => assets/pretrained_v2}/.gitignore (100%) rename {uvr5_weights => assets/uvr5_weights}/.gitignore (100%) rename {weights => assets/weights}/.gitignore (100%) diff --git a/pretrained/.gitignore b/assets/pretrained/.gitignore similarity index 100% rename from pretrained/.gitignore rename to assets/pretrained/.gitignore diff --git a/pretrained_v2/.gitignore b/assets/pretrained_v2/.gitignore similarity index 100% rename from pretrained_v2/.gitignore rename to assets/pretrained_v2/.gitignore diff --git a/uvr5_weights/.gitignore b/assets/uvr5_weights/.gitignore similarity index 100% rename from uvr5_weights/.gitignore rename to assets/uvr5_weights/.gitignore diff --git a/weights/.gitignore b/assets/weights/.gitignore similarity index 100% rename from weights/.gitignore rename to assets/weights/.gitignore From 2e56c5c600fdbaa5cae93dcf543272273e7f148e Mon Sep 17 00:00:00 2001 From: Ftps Date: Sat, 19 Aug 2023 19:56:43 +0900 Subject: [PATCH 07/65] vc modules --- infer/modules/vc/__init__.py | 0 infer/modules/vc/modules.py | 219 +++++++++++++++++ infer/modules/vc/pipeline.py | 456 +++++++++++++++++++++++++++++++++++ infer/modules/vc/utils.py | 42 ++++ 4 files changed, 717 insertions(+) create mode 100644 infer/modules/vc/__init__.py create mode 100644 infer/modules/vc/modules.py create mode 100644 infer/modules/vc/pipeline.py create mode 100644 infer/modules/vc/utils.py diff --git a/infer/modules/vc/__init__.py b/infer/modules/vc/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/infer/modules/vc/modules.py b/infer/modules/vc/modules.py new file mode 100644 index 0000000..1d967f7 --- /dev/null +++ b/infer/modules/vc/modules.py @@ -0,0 +1,219 @@ +import traceback + +import torch +import soundfile as sf + +from infer.lib.infer_pack.models import ( + SynthesizerTrnMs256NSFsid, + SynthesizerTrnMs256NSFsid_nono, + SynthesizerTrnMs768NSFsid, + SynthesizerTrnMs768NSFsid_nono, +) +from infer.modules.vc.pipeline import Pipeline +from infer.modules.vc.utils import * + + +class VC: + def __init__(self, config): + self.n_spk = None + self.tgt_sr = None + self.net_g = None + self.pipeline = None + self.cpt = None + self.version = None + self.if_f0 = None + self.version = None + self.hubert_model = None + + self.config = config + + def get_vc(self, sid, to_return_protect0, to_return_protect1): + person = f'{os.getenv("weight_root")}/{sid}' + print(f'loading {person}') + + self.cpt = torch.load(person, map_location="cpu") + self.tgt_sr = self.cpt["config"][-1] + self.cpt["config"][-3] = self.cpt["weight"]["emb_g.weight"].shape[0] # n_spk + self.if_f0 = self.cpt.get("f0", 1) + self.version = self.cpt.get("version", "v1") + + to_return_protect0 = { + "visible": self.if_f0 != 0, + "value": to_return_protect0 if self.if_f0 != 0 else 0.5, + "__type__": "update", + } + to_return_protect1 = { + "visible": self.if_f0 != 0, + "value": to_return_protect1 if self.if_f0 != 0 else 0.33, + "__type__": "update", + } + + synthesizer_class = { + ("v1", 1): SynthesizerTrnMs256NSFsid, + ("v1", 0): SynthesizerTrnMs256NSFsid_nono, + ("v2", 1): SynthesizerTrnMs768NSFsid, + ("v2", 0): SynthesizerTrnMs768NSFsid_nono + } + + self.net_g = synthesizer_class.get((self.version, self.if_f0), SynthesizerTrnMs256NSFsid)(*self.cpt["config"], is_half=self.config.is_half) + + del self.net_g.enc_q + + self.net_g.load_state_dict(self.cpt["weight"], strict=False) + self.net_g.eval().to(self.config.device) + if self.config.is_half: + self.net_g = self.net_g.half() + else: + self.net_g = self.net_g.float() + + self.pipeline = Pipeline(self.tgt_sr, self.config) + n_spk = self.cpt["config"][-3] + index = { + "value": get_index_path_from_model(sid), + "__type__": "update" + } + + return ( + {"visible": True, "maximum": n_spk, "__type__": "update"}, + to_return_protect0, + to_return_protect1, + index, + index + ) + + def vc_single(self, sid, input_audio_path, f0_up_key, f0_file, f0_method, file_index, file_index2, index_rate, filter_radius, resample_sr, rms_mix_rate, protect): + if input_audio_path is None: + return "You need to upload an audio", None + f0_up_key = int(f0_up_key) + try: + audio = load_audio(input_audio_path, 16000) + audio_max = np.abs(audio).max() / 0.95 + if audio_max > 1: + audio /= audio_max + times = [0, 0, 0] + + if self.hubert_model is None: + self.hubert_model = load_hubert(self.config) + + file_index = ( + ( + file_index.strip(" ") + .strip('"') + .strip("\n") + .strip('"') + .strip(" ") + .replace("trained", "added") + ) + if file_index != "" + else file_index2 + ) # 防止小白写错,自动帮他替换掉 + + audio_opt = Pipeline.pipeline( + self.hubert_model, + self.net_g, + sid, + audio, + input_audio_path, + times, + f0_up_key, + f0_method, + file_index, + index_rate, + self.if_f0, + filter_radius, + self.tgt_sr, + resample_sr, + rms_mix_rate, + self.version, + protect, + f0_file, + ) + if self.tgt_sr != resample_sr >= 16000: + self.tgt_sr = resample_sr + index_info = ( + "Using index:%s." % file_index + if os.path.exists(file_index) + else "Index not used." + ) + return f"Success.\n {index_info}\nTime:\n npy:{times[0]}s, f0:{times[1]}s, infer:{times[2]}s", (self.tgt_sr, audio_opt) + except: + info = traceback.format_exc() + print(info) + return info, (None, None) + + def vc_multi( + self, + sid, + dir_path, + opt_root, + paths, + f0_up_key, + f0_method, + file_index, + file_index2, + index_rate, + filter_radius, + resample_sr, + rms_mix_rate, + protect, + format1): + try: + dir_path = ( + dir_path.strip(" ").strip('"').strip("\n").strip('"').strip(" ") + ) # 防止小白拷路径头尾带了空格和"和回车 + opt_root = opt_root.strip(" ").strip('"').strip("\n").strip('"').strip(" ") + os.makedirs(opt_root, exist_ok=True) + try: + if dir_path != "": + paths = [os.path.join(dir_path, name) for name in os.listdir(dir_path)] + else: + paths = [path.name for path in paths] + except: + traceback.print_exc() + paths = [path.name for path in paths] + infos = [] + for path in paths: + info, opt = self.vc_single( + sid, + path, + f0_up_key, + None, + f0_method, + file_index, + file_index2, + # file_big_npy, + index_rate, + filter_radius, + resample_sr, + rms_mix_rate, + protect, + ) + if "Success" in info: + try: + tgt_sr, audio_opt = opt + if format1 in ["wav", "flac"]: + sf.write( + "%s/%s.%s" % (opt_root, os.path.basename(path), format1), + audio_opt, + tgt_sr, + ) + else: + path = "%s/%s.wav" % (opt_root, os.path.basename(path)) + sf.write( + path, + audio_opt, + tgt_sr, + ) + if os.path.exists(path): + os.system( + "ffmpeg -i %s -vn %s -q:a 2 -y" + % (path, path[:-4] + ".%s" % format1) + ) + except: + info += traceback.format_exc() + infos.append("%s->%s" % (os.path.basename(path), info)) + yield "\n".join(infos) + yield "\n".join(infos) + except: + yield traceback.format_exc() + \ No newline at end of file diff --git a/infer/modules/vc/pipeline.py b/infer/modules/vc/pipeline.py new file mode 100644 index 0000000..54bc41d --- /dev/null +++ b/infer/modules/vc/pipeline.py @@ -0,0 +1,456 @@ +import sys +from time import time as ttime + +import numpy as np +import parselmouth +import torch +import torch.nn.functional as F +import pyworld, os, traceback, faiss, librosa, torchcrepe +from scipy import signal +from functools import lru_cache + +now_dir = os.getcwd() +sys.path.append(now_dir) + +bh, ah = signal.butter(N=5, Wn=48, btype="high", fs=16000) + +input_audio_path2wav = {} + + +@lru_cache +def cache_harvest_f0(input_audio_path, fs, f0max, f0min, frame_period): + audio = input_audio_path2wav[input_audio_path] + f0, t = pyworld.harvest( + audio, + fs=fs, + f0_ceil=f0max, + f0_floor=f0min, + frame_period=frame_period, + ) + f0 = pyworld.stonemask(audio, f0, t, fs) + return f0 + + +def change_rms(data1, sr1, data2, sr2, rate): # 1是输入音频,2是输出音频,rate是2的占比 + # print(data1.max(),data2.max()) + rms1 = librosa.feature.rms( + y=data1, frame_length=sr1 // 2 * 2, hop_length=sr1 // 2 + ) # 每半秒一个点 + rms2 = librosa.feature.rms(y=data2, frame_length=sr2 // 2 * 2, hop_length=sr2 // 2) + rms1 = torch.from_numpy(rms1) + rms1 = F.interpolate( + rms1.unsqueeze(0), size=data2.shape[0], mode="linear" + ).squeeze() + rms2 = torch.from_numpy(rms2) + rms2 = F.interpolate( + rms2.unsqueeze(0), size=data2.shape[0], mode="linear" + ).squeeze() + rms2 = torch.max(rms2, torch.zeros_like(rms2) + 1e-6) + data2 *= ( + torch.pow(rms1, torch.tensor(1 - rate)) + * torch.pow(rms2, torch.tensor(rate - 1)) + ).numpy() + return data2 + + +class Pipeline(object): + def __init__(self, tgt_sr, config): + self.x_pad, self.x_query, self.x_center, self.x_max, self.is_half = ( + config.x_pad, + config.x_query, + config.x_center, + config.x_max, + config.is_half, + ) + self.sr = 16000 # hubert输入采样率 + self.window = 160 # 每帧点数 + self.t_pad = self.sr * self.x_pad # 每条前后pad时间 + self.t_pad_tgt = tgt_sr * self.x_pad + self.t_pad2 = self.t_pad * 2 + self.t_query = self.sr * self.x_query # 查询切点前后查询时间 + self.t_center = self.sr * self.x_center # 查询切点位置 + self.t_max = self.sr * self.x_max # 免查询时长阈值 + self.device = config.device + + self.model_rmvpe = None + + def get_f0( + self, + input_audio_path, + x, + p_len, + f0_up_key, + f0_method, + filter_radius, + inp_f0=None, + ): + global input_audio_path2wav + time_step = self.window / self.sr * 1000 + f0_min = 50 + f0_max = 1100 + f0_mel_min = 1127 * np.log(1 + f0_min / 700) + f0_mel_max = 1127 * np.log(1 + f0_max / 700) + if f0_method == "pm": + f0 = ( + parselmouth.Sound(x, self.sr) + .to_pitch_ac( + time_step=time_step / 1000, + voicing_threshold=0.6, + pitch_floor=f0_min, + pitch_ceiling=f0_max, + ) + .selected_array["frequency"] + ) + pad_size = (p_len - len(f0) + 1) // 2 + if pad_size > 0 or p_len - len(f0) - pad_size > 0: + f0 = np.pad( + f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant" + ) + elif f0_method == "harvest": + input_audio_path2wav[input_audio_path] = x.astype(np.double) + f0 = cache_harvest_f0(input_audio_path, self.sr, f0_max, f0_min, 10) + if filter_radius > 2: + f0 = signal.medfilt(f0, 3) + elif f0_method == "crepe": + model = "full" + # Pick a batch size that doesn't cause memory errors on your gpu + batch_size = 512 + # Compute pitch using first gpu + audio = torch.tensor(np.copy(x))[None].float() + f0, pd = torchcrepe.predict( + audio, + self.sr, + self.window, + f0_min, + f0_max, + model, + batch_size=batch_size, + device=self.device, + return_periodicity=True, + ) + pd = torchcrepe.filter.median(pd, 3) + f0 = torchcrepe.filter.mean(f0, 3) + f0[pd < 0.1] = 0 + f0 = f0[0].cpu().numpy() + elif f0_method == "rmvpe": + if not hasattr(self, "model_rmvpe"): + from infer.lib.rmvpe import RMVPE + + print("loading rmvpe model") + self.model_rmvpe = RMVPE( + "rmvpe.pt", is_half=self.is_half, device=self.device + ) + f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03) + + if "privateuseone" in str(self.device): # clean ortruntime memory + del self.model_rmvpe.model + del self.model_rmvpe + print("cleaning ortruntime memory") + + f0 *= pow(2, f0_up_key / 12) + # with open("test.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()])) + tf0 = self.sr // self.window # 每秒f0点数 + if inp_f0 is not None: + delta_t = np.round( + (inp_f0[:, 0].max() - inp_f0[:, 0].min()) * tf0 + 1 + ).astype("int16") + replace_f0 = np.interp( + list(range(delta_t)), inp_f0[:, 0] * 100, inp_f0[:, 1] + ) + shape = f0[self.x_pad * tf0: self.x_pad * tf0 + len(replace_f0)].shape[0] + f0[self.x_pad * tf0: self.x_pad * tf0 + len(replace_f0)] = replace_f0[ + :shape + ] + # with open("test_opt.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()])) + f0bak = f0.copy() + f0_mel = 1127 * np.log(1 + f0 / 700) + f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / ( + f0_mel_max - f0_mel_min + ) + 1 + f0_mel[f0_mel <= 1] = 1 + f0_mel[f0_mel > 255] = 255 + f0_coarse = np.rint(f0_mel).astype(np.int32) + return f0_coarse, f0bak # 1-0 + + def vc( + self, + model, + net_g, + sid, + audio0, + pitch, + pitchf, + times, + index, + big_npy, + index_rate, + version, + protect, + ): # ,file_index,file_big_npy + feats = torch.from_numpy(audio0) + if self.is_half: + feats = feats.half() + else: + feats = feats.float() + if feats.dim() == 2: # double channels + feats = feats.mean(-1) + assert feats.dim() == 1, feats.dim() + feats = feats.view(1, -1) + padding_mask = torch.BoolTensor(feats.shape).to(self.device).fill_(False) + + inputs = { + "source": feats.to(self.device), + "padding_mask": padding_mask, + "output_layer": 9 if version == "v1" else 12, + } + t0 = ttime() + with torch.no_grad(): + logits = model.extract_features(**inputs) + feats = model.final_proj(logits[0]) if version == "v1" else logits[0] + if protect < 0.5 and pitch is not None and pitchf is not None: + feats0 = feats.clone() + if ( + not isinstance(index, type(None)) + and not isinstance(big_npy, type(None)) + and index_rate != 0 + ): + npy = feats[0].cpu().numpy() + if self.is_half: + npy = npy.astype("float32") + + # _, I = index.search(npy, 1) + # npy = big_npy[I.squeeze()] + + score, ix = index.search(npy, k=8) + weight = np.square(1 / score) + weight /= weight.sum(axis=1, keepdims=True) + npy = np.sum(big_npy[ix] * np.expand_dims(weight, axis=2), axis=1) + + if self.is_half: + npy = npy.astype("float16") + feats = ( + torch.from_numpy(npy).unsqueeze(0).to(self.device) * index_rate + + (1 - index_rate) * feats + ) + + feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1) + if protect < 0.5 and pitch is not None and pitchf is not None: + feats0 = F.interpolate(feats0.permute(0, 2, 1), scale_factor=2).permute( + 0, 2, 1 + ) + t1 = ttime() + p_len = audio0.shape[0] // self.window + if feats.shape[1] < p_len: + p_len = feats.shape[1] + if pitch is not None and pitchf is not None: + pitch = pitch[:, :p_len] + pitchf = pitchf[:, :p_len] + + if protect < 0.5 and pitch is not None and pitchf is not None: + pitchff = pitchf.clone() + pitchff[pitchf > 0] = 1 + pitchff[pitchf < 1] = protect + pitchff = pitchff.unsqueeze(-1) + feats = feats * pitchff + feats0 * (1 - pitchff) + feats = feats.to(feats0.dtype) + p_len = torch.tensor([p_len], device=self.device).long() + with torch.no_grad(): + if pitch is not None and pitchf is not None: + audio1 = ( + (net_g.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0]) + .data.cpu() + .float() + .numpy() + ) + else: + audio1 = ( + (net_g.infer(feats, p_len, sid)[0][0, 0]).data.cpu().float().numpy() + ) + del feats, p_len, padding_mask + if torch.cuda.is_available(): + torch.cuda.empty_cache() + t2 = ttime() + times[0] += t1 - t0 + times[2] += t2 - t1 + return audio1 + + def pipeline( + self, + model, + net_g, + sid, + audio, + input_audio_path, + times, + f0_up_key, + f0_method, + file_index, + # file_big_npy, + index_rate, + if_f0, + filter_radius, + tgt_sr, + resample_sr, + rms_mix_rate, + version, + protect, + f0_file=None, + ): + print(file_index) + if ( + file_index != "" + # and file_big_npy != "" + # and os.path.exists(file_big_npy) == True + and os.path.exists(file_index) + and index_rate != 0 + ): + try: + index = faiss.read_index(file_index) + # big_npy = np.load(file_big_npy) + big_npy = index.reconstruct_n(0, index.ntotal) + except: + traceback.print_exc() + index = big_npy = None + else: + index = big_npy = None + audio = signal.filtfilt(bh, ah, audio) + audio_pad = np.pad(audio, (self.window // 2, self.window // 2), mode="reflect") + opt_ts = [] + if audio_pad.shape[0] > self.t_max: + audio_sum = np.zeros_like(audio) + for i in range(self.window): + audio_sum += audio_pad[i: i - self.window] + for t in range(self.t_center, audio.shape[0], self.t_center): + opt_ts.append( + t + - self.t_query + + np.where( + np.abs(audio_sum[t - self.t_query: t + self.t_query]) + == np.abs(audio_sum[t - self.t_query: t + self.t_query]).min() + )[0][0] + ) + s = 0 + audio_opt = [] + t = None + t1 = ttime() + audio_pad = np.pad(audio, (self.t_pad, self.t_pad), mode="reflect") + p_len = audio_pad.shape[0] // self.window + inp_f0 = None + if hasattr(f0_file, "name"): + try: + with open(f0_file.name, "r") as f: + lines = f.read().strip("\n").split("\n") + inp_f0 = [] + for line in lines: + inp_f0.append([float(i) for i in line.split(",")]) + inp_f0 = np.array(inp_f0, dtype="float32") + except: + traceback.print_exc() + sid = torch.tensor(sid, device=self.device).unsqueeze(0).long() + pitch, pitchf = None, None + if if_f0 == 1: + pitch, pitchf = self.get_f0( + input_audio_path, + audio_pad, + p_len, + f0_up_key, + f0_method, + filter_radius, + inp_f0, + ) + pitch = pitch[:p_len] + pitchf = pitchf[:p_len] + if self.device == "mps": + pitchf = pitchf.astype(np.float32) + pitch = torch.tensor(pitch, device=self.device).unsqueeze(0).long() + pitchf = torch.tensor(pitchf, device=self.device).unsqueeze(0).float() + t2 = ttime() + times[1] += t2 - t1 + for t in opt_ts: + t = t // self.window * self.window + if if_f0 == 1: + audio_opt.append( + self.vc( + model, + net_g, + sid, + audio_pad[s: t + self.t_pad2 + self.window], + pitch[:, s // self.window: (t + self.t_pad2) // self.window], + pitchf[:, s // self.window: (t + self.t_pad2) // self.window], + times, + index, + big_npy, + index_rate, + version, + protect, + )[self.t_pad_tgt: -self.t_pad_tgt] + ) + else: + audio_opt.append( + self.vc( + model, + net_g, + sid, + audio_pad[s: t + self.t_pad2 + self.window], + None, + None, + times, + index, + big_npy, + index_rate, + version, + protect, + )[self.t_pad_tgt: -self.t_pad_tgt] + ) + s = t + if if_f0 == 1: + audio_opt.append( + self.vc( + model, + net_g, + sid, + audio_pad[t:], + pitch[:, t // self.window:] if t is not None else pitch, + pitchf[:, t // self.window:] if t is not None else pitchf, + times, + index, + big_npy, + index_rate, + version, + protect, + )[self.t_pad_tgt: -self.t_pad_tgt] + ) + else: + audio_opt.append( + self.vc( + model, + net_g, + sid, + audio_pad[t:], + None, + None, + times, + index, + big_npy, + index_rate, + version, + protect, + )[self.t_pad_tgt: -self.t_pad_tgt] + ) + audio_opt = np.concatenate(audio_opt) + if rms_mix_rate != 1: + audio_opt = change_rms(audio, 16000, audio_opt, tgt_sr, rms_mix_rate) + if tgt_sr != resample_sr >= 16000: + audio_opt = librosa.resample( + audio_opt, orig_sr=tgt_sr, target_sr=resample_sr + ) + audio_max = np.abs(audio_opt).max() / 0.99 + max_int16 = 32768 + if audio_max > 1: + max_int16 /= audio_max + audio_opt = (audio_opt * max_int16).astype(np.int16) + del pitch, pitchf, sid + if torch.cuda.is_available(): + torch.cuda.empty_cache() + return audio_opt + \ No newline at end of file diff --git a/infer/modules/vc/utils.py b/infer/modules/vc/utils.py new file mode 100644 index 0000000..9ba2ea5 --- /dev/null +++ b/infer/modules/vc/utils.py @@ -0,0 +1,42 @@ +import os + +import numpy as np +import ffmpeg +from fairseq import checkpoint_utils + + +def get_index_path_from_model(sid): + return next((f for f in [os.path.join(root, name) for root, dirs, files in os.walk(os.getenv("index_root"), topdown=False) for name in files if name.endswith(".index") and "trained" not in name] if sid.split(".")[0] in f), "") + + +def load_hubert(config): + models, _, _ = checkpoint_utils.load_model_ensemble_and_task( + ["assets/hubert/hubert_base.pt"], + suffix="", + ) + hubert_model = models[0] + hubert_model = hubert_model.to(config.device) + if config.is_half: + hubert_model = hubert_model.half() + else: + hubert_model = hubert_model.float() + return hubert_model.eval() + + +def load_audio(file, sr): + try: + # https://github.com/openai/whisper/blob/main/whisper/audio.py#L26 + # This launches a subprocess to decode audio while down-mixing and resampling as necessary. + # Requires the ffmpeg CLI and `ffmpeg-python` package to be installed. + file = ( + file.strip(" ").strip('"').strip("\n").strip('"').strip(" ") + ) # 防止小白拷路径头尾带了空格和"和回车 + out, _ = ( + ffmpeg.input(file, threads=0) + .output("-", format="f32le", acodec="pcm_f32le", ac=1, ar=sr) + .run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True) + ) + except Exception as e: + raise RuntimeError(f"Failed to load audio: {e}") + + return np.frombuffer(out, np.float32).flatten() \ No newline at end of file From c14721d9c317551f5ae318ee466a5ef5c7b2e2af Mon Sep 17 00:00:00 2001 From: Tps-F Date: Sat, 19 Aug 2023 10:57:09 +0000 Subject: [PATCH 08/65] Apply Code Formatter Change --- infer/modules/vc/modules.py | 103 +++++++++++++++++------------ infer/modules/vc/pipeline.py | 121 +++++++++++++++++------------------ infer/modules/vc/utils.py | 20 ++++-- 3 files changed, 137 insertions(+), 107 deletions(-) diff --git a/infer/modules/vc/modules.py b/infer/modules/vc/modules.py index 1d967f7..f1a96eb 100644 --- a/infer/modules/vc/modules.py +++ b/infer/modules/vc/modules.py @@ -24,19 +24,19 @@ class VC: self.if_f0 = None self.version = None self.hubert_model = None - + self.config = config - + def get_vc(self, sid, to_return_protect0, to_return_protect1): person = f'{os.getenv("weight_root")}/{sid}' - print(f'loading {person}') - + print(f"loading {person}") + self.cpt = torch.load(person, map_location="cpu") self.tgt_sr = self.cpt["config"][-1] self.cpt["config"][-3] = self.cpt["weight"]["emb_g.weight"].shape[0] # n_spk self.if_f0 = self.cpt.get("f0", 1) self.version = self.cpt.get("version", "v1") - + to_return_protect0 = { "visible": self.if_f0 != 0, "value": to_return_protect0 if self.if_f0 != 0 else 0.5, @@ -47,16 +47,18 @@ class VC: "value": to_return_protect1 if self.if_f0 != 0 else 0.33, "__type__": "update", } - + synthesizer_class = { ("v1", 1): SynthesizerTrnMs256NSFsid, ("v1", 0): SynthesizerTrnMs256NSFsid_nono, ("v2", 1): SynthesizerTrnMs768NSFsid, - ("v2", 0): SynthesizerTrnMs768NSFsid_nono + ("v2", 0): SynthesizerTrnMs768NSFsid_nono, } - - self.net_g = synthesizer_class.get((self.version, self.if_f0), SynthesizerTrnMs256NSFsid)(*self.cpt["config"], is_half=self.config.is_half) - + + self.net_g = synthesizer_class.get( + (self.version, self.if_f0), SynthesizerTrnMs256NSFsid + )(*self.cpt["config"], is_half=self.config.is_half) + del self.net_g.enc_q self.net_g.load_state_dict(self.cpt["weight"], strict=False) @@ -65,23 +67,34 @@ class VC: self.net_g = self.net_g.half() else: self.net_g = self.net_g.float() - + self.pipeline = Pipeline(self.tgt_sr, self.config) n_spk = self.cpt["config"][-3] - index = { - "value": get_index_path_from_model(sid), - "__type__": "update" - } - + index = {"value": get_index_path_from_model(sid), "__type__": "update"} + return ( {"visible": True, "maximum": n_spk, "__type__": "update"}, to_return_protect0, to_return_protect1, index, - index + index, ) - - def vc_single(self, sid, input_audio_path, f0_up_key, f0_file, f0_method, file_index, file_index2, index_rate, filter_radius, resample_sr, rms_mix_rate, protect): + + def vc_single( + self, + sid, + input_audio_path, + f0_up_key, + f0_file, + f0_method, + file_index, + file_index2, + index_rate, + filter_radius, + resample_sr, + rms_mix_rate, + protect, + ): if input_audio_path is None: return "You need to upload an audio", None f0_up_key = int(f0_up_key) @@ -91,10 +104,10 @@ class VC: if audio_max > 1: audio /= audio_max times = [0, 0, 0] - + if self.hubert_model is None: self.hubert_model = load_hubert(self.config) - + file_index = ( ( file_index.strip(" ") @@ -107,7 +120,7 @@ class VC: if file_index != "" else file_index2 ) # 防止小白写错,自动帮他替换掉 - + audio_opt = Pipeline.pipeline( self.hubert_model, self.net_g, @@ -135,28 +148,32 @@ class VC: if os.path.exists(file_index) else "Index not used." ) - return f"Success.\n {index_info}\nTime:\n npy:{times[0]}s, f0:{times[1]}s, infer:{times[2]}s", (self.tgt_sr, audio_opt) + return ( + f"Success.\n {index_info}\nTime:\n npy:{times[0]}s, f0:{times[1]}s, infer:{times[2]}s", + (self.tgt_sr, audio_opt), + ) except: info = traceback.format_exc() print(info) return info, (None, None) - + def vc_multi( - self, - sid, - dir_path, - opt_root, - paths, - f0_up_key, - f0_method, - file_index, - file_index2, - index_rate, - filter_radius, - resample_sr, - rms_mix_rate, - protect, - format1): + self, + sid, + dir_path, + opt_root, + paths, + f0_up_key, + f0_method, + file_index, + file_index2, + index_rate, + filter_radius, + resample_sr, + rms_mix_rate, + protect, + format1, + ): try: dir_path = ( dir_path.strip(" ").strip('"').strip("\n").strip('"').strip(" ") @@ -165,7 +182,9 @@ class VC: os.makedirs(opt_root, exist_ok=True) try: if dir_path != "": - paths = [os.path.join(dir_path, name) for name in os.listdir(dir_path)] + paths = [ + os.path.join(dir_path, name) for name in os.listdir(dir_path) + ] else: paths = [path.name for path in paths] except: @@ -193,7 +212,8 @@ class VC: tgt_sr, audio_opt = opt if format1 in ["wav", "flac"]: sf.write( - "%s/%s.%s" % (opt_root, os.path.basename(path), format1), + "%s/%s.%s" + % (opt_root, os.path.basename(path), format1), audio_opt, tgt_sr, ) @@ -216,4 +236,3 @@ class VC: yield "\n".join(infos) except: yield traceback.format_exc() - \ No newline at end of file diff --git a/infer/modules/vc/pipeline.py b/infer/modules/vc/pipeline.py index 54bc41d..3ac47cd 100644 --- a/infer/modules/vc/pipeline.py +++ b/infer/modules/vc/pipeline.py @@ -11,7 +11,7 @@ from functools import lru_cache now_dir = os.getcwd() sys.path.append(now_dir) - + bh, ah = signal.butter(N=5, Wn=48, btype="high", fs=16000) input_audio_path2wav = {} @@ -71,18 +71,18 @@ class Pipeline(object): self.t_center = self.sr * self.x_center # 查询切点位置 self.t_max = self.sr * self.x_max # 免查询时长阈值 self.device = config.device - + self.model_rmvpe = None def get_f0( - self, - input_audio_path, - x, - p_len, - f0_up_key, - f0_method, - filter_radius, - inp_f0=None, + self, + input_audio_path, + x, + p_len, + f0_up_key, + f0_method, + filter_radius, + inp_f0=None, ): global input_audio_path2wav time_step = self.window / self.sr * 1000 @@ -141,12 +141,12 @@ class Pipeline(object): "rmvpe.pt", is_half=self.is_half, device=self.device ) f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03) - + if "privateuseone" in str(self.device): # clean ortruntime memory del self.model_rmvpe.model del self.model_rmvpe print("cleaning ortruntime memory") - + f0 *= pow(2, f0_up_key / 12) # with open("test.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()])) tf0 = self.sr // self.window # 每秒f0点数 @@ -157,8 +157,8 @@ class Pipeline(object): replace_f0 = np.interp( list(range(delta_t)), inp_f0[:, 0] * 100, inp_f0[:, 1] ) - shape = f0[self.x_pad * tf0: self.x_pad * tf0 + len(replace_f0)].shape[0] - f0[self.x_pad * tf0: self.x_pad * tf0 + len(replace_f0)] = replace_f0[ + shape = f0[self.x_pad * tf0 : self.x_pad * tf0 + len(replace_f0)].shape[0] + f0[self.x_pad * tf0 : self.x_pad * tf0 + len(replace_f0)] = replace_f0[ :shape ] # with open("test_opt.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()])) @@ -173,19 +173,19 @@ class Pipeline(object): return f0_coarse, f0bak # 1-0 def vc( - self, - model, - net_g, - sid, - audio0, - pitch, - pitchf, - times, - index, - big_npy, - index_rate, - version, - protect, + self, + model, + net_g, + sid, + audio0, + pitch, + pitchf, + times, + index, + big_npy, + index_rate, + version, + protect, ): # ,file_index,file_big_npy feats = torch.from_numpy(audio0) if self.is_half: @@ -275,26 +275,26 @@ class Pipeline(object): return audio1 def pipeline( - self, - model, - net_g, - sid, - audio, - input_audio_path, - times, - f0_up_key, - f0_method, - file_index, - # file_big_npy, - index_rate, - if_f0, - filter_radius, - tgt_sr, - resample_sr, - rms_mix_rate, - version, - protect, - f0_file=None, + self, + model, + net_g, + sid, + audio, + input_audio_path, + times, + f0_up_key, + f0_method, + file_index, + # file_big_npy, + index_rate, + if_f0, + filter_radius, + tgt_sr, + resample_sr, + rms_mix_rate, + version, + protect, + f0_file=None, ): print(file_index) if ( @@ -319,14 +319,14 @@ class Pipeline(object): if audio_pad.shape[0] > self.t_max: audio_sum = np.zeros_like(audio) for i in range(self.window): - audio_sum += audio_pad[i: i - self.window] + audio_sum += audio_pad[i : i - self.window] for t in range(self.t_center, audio.shape[0], self.t_center): opt_ts.append( t - self.t_query + np.where( - np.abs(audio_sum[t - self.t_query: t + self.t_query]) - == np.abs(audio_sum[t - self.t_query: t + self.t_query]).min() + np.abs(audio_sum[t - self.t_query : t + self.t_query]) + == np.abs(audio_sum[t - self.t_query : t + self.t_query]).min() )[0][0] ) s = 0 @@ -374,16 +374,16 @@ class Pipeline(object): model, net_g, sid, - audio_pad[s: t + self.t_pad2 + self.window], - pitch[:, s // self.window: (t + self.t_pad2) // self.window], - pitchf[:, s // self.window: (t + self.t_pad2) // self.window], + audio_pad[s : t + self.t_pad2 + self.window], + pitch[:, s // self.window : (t + self.t_pad2) // self.window], + pitchf[:, s // self.window : (t + self.t_pad2) // self.window], times, index, big_npy, index_rate, version, protect, - )[self.t_pad_tgt: -self.t_pad_tgt] + )[self.t_pad_tgt : -self.t_pad_tgt] ) else: audio_opt.append( @@ -391,7 +391,7 @@ class Pipeline(object): model, net_g, sid, - audio_pad[s: t + self.t_pad2 + self.window], + audio_pad[s : t + self.t_pad2 + self.window], None, None, times, @@ -400,7 +400,7 @@ class Pipeline(object): index_rate, version, protect, - )[self.t_pad_tgt: -self.t_pad_tgt] + )[self.t_pad_tgt : -self.t_pad_tgt] ) s = t if if_f0 == 1: @@ -410,15 +410,15 @@ class Pipeline(object): net_g, sid, audio_pad[t:], - pitch[:, t // self.window:] if t is not None else pitch, - pitchf[:, t // self.window:] if t is not None else pitchf, + pitch[:, t // self.window :] if t is not None else pitch, + pitchf[:, t // self.window :] if t is not None else pitchf, times, index, big_npy, index_rate, version, protect, - )[self.t_pad_tgt: -self.t_pad_tgt] + )[self.t_pad_tgt : -self.t_pad_tgt] ) else: audio_opt.append( @@ -435,7 +435,7 @@ class Pipeline(object): index_rate, version, protect, - )[self.t_pad_tgt: -self.t_pad_tgt] + )[self.t_pad_tgt : -self.t_pad_tgt] ) audio_opt = np.concatenate(audio_opt) if rms_mix_rate != 1: @@ -453,4 +453,3 @@ class Pipeline(object): if torch.cuda.is_available(): torch.cuda.empty_cache() return audio_opt - \ No newline at end of file diff --git a/infer/modules/vc/utils.py b/infer/modules/vc/utils.py index 9ba2ea5..933775f 100644 --- a/infer/modules/vc/utils.py +++ b/infer/modules/vc/utils.py @@ -6,7 +6,19 @@ from fairseq import checkpoint_utils def get_index_path_from_model(sid): - return next((f for f in [os.path.join(root, name) for root, dirs, files in os.walk(os.getenv("index_root"), topdown=False) for name in files if name.endswith(".index") and "trained" not in name] if sid.split(".")[0] in f), "") + return next( + ( + f + for f in [ + os.path.join(root, name) + for root, dirs, files in os.walk(os.getenv("index_root"), topdown=False) + for name in files + if name.endswith(".index") and "trained" not in name + ] + if sid.split(".")[0] in f + ), + "", + ) def load_hubert(config): @@ -21,8 +33,8 @@ def load_hubert(config): else: hubert_model = hubert_model.float() return hubert_model.eval() - - + + def load_audio(file, sr): try: # https://github.com/openai/whisper/blob/main/whisper/audio.py#L26 @@ -39,4 +51,4 @@ def load_audio(file, sr): except Exception as e: raise RuntimeError(f"Failed to load audio: {e}") - return np.frombuffer(out, np.float32).flatten() \ No newline at end of file + return np.frombuffer(out, np.float32).flatten() From 0de947cf70cc9a3f7108342101d1a3216156787e Mon Sep 17 00:00:00 2001 From: Ftps Date: Sat, 19 Aug 2023 19:57:30 +0900 Subject: [PATCH 09/65] uvr5 modules --- infer/modules/uvr5/mdxnet.py | 239 +++++++++++++++++++++ infer/modules/uvr5/modules.py | 92 +++++++++ infer/modules/uvr5/preprocess.py | 344 +++++++++++++++++++++++++++++++ 3 files changed, 675 insertions(+) create mode 100644 infer/modules/uvr5/mdxnet.py create mode 100644 infer/modules/uvr5/modules.py create mode 100644 infer/modules/uvr5/preprocess.py diff --git a/infer/modules/uvr5/mdxnet.py b/infer/modules/uvr5/mdxnet.py new file mode 100644 index 0000000..9ca22e2 --- /dev/null +++ b/infer/modules/uvr5/mdxnet.py @@ -0,0 +1,239 @@ +import os +import warnings + +import soundfile as sf +import librosa +import numpy as np +import onnxruntime as ort +from tqdm import tqdm +import torch + +cpu = torch.device("cpu") + + +class ConvTDFNetTrim: + def __init__( + self, device, model_name, target_name, L, dim_f, dim_t, n_fft, hop=1024 + ): + super(ConvTDFNetTrim, self).__init__() + + self.dim_f = dim_f + self.dim_t = 2**dim_t + self.n_fft = n_fft + self.hop = hop + self.n_bins = self.n_fft // 2 + 1 + self.chunk_size = hop * (self.dim_t - 1) + self.window = torch.hann_window(window_length=self.n_fft, periodic=True).to( + device + ) + self.target_name = target_name + self.blender = "blender" in model_name + + self.dim_c = 4 + out_c = self.dim_c * 4 if target_name == "*" else self.dim_c + self.freq_pad = torch.zeros( + [1, out_c, self.n_bins - self.dim_f, self.dim_t] + ).to(device) + + self.n = L // 2 + + def stft(self, x): + x = x.reshape([-1, self.chunk_size]) + x = torch.stft( + x, + n_fft=self.n_fft, + hop_length=self.hop, + window=self.window, + center=True, + return_complex=True, + ) + x = torch.view_as_real(x) + x = x.permute([0, 3, 1, 2]) + x = x.reshape([-1, 2, 2, self.n_bins, self.dim_t]).reshape( + [-1, self.dim_c, self.n_bins, self.dim_t] + ) + return x[:, :, : self.dim_f] + + def istft(self, x, freq_pad=None): + freq_pad = ( + self.freq_pad.repeat([x.shape[0], 1, 1, 1]) + if freq_pad is None + else freq_pad + ) + x = torch.cat([x, freq_pad], -2) + c = 4 * 2 if self.target_name == "*" else 2 + x = x.reshape([-1, c, 2, self.n_bins, self.dim_t]).reshape( + [-1, 2, self.n_bins, self.dim_t] + ) + x = x.permute([0, 2, 3, 1]) + x = x.contiguous() + x = torch.view_as_complex(x) + x = torch.istft( + x, n_fft=self.n_fft, hop_length=self.hop, window=self.window, center=True + ) + return x.reshape([-1, c, self.chunk_size]) + + +def get_models(device, dim_f, dim_t, n_fft): + return ConvTDFNetTrim( + device=device, + model_name="Conv-TDF", + target_name="vocals", + L=11, + dim_f=dim_f, + dim_t=dim_t, + n_fft=n_fft, + ) + + +class Predictor: + def __init__(self, args): + print(ort.get_available_providers()) + self.args = args + self.model_ = get_models( + device=cpu, dim_f=args.dim_f, dim_t=args.dim_t, n_fft=args.n_fft + ) + self.model = ort.InferenceSession( + os.path.join(args.onnx, self.model_.target_name + ".onnx"), + providers=["CUDAExecutionProvider", "DmlExecutionProvider", "CPUExecutionProvider"], + ) + print("onnx load done") + + def demix(self, mix): + samples = mix.shape[-1] + margin = self.args.margin + chunk_size = self.args.chunks * 44100 + assert not margin == 0, "margin cannot be zero!" + if margin > chunk_size: + margin = chunk_size + + segmented_mix = {} + + if self.args.chunks == 0 or samples < chunk_size: + chunk_size = samples + + counter = -1 + for skip in range(0, samples, chunk_size): + counter += 1 + + s_margin = 0 if counter == 0 else margin + end = min(skip + chunk_size + margin, samples) + + start = skip - s_margin + + segmented_mix[skip] = mix[:, start:end].copy() + if end == samples: + break + + sources = self.demix_base(segmented_mix, margin_size=margin) + """ + mix:(2,big_sample) + segmented_mix:offset->(2,small_sample) + sources:(1,2,big_sample) + """ + return sources + + def demix_base(self, mixes, margin_size): + chunked_sources = [] + progress_bar = tqdm(total=len(mixes)) + progress_bar.set_description("Processing") + for mix in mixes: + cmix = mixes[mix] + sources = [] + n_sample = cmix.shape[1] + model = self.model_ + trim = model.n_fft // 2 + gen_size = model.chunk_size - 2 * trim + pad = gen_size - n_sample % gen_size + mix_p = np.concatenate( + (np.zeros((2, trim)), cmix, np.zeros((2, pad)), np.zeros((2, trim))), 1 + ) + mix_waves = [] + i = 0 + while i < n_sample + pad: + waves = np.array(mix_p[:, i : i + model.chunk_size]) + mix_waves.append(waves) + i += gen_size + mix_waves = torch.tensor(mix_waves, dtype=torch.float32).to(cpu) + with torch.no_grad(): + _ort = self.model + spek = model.stft(mix_waves) + if self.args.denoise: + spec_pred = ( + -_ort.run(None, {"input": -spek.cpu().numpy()})[0] * 0.5 + + _ort.run(None, {"input": spek.cpu().numpy()})[0] * 0.5 + ) + tar_waves = model.istft(torch.tensor(spec_pred)) + else: + tar_waves = model.istft( + torch.tensor(_ort.run(None, {"input": spek.cpu().numpy()})[0]) + ) + tar_signal = ( + tar_waves[:, :, trim:-trim] + .transpose(0, 1) + .reshape(2, -1) + .numpy()[:, :-pad] + ) + + start = 0 if mix == 0 else margin_size + end = None if mix == list(mixes.keys())[::-1][0] else -margin_size + if margin_size == 0: + end = None + sources.append(tar_signal[:, start:end]) + + progress_bar.update(1) + + chunked_sources.append(sources) + _sources = np.concatenate(chunked_sources, axis=-1) + # del self.model + progress_bar.close() + return _sources + + def prediction(self, m, vocal_root, others_root, format): + os.makedirs(vocal_root, exist_ok=True) + os.makedirs(others_root, exist_ok=True) + basename = os.path.basename(m) + mix, rate = librosa.load(m, mono=False, sr=44100) + if mix.ndim == 1: + mix = np.asfortranarray([mix, mix]) + mix = mix.T + sources = self.demix(mix.T) + opt = sources[0].T + if format in ["wav", "flac"]: + sf.write( + "%s/%s_main_vocal.%s" % (vocal_root, basename, format), mix - opt, rate + ) + sf.write("%s/%s_others.%s" % (others_root, basename, format), opt, rate) + else: + path_vocal = "%s/%s_main_vocal.wav" % (vocal_root, basename) + path_other = "%s/%s_others.wav" % (others_root, basename) + sf.write(path_vocal, mix - opt, rate) + sf.write(path_other, opt, rate) + if os.path.exists(path_vocal): + os.system( + "ffmpeg -i %s -vn %s -q:a 2 -y" + % (path_vocal, path_vocal[:-4] + ".%s" % format) + ) + if os.path.exists(path_other): + os.system( + "ffmpeg -i %s -vn %s -q:a 2 -y" + % (path_other, path_other[:-4] + ".%s" % format) + ) + + +class MDXNetDereverb: + def __init__(self, chunks, device): + self.onnx = "uvr5_weights/onnx_dereverb_By_FoxJoy" + self.shifts = 10 # 'Predict with randomised equivariant stabilisation' + self.mixing = "min_mag" # ['default','min_mag','max_mag'] + self.chunks = chunks + self.margin = 44100 + self.dim_t = 9 + self.dim_f = 3072 + self.n_fft = 6144 + self.denoise = True + self.pred = Predictor(self) + self.device = device + + def path_audio(self, input, vocal_root, others_root, format): + self.pred.prediction(input, vocal_root, others_root, format) \ No newline at end of file diff --git a/infer/modules/uvr5/modules.py b/infer/modules/uvr5/modules.py new file mode 100644 index 0000000..385bd13 --- /dev/null +++ b/infer/modules/uvr5/modules.py @@ -0,0 +1,92 @@ +import os +import traceback + +import torch +import ffmpeg + +from configs.config import Config +from infer.modules.uvr5.preprocess import AudioPre, AudioPreDeEcho +from infer.modules.uvr5.mdxnet import MDXNetDereverb + +config = Config() + + +def uvr(model_name, inp_root, save_root_vocal, paths, save_root_ins, agg, format0): + infos = [] + try: + inp_root = inp_root.strip(" ").strip('"').strip("\n").strip('"').strip(" ") + save_root_vocal = ( + save_root_vocal.strip(" ").strip('"').strip("\n").strip('"').strip(" ") + ) + save_root_ins = ( + save_root_ins.strip(" ").strip('"').strip("\n").strip('"').strip(" ") + ) + if model_name == "onnx_dereverb_By_FoxJoy": + pre_fun = MDXNetDereverb(15, config.device) + else: + func = AudioPre if "DeEcho" not in model_name else AudioPreDeEcho + pre_fun = func( + agg=int(agg), + model_path=os.path.join(os.getenv("weight_uvr5_root"), model_name + ".pth"), + device=config.device, + is_half=config.is_half, + ) + if inp_root != "": + paths = [os.path.join(inp_root, name) for name in os.listdir(inp_root)] + else: + paths = [path.name for path in paths] + for path in paths: + inp_path = os.path.join(inp_root, path) + need_reformat = 1 + done = 0 + try: + info = ffmpeg.probe(inp_path, cmd="ffprobe") + if ( + info["streams"][0]["channels"] == 2 + and info["streams"][0]["sample_rate"] == "44100" + ): + need_reformat = 0 + pre_fun._path_audio_( + inp_path, save_root_ins, save_root_vocal, format0 + ) + done = 1 + except: + need_reformat = 1 + traceback.print_exc() + if need_reformat == 1: + tmp_path = "%s/%s.reformatted.wav" % (os.path.join("tmp"), os.path.basename(inp_path)) + os.system( + "ffmpeg -i %s -vn -acodec pcm_s16le -ac 2 -ar 44100 %s -y" + % (inp_path, tmp_path) + ) + inp_path = tmp_path + try: + if done == 0: + pre_fun.path_audio( + inp_path, save_root_ins, save_root_vocal, format0 + ) + infos.append("%s->Success" % (os.path.basename(inp_path))) + yield "\n".join(infos) + except: + infos.append( + "%s->%s" % (os.path.basename(inp_path), traceback.format_exc()) + ) + yield "\n".join(infos) + except: + infos.append(traceback.format_exc()) + yield "\n".join(infos) + finally: + try: + if model_name == "onnx_dereverb_By_FoxJoy": + del pre_fun.pred.model + del pre_fun.pred.model_ + else: + del pre_fun.model + del pre_fun + except: + traceback.print_exc() + print("clean_empty_cache") + if torch.cuda.is_available(): + torch.cuda.empty_cache() + yield "\n".join(infos) + \ No newline at end of file diff --git a/infer/modules/uvr5/preprocess.py b/infer/modules/uvr5/preprocess.py new file mode 100644 index 0000000..b8dafdc --- /dev/null +++ b/infer/modules/uvr5/preprocess.py @@ -0,0 +1,344 @@ +import os +import torch + +import librosa +import numpy as np +import soundfile as sf + +from infer.lib.uvr5_pack.lib_v5 import spec_utils +from infer.lib.uvr5_pack.utils import inference +from infer.lib.uvr5_pack.lib_v5.model_param_init import ModelParameters + +from infer.lib.uvr5_pack.lib_v5.nets_new import CascadedNet +from infer.lib.uvr5_pack.lib_v5 import nets_61968KB as Nets + + +class AudioPre: + def __init__(self, agg, model_path, device, is_half): + self.model_path = model_path + self.device = device + self.data = { + # Processing Options + "postprocess": False, + "tta": False, + # Constants + "window_size": 512, + "agg": agg, + "high_end_process": "mirroring", + } + mp = ModelParameters("lib/uvr5_pack/lib_v5/modelparams/4band_v2.json") + model = Nets.CascadedASPPNet(mp.param["bins"] * 2) + cpk = torch.load(model_path, map_location="cpu") + model.load_state_dict(cpk) + model.eval() + if is_half: + model = model.half().to(device) + else: + model = model.to(device) + + self.mp = mp + self.model = model + + def _path_audio_(self, music_file, ins_root=None, vocal_root=None, format="flac"): + if ins_root is None and vocal_root is None: + return "No save root." + name = os.path.basename(music_file) + if ins_root is not None: + os.makedirs(ins_root, exist_ok=True) + if vocal_root is not None: + os.makedirs(vocal_root, exist_ok=True) + X_wave, y_wave, X_spec_s, y_spec_s = {}, {}, {}, {} + bands_n = len(self.mp.param["band"]) + # print(bands_n) + for d in range(bands_n, 0, -1): + bp = self.mp.param["band"][d] + if d == bands_n: # high-end band + ( + X_wave[d], + _, + ) = librosa.core.load( # 理论上librosa读取可能对某些音频有bug,应该上ffmpeg读取,但是太麻烦了弃坑 + music_file, + bp["sr"], + False, + dtype=np.float32, + res_type=bp["res_type"], + ) + if X_wave[d].ndim == 1: + X_wave[d] = np.asfortranarray([X_wave[d], X_wave[d]]) + else: # lower bands + X_wave[d] = librosa.core.resample( + X_wave[d + 1], + self.mp.param["band"][d + 1]["sr"], + bp["sr"], + res_type=bp["res_type"], + ) + # Stft of wave source + X_spec_s[d] = spec_utils.wave_to_spectrogram_mt( + X_wave[d], + bp["hl"], + bp["n_fft"], + self.mp.param["mid_side"], + self.mp.param["mid_side_b2"], + self.mp.param["reverse"], + ) + # pdb.set_trace() + if d == bands_n and self.data["high_end_process"] != "none": + input_high_end_h = (bp["n_fft"] // 2 - bp["crop_stop"]) + ( + self.mp.param["pre_filter_stop"] - self.mp.param["pre_filter_start"] + ) + input_high_end = X_spec_s[d][ + :, bp["n_fft"] // 2 - input_high_end_h : bp["n_fft"] // 2, : + ] + + X_spec_m = spec_utils.combine_spectrograms(X_spec_s, self.mp) + aggresive_set = float(self.data["agg"] / 100) + aggressiveness = { + "value": aggresive_set, + "split_bin": self.mp.param["band"][1]["crop_stop"], + } + with torch.no_grad(): + pred, X_mag, X_phase = inference( + X_spec_m, self.device, self.model, aggressiveness, self.data + ) + # Postprocess + if self.data["postprocess"]: + pred_inv = np.clip(X_mag - pred, 0, np.inf) + pred = spec_utils.mask_silence(pred, pred_inv) + y_spec_m = pred * X_phase + v_spec_m = X_spec_m - y_spec_m + + if ins_root is not None: + if self.data["high_end_process"].startswith("mirroring"): + input_high_end_ = spec_utils.mirroring( + self.data["high_end_process"], y_spec_m, input_high_end, self.mp + ) + wav_instrument = spec_utils.cmb_spectrogram_to_wave( + y_spec_m, self.mp, input_high_end_h, input_high_end_ + ) + else: + wav_instrument = spec_utils.cmb_spectrogram_to_wave(y_spec_m, self.mp) + print("%s instruments done" % name) + if format in ["wav", "flac"]: + sf.write( + os.path.join( + ins_root, + "instrument_{}_{}.{}".format(name, self.data["agg"], format), + ), + (np.array(wav_instrument) * 32768).astype("int16"), + self.mp.param["sr"], + ) # + else: + path = os.path.join( + ins_root, "instrument_{}_{}.wav".format(name, self.data["agg"]) + ) + sf.write( + path, + (np.array(wav_instrument) * 32768).astype("int16"), + self.mp.param["sr"], + ) + if os.path.exists(path): + os.system( + "ffmpeg -i %s -vn %s -q:a 2 -y" + % (path, path[:-4] + ".%s" % format) + ) + if vocal_root is not None: + if self.data["high_end_process"].startswith("mirroring"): + input_high_end_ = spec_utils.mirroring( + self.data["high_end_process"], v_spec_m, input_high_end, self.mp + ) + wav_vocals = spec_utils.cmb_spectrogram_to_wave( + v_spec_m, self.mp, input_high_end_h, input_high_end_ + ) + else: + wav_vocals = spec_utils.cmb_spectrogram_to_wave(v_spec_m, self.mp) + print("%s vocals done" % name) + if format in ["wav", "flac"]: + sf.write( + os.path.join( + vocal_root, + "vocal_{}_{}.{}".format(name, self.data["agg"], format), + ), + (np.array(wav_vocals) * 32768).astype("int16"), + self.mp.param["sr"], + ) + else: + path = os.path.join( + vocal_root, "vocal_{}_{}.wav".format(name, self.data["agg"]) + ) + sf.write( + path, + (np.array(wav_vocals) * 32768).astype("int16"), + self.mp.param["sr"], + ) + if os.path.exists(path): + os.system( + "ffmpeg -i %s -vn %s -q:a 2 -y" + % (path, path[:-4] + ".%s" % format) + ) + + +class AudioPreDeEcho: + def __init__(self, agg, model_path, device, is_half): + self.model_path = model_path + self.device = device + self.data = { + # Processing Options + "postprocess": False, + "tta": False, + # Constants + "window_size": 512, + "agg": agg, + "high_end_process": "mirroring", + } + mp = ModelParameters("lib/uvr5_pack/lib_v5/modelparams/4band_v3.json") + nout = 64 if "DeReverb" in model_path else 48 + model = CascadedNet(mp.param["bins"] * 2, nout) + cpk = torch.load(model_path, map_location="cpu") + model.load_state_dict(cpk) + model.eval() + if is_half: + model = model.half().to(device) + else: + model = model.to(device) + + self.mp = mp + self.model = model + + def _path_audio_( + self, music_file, vocal_root=None, ins_root=None, format="flac" + ): # 3个VR模型vocal和ins是反的 + if ins_root is None and vocal_root is None: + return "No save root." + name = os.path.basename(music_file) + if ins_root is not None: + os.makedirs(ins_root, exist_ok=True) + if vocal_root is not None: + os.makedirs(vocal_root, exist_ok=True) + X_wave, y_wave, X_spec_s, y_spec_s = {}, {}, {}, {} + bands_n = len(self.mp.param["band"]) + # print(bands_n) + for d in range(bands_n, 0, -1): + bp = self.mp.param["band"][d] + if d == bands_n: # high-end band + ( + X_wave[d], + _, + ) = librosa.core.load( # 理论上librosa读取可能对某些音频有bug,应该上ffmpeg读取,但是太麻烦了弃坑 + music_file, + bp["sr"], + False, + dtype=np.float32, + res_type=bp["res_type"], + ) + if X_wave[d].ndim == 1: + X_wave[d] = np.asfortranarray([X_wave[d], X_wave[d]]) + else: # lower bands + X_wave[d] = librosa.core.resample( + X_wave[d + 1], + self.mp.param["band"][d + 1]["sr"], + bp["sr"], + res_type=bp["res_type"], + ) + # Stft of wave source + X_spec_s[d] = spec_utils.wave_to_spectrogram_mt( + X_wave[d], + bp["hl"], + bp["n_fft"], + self.mp.param["mid_side"], + self.mp.param["mid_side_b2"], + self.mp.param["reverse"], + ) + # pdb.set_trace() + if d == bands_n and self.data["high_end_process"] != "none": + input_high_end_h = (bp["n_fft"] // 2 - bp["crop_stop"]) + ( + self.mp.param["pre_filter_stop"] - self.mp.param["pre_filter_start"] + ) + input_high_end = X_spec_s[d][ + :, bp["n_fft"] // 2 - input_high_end_h : bp["n_fft"] // 2, : + ] + + X_spec_m = spec_utils.combine_spectrograms(X_spec_s, self.mp) + aggresive_set = float(self.data["agg"] / 100) + aggressiveness = { + "value": aggresive_set, + "split_bin": self.mp.param["band"][1]["crop_stop"], + } + with torch.no_grad(): + pred, X_mag, X_phase = inference( + X_spec_m, self.device, self.model, aggressiveness, self.data + ) + # Postprocess + if self.data["postprocess"]: + pred_inv = np.clip(X_mag - pred, 0, np.inf) + pred = spec_utils.mask_silence(pred, pred_inv) + y_spec_m = pred * X_phase + v_spec_m = X_spec_m - y_spec_m + + if ins_root is not None: + if self.data["high_end_process"].startswith("mirroring"): + input_high_end_ = spec_utils.mirroring( + self.data["high_end_process"], y_spec_m, input_high_end, self.mp + ) + wav_instrument = spec_utils.cmb_spectrogram_to_wave( + y_spec_m, self.mp, input_high_end_h, input_high_end_ + ) + else: + wav_instrument = spec_utils.cmb_spectrogram_to_wave(y_spec_m, self.mp) + print("%s instruments done" % name) + if format in ["wav", "flac"]: + sf.write( + os.path.join( + ins_root, + "instrument_{}_{}.{}".format(name, self.data["agg"], format), + ), + (np.array(wav_instrument) * 32768).astype("int16"), + self.mp.param["sr"], + ) # + else: + path = os.path.join( + ins_root, "instrument_{}_{}.wav".format(name, self.data["agg"]) + ) + sf.write( + path, + (np.array(wav_instrument) * 32768).astype("int16"), + self.mp.param["sr"], + ) + if os.path.exists(path): + os.system( + "ffmpeg -i %s -vn %s -q:a 2 -y" + % (path, path[:-4] + ".%s" % format) + ) + if vocal_root is not None: + if self.data["high_end_process"].startswith("mirroring"): + input_high_end_ = spec_utils.mirroring( + self.data["high_end_process"], v_spec_m, input_high_end, self.mp + ) + wav_vocals = spec_utils.cmb_spectrogram_to_wave( + v_spec_m, self.mp, input_high_end_h, input_high_end_ + ) + else: + wav_vocals = spec_utils.cmb_spectrogram_to_wave(v_spec_m, self.mp) + print("%s vocals done" % name) + if format in ["wav", "flac"]: + sf.write( + os.path.join( + vocal_root, + "vocal_{}_{}.{}".format(name, self.data["agg"], format), + ), + (np.array(wav_vocals) * 32768).astype("int16"), + self.mp.param["sr"], + ) + else: + path = os.path.join( + vocal_root, "vocal_{}_{}.wav".format(name, self.data["agg"]) + ) + sf.write( + path, + (np.array(wav_vocals) * 32768).astype("int16"), + self.mp.param["sr"], + ) + if os.path.exists(path): + os.system( + "ffmpeg -i %s -vn %s -q:a 2 -y" + % (path, path[:-4] + ".%s" % format) + ) \ No newline at end of file From cb42c6990b6ef37a5c891470a98e82f8a6271cf3 Mon Sep 17 00:00:00 2001 From: Tps-F Date: Sat, 19 Aug 2023 10:58:39 +0000 Subject: [PATCH 10/65] Apply Code Formatter Change --- infer/modules/uvr5/mdxnet.py | 12 ++++++++---- infer/modules/uvr5/modules.py | 10 +++++++--- infer/modules/uvr5/preprocess.py | 6 +++--- 3 files changed, 18 insertions(+), 10 deletions(-) diff --git a/infer/modules/uvr5/mdxnet.py b/infer/modules/uvr5/mdxnet.py index 9ca22e2..bd84f82 100644 --- a/infer/modules/uvr5/mdxnet.py +++ b/infer/modules/uvr5/mdxnet.py @@ -13,7 +13,7 @@ cpu = torch.device("cpu") class ConvTDFNetTrim: def __init__( - self, device, model_name, target_name, L, dim_f, dim_t, n_fft, hop=1024 + self, device, model_name, target_name, L, dim_f, dim_t, n_fft, hop=1024 ): super(ConvTDFNetTrim, self).__init__() @@ -83,7 +83,7 @@ def get_models(device, dim_f, dim_t, n_fft): dim_f=dim_f, dim_t=dim_t, n_fft=n_fft, - ) + ) class Predictor: @@ -95,7 +95,11 @@ class Predictor: ) self.model = ort.InferenceSession( os.path.join(args.onnx, self.model_.target_name + ".onnx"), - providers=["CUDAExecutionProvider", "DmlExecutionProvider", "CPUExecutionProvider"], + providers=[ + "CUDAExecutionProvider", + "DmlExecutionProvider", + "CPUExecutionProvider", + ], ) print("onnx load done") @@ -236,4 +240,4 @@ class MDXNetDereverb: self.device = device def path_audio(self, input, vocal_root, others_root, format): - self.pred.prediction(input, vocal_root, others_root, format) \ No newline at end of file + self.pred.prediction(input, vocal_root, others_root, format) diff --git a/infer/modules/uvr5/modules.py b/infer/modules/uvr5/modules.py index 385bd13..4f5269a 100644 --- a/infer/modules/uvr5/modules.py +++ b/infer/modules/uvr5/modules.py @@ -27,7 +27,9 @@ def uvr(model_name, inp_root, save_root_vocal, paths, save_root_ins, agg, format func = AudioPre if "DeEcho" not in model_name else AudioPreDeEcho pre_fun = func( agg=int(agg), - model_path=os.path.join(os.getenv("weight_uvr5_root"), model_name + ".pth"), + model_path=os.path.join( + os.getenv("weight_uvr5_root"), model_name + ".pth" + ), device=config.device, is_half=config.is_half, ) @@ -54,7 +56,10 @@ def uvr(model_name, inp_root, save_root_vocal, paths, save_root_ins, agg, format need_reformat = 1 traceback.print_exc() if need_reformat == 1: - tmp_path = "%s/%s.reformatted.wav" % (os.path.join("tmp"), os.path.basename(inp_path)) + tmp_path = "%s/%s.reformatted.wav" % ( + os.path.join("tmp"), + os.path.basename(inp_path), + ) os.system( "ffmpeg -i %s -vn -acodec pcm_s16le -ac 2 -ar 44100 %s -y" % (inp_path, tmp_path) @@ -89,4 +94,3 @@ def uvr(model_name, inp_root, save_root_vocal, paths, save_root_ins, agg, format if torch.cuda.is_available(): torch.cuda.empty_cache() yield "\n".join(infos) - \ No newline at end of file diff --git a/infer/modules/uvr5/preprocess.py b/infer/modules/uvr5/preprocess.py index b8dafdc..86c3ab0 100644 --- a/infer/modules/uvr5/preprocess.py +++ b/infer/modules/uvr5/preprocess.py @@ -205,7 +205,7 @@ class AudioPreDeEcho: self.model = model def _path_audio_( - self, music_file, vocal_root=None, ins_root=None, format="flac" + self, music_file, vocal_root=None, ins_root=None, format="flac" ): # 3个VR模型vocal和ins是反的 if ins_root is None and vocal_root is None: return "No save root." @@ -222,7 +222,7 @@ class AudioPreDeEcho: if d == bands_n: # high-end band ( X_wave[d], - _, + _, ) = librosa.core.load( # 理论上librosa读取可能对某些音频有bug,应该上ffmpeg读取,但是太麻烦了弃坑 music_file, bp["sr"], @@ -341,4 +341,4 @@ class AudioPreDeEcho: os.system( "ffmpeg -i %s -vn %s -q:a 2 -y" % (path, path[:-4] + ".%s" % format) - ) \ No newline at end of file + ) From 6721b81dcffbaefe23a179f26ea222ecf6019f46 Mon Sep 17 00:00:00 2001 From: Ftps Date: Sat, 19 Aug 2023 20:00:56 +0900 Subject: [PATCH 11/65] replace lib --- infer/lib/audio.py | 21 + infer/lib/rmvpe.py | 654 ++++++++++++++++++++++++++++++ infer/lib/slicer2.py | 260 ++++++++++++ infer/lib/train/data_utils.py | 512 +++++++++++++++++++++++ infer/lib/train/losses.py | 58 +++ infer/lib/train/mel_processing.py | 130 ++++++ infer/lib/train/process_ckpt.py | 259 ++++++++++++ infer/lib/train/utils.py | 487 ++++++++++++++++++++++ 8 files changed, 2381 insertions(+) create mode 100644 infer/lib/audio.py create mode 100644 infer/lib/rmvpe.py create mode 100644 infer/lib/slicer2.py create mode 100644 infer/lib/train/data_utils.py create mode 100644 infer/lib/train/losses.py create mode 100644 infer/lib/train/mel_processing.py create mode 100644 infer/lib/train/process_ckpt.py create mode 100644 infer/lib/train/utils.py diff --git a/infer/lib/audio.py b/infer/lib/audio.py new file mode 100644 index 0000000..776939d --- /dev/null +++ b/infer/lib/audio.py @@ -0,0 +1,21 @@ +import ffmpeg +import numpy as np + + +def load_audio(file, sr): + try: + # https://github.com/openai/whisper/blob/main/whisper/audio.py#L26 + # This launches a subprocess to decode audio while down-mixing and resampling as necessary. + # Requires the ffmpeg CLI and `ffmpeg-python` package to be installed. + file = ( + file.strip(" ").strip('"').strip("\n").strip('"').strip(" ") + ) # 防止小白拷路径头尾带了空格和"和回车 + out, _ = ( + ffmpeg.input(file, threads=0) + .output("-", format="f32le", acodec="pcm_f32le", ac=1, ar=sr) + .run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True) + ) + except Exception as e: + raise RuntimeError(f"Failed to load audio: {e}") + + return np.frombuffer(out, np.float32).flatten() diff --git a/infer/lib/rmvpe.py b/infer/lib/rmvpe.py new file mode 100644 index 0000000..25dcb8c --- /dev/null +++ b/infer/lib/rmvpe.py @@ -0,0 +1,654 @@ +import torch, numpy as np,pdb +import torch.nn as nn +import torch.nn.functional as F +import torch,pdb +import numpy as np +import torch.nn.functional as F +from scipy.signal import get_window +from librosa.util import pad_center, tiny,normalize +###stft codes from https://github.com/pseeth/torch-stft/blob/master/torch_stft/util.py +def window_sumsquare(window, n_frames, hop_length=200, win_length=800, + n_fft=800, dtype=np.float32, norm=None): + """ + # from librosa 0.6 + Compute the sum-square envelope of a window function at a given hop length. + This is used to estimate modulation effects induced by windowing + observations in short-time fourier transforms. + Parameters + ---------- + window : string, tuple, number, callable, or list-like + Window specification, as in `get_window` + n_frames : int > 0 + The number of analysis frames + hop_length : int > 0 + The number of samples to advance between frames + win_length : [optional] + The length of the window function. By default, this matches `n_fft`. + n_fft : int > 0 + The length of each analysis frame. + dtype : np.dtype + The data type of the output + Returns + ------- + wss : np.ndarray, shape=`(n_fft + hop_length * (n_frames - 1))` + The sum-squared envelope of the window function + """ + if win_length is None: + win_length = n_fft + + n = n_fft + hop_length * (n_frames - 1) + x = np.zeros(n, dtype=dtype) + + # Compute the squared window at the desired length + win_sq = get_window(window, win_length, fftbins=True) + win_sq = normalize(win_sq, norm=norm)**2 + win_sq = pad_center(win_sq, n_fft) + + # Fill the envelope + for i in range(n_frames): + sample = i * hop_length + x[sample:min(n, sample + n_fft)] += win_sq[:max(0, min(n_fft, n - sample))] + return x + +class STFT(torch.nn.Module): + def __init__(self, filter_length=1024, hop_length=512, win_length=None, + window='hann'): + """ + This module implements an STFT using 1D convolution and 1D transpose convolutions. + This is a bit tricky so there are some cases that probably won't work as working + out the same sizes before and after in all overlap add setups is tough. Right now, + this code should work with hop lengths that are half the filter length (50% overlap + between frames). + + Keyword Arguments: + filter_length {int} -- Length of filters used (default: {1024}) + hop_length {int} -- Hop length of STFT (restrict to 50% overlap between frames) (default: {512}) + win_length {[type]} -- Length of the window function applied to each frame (if not specified, it + equals the filter length). (default: {None}) + window {str} -- Type of window to use (options are bartlett, hann, hamming, blackman, blackmanharris) + (default: {'hann'}) + """ + super(STFT, self).__init__() + self.filter_length = filter_length + self.hop_length = hop_length + self.win_length = win_length if win_length else filter_length + self.window = window + self.forward_transform = None + self.pad_amount = int(self.filter_length / 2) + scale = self.filter_length / self.hop_length + fourier_basis = np.fft.fft(np.eye(self.filter_length)) + + cutoff = int((self.filter_length / 2 + 1)) + fourier_basis = np.vstack([np.real(fourier_basis[:cutoff, :]),np.imag(fourier_basis[:cutoff, :])]) + forward_basis = torch.FloatTensor(fourier_basis[:, None, :]) + inverse_basis = torch.FloatTensor( + np.linalg.pinv(scale * fourier_basis).T[:, None, :]) + + assert (filter_length >= self.win_length) + # get window and zero center pad it to filter_length + fft_window = get_window(window, self.win_length, fftbins=True) + fft_window = pad_center(fft_window, size=filter_length) + fft_window = torch.from_numpy(fft_window).float() + + # window the bases + forward_basis *= fft_window + inverse_basis *= fft_window + + self.register_buffer('forward_basis', forward_basis.float()) + self.register_buffer('inverse_basis', inverse_basis.float()) + + def transform(self, input_data): + """Take input data (audio) to STFT domain. + + Arguments: + input_data {tensor} -- Tensor of floats, with shape (num_batch, num_samples) + + Returns: + magnitude {tensor} -- Magnitude of STFT with shape (num_batch, + num_frequencies, num_frames) + phase {tensor} -- Phase of STFT with shape (num_batch, + num_frequencies, num_frames) + """ + num_batches = input_data.shape[0] + num_samples = input_data.shape[-1] + + self.num_samples = num_samples + + # similar to librosa, reflect-pad the input + input_data = input_data.view(num_batches, 1, num_samples) + # print(1234,input_data.shape) + input_data = F.pad(input_data.unsqueeze(1),(self.pad_amount, self.pad_amount, 0, 0,0,0),mode='reflect').squeeze(1) + # print(2333,input_data.shape,self.forward_basis.shape,self.hop_length) + # pdb.set_trace() + forward_transform = F.conv1d( + input_data, + self.forward_basis, + stride=self.hop_length, + padding=0) + + cutoff = int((self.filter_length / 2) + 1) + real_part = forward_transform[:, :cutoff, :] + imag_part = forward_transform[:, cutoff:, :] + + magnitude = torch.sqrt(real_part ** 2 + imag_part ** 2) + # phase = torch.atan2(imag_part.data, real_part.data) + + return magnitude#, phase + + def inverse(self, magnitude, phase): + """Call the inverse STFT (iSTFT), given magnitude and phase tensors produced + by the ```transform``` function. + + Arguments: + magnitude {tensor} -- Magnitude of STFT with shape (num_batch, + num_frequencies, num_frames) + phase {tensor} -- Phase of STFT with shape (num_batch, + num_frequencies, num_frames) + + Returns: + inverse_transform {tensor} -- Reconstructed audio given magnitude and phase. Of + shape (num_batch, num_samples) + """ + recombine_magnitude_phase = torch.cat( + [magnitude * torch.cos(phase), magnitude * torch.sin(phase)], dim=1) + + inverse_transform = F.conv_transpose1d( + recombine_magnitude_phase, + self.inverse_basis, + stride=self.hop_length, + padding=0) + + if self.window is not None: + window_sum = window_sumsquare( + self.window, magnitude.size(-1), hop_length=self.hop_length, + win_length=self.win_length, n_fft=self.filter_length, + dtype=np.float32) + # remove modulation effects + approx_nonzero_indices = torch.from_numpy( + np.where(window_sum > tiny(window_sum))[0]) + window_sum = torch.from_numpy(window_sum).to(inverse_transform.device) + inverse_transform[:, :, approx_nonzero_indices] /= window_sum[approx_nonzero_indices] + + # scale by hop ratio + inverse_transform *= float(self.filter_length) / self.hop_length + + inverse_transform = inverse_transform[..., self.pad_amount:] + inverse_transform = inverse_transform[..., :self.num_samples] + inverse_transform = inverse_transform.squeeze(1) + + return inverse_transform + + def forward(self, input_data): + """Take input data (audio) to STFT domain and then back to audio. + + Arguments: + input_data {tensor} -- Tensor of floats, with shape (num_batch, num_samples) + + Returns: + reconstruction {tensor} -- Reconstructed audio given magnitude and phase. Of + shape (num_batch, num_samples) + """ + self.magnitude, self.phase = self.transform(input_data) + reconstruction = self.inverse(self.magnitude, self.phase) + return reconstruction +from time import time as ttime +class BiGRU(nn.Module): + def __init__(self, input_features, hidden_features, num_layers): + super(BiGRU, self).__init__() + self.gru = nn.GRU( + input_features, + hidden_features, + num_layers=num_layers, + batch_first=True, + bidirectional=True, + ) + + def forward(self, x): + return self.gru(x)[0] + + +class ConvBlockRes(nn.Module): + def __init__(self, in_channels, out_channels, momentum=0.01): + super(ConvBlockRes, self).__init__() + self.conv = nn.Sequential( + nn.Conv2d( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=(3, 3), + stride=(1, 1), + padding=(1, 1), + bias=False, + ), + nn.BatchNorm2d(out_channels, momentum=momentum), + nn.ReLU(), + nn.Conv2d( + in_channels=out_channels, + out_channels=out_channels, + kernel_size=(3, 3), + stride=(1, 1), + padding=(1, 1), + bias=False, + ), + nn.BatchNorm2d(out_channels, momentum=momentum), + nn.ReLU(), + ) + if in_channels != out_channels: + self.shortcut = nn.Conv2d(in_channels, out_channels, (1, 1)) + self.is_shortcut = True + else: + self.is_shortcut = False + + def forward(self, x): + if self.is_shortcut: + return self.conv(x) + self.shortcut(x) + else: + return self.conv(x) + x + + +class Encoder(nn.Module): + def __init__( + self, + in_channels, + in_size, + n_encoders, + kernel_size, + n_blocks, + out_channels=16, + momentum=0.01, + ): + super(Encoder, self).__init__() + self.n_encoders = n_encoders + self.bn = nn.BatchNorm2d(in_channels, momentum=momentum) + self.layers = nn.ModuleList() + self.latent_channels = [] + for i in range(self.n_encoders): + self.layers.append( + ResEncoderBlock( + in_channels, out_channels, kernel_size, n_blocks, momentum=momentum + ) + ) + self.latent_channels.append([out_channels, in_size]) + in_channels = out_channels + out_channels *= 2 + in_size //= 2 + self.out_size = in_size + self.out_channel = out_channels + + def forward(self, x): + concat_tensors = [] + x = self.bn(x) + for i in range(self.n_encoders): + _, x = self.layers[i](x) + concat_tensors.append(_) + return x, concat_tensors + + +class ResEncoderBlock(nn.Module): + def __init__( + self, in_channels, out_channels, kernel_size, n_blocks=1, momentum=0.01 + ): + super(ResEncoderBlock, self).__init__() + self.n_blocks = n_blocks + self.conv = nn.ModuleList() + self.conv.append(ConvBlockRes(in_channels, out_channels, momentum)) + for i in range(n_blocks - 1): + self.conv.append(ConvBlockRes(out_channels, out_channels, momentum)) + self.kernel_size = kernel_size + if self.kernel_size is not None: + self.pool = nn.AvgPool2d(kernel_size=kernel_size) + + def forward(self, x): + for i in range(self.n_blocks): + x = self.conv[i](x) + if self.kernel_size is not None: + return x, self.pool(x) + else: + return x + + +class Intermediate(nn.Module): # + def __init__(self, in_channels, out_channels, n_inters, n_blocks, momentum=0.01): + super(Intermediate, self).__init__() + self.n_inters = n_inters + self.layers = nn.ModuleList() + self.layers.append( + ResEncoderBlock(in_channels, out_channels, None, n_blocks, momentum) + ) + for i in range(self.n_inters - 1): + self.layers.append( + ResEncoderBlock(out_channels, out_channels, None, n_blocks, momentum) + ) + + def forward(self, x): + for i in range(self.n_inters): + x = self.layers[i](x) + return x + + +class ResDecoderBlock(nn.Module): + def __init__(self, in_channels, out_channels, stride, n_blocks=1, momentum=0.01): + super(ResDecoderBlock, self).__init__() + out_padding = (0, 1) if stride == (1, 2) else (1, 1) + self.n_blocks = n_blocks + self.conv1 = nn.Sequential( + nn.ConvTranspose2d( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=(3, 3), + stride=stride, + padding=(1, 1), + output_padding=out_padding, + bias=False, + ), + nn.BatchNorm2d(out_channels, momentum=momentum), + nn.ReLU(), + ) + self.conv2 = nn.ModuleList() + self.conv2.append(ConvBlockRes(out_channels * 2, out_channels, momentum)) + for i in range(n_blocks - 1): + self.conv2.append(ConvBlockRes(out_channels, out_channels, momentum)) + + def forward(self, x, concat_tensor): + x = self.conv1(x) + x = torch.cat((x, concat_tensor), dim=1) + for i in range(self.n_blocks): + x = self.conv2[i](x) + return x + + +class Decoder(nn.Module): + def __init__(self, in_channels, n_decoders, stride, n_blocks, momentum=0.01): + super(Decoder, self).__init__() + self.layers = nn.ModuleList() + self.n_decoders = n_decoders + for i in range(self.n_decoders): + out_channels = in_channels // 2 + self.layers.append( + ResDecoderBlock(in_channels, out_channels, stride, n_blocks, momentum) + ) + in_channels = out_channels + + def forward(self, x, concat_tensors): + for i in range(self.n_decoders): + x = self.layers[i](x, concat_tensors[-1 - i]) + return x + + +class DeepUnet(nn.Module): + def __init__( + self, + kernel_size, + n_blocks, + en_de_layers=5, + inter_layers=4, + in_channels=1, + en_out_channels=16, + ): + super(DeepUnet, self).__init__() + self.encoder = Encoder( + in_channels, 128, en_de_layers, kernel_size, n_blocks, en_out_channels + ) + self.intermediate = Intermediate( + self.encoder.out_channel // 2, + self.encoder.out_channel, + inter_layers, + n_blocks, + ) + self.decoder = Decoder( + self.encoder.out_channel, en_de_layers, kernel_size, n_blocks + ) + + def forward(self, x): + x, concat_tensors = self.encoder(x) + x = self.intermediate(x) + x = self.decoder(x, concat_tensors) + return x + + +class E2E(nn.Module): + def __init__( + self, + n_blocks, + n_gru, + kernel_size, + en_de_layers=5, + inter_layers=4, + in_channels=1, + en_out_channels=16, + ): + super(E2E, self).__init__() + self.unet = DeepUnet( + kernel_size, + n_blocks, + en_de_layers, + inter_layers, + in_channels, + en_out_channels, + ) + self.cnn = nn.Conv2d(en_out_channels, 3, (3, 3), padding=(1, 1)) + if n_gru: + self.fc = nn.Sequential( + BiGRU(3 * 128, 256, n_gru), + nn.Linear(512, 360), + nn.Dropout(0.25), + nn.Sigmoid(), + ) + else: + self.fc = nn.Sequential( + nn.Linear(3 * nn.N_MELS, nn.N_CLASS), nn.Dropout(0.25), nn.Sigmoid() + ) + + def forward(self, mel): + # print(mel.shape) + mel = mel.transpose(-1, -2).unsqueeze(1) + x = self.cnn(self.unet(mel)).transpose(1, 2).flatten(-2) + x = self.fc(x) + # print(x.shape) + return x + + +from librosa.filters import mel + + +class MelSpectrogram(torch.nn.Module): + def __init__( + self, + is_half, + n_mel_channels, + sampling_rate, + win_length, + hop_length, + n_fft=None, + mel_fmin=0, + mel_fmax=None, + clamp=1e-5, + ): + super().__init__() + n_fft = win_length if n_fft is None else n_fft + self.hann_window = {} + mel_basis = mel( + sr=sampling_rate, + n_fft=n_fft, + n_mels=n_mel_channels, + fmin=mel_fmin, + fmax=mel_fmax, + htk=True, + ) + mel_basis = torch.from_numpy(mel_basis).float() + self.register_buffer("mel_basis", mel_basis) + self.n_fft = win_length if n_fft is None else n_fft + self.hop_length = hop_length + self.win_length = win_length + self.sampling_rate = sampling_rate + self.n_mel_channels = n_mel_channels + self.clamp = clamp + self.is_half = is_half + + def forward(self, audio, keyshift=0, speed=1, center=True): + factor = 2 ** (keyshift / 12) + n_fft_new = int(np.round(self.n_fft * factor)) + win_length_new = int(np.round(self.win_length * factor)) + hop_length_new = int(np.round(self.hop_length * speed)) + keyshift_key = str(keyshift) + "_" + str(audio.device) + if keyshift_key not in self.hann_window: + self.hann_window[keyshift_key] = torch.hann_window(win_length_new).to( + # "cpu"if(audio.device.type=="privateuseone") else audio.device + audio.device + ) + # fft = torch.stft(#doesn't support pytorch_dml + # # audio.cpu() if(audio.device.type=="privateuseone")else audio, + # audio, + # n_fft=n_fft_new, + # hop_length=hop_length_new, + # win_length=win_length_new, + # window=self.hann_window[keyshift_key], + # center=center, + # return_complex=True, + # ) + # magnitude = torch.sqrt(fft.real.pow(2) + fft.imag.pow(2)) + # print(1111111111) + # print(222222222222222,audio.device,self.is_half) + if hasattr(self, "stft") == False: + # print(n_fft_new,hop_length_new,win_length_new,audio.shape) + self.stft=STFT( + filter_length=n_fft_new, + hop_length=hop_length_new, + win_length=win_length_new, + window='hann' + ).to(audio.device) + magnitude = self.stft.transform(audio)#phase + # if (audio.device.type == "privateuseone"): + # magnitude=magnitude.to(audio.device) + if keyshift != 0: + size = self.n_fft // 2 + 1 + resize = magnitude.size(1) + if resize < size: + magnitude = F.pad(magnitude, (0, 0, 0, size - resize)) + magnitude = magnitude[:, :size, :] * self.win_length / win_length_new + mel_output = torch.matmul(self.mel_basis, magnitude) + if self.is_half == True: + mel_output = mel_output.half() + log_mel_spec = torch.log(torch.clamp(mel_output, min=self.clamp)) + # print(log_mel_spec.device.type) + return log_mel_spec + + +class RMVPE: + def __init__(self, model_path, is_half, device=None): + self.resample_kernel = {} + self.resample_kernel = {} + self.is_half = is_half + if device is None: + device = "cuda" if torch.cuda.is_available() else "cpu" + self.device = device + self.mel_extractor = MelSpectrogram( + is_half, 128, 16000, 1024, 160, None, 30, 8000 + ).to(device) + if ("privateuseone" in str(device)): + import onnxruntime as ort + ort_session = ort.InferenceSession("rmvpe.onnx", providers=["DmlExecutionProvider"]) + self.model=ort_session + else: + model = E2E(4, 1, (2, 2)) + ckpt = torch.load(model_path, map_location="cpu") + model.load_state_dict(ckpt) + model.eval() + if is_half == True: + model = model.half() + self.model = model + self.model = self.model.to(device) + cents_mapping = 20 * np.arange(360) + 1997.3794084376191 + self.cents_mapping = np.pad(cents_mapping, (4, 4)) # 368 + + def mel2hidden(self, mel): + with torch.no_grad(): + n_frames = mel.shape[-1] + mel = F.pad( + mel, (0, 32 * ((n_frames - 1) // 32 + 1) - n_frames), mode="reflect" + ) + if("privateuseone" in str(self.device) ): + onnx_input_name = self.model.get_inputs()[0].name + onnx_outputs_names = self.model.get_outputs()[0].name + hidden = self.model.run([onnx_outputs_names], input_feed={onnx_input_name: mel.cpu().numpy()})[0] + else: + hidden = self.model(mel) + return hidden[:, :n_frames] + + def decode(self, hidden, thred=0.03): + cents_pred = self.to_local_average_cents(hidden, thred=thred) + f0 = 10 * (2 ** (cents_pred / 1200)) + f0[f0 == 10] = 0 + # f0 = np.array([10 * (2 ** (cent_pred / 1200)) if cent_pred else 0 for cent_pred in cents_pred]) + return f0 + + def infer_from_audio(self, audio, thred=0.03): + # torch.cuda.synchronize() + t0=ttime() + mel = self.mel_extractor(torch.from_numpy(audio).float().to(self.device).unsqueeze(0), center=True) + # print(123123123,mel.device.type) + # torch.cuda.synchronize() + t1=ttime() + hidden = self.mel2hidden(mel) + # torch.cuda.synchronize() + t2=ttime() + # print(234234,hidden.device.type) + if("privateuseone" not in str(self.device)): + hidden = hidden.squeeze(0).cpu().numpy() + else: + hidden=hidden[0] + if self.is_half == True: + hidden = hidden.astype("float32") + + f0 = self.decode(hidden, thred=thred) + # torch.cuda.synchronize() + t3=ttime() + # print("hmvpe:%s\t%s\t%s\t%s"%(t1-t0,t2-t1,t3-t2,t3-t0)) + return f0 + + def to_local_average_cents(self, salience, thred=0.05): + # t0 = ttime() + center = np.argmax(salience, axis=1) # 帧长#index + salience = np.pad(salience, ((0, 0), (4, 4))) # 帧长,368 + # t1 = ttime() + center += 4 + todo_salience = [] + todo_cents_mapping = [] + starts = center - 4 + ends = center + 5 + for idx in range(salience.shape[0]): + todo_salience.append(salience[:, starts[idx] : ends[idx]][idx]) + todo_cents_mapping.append(self.cents_mapping[starts[idx] : ends[idx]]) + # t2 = ttime() + todo_salience = np.array(todo_salience) # 帧长,9 + todo_cents_mapping = np.array(todo_cents_mapping) # 帧长,9 + product_sum = np.sum(todo_salience * todo_cents_mapping, 1) + weight_sum = np.sum(todo_salience, 1) # 帧长 + devided = product_sum / weight_sum # 帧长 + # t3 = ttime() + maxx = np.max(salience, axis=1) # 帧长 + devided[maxx <= thred] = 0 + # t4 = ttime() + # print("decode:%s\t%s\t%s\t%s" % (t1 - t0, t2 - t1, t3 - t2, t4 - t3)) + return devided + + +if __name__ == '__main__': + import soundfile as sf, librosa + audio, sampling_rate = sf.read(r"C:\Users\liujing04\Desktop\Z\冬之花clip1.wav") + if len(audio.shape) > 1: + audio = librosa.to_mono(audio.transpose(1, 0)) + audio_bak = audio.copy() + if sampling_rate != 16000: + audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000) + model_path = r"D:\BaiduNetdiskDownload\RVC-beta-v2-0727AMD_realtime\rmvpe.pt" + thred = 0.03 # 0.01 + device = 'cuda' if torch.cuda.is_available() else 'cpu' + rmvpe = RMVPE(model_path,is_half=False, device=device) + t0=ttime() + f0 = rmvpe.infer_from_audio(audio, thred=thred) + # f0 = rmvpe.infer_from_audio(audio, thred=thred) + # f0 = rmvpe.infer_from_audio(audio, thred=thred) + # f0 = rmvpe.infer_from_audio(audio, thred=thred) + # f0 = rmvpe.infer_from_audio(audio, thred=thred) + t1=ttime() + print(f0.shape,t1-t0) diff --git a/infer/lib/slicer2.py b/infer/lib/slicer2.py new file mode 100644 index 0000000..7d9d16d --- /dev/null +++ b/infer/lib/slicer2.py @@ -0,0 +1,260 @@ +import numpy as np + + +# This function is obtained from librosa. +def get_rms( + y, + frame_length=2048, + hop_length=512, + pad_mode="constant", +): + padding = (int(frame_length // 2), int(frame_length // 2)) + y = np.pad(y, padding, mode=pad_mode) + + axis = -1 + # put our new within-frame axis at the end for now + out_strides = y.strides + tuple([y.strides[axis]]) + # Reduce the shape on the framing axis + x_shape_trimmed = list(y.shape) + x_shape_trimmed[axis] -= frame_length - 1 + out_shape = tuple(x_shape_trimmed) + tuple([frame_length]) + xw = np.lib.stride_tricks.as_strided(y, shape=out_shape, strides=out_strides) + if axis < 0: + target_axis = axis - 1 + else: + target_axis = axis + 1 + xw = np.moveaxis(xw, -1, target_axis) + # Downsample along the target axis + slices = [slice(None)] * xw.ndim + slices[axis] = slice(0, None, hop_length) + x = xw[tuple(slices)] + + # Calculate power + power = np.mean(np.abs(x) ** 2, axis=-2, keepdims=True) + + return np.sqrt(power) + + +class Slicer: + def __init__( + self, + sr: int, + threshold: float = -40.0, + min_length: int = 5000, + min_interval: int = 300, + hop_size: int = 20, + max_sil_kept: int = 5000, + ): + if not min_length >= min_interval >= hop_size: + raise ValueError( + "The following condition must be satisfied: min_length >= min_interval >= hop_size" + ) + if not max_sil_kept >= hop_size: + raise ValueError( + "The following condition must be satisfied: max_sil_kept >= hop_size" + ) + min_interval = sr * min_interval / 1000 + self.threshold = 10 ** (threshold / 20.0) + self.hop_size = round(sr * hop_size / 1000) + self.win_size = min(round(min_interval), 4 * self.hop_size) + self.min_length = round(sr * min_length / 1000 / self.hop_size) + self.min_interval = round(min_interval / self.hop_size) + self.max_sil_kept = round(sr * max_sil_kept / 1000 / self.hop_size) + + def _apply_slice(self, waveform, begin, end): + if len(waveform.shape) > 1: + return waveform[ + :, begin * self.hop_size : min(waveform.shape[1], end * self.hop_size) + ] + else: + return waveform[ + begin * self.hop_size : min(waveform.shape[0], end * self.hop_size) + ] + + # @timeit + def slice(self, waveform): + if len(waveform.shape) > 1: + samples = waveform.mean(axis=0) + else: + samples = waveform + if samples.shape[0] <= self.min_length: + return [waveform] + rms_list = get_rms( + y=samples, frame_length=self.win_size, hop_length=self.hop_size + ).squeeze(0) + sil_tags = [] + silence_start = None + clip_start = 0 + for i, rms in enumerate(rms_list): + # Keep looping while frame is silent. + if rms < self.threshold: + # Record start of silent frames. + if silence_start is None: + silence_start = i + continue + # Keep looping while frame is not silent and silence start has not been recorded. + if silence_start is None: + continue + # Clear recorded silence start if interval is not enough or clip is too short + is_leading_silence = silence_start == 0 and i > self.max_sil_kept + need_slice_middle = ( + i - silence_start >= self.min_interval + and i - clip_start >= self.min_length + ) + if not is_leading_silence and not need_slice_middle: + silence_start = None + continue + # Need slicing. Record the range of silent frames to be removed. + if i - silence_start <= self.max_sil_kept: + pos = rms_list[silence_start : i + 1].argmin() + silence_start + if silence_start == 0: + sil_tags.append((0, pos)) + else: + sil_tags.append((pos, pos)) + clip_start = pos + elif i - silence_start <= self.max_sil_kept * 2: + pos = rms_list[ + i - self.max_sil_kept : silence_start + self.max_sil_kept + 1 + ].argmin() + pos += i - self.max_sil_kept + pos_l = ( + rms_list[ + silence_start : silence_start + self.max_sil_kept + 1 + ].argmin() + + silence_start + ) + pos_r = ( + rms_list[i - self.max_sil_kept : i + 1].argmin() + + i + - self.max_sil_kept + ) + if silence_start == 0: + sil_tags.append((0, pos_r)) + clip_start = pos_r + else: + sil_tags.append((min(pos_l, pos), max(pos_r, pos))) + clip_start = max(pos_r, pos) + else: + pos_l = ( + rms_list[ + silence_start : silence_start + self.max_sil_kept + 1 + ].argmin() + + silence_start + ) + pos_r = ( + rms_list[i - self.max_sil_kept : i + 1].argmin() + + i + - self.max_sil_kept + ) + if silence_start == 0: + sil_tags.append((0, pos_r)) + else: + sil_tags.append((pos_l, pos_r)) + clip_start = pos_r + silence_start = None + # Deal with trailing silence. + total_frames = rms_list.shape[0] + if ( + silence_start is not None + and total_frames - silence_start >= self.min_interval + ): + silence_end = min(total_frames, silence_start + self.max_sil_kept) + pos = rms_list[silence_start : silence_end + 1].argmin() + silence_start + sil_tags.append((pos, total_frames + 1)) + # Apply and return slices. + if len(sil_tags) == 0: + return [waveform] + else: + chunks = [] + if sil_tags[0][0] > 0: + chunks.append(self._apply_slice(waveform, 0, sil_tags[0][0])) + for i in range(len(sil_tags) - 1): + chunks.append( + self._apply_slice(waveform, sil_tags[i][1], sil_tags[i + 1][0]) + ) + if sil_tags[-1][1] < total_frames: + chunks.append( + self._apply_slice(waveform, sil_tags[-1][1], total_frames) + ) + return chunks + + +def main(): + import os.path + from argparse import ArgumentParser + + import librosa + import soundfile + + parser = ArgumentParser() + parser.add_argument("audio", type=str, help="The audio to be sliced") + parser.add_argument( + "--out", type=str, help="Output directory of the sliced audio clips" + ) + parser.add_argument( + "--db_thresh", + type=float, + required=False, + default=-40, + help="The dB threshold for silence detection", + ) + parser.add_argument( + "--min_length", + type=int, + required=False, + default=5000, + help="The minimum milliseconds required for each sliced audio clip", + ) + parser.add_argument( + "--min_interval", + type=int, + required=False, + default=300, + help="The minimum milliseconds for a silence part to be sliced", + ) + parser.add_argument( + "--hop_size", + type=int, + required=False, + default=10, + help="Frame length in milliseconds", + ) + parser.add_argument( + "--max_sil_kept", + type=int, + required=False, + default=500, + help="The maximum silence length kept around the sliced clip, presented in milliseconds", + ) + args = parser.parse_args() + out = args.out + if out is None: + out = os.path.dirname(os.path.abspath(args.audio)) + audio, sr = librosa.load(args.audio, sr=None, mono=False) + slicer = Slicer( + sr=sr, + threshold=args.db_thresh, + min_length=args.min_length, + min_interval=args.min_interval, + hop_size=args.hop_size, + max_sil_kept=args.max_sil_kept, + ) + chunks = slicer.slice(audio) + if not os.path.exists(out): + os.makedirs(out) + for i, chunk in enumerate(chunks): + if len(chunk.shape) > 1: + chunk = chunk.T + soundfile.write( + os.path.join( + out, + f"%s_%d.wav" + % (os.path.basename(args.audio).rsplit(".", maxsplit=1)[0], i), + ), + chunk, + sr, + ) + + +if __name__ == "__main__": + main() diff --git a/infer/lib/train/data_utils.py b/infer/lib/train/data_utils.py new file mode 100644 index 0000000..7793f15 --- /dev/null +++ b/infer/lib/train/data_utils.py @@ -0,0 +1,512 @@ +import os, traceback +import numpy as np +import torch +import torch.utils.data + +from infer.lib.train.mel_processing import spectrogram_torch +from infer.lib.train.utils import load_wav_to_torch, load_filepaths_and_text + + +class TextAudioLoaderMultiNSFsid(torch.utils.data.Dataset): + """ + 1) loads audio, text pairs + 2) normalizes text and converts them to sequences of integers + 3) computes spectrograms from audio files. + """ + + def __init__(self, audiopaths_and_text, hparams): + self.audiopaths_and_text = load_filepaths_and_text(audiopaths_and_text) + self.max_wav_value = hparams.max_wav_value + self.sampling_rate = hparams.sampling_rate + self.filter_length = hparams.filter_length + self.hop_length = hparams.hop_length + self.win_length = hparams.win_length + self.sampling_rate = hparams.sampling_rate + self.min_text_len = getattr(hparams, "min_text_len", 1) + self.max_text_len = getattr(hparams, "max_text_len", 5000) + self._filter() + + def _filter(self): + """ + Filter text & store spec lengths + """ + # Store spectrogram lengths for Bucketing + # wav_length ~= file_size / (wav_channels * Bytes per dim) = file_size / (1 * 2) + # spec_length = wav_length // hop_length + audiopaths_and_text_new = [] + lengths = [] + for audiopath, text, pitch, pitchf, dv in self.audiopaths_and_text: + if self.min_text_len <= len(text) and len(text) <= self.max_text_len: + audiopaths_and_text_new.append([audiopath, text, pitch, pitchf, dv]) + lengths.append(os.path.getsize(audiopath) // (3 * self.hop_length)) + self.audiopaths_and_text = audiopaths_and_text_new + self.lengths = lengths + + def get_sid(self, sid): + sid = torch.LongTensor([int(sid)]) + return sid + + def get_audio_text_pair(self, audiopath_and_text): + # separate filename and text + file = audiopath_and_text[0] + phone = audiopath_and_text[1] + pitch = audiopath_and_text[2] + pitchf = audiopath_and_text[3] + dv = audiopath_and_text[4] + + phone, pitch, pitchf = self.get_labels(phone, pitch, pitchf) + spec, wav = self.get_audio(file) + dv = self.get_sid(dv) + + len_phone = phone.size()[0] + len_spec = spec.size()[-1] + # print(123,phone.shape,pitch.shape,spec.shape) + if len_phone != len_spec: + len_min = min(len_phone, len_spec) + # amor + len_wav = len_min * self.hop_length + + spec = spec[:, :len_min] + wav = wav[:, :len_wav] + + phone = phone[:len_min, :] + pitch = pitch[:len_min] + pitchf = pitchf[:len_min] + + return (spec, wav, phone, pitch, pitchf, dv) + + def get_labels(self, phone, pitch, pitchf): + phone = np.load(phone) + phone = np.repeat(phone, 2, axis=0) + pitch = np.load(pitch) + pitchf = np.load(pitchf) + n_num = min(phone.shape[0], 900) # DistributedBucketSampler + # print(234,phone.shape,pitch.shape) + phone = phone[:n_num, :] + pitch = pitch[:n_num] + pitchf = pitchf[:n_num] + phone = torch.FloatTensor(phone) + pitch = torch.LongTensor(pitch) + pitchf = torch.FloatTensor(pitchf) + return phone, pitch, pitchf + + def get_audio(self, filename): + audio, sampling_rate = load_wav_to_torch(filename) + if sampling_rate != self.sampling_rate: + raise ValueError( + "{} SR doesn't match target {} SR".format( + sampling_rate, self.sampling_rate + ) + ) + audio_norm = audio + # audio_norm = audio / self.max_wav_value + # audio_norm = audio / np.abs(audio).max() + + audio_norm = audio_norm.unsqueeze(0) + spec_filename = filename.replace(".wav", ".spec.pt") + if os.path.exists(spec_filename): + try: + spec = torch.load(spec_filename) + except: + print(spec_filename, traceback.format_exc()) + spec = spectrogram_torch( + audio_norm, + self.filter_length, + self.sampling_rate, + self.hop_length, + self.win_length, + center=False, + ) + spec = torch.squeeze(spec, 0) + torch.save(spec, spec_filename, _use_new_zipfile_serialization=False) + else: + spec = spectrogram_torch( + audio_norm, + self.filter_length, + self.sampling_rate, + self.hop_length, + self.win_length, + center=False, + ) + spec = torch.squeeze(spec, 0) + torch.save(spec, spec_filename, _use_new_zipfile_serialization=False) + return spec, audio_norm + + def __getitem__(self, index): + return self.get_audio_text_pair(self.audiopaths_and_text[index]) + + def __len__(self): + return len(self.audiopaths_and_text) + + +class TextAudioCollateMultiNSFsid: + """Zero-pads model inputs and targets""" + + def __init__(self, return_ids=False): + self.return_ids = return_ids + + def __call__(self, batch): + """Collate's training batch from normalized text and aduio + PARAMS + ------ + batch: [text_normalized, spec_normalized, wav_normalized] + """ + # Right zero-pad all one-hot text sequences to max input length + _, ids_sorted_decreasing = torch.sort( + torch.LongTensor([x[0].size(1) for x in batch]), dim=0, descending=True + ) + + max_spec_len = max([x[0].size(1) for x in batch]) + max_wave_len = max([x[1].size(1) for x in batch]) + spec_lengths = torch.LongTensor(len(batch)) + wave_lengths = torch.LongTensor(len(batch)) + spec_padded = torch.FloatTensor(len(batch), batch[0][0].size(0), max_spec_len) + wave_padded = torch.FloatTensor(len(batch), 1, max_wave_len) + spec_padded.zero_() + wave_padded.zero_() + + max_phone_len = max([x[2].size(0) for x in batch]) + phone_lengths = torch.LongTensor(len(batch)) + phone_padded = torch.FloatTensor( + len(batch), max_phone_len, batch[0][2].shape[1] + ) # (spec, wav, phone, pitch) + pitch_padded = torch.LongTensor(len(batch), max_phone_len) + pitchf_padded = torch.FloatTensor(len(batch), max_phone_len) + phone_padded.zero_() + pitch_padded.zero_() + pitchf_padded.zero_() + # dv = torch.FloatTensor(len(batch), 256)#gin=256 + sid = torch.LongTensor(len(batch)) + + for i in range(len(ids_sorted_decreasing)): + row = batch[ids_sorted_decreasing[i]] + + spec = row[0] + spec_padded[i, :, : spec.size(1)] = spec + spec_lengths[i] = spec.size(1) + + wave = row[1] + wave_padded[i, :, : wave.size(1)] = wave + wave_lengths[i] = wave.size(1) + + phone = row[2] + phone_padded[i, : phone.size(0), :] = phone + phone_lengths[i] = phone.size(0) + + pitch = row[3] + pitch_padded[i, : pitch.size(0)] = pitch + pitchf = row[4] + pitchf_padded[i, : pitchf.size(0)] = pitchf + + # dv[i] = row[5] + sid[i] = row[5] + + return ( + phone_padded, + phone_lengths, + pitch_padded, + pitchf_padded, + spec_padded, + spec_lengths, + wave_padded, + wave_lengths, + # dv + sid, + ) + + +class TextAudioLoader(torch.utils.data.Dataset): + """ + 1) loads audio, text pairs + 2) normalizes text and converts them to sequences of integers + 3) computes spectrograms from audio files. + """ + + def __init__(self, audiopaths_and_text, hparams): + self.audiopaths_and_text = load_filepaths_and_text(audiopaths_and_text) + self.max_wav_value = hparams.max_wav_value + self.sampling_rate = hparams.sampling_rate + self.filter_length = hparams.filter_length + self.hop_length = hparams.hop_length + self.win_length = hparams.win_length + self.sampling_rate = hparams.sampling_rate + self.min_text_len = getattr(hparams, "min_text_len", 1) + self.max_text_len = getattr(hparams, "max_text_len", 5000) + self._filter() + + def _filter(self): + """ + Filter text & store spec lengths + """ + # Store spectrogram lengths for Bucketing + # wav_length ~= file_size / (wav_channels * Bytes per dim) = file_size / (1 * 2) + # spec_length = wav_length // hop_length + audiopaths_and_text_new = [] + lengths = [] + for audiopath, text, dv in self.audiopaths_and_text: + if self.min_text_len <= len(text) and len(text) <= self.max_text_len: + audiopaths_and_text_new.append([audiopath, text, dv]) + lengths.append(os.path.getsize(audiopath) // (3 * self.hop_length)) + self.audiopaths_and_text = audiopaths_and_text_new + self.lengths = lengths + + def get_sid(self, sid): + sid = torch.LongTensor([int(sid)]) + return sid + + def get_audio_text_pair(self, audiopath_and_text): + # separate filename and text + file = audiopath_and_text[0] + phone = audiopath_and_text[1] + dv = audiopath_and_text[2] + + phone = self.get_labels(phone) + spec, wav = self.get_audio(file) + dv = self.get_sid(dv) + + len_phone = phone.size()[0] + len_spec = spec.size()[-1] + if len_phone != len_spec: + len_min = min(len_phone, len_spec) + len_wav = len_min * self.hop_length + spec = spec[:, :len_min] + wav = wav[:, :len_wav] + phone = phone[:len_min, :] + return (spec, wav, phone, dv) + + def get_labels(self, phone): + phone = np.load(phone) + phone = np.repeat(phone, 2, axis=0) + n_num = min(phone.shape[0], 900) # DistributedBucketSampler + phone = phone[:n_num, :] + phone = torch.FloatTensor(phone) + return phone + + def get_audio(self, filename): + audio, sampling_rate = load_wav_to_torch(filename) + if sampling_rate != self.sampling_rate: + raise ValueError( + "{} SR doesn't match target {} SR".format( + sampling_rate, self.sampling_rate + ) + ) + audio_norm = audio + # audio_norm = audio / self.max_wav_value + # audio_norm = audio / np.abs(audio).max() + + audio_norm = audio_norm.unsqueeze(0) + spec_filename = filename.replace(".wav", ".spec.pt") + if os.path.exists(spec_filename): + try: + spec = torch.load(spec_filename) + except: + print(spec_filename, traceback.format_exc()) + spec = spectrogram_torch( + audio_norm, + self.filter_length, + self.sampling_rate, + self.hop_length, + self.win_length, + center=False, + ) + spec = torch.squeeze(spec, 0) + torch.save(spec, spec_filename, _use_new_zipfile_serialization=False) + else: + spec = spectrogram_torch( + audio_norm, + self.filter_length, + self.sampling_rate, + self.hop_length, + self.win_length, + center=False, + ) + spec = torch.squeeze(spec, 0) + torch.save(spec, spec_filename, _use_new_zipfile_serialization=False) + return spec, audio_norm + + def __getitem__(self, index): + return self.get_audio_text_pair(self.audiopaths_and_text[index]) + + def __len__(self): + return len(self.audiopaths_and_text) + + +class TextAudioCollate: + """Zero-pads model inputs and targets""" + + def __init__(self, return_ids=False): + self.return_ids = return_ids + + def __call__(self, batch): + """Collate's training batch from normalized text and aduio + PARAMS + ------ + batch: [text_normalized, spec_normalized, wav_normalized] + """ + # Right zero-pad all one-hot text sequences to max input length + _, ids_sorted_decreasing = torch.sort( + torch.LongTensor([x[0].size(1) for x in batch]), dim=0, descending=True + ) + + max_spec_len = max([x[0].size(1) for x in batch]) + max_wave_len = max([x[1].size(1) for x in batch]) + spec_lengths = torch.LongTensor(len(batch)) + wave_lengths = torch.LongTensor(len(batch)) + spec_padded = torch.FloatTensor(len(batch), batch[0][0].size(0), max_spec_len) + wave_padded = torch.FloatTensor(len(batch), 1, max_wave_len) + spec_padded.zero_() + wave_padded.zero_() + + max_phone_len = max([x[2].size(0) for x in batch]) + phone_lengths = torch.LongTensor(len(batch)) + phone_padded = torch.FloatTensor( + len(batch), max_phone_len, batch[0][2].shape[1] + ) + phone_padded.zero_() + sid = torch.LongTensor(len(batch)) + + for i in range(len(ids_sorted_decreasing)): + row = batch[ids_sorted_decreasing[i]] + + spec = row[0] + spec_padded[i, :, : spec.size(1)] = spec + spec_lengths[i] = spec.size(1) + + wave = row[1] + wave_padded[i, :, : wave.size(1)] = wave + wave_lengths[i] = wave.size(1) + + phone = row[2] + phone_padded[i, : phone.size(0), :] = phone + phone_lengths[i] = phone.size(0) + + sid[i] = row[3] + + return ( + phone_padded, + phone_lengths, + spec_padded, + spec_lengths, + wave_padded, + wave_lengths, + sid, + ) + + +class DistributedBucketSampler(torch.utils.data.distributed.DistributedSampler): + """ + Maintain similar input lengths in a batch. + Length groups are specified by boundaries. + Ex) boundaries = [b1, b2, b3] -> any batch is included either {x | b1 < length(x) <=b2} or {x | b2 < length(x) <= b3}. + + It removes samples which are not included in the boundaries. + Ex) boundaries = [b1, b2, b3] -> any x s.t. length(x) <= b1 or length(x) > b3 are discarded. + """ + + def __init__( + self, + dataset, + batch_size, + boundaries, + num_replicas=None, + rank=None, + shuffle=True, + ): + super().__init__(dataset, num_replicas=num_replicas, rank=rank, shuffle=shuffle) + self.lengths = dataset.lengths + self.batch_size = batch_size + self.boundaries = boundaries + + self.buckets, self.num_samples_per_bucket = self._create_buckets() + self.total_size = sum(self.num_samples_per_bucket) + self.num_samples = self.total_size // self.num_replicas + + def _create_buckets(self): + buckets = [[] for _ in range(len(self.boundaries) - 1)] + for i in range(len(self.lengths)): + length = self.lengths[i] + idx_bucket = self._bisect(length) + if idx_bucket != -1: + buckets[idx_bucket].append(i) + + for i in range(len(buckets) - 1, -1, -1): # + if len(buckets[i]) == 0: + buckets.pop(i) + self.boundaries.pop(i + 1) + + num_samples_per_bucket = [] + for i in range(len(buckets)): + len_bucket = len(buckets[i]) + total_batch_size = self.num_replicas * self.batch_size + rem = ( + total_batch_size - (len_bucket % total_batch_size) + ) % total_batch_size + num_samples_per_bucket.append(len_bucket + rem) + return buckets, num_samples_per_bucket + + def __iter__(self): + # deterministically shuffle based on epoch + g = torch.Generator() + g.manual_seed(self.epoch) + + indices = [] + if self.shuffle: + for bucket in self.buckets: + indices.append(torch.randperm(len(bucket), generator=g).tolist()) + else: + for bucket in self.buckets: + indices.append(list(range(len(bucket)))) + + batches = [] + for i in range(len(self.buckets)): + bucket = self.buckets[i] + len_bucket = len(bucket) + ids_bucket = indices[i] + num_samples_bucket = self.num_samples_per_bucket[i] + + # add extra samples to make it evenly divisible + rem = num_samples_bucket - len_bucket + ids_bucket = ( + ids_bucket + + ids_bucket * (rem // len_bucket) + + ids_bucket[: (rem % len_bucket)] + ) + + # subsample + ids_bucket = ids_bucket[self.rank :: self.num_replicas] + + # batching + for j in range(len(ids_bucket) // self.batch_size): + batch = [ + bucket[idx] + for idx in ids_bucket[ + j * self.batch_size : (j + 1) * self.batch_size + ] + ] + batches.append(batch) + + if self.shuffle: + batch_ids = torch.randperm(len(batches), generator=g).tolist() + batches = [batches[i] for i in batch_ids] + self.batches = batches + + assert len(self.batches) * self.batch_size == self.num_samples + return iter(self.batches) + + def _bisect(self, x, lo=0, hi=None): + if hi is None: + hi = len(self.boundaries) - 1 + + if hi > lo: + mid = (hi + lo) // 2 + if self.boundaries[mid] < x and x <= self.boundaries[mid + 1]: + return mid + elif x <= self.boundaries[mid]: + return self._bisect(x, lo, mid) + else: + return self._bisect(x, mid + 1, hi) + else: + return -1 + + def __len__(self): + return self.num_samples // self.batch_size diff --git a/infer/lib/train/losses.py b/infer/lib/train/losses.py new file mode 100644 index 0000000..aa7bd81 --- /dev/null +++ b/infer/lib/train/losses.py @@ -0,0 +1,58 @@ +import torch + + +def feature_loss(fmap_r, fmap_g): + loss = 0 + for dr, dg in zip(fmap_r, fmap_g): + for rl, gl in zip(dr, dg): + rl = rl.float().detach() + gl = gl.float() + loss += torch.mean(torch.abs(rl - gl)) + + return loss * 2 + + +def discriminator_loss(disc_real_outputs, disc_generated_outputs): + loss = 0 + r_losses = [] + g_losses = [] + for dr, dg in zip(disc_real_outputs, disc_generated_outputs): + dr = dr.float() + dg = dg.float() + r_loss = torch.mean((1 - dr) ** 2) + g_loss = torch.mean(dg**2) + loss += r_loss + g_loss + r_losses.append(r_loss.item()) + g_losses.append(g_loss.item()) + + return loss, r_losses, g_losses + + +def generator_loss(disc_outputs): + loss = 0 + gen_losses = [] + for dg in disc_outputs: + dg = dg.float() + l = torch.mean((1 - dg) ** 2) + gen_losses.append(l) + loss += l + + return loss, gen_losses + + +def kl_loss(z_p, logs_q, m_p, logs_p, z_mask): + """ + z_p, logs_q: [b, h, t_t] + m_p, logs_p: [b, h, t_t] + """ + z_p = z_p.float() + logs_q = logs_q.float() + m_p = m_p.float() + logs_p = logs_p.float() + z_mask = z_mask.float() + + kl = logs_p - logs_q - 0.5 + kl += 0.5 * ((z_p - m_p) ** 2) * torch.exp(-2.0 * logs_p) + kl = torch.sum(kl * z_mask) + l = kl / torch.sum(z_mask) + return l diff --git a/infer/lib/train/mel_processing.py b/infer/lib/train/mel_processing.py new file mode 100644 index 0000000..3cc3687 --- /dev/null +++ b/infer/lib/train/mel_processing.py @@ -0,0 +1,130 @@ +import torch +import torch.utils.data +from librosa.filters import mel as librosa_mel_fn + + +MAX_WAV_VALUE = 32768.0 + + +def dynamic_range_compression_torch(x, C=1, clip_val=1e-5): + """ + PARAMS + ------ + C: compression factor + """ + return torch.log(torch.clamp(x, min=clip_val) * C) + + +def dynamic_range_decompression_torch(x, C=1): + """ + PARAMS + ------ + C: compression factor used to compress + """ + return torch.exp(x) / C + + +def spectral_normalize_torch(magnitudes): + return dynamic_range_compression_torch(magnitudes) + + +def spectral_de_normalize_torch(magnitudes): + return dynamic_range_decompression_torch(magnitudes) + + +# Reusable banks +mel_basis = {} +hann_window = {} + + +def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False): + """Convert waveform into Linear-frequency Linear-amplitude spectrogram. + + Args: + y :: (B, T) - Audio waveforms + n_fft + sampling_rate + hop_size + win_size + center + Returns: + :: (B, Freq, Frame) - Linear-frequency Linear-amplitude spectrogram + """ + # Validation + if torch.min(y) < -1.07: + print("min value is ", torch.min(y)) + if torch.max(y) > 1.07: + print("max value is ", torch.max(y)) + + # Window - Cache if needed + global hann_window + dtype_device = str(y.dtype) + "_" + str(y.device) + wnsize_dtype_device = str(win_size) + "_" + dtype_device + if wnsize_dtype_device not in hann_window: + hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to( + dtype=y.dtype, device=y.device + ) + + # Padding + y = torch.nn.functional.pad( + y.unsqueeze(1), + (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)), + mode="reflect", + ) + y = y.squeeze(1) + + # Complex Spectrogram :: (B, T) -> (B, Freq, Frame, RealComplex=2) + spec = torch.stft( + y, + n_fft, + hop_length=hop_size, + win_length=win_size, + window=hann_window[wnsize_dtype_device], + center=center, + pad_mode="reflect", + normalized=False, + onesided=True, + return_complex=False, + ) + + # Linear-frequency Linear-amplitude spectrogram :: (B, Freq, Frame, RealComplex=2) -> (B, Freq, Frame) + spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6) + return spec + + +def spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax): + # MelBasis - Cache if needed + global mel_basis + dtype_device = str(spec.dtype) + "_" + str(spec.device) + fmax_dtype_device = str(fmax) + "_" + dtype_device + if fmax_dtype_device not in mel_basis: + mel = librosa_mel_fn( + sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax + ) + mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to( + dtype=spec.dtype, device=spec.device + ) + + # Mel-frequency Log-amplitude spectrogram :: (B, Freq=num_mels, Frame) + melspec = torch.matmul(mel_basis[fmax_dtype_device], spec) + melspec = spectral_normalize_torch(melspec) + return melspec + + +def mel_spectrogram_torch( + y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False +): + """Convert waveform into Mel-frequency Log-amplitude spectrogram. + + Args: + y :: (B, T) - Waveforms + Returns: + melspec :: (B, Freq, Frame) - Mel-frequency Log-amplitude spectrogram + """ + # Linear-frequency Linear-amplitude spectrogram :: (B, T) -> (B, Freq, Frame) + spec = spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center) + + # Mel-frequency Log-amplitude spectrogram :: (B, Freq, Frame) -> (B, Freq=num_mels, Frame) + melspec = spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax) + + return melspec diff --git a/infer/lib/train/process_ckpt.py b/infer/lib/train/process_ckpt.py new file mode 100644 index 0000000..a48ca61 --- /dev/null +++ b/infer/lib/train/process_ckpt.py @@ -0,0 +1,259 @@ +import torch, traceback, os, sys + +now_dir = os.getcwd() +sys.path.append(now_dir) +from collections import OrderedDict +from i18n.i18n import I18nAuto + +i18n = I18nAuto() + + +def savee(ckpt, sr, if_f0, name, epoch, version, hps, i18n): + try: + opt = OrderedDict() + opt["weight"] = {} + for key in ckpt.keys(): + if "enc_q" in key: + continue + opt["weight"][key] = ckpt[key].half() + opt["config"] = [ + hps.data.filter_length // 2 + 1, + 32, + hps.model.inter_channels, + hps.model.hidden_channels, + hps.model.filter_channels, + hps.model.n_heads, + hps.model.n_layers, + hps.model.kernel_size, + hps.model.p_dropout, + hps.model.resblock, + hps.model.resblock_kernel_sizes, + hps.model.resblock_dilation_sizes, + hps.model.upsample_rates, + hps.model.upsample_initial_channel, + hps.model.upsample_kernel_sizes, + hps.model.spk_embed_dim, + hps.model.gin_channels, + hps.data.sampling_rate, + ] + opt["info"] = "%sepoch" % epoch + opt["sr"] = sr + opt["f0"] = if_f0 + opt["version"] = version + torch.save(opt, "weights/%s.pth" % name) + return "Success." + except: + return traceback.format_exc() + + +def show_info(path): + try: + a = torch.load(path, map_location="cpu") + return "模型信息:%s\n采样率:%s\n模型是否输入音高引导:%s\n版本:%s" % ( + a.get("info", "None"), + a.get("sr", "None"), + a.get("f0", "None"), + a.get("version", "None"), + ) + except: + return traceback.format_exc() + + +def extract_small_model(path, name, sr, if_f0, info, version): + try: + ckpt = torch.load(path, map_location="cpu") + if "model" in ckpt: + ckpt = ckpt["model"] + opt = OrderedDict() + opt["weight"] = {} + for key in ckpt.keys(): + if "enc_q" in key: + continue + opt["weight"][key] = ckpt[key].half() + if sr == "40k": + opt["config"] = [ + 1025, + 32, + 192, + 192, + 768, + 2, + 6, + 3, + 0, + "1", + [3, 7, 11], + [[1, 3, 5], [1, 3, 5], [1, 3, 5]], + [10, 10, 2, 2], + 512, + [16, 16, 4, 4], + 109, + 256, + 40000, + ] + elif sr == "48k": + if version == "v1": + opt["config"] = [ + 1025, + 32, + 192, + 192, + 768, + 2, + 6, + 3, + 0, + "1", + [3, 7, 11], + [[1, 3, 5], [1, 3, 5], [1, 3, 5]], + [10, 6, 2, 2, 2], + 512, + [16, 16, 4, 4, 4], + 109, + 256, + 48000, + ] + else: + opt["config"] = [ + 1025, + 32, + 192, + 192, + 768, + 2, + 6, + 3, + 0, + "1", + [3, 7, 11], + [[1, 3, 5], [1, 3, 5], [1, 3, 5]], + [12, 10, 2, 2], + 512, + [24, 20, 4, 4], + 109, + 256, + 48000, + ] + elif sr == "32k": + if version == "v1": + opt["config"] = [ + 513, + 32, + 192, + 192, + 768, + 2, + 6, + 3, + 0, + "1", + [3, 7, 11], + [[1, 3, 5], [1, 3, 5], [1, 3, 5]], + [10, 4, 2, 2, 2], + 512, + [16, 16, 4, 4, 4], + 109, + 256, + 32000, + ] + else: + opt["config"] = [ + 513, + 32, + 192, + 192, + 768, + 2, + 6, + 3, + 0, + "1", + [3, 7, 11], + [[1, 3, 5], [1, 3, 5], [1, 3, 5]], + [10, 8, 2, 2], + 512, + [20, 16, 4, 4], + 109, + 256, + 32000, + ] + if info == "": + info = "Extracted model." + opt["info"] = info + opt["version"] = version + opt["sr"] = sr + opt["f0"] = int(if_f0) + torch.save(opt, "weights/%s.pth" % name) + return "Success." + except: + return traceback.format_exc() + + +def change_info(path, info, name): + try: + ckpt = torch.load(path, map_location="cpu") + ckpt["info"] = info + if name == "": + name = os.path.basename(path) + torch.save(ckpt, "weights/%s" % name) + return "Success." + except: + return traceback.format_exc() + + +def merge(path1, path2, alpha1, sr, f0, info, name, version): + try: + + def extract(ckpt): + a = ckpt["model"] + opt = OrderedDict() + opt["weight"] = {} + for key in a.keys(): + if "enc_q" in key: + continue + opt["weight"][key] = a[key] + return opt + + ckpt1 = torch.load(path1, map_location="cpu") + ckpt2 = torch.load(path2, map_location="cpu") + cfg = ckpt1["config"] + if "model" in ckpt1: + ckpt1 = extract(ckpt1) + else: + ckpt1 = ckpt1["weight"] + if "model" in ckpt2: + ckpt2 = extract(ckpt2) + else: + ckpt2 = ckpt2["weight"] + if sorted(list(ckpt1.keys())) != sorted(list(ckpt2.keys())): + return "Fail to merge the models. The model architectures are not the same." + opt = OrderedDict() + opt["weight"] = {} + for key in ckpt1.keys(): + # try: + if key == "emb_g.weight" and ckpt1[key].shape != ckpt2[key].shape: + min_shape0 = min(ckpt1[key].shape[0], ckpt2[key].shape[0]) + opt["weight"][key] = ( + alpha1 * (ckpt1[key][:min_shape0].float()) + + (1 - alpha1) * (ckpt2[key][:min_shape0].float()) + ).half() + else: + opt["weight"][key] = ( + alpha1 * (ckpt1[key].float()) + (1 - alpha1) * (ckpt2[key].float()) + ).half() + # except: + # pdb.set_trace() + opt["config"] = cfg + """ + if(sr=="40k"):opt["config"] = [1025, 32, 192, 192, 768, 2, 6, 3, 0, "1", [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [10, 10, 2, 2], 512, [16, 16, 4, 4,4], 109, 256, 40000] + elif(sr=="48k"):opt["config"] = [1025, 32, 192, 192, 768, 2, 6, 3, 0, "1", [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [10,6,2,2,2], 512, [16, 16, 4, 4], 109, 256, 48000] + elif(sr=="32k"):opt["config"] = [513, 32, 192, 192, 768, 2, 6, 3, 0, "1", [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [10, 4, 2, 2, 2], 512, [16, 16, 4, 4,4], 109, 256, 32000] + """ + opt["sr"] = sr + opt["f0"] = 1 if f0 == i18n("是") else 0 + opt["version"] = version + opt["info"] = info + torch.save(opt, "weights/%s.pth" % name) + return "Success." + except: + return traceback.format_exc() diff --git a/infer/lib/train/utils.py b/infer/lib/train/utils.py new file mode 100644 index 0000000..9c0fb5c --- /dev/null +++ b/infer/lib/train/utils.py @@ -0,0 +1,487 @@ +import os, traceback +import glob +import sys +import argparse +import logging +import json +import subprocess +import numpy as np +from scipy.io.wavfile import read +import torch + +MATPLOTLIB_FLAG = False + +logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) +logger = logging + + +def load_checkpoint_d(checkpoint_path, combd, sbd, optimizer=None, load_opt=1): + assert os.path.isfile(checkpoint_path) + checkpoint_dict = torch.load(checkpoint_path, map_location="cpu") + + ################## + def go(model, bkey): + saved_state_dict = checkpoint_dict[bkey] + if hasattr(model, "module"): + state_dict = model.module.state_dict() + else: + state_dict = model.state_dict() + new_state_dict = {} + for k, v in state_dict.items(): # 模型需要的shape + try: + new_state_dict[k] = saved_state_dict[k] + if saved_state_dict[k].shape != state_dict[k].shape: + print( + "shape-%s-mismatch|need-%s|get-%s" + % (k, state_dict[k].shape, saved_state_dict[k].shape) + ) # + raise KeyError + except: + # logger.info(traceback.format_exc()) + logger.info("%s is not in the checkpoint" % k) # pretrain缺失的 + new_state_dict[k] = v # 模型自带的随机值 + if hasattr(model, "module"): + model.module.load_state_dict(new_state_dict, strict=False) + else: + model.load_state_dict(new_state_dict, strict=False) + return model + + go(combd, "combd") + model = go(sbd, "sbd") + ############# + logger.info("Loaded model weights") + + iteration = checkpoint_dict["iteration"] + learning_rate = checkpoint_dict["learning_rate"] + if ( + optimizer is not None and load_opt == 1 + ): ###加载不了,如果是空的的话,重新初始化,可能还会影响lr时间表的更新,因此在train文件最外围catch + # try: + optimizer.load_state_dict(checkpoint_dict["optimizer"]) + # except: + # traceback.print_exc() + logger.info("Loaded checkpoint '{}' (epoch {})".format(checkpoint_path, iteration)) + return model, optimizer, learning_rate, iteration + + +# def load_checkpoint(checkpoint_path, model, optimizer=None): +# assert os.path.isfile(checkpoint_path) +# checkpoint_dict = torch.load(checkpoint_path, map_location='cpu') +# iteration = checkpoint_dict['iteration'] +# learning_rate = checkpoint_dict['learning_rate'] +# if optimizer is not None: +# optimizer.load_state_dict(checkpoint_dict['optimizer']) +# # print(1111) +# saved_state_dict = checkpoint_dict['model'] +# # print(1111) +# +# if hasattr(model, 'module'): +# state_dict = model.module.state_dict() +# else: +# state_dict = model.state_dict() +# new_state_dict= {} +# for k, v in state_dict.items(): +# try: +# new_state_dict[k] = saved_state_dict[k] +# except: +# logger.info("%s is not in the checkpoint" % k) +# new_state_dict[k] = v +# if hasattr(model, 'module'): +# model.module.load_state_dict(new_state_dict) +# else: +# model.load_state_dict(new_state_dict) +# logger.info("Loaded checkpoint '{}' (epoch {})" .format( +# checkpoint_path, iteration)) +# return model, optimizer, learning_rate, iteration +def load_checkpoint(checkpoint_path, model, optimizer=None, load_opt=1): + assert os.path.isfile(checkpoint_path) + checkpoint_dict = torch.load(checkpoint_path, map_location="cpu") + + saved_state_dict = checkpoint_dict["model"] + if hasattr(model, "module"): + state_dict = model.module.state_dict() + else: + state_dict = model.state_dict() + new_state_dict = {} + for k, v in state_dict.items(): # 模型需要的shape + try: + new_state_dict[k] = saved_state_dict[k] + if saved_state_dict[k].shape != state_dict[k].shape: + print( + "shape-%s-mismatch|need-%s|get-%s" + % (k, state_dict[k].shape, saved_state_dict[k].shape) + ) # + raise KeyError + except: + # logger.info(traceback.format_exc()) + logger.info("%s is not in the checkpoint" % k) # pretrain缺失的 + new_state_dict[k] = v # 模型自带的随机值 + if hasattr(model, "module"): + model.module.load_state_dict(new_state_dict, strict=False) + else: + model.load_state_dict(new_state_dict, strict=False) + logger.info("Loaded model weights") + + iteration = checkpoint_dict["iteration"] + learning_rate = checkpoint_dict["learning_rate"] + if ( + optimizer is not None and load_opt == 1 + ): ###加载不了,如果是空的的话,重新初始化,可能还会影响lr时间表的更新,因此在train文件最外围catch + # try: + optimizer.load_state_dict(checkpoint_dict["optimizer"]) + # except: + # traceback.print_exc() + logger.info("Loaded checkpoint '{}' (epoch {})".format(checkpoint_path, iteration)) + return model, optimizer, learning_rate, iteration + + +def save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path): + logger.info( + "Saving model and optimizer state at epoch {} to {}".format( + iteration, checkpoint_path + ) + ) + if hasattr(model, "module"): + state_dict = model.module.state_dict() + else: + state_dict = model.state_dict() + torch.save( + { + "model": state_dict, + "iteration": iteration, + "optimizer": optimizer.state_dict(), + "learning_rate": learning_rate, + }, + checkpoint_path, + ) + + +def save_checkpoint_d(combd, sbd, optimizer, learning_rate, iteration, checkpoint_path): + logger.info( + "Saving model and optimizer state at epoch {} to {}".format( + iteration, checkpoint_path + ) + ) + if hasattr(combd, "module"): + state_dict_combd = combd.module.state_dict() + else: + state_dict_combd = combd.state_dict() + if hasattr(sbd, "module"): + state_dict_sbd = sbd.module.state_dict() + else: + state_dict_sbd = sbd.state_dict() + torch.save( + { + "combd": state_dict_combd, + "sbd": state_dict_sbd, + "iteration": iteration, + "optimizer": optimizer.state_dict(), + "learning_rate": learning_rate, + }, + checkpoint_path, + ) + + +def summarize( + writer, + global_step, + scalars={}, + histograms={}, + images={}, + audios={}, + audio_sampling_rate=22050, +): + for k, v in scalars.items(): + writer.add_scalar(k, v, global_step) + for k, v in histograms.items(): + writer.add_histogram(k, v, global_step) + for k, v in images.items(): + writer.add_image(k, v, global_step, dataformats="HWC") + for k, v in audios.items(): + writer.add_audio(k, v, global_step, audio_sampling_rate) + + +def latest_checkpoint_path(dir_path, regex="G_*.pth"): + f_list = glob.glob(os.path.join(dir_path, regex)) + f_list.sort(key=lambda f: int("".join(filter(str.isdigit, f)))) + x = f_list[-1] + print(x) + return x + + +def plot_spectrogram_to_numpy(spectrogram): + global MATPLOTLIB_FLAG + if not MATPLOTLIB_FLAG: + import matplotlib + + matplotlib.use("Agg") + MATPLOTLIB_FLAG = True + mpl_logger = logging.getLogger("matplotlib") + mpl_logger.setLevel(logging.WARNING) + import matplotlib.pylab as plt + import numpy as np + + fig, ax = plt.subplots(figsize=(10, 2)) + im = ax.imshow(spectrogram, aspect="auto", origin="lower", interpolation="none") + plt.colorbar(im, ax=ax) + plt.xlabel("Frames") + plt.ylabel("Channels") + plt.tight_layout() + + fig.canvas.draw() + data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep="") + data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,)) + plt.close() + return data + + +def plot_alignment_to_numpy(alignment, info=None): + global MATPLOTLIB_FLAG + if not MATPLOTLIB_FLAG: + import matplotlib + + matplotlib.use("Agg") + MATPLOTLIB_FLAG = True + mpl_logger = logging.getLogger("matplotlib") + mpl_logger.setLevel(logging.WARNING) + import matplotlib.pylab as plt + import numpy as np + + fig, ax = plt.subplots(figsize=(6, 4)) + im = ax.imshow( + alignment.transpose(), aspect="auto", origin="lower", interpolation="none" + ) + fig.colorbar(im, ax=ax) + xlabel = "Decoder timestep" + if info is not None: + xlabel += "\n\n" + info + plt.xlabel(xlabel) + plt.ylabel("Encoder timestep") + plt.tight_layout() + + fig.canvas.draw() + data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep="") + data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,)) + plt.close() + return data + + +def load_wav_to_torch(full_path): + sampling_rate, data = read(full_path) + return torch.FloatTensor(data.astype(np.float32)), sampling_rate + + +def load_filepaths_and_text(filename, split="|"): + with open(filename, encoding="utf-8") as f: + filepaths_and_text = [line.strip().split(split) for line in f] + return filepaths_and_text + + +def get_hparams(init=True): + """ + todo: + 结尾七人组: + 保存频率、总epoch done + bs done + pretrainG、pretrainD done + 卡号:os.en["CUDA_VISIBLE_DEVICES"] done + if_latest done + 模型:if_f0 done + 采样率:自动选择config done + 是否缓存数据集进GPU:if_cache_data_in_gpu done + + -m: + 自动决定training_files路径,改掉train_nsf_load_pretrain.py里的hps.data.training_files done + -c不要了 + """ + parser = argparse.ArgumentParser() + # parser.add_argument('-c', '--config', type=str, default="configs/40k.json",help='JSON file for configuration') + parser.add_argument( + "-se", + "--save_every_epoch", + type=int, + required=True, + help="checkpoint save frequency (epoch)", + ) + parser.add_argument( + "-te", "--total_epoch", type=int, required=True, help="total_epoch" + ) + parser.add_argument( + "-pg", "--pretrainG", type=str, default="", help="Pretrained Discriminator path" + ) + parser.add_argument( + "-pd", "--pretrainD", type=str, default="", help="Pretrained Generator path" + ) + parser.add_argument("-g", "--gpus", type=str, default="0", help="split by -") + parser.add_argument( + "-bs", "--batch_size", type=int, required=True, help="batch size" + ) + parser.add_argument( + "-e", "--experiment_dir", type=str, required=True, help="experiment dir" + ) # -m + parser.add_argument( + "-sr", "--sample_rate", type=str, required=True, help="sample rate, 32k/40k/48k" + ) + parser.add_argument( + "-sw", + "--save_every_weights", + type=str, + default="0", + help="save the extracted model in weights directory when saving checkpoints", + ) + parser.add_argument( + "-v", "--version", type=str, required=True, help="model version" + ) + parser.add_argument( + "-f0", + "--if_f0", + type=int, + required=True, + help="use f0 as one of the inputs of the model, 1 or 0", + ) + parser.add_argument( + "-l", + "--if_latest", + type=int, + required=True, + help="if only save the latest G/D pth file, 1 or 0", + ) + parser.add_argument( + "-c", + "--if_cache_data_in_gpu", + type=int, + required=True, + help="if caching the dataset in GPU memory, 1 or 0", + ) + + args = parser.parse_args() + name = args.experiment_dir + experiment_dir = os.path.join("./logs", args.experiment_dir) + + if not os.path.exists(experiment_dir): + os.makedirs(experiment_dir) + + if args.version == "v1" or args.sample_rate == "40k": + config_path = "configs/%s.json" % args.sample_rate + else: + config_path = "configs/%s_v2.json" % args.sample_rate + config_save_path = os.path.join(experiment_dir, "config.json") + if init: + with open(config_path, "r") as f: + data = f.read() + with open(config_save_path, "w") as f: + f.write(data) + else: + with open(config_save_path, "r") as f: + data = f.read() + config = json.loads(data) + + hparams = HParams(**config) + hparams.model_dir = hparams.experiment_dir = experiment_dir + hparams.save_every_epoch = args.save_every_epoch + hparams.name = name + hparams.total_epoch = args.total_epoch + hparams.pretrainG = args.pretrainG + hparams.pretrainD = args.pretrainD + hparams.version = args.version + hparams.gpus = args.gpus + hparams.train.batch_size = args.batch_size + hparams.sample_rate = args.sample_rate + hparams.if_f0 = args.if_f0 + hparams.if_latest = args.if_latest + hparams.save_every_weights = args.save_every_weights + hparams.if_cache_data_in_gpu = args.if_cache_data_in_gpu + hparams.data.training_files = "%s/filelist.txt" % experiment_dir + return hparams + + +def get_hparams_from_dir(model_dir): + config_save_path = os.path.join(model_dir, "config.json") + with open(config_save_path, "r") as f: + data = f.read() + config = json.loads(data) + + hparams = HParams(**config) + hparams.model_dir = model_dir + return hparams + + +def get_hparams_from_file(config_path): + with open(config_path, "r") as f: + data = f.read() + config = json.loads(data) + + hparams = HParams(**config) + return hparams + + +def check_git_hash(model_dir): + source_dir = os.path.dirname(os.path.realpath(__file__)) + if not os.path.exists(os.path.join(source_dir, ".git")): + logger.warn( + "{} is not a git repository, therefore hash value comparison will be ignored.".format( + source_dir + ) + ) + return + + cur_hash = subprocess.getoutput("git rev-parse HEAD") + + path = os.path.join(model_dir, "githash") + if os.path.exists(path): + saved_hash = open(path).read() + if saved_hash != cur_hash: + logger.warn( + "git hash values are different. {}(saved) != {}(current)".format( + saved_hash[:8], cur_hash[:8] + ) + ) + else: + open(path, "w").write(cur_hash) + + +def get_logger(model_dir, filename="train.log"): + global logger + logger = logging.getLogger(os.path.basename(model_dir)) + logger.setLevel(logging.DEBUG) + + formatter = logging.Formatter("%(asctime)s\t%(name)s\t%(levelname)s\t%(message)s") + if not os.path.exists(model_dir): + os.makedirs(model_dir) + h = logging.FileHandler(os.path.join(model_dir, filename)) + h.setLevel(logging.DEBUG) + h.setFormatter(formatter) + logger.addHandler(h) + return logger + + +class HParams: + def __init__(self, **kwargs): + for k, v in kwargs.items(): + if type(v) == dict: + v = HParams(**v) + self[k] = v + + def keys(self): + return self.__dict__.keys() + + def items(self): + return self.__dict__.items() + + def values(self): + return self.__dict__.values() + + def __len__(self): + return len(self.__dict__) + + def __getitem__(self, key): + return getattr(self, key) + + def __setitem__(self, key, value): + return setattr(self, key, value) + + def __contains__(self, key): + return key in self.__dict__ + + def __repr__(self): + return self.__dict__.__repr__() From 92f18e2f81d1f2b6554b183dff6d335f7a6a7f35 Mon Sep 17 00:00:00 2001 From: Ftps Date: Sat, 19 Aug 2023 20:01:09 +0900 Subject: [PATCH 12/65] somefix vc --- infer/modules/vc/modules.py | 2 ++ infer/modules/vc/utils.py | 20 -------------------- 2 files changed, 2 insertions(+), 20 deletions(-) diff --git a/infer/modules/vc/modules.py b/infer/modules/vc/modules.py index f1a96eb..5c2bc60 100644 --- a/infer/modules/vc/modules.py +++ b/infer/modules/vc/modules.py @@ -1,5 +1,6 @@ import traceback +import numpy as np import torch import soundfile as sf @@ -11,6 +12,7 @@ from infer.lib.infer_pack.models import ( ) from infer.modules.vc.pipeline import Pipeline from infer.modules.vc.utils import * +from infer.lib.audio import load_audio class VC: diff --git a/infer/modules/vc/utils.py b/infer/modules/vc/utils.py index 933775f..bc98989 100644 --- a/infer/modules/vc/utils.py +++ b/infer/modules/vc/utils.py @@ -1,7 +1,5 @@ import os -import numpy as np -import ffmpeg from fairseq import checkpoint_utils @@ -34,21 +32,3 @@ def load_hubert(config): hubert_model = hubert_model.float() return hubert_model.eval() - -def load_audio(file, sr): - try: - # https://github.com/openai/whisper/blob/main/whisper/audio.py#L26 - # This launches a subprocess to decode audio while down-mixing and resampling as necessary. - # Requires the ffmpeg CLI and `ffmpeg-python` package to be installed. - file = ( - file.strip(" ").strip('"').strip("\n").strip('"').strip(" ") - ) # 防止小白拷路径头尾带了空格和"和回车 - out, _ = ( - ffmpeg.input(file, threads=0) - .output("-", format="f32le", acodec="pcm_f32le", ac=1, ar=sr) - .run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True) - ) - except Exception as e: - raise RuntimeError(f"Failed to load audio: {e}") - - return np.frombuffer(out, np.float32).flatten() From de0c1399c8d37f0e54ce233f0e0814f023d28797 Mon Sep 17 00:00:00 2001 From: Tps-F Date: Sat, 19 Aug 2023 11:01:49 +0000 Subject: [PATCH 13/65] Apply Code Formatter Change --- infer/lib/rmvpe.py | 142 ++++++++++++++++++++++++-------------- infer/modules/vc/utils.py | 1 - 2 files changed, 90 insertions(+), 53 deletions(-) diff --git a/infer/lib/rmvpe.py b/infer/lib/rmvpe.py index 25dcb8c..e5fa613 100644 --- a/infer/lib/rmvpe.py +++ b/infer/lib/rmvpe.py @@ -1,14 +1,23 @@ -import torch, numpy as np,pdb +import torch, numpy as np, pdb import torch.nn as nn import torch.nn.functional as F -import torch,pdb +import torch, pdb import numpy as np import torch.nn.functional as F from scipy.signal import get_window -from librosa.util import pad_center, tiny,normalize +from librosa.util import pad_center, tiny, normalize + + ###stft codes from https://github.com/pseeth/torch-stft/blob/master/torch_stft/util.py -def window_sumsquare(window, n_frames, hop_length=200, win_length=800, - n_fft=800, dtype=np.float32, norm=None): +def window_sumsquare( + window, + n_frames, + hop_length=200, + win_length=800, + n_fft=800, + dtype=np.float32, + norm=None, +): """ # from librosa 0.6 Compute the sum-square envelope of a window function at a given hop length. @@ -41,18 +50,20 @@ def window_sumsquare(window, n_frames, hop_length=200, win_length=800, # Compute the squared window at the desired length win_sq = get_window(window, win_length, fftbins=True) - win_sq = normalize(win_sq, norm=norm)**2 + win_sq = normalize(win_sq, norm=norm) ** 2 win_sq = pad_center(win_sq, n_fft) # Fill the envelope for i in range(n_frames): sample = i * hop_length - x[sample:min(n, sample + n_fft)] += win_sq[:max(0, min(n_fft, n - sample))] + x[sample : min(n, sample + n_fft)] += win_sq[: max(0, min(n_fft, n - sample))] return x + class STFT(torch.nn.Module): - def __init__(self, filter_length=1024, hop_length=512, win_length=None, - window='hann'): + def __init__( + self, filter_length=1024, hop_length=512, win_length=None, window="hann" + ): """ This module implements an STFT using 1D convolution and 1D transpose convolutions. This is a bit tricky so there are some cases that probably won't work as working @@ -79,12 +90,15 @@ class STFT(torch.nn.Module): fourier_basis = np.fft.fft(np.eye(self.filter_length)) cutoff = int((self.filter_length / 2 + 1)) - fourier_basis = np.vstack([np.real(fourier_basis[:cutoff, :]),np.imag(fourier_basis[:cutoff, :])]) + fourier_basis = np.vstack( + [np.real(fourier_basis[:cutoff, :]), np.imag(fourier_basis[:cutoff, :])] + ) forward_basis = torch.FloatTensor(fourier_basis[:, None, :]) inverse_basis = torch.FloatTensor( - np.linalg.pinv(scale * fourier_basis).T[:, None, :]) + np.linalg.pinv(scale * fourier_basis).T[:, None, :] + ) - assert (filter_length >= self.win_length) + assert filter_length >= self.win_length # get window and zero center pad it to filter_length fft_window = get_window(window, self.win_length, fftbins=True) fft_window = pad_center(fft_window, size=filter_length) @@ -94,8 +108,8 @@ class STFT(torch.nn.Module): forward_basis *= fft_window inverse_basis *= fft_window - self.register_buffer('forward_basis', forward_basis.float()) - self.register_buffer('inverse_basis', inverse_basis.float()) + self.register_buffer("forward_basis", forward_basis.float()) + self.register_buffer("inverse_basis", inverse_basis.float()) def transform(self, input_data): """Take input data (audio) to STFT domain. @@ -117,23 +131,25 @@ class STFT(torch.nn.Module): # similar to librosa, reflect-pad the input input_data = input_data.view(num_batches, 1, num_samples) # print(1234,input_data.shape) - input_data = F.pad(input_data.unsqueeze(1),(self.pad_amount, self.pad_amount, 0, 0,0,0),mode='reflect').squeeze(1) + input_data = F.pad( + input_data.unsqueeze(1), + (self.pad_amount, self.pad_amount, 0, 0, 0, 0), + mode="reflect", + ).squeeze(1) # print(2333,input_data.shape,self.forward_basis.shape,self.hop_length) # pdb.set_trace() forward_transform = F.conv1d( - input_data, - self.forward_basis, - stride=self.hop_length, - padding=0) + input_data, self.forward_basis, stride=self.hop_length, padding=0 + ) cutoff = int((self.filter_length / 2) + 1) real_part = forward_transform[:, :cutoff, :] imag_part = forward_transform[:, cutoff:, :] - magnitude = torch.sqrt(real_part ** 2 + imag_part ** 2) + magnitude = torch.sqrt(real_part**2 + imag_part**2) # phase = torch.atan2(imag_part.data, real_part.data) - return magnitude#, phase + return magnitude # , phase def inverse(self, magnitude, phase): """Call the inverse STFT (iSTFT), given magnitude and phase tensors produced @@ -150,30 +166,39 @@ class STFT(torch.nn.Module): shape (num_batch, num_samples) """ recombine_magnitude_phase = torch.cat( - [magnitude * torch.cos(phase), magnitude * torch.sin(phase)], dim=1) + [magnitude * torch.cos(phase), magnitude * torch.sin(phase)], dim=1 + ) inverse_transform = F.conv_transpose1d( recombine_magnitude_phase, self.inverse_basis, stride=self.hop_length, - padding=0) + padding=0, + ) if self.window is not None: window_sum = window_sumsquare( - self.window, magnitude.size(-1), hop_length=self.hop_length, - win_length=self.win_length, n_fft=self.filter_length, - dtype=np.float32) + self.window, + magnitude.size(-1), + hop_length=self.hop_length, + win_length=self.win_length, + n_fft=self.filter_length, + dtype=np.float32, + ) # remove modulation effects approx_nonzero_indices = torch.from_numpy( - np.where(window_sum > tiny(window_sum))[0]) + np.where(window_sum > tiny(window_sum))[0] + ) window_sum = torch.from_numpy(window_sum).to(inverse_transform.device) - inverse_transform[:, :, approx_nonzero_indices] /= window_sum[approx_nonzero_indices] + inverse_transform[:, :, approx_nonzero_indices] /= window_sum[ + approx_nonzero_indices + ] # scale by hop ratio inverse_transform *= float(self.filter_length) / self.hop_length - inverse_transform = inverse_transform[..., self.pad_amount:] - inverse_transform = inverse_transform[..., :self.num_samples] + inverse_transform = inverse_transform[..., self.pad_amount :] + inverse_transform = inverse_transform[..., : self.num_samples] inverse_transform = inverse_transform.squeeze(1) return inverse_transform @@ -191,7 +216,11 @@ class STFT(torch.nn.Module): self.magnitude, self.phase = self.transform(input_data) reconstruction = self.inverse(self.magnitude, self.phase) return reconstruction + + from time import time as ttime + + class BiGRU(nn.Module): def __init__(self, input_features, hidden_features, num_layers): super(BiGRU, self).__init__() @@ -509,14 +538,14 @@ class MelSpectrogram(torch.nn.Module): # print(1111111111) # print(222222222222222,audio.device,self.is_half) if hasattr(self, "stft") == False: - # print(n_fft_new,hop_length_new,win_length_new,audio.shape) - self.stft=STFT( + # print(n_fft_new,hop_length_new,win_length_new,audio.shape) + self.stft = STFT( filter_length=n_fft_new, hop_length=hop_length_new, win_length=win_length_new, - window='hann' + window="hann", ).to(audio.device) - magnitude = self.stft.transform(audio)#phase + magnitude = self.stft.transform(audio) # phase # if (audio.device.type == "privateuseone"): # magnitude=magnitude.to(audio.device) if keyshift != 0: @@ -544,10 +573,13 @@ class RMVPE: self.mel_extractor = MelSpectrogram( is_half, 128, 16000, 1024, 160, None, 30, 8000 ).to(device) - if ("privateuseone" in str(device)): + if "privateuseone" in str(device): import onnxruntime as ort - ort_session = ort.InferenceSession("rmvpe.onnx", providers=["DmlExecutionProvider"]) - self.model=ort_session + + ort_session = ort.InferenceSession( + "rmvpe.onnx", providers=["DmlExecutionProvider"] + ) + self.model = ort_session else: model = E2E(4, 1, (2, 2)) ckpt = torch.load(model_path, map_location="cpu") @@ -566,10 +598,13 @@ class RMVPE: mel = F.pad( mel, (0, 32 * ((n_frames - 1) // 32 + 1) - n_frames), mode="reflect" ) - if("privateuseone" in str(self.device) ): + if "privateuseone" in str(self.device): onnx_input_name = self.model.get_inputs()[0].name onnx_outputs_names = self.model.get_outputs()[0].name - hidden = self.model.run([onnx_outputs_names], input_feed={onnx_input_name: mel.cpu().numpy()})[0] + hidden = self.model.run( + [onnx_outputs_names], + input_feed={onnx_input_name: mel.cpu().numpy()}, + )[0] else: hidden = self.model(mel) return hidden[:, :n_frames] @@ -583,25 +618,27 @@ class RMVPE: def infer_from_audio(self, audio, thred=0.03): # torch.cuda.synchronize() - t0=ttime() - mel = self.mel_extractor(torch.from_numpy(audio).float().to(self.device).unsqueeze(0), center=True) + t0 = ttime() + mel = self.mel_extractor( + torch.from_numpy(audio).float().to(self.device).unsqueeze(0), center=True + ) # print(123123123,mel.device.type) # torch.cuda.synchronize() - t1=ttime() + t1 = ttime() hidden = self.mel2hidden(mel) # torch.cuda.synchronize() - t2=ttime() + t2 = ttime() # print(234234,hidden.device.type) - if("privateuseone" not in str(self.device)): + if "privateuseone" not in str(self.device): hidden = hidden.squeeze(0).cpu().numpy() else: - hidden=hidden[0] + hidden = hidden[0] if self.is_half == True: hidden = hidden.astype("float32") f0 = self.decode(hidden, thred=thred) # torch.cuda.synchronize() - t3=ttime() + t3 = ttime() # print("hmvpe:%s\t%s\t%s\t%s"%(t1-t0,t2-t1,t3-t2,t3-t0)) return f0 @@ -632,8 +669,9 @@ class RMVPE: return devided -if __name__ == '__main__': +if __name__ == "__main__": import soundfile as sf, librosa + audio, sampling_rate = sf.read(r"C:\Users\liujing04\Desktop\Z\冬之花clip1.wav") if len(audio.shape) > 1: audio = librosa.to_mono(audio.transpose(1, 0)) @@ -642,13 +680,13 @@ if __name__ == '__main__': audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000) model_path = r"D:\BaiduNetdiskDownload\RVC-beta-v2-0727AMD_realtime\rmvpe.pt" thred = 0.03 # 0.01 - device = 'cuda' if torch.cuda.is_available() else 'cpu' - rmvpe = RMVPE(model_path,is_half=False, device=device) - t0=ttime() + device = "cuda" if torch.cuda.is_available() else "cpu" + rmvpe = RMVPE(model_path, is_half=False, device=device) + t0 = ttime() f0 = rmvpe.infer_from_audio(audio, thred=thred) # f0 = rmvpe.infer_from_audio(audio, thred=thred) # f0 = rmvpe.infer_from_audio(audio, thred=thred) # f0 = rmvpe.infer_from_audio(audio, thred=thred) # f0 = rmvpe.infer_from_audio(audio, thred=thred) - t1=ttime() - print(f0.shape,t1-t0) + t1 = ttime() + print(f0.shape, t1 - t0) diff --git a/infer/modules/vc/utils.py b/infer/modules/vc/utils.py index bc98989..98497e2 100644 --- a/infer/modules/vc/utils.py +++ b/infer/modules/vc/utils.py @@ -31,4 +31,3 @@ def load_hubert(config): else: hubert_model = hubert_model.float() return hubert_model.eval() - From a5e6dfb41774ca58a6388fbc0b56f0576b0c7f59 Mon Sep 17 00:00:00 2001 From: Ftps Date: Sat, 19 Aug 2023 20:02:58 +0900 Subject: [PATCH 14/65] onnx module --- infer/modules/onnx/export.py | 53 ++++++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) create mode 100644 infer/modules/onnx/export.py diff --git a/infer/modules/onnx/export.py b/infer/modules/onnx/export.py new file mode 100644 index 0000000..e80384c --- /dev/null +++ b/infer/modules/onnx/export.py @@ -0,0 +1,53 @@ +import torch + +from infer.lib.infer_pack.models_onnx import SynthesizerTrnMsNSFsidM + + +def export_onnx(ModelPath, ExportedPath): + global cpt + cpt = torch.load(ModelPath, map_location="cpu") + cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0] + vec_channels = 256 if cpt.get("version", "v1") == "v1" else 768 + + test_phone = torch.rand(1, 200, vec_channels) # hidden unit + test_phone_lengths = torch.tensor([200]).long() # hidden unit 长度(貌似没啥用) + test_pitch = torch.randint(size=(1, 200), low=5, high=255) # 基频(单位赫兹) + test_pitchf = torch.rand(1, 200) # nsf基频 + test_ds = torch.LongTensor([0]) # 说话人ID + test_rnd = torch.rand(1, 192, 200) # 噪声(加入随机因子) + + device = "cpu" # 导出时设备(不影响使用模型) + + net_g = SynthesizerTrnMsNSFsidM( + *cpt["config"], is_half=False, version=cpt.get("version", "v1") + ) # fp32导出(C++要支持fp16必须手动将内存重新排列所以暂时不用fp16) + net_g.load_state_dict(cpt["weight"], strict=False) + input_names = ["phone", "phone_lengths", "pitch", "pitchf", "ds", "rnd"] + output_names = [ + "audio", + ] + # net_g.construct_spkmixmap(n_speaker) 多角色混合轨道导出 + torch.onnx.export( + net_g, + ( + test_phone.to(device), + test_phone_lengths.to(device), + test_pitch.to(device), + test_pitchf.to(device), + test_ds.to(device), + test_rnd.to(device), + ), + ExportedPath, + dynamic_axes={ + "phone": [1], + "pitch": [1], + "pitchf": [1], + "rnd": [2], + }, + do_constant_folding=False, + opset_version=13, + verbose=False, + input_names=input_names, + output_names=output_names, + ) + return "Finished" From b3d7075ba414823045b0b9b9801dfaa1a80638a0 Mon Sep 17 00:00:00 2001 From: Ftps Date: Sat, 19 Aug 2023 20:13:39 +0900 Subject: [PATCH 15/65] train modules --- .../modules/train/preprocess.py | 0 .../modules/train/train.py | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename trainset_preprocess_pipeline_print.py => infer/modules/train/preprocess.py (100%) rename train_nsf_sim_cache_sid_load_pretrain.py => infer/modules/train/train.py (100%) diff --git a/trainset_preprocess_pipeline_print.py b/infer/modules/train/preprocess.py similarity index 100% rename from trainset_preprocess_pipeline_print.py rename to infer/modules/train/preprocess.py diff --git a/train_nsf_sim_cache_sid_load_pretrain.py b/infer/modules/train/train.py similarity index 100% rename from train_nsf_sim_cache_sid_load_pretrain.py rename to infer/modules/train/train.py From d2ea3f193050beeafae9c701f58fd80a65b79f98 Mon Sep 17 00:00:00 2001 From: Ftps Date: Sat, 19 Aug 2023 22:13:46 +0900 Subject: [PATCH 16/65] fix config --- configs/config.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/configs/config.py b/configs/config.py index de2460f..1511fdb 100644 --- a/configs/config.py +++ b/configs/config.py @@ -17,9 +17,9 @@ def use_fp32_config(): strr = f.read().replace("true", "false") with open(f"configs/{config_file}", "w") as f: f.write(strr) - with open("trainset_preprocess_pipeline_print.py", "r") as f: + with open("infer/modules/train/preprocess.py", "r") as f: strr = f.read().replace("3.7", "3.0") - with open("trainset_preprocess_pipeline_print.py", "w") as f: + with open("infer/modules/train/preprocess.py", "w") as f: f.write(strr) @@ -110,9 +110,9 @@ class Config: + 0.4 ) if self.gpu_mem <= 4: - with open("trainset_preprocess_pipeline_print.py", "r") as f: + with open("infer/modules/train/preprocess.py", "r") as f: strr = f.read().replace("3.7", "3.0") - with open("trainset_preprocess_pipeline_print.py", "w") as f: + with open("infer/modules/train/preprocess.py", "w") as f: f.write(strr) elif self.has_mps(): print("No supported Nvidia GPU found") From e65275cbc1234b9a630725d381fb3bf84f2da019 Mon Sep 17 00:00:00 2001 From: Ftps Date: Sat, 19 Aug 2023 22:14:41 +0900 Subject: [PATCH 17/65] fix onnx --- infer/modules/onnx/export.py | 1 - 1 file changed, 1 deletion(-) diff --git a/infer/modules/onnx/export.py b/infer/modules/onnx/export.py index e80384c..ed4a416 100644 --- a/infer/modules/onnx/export.py +++ b/infer/modules/onnx/export.py @@ -4,7 +4,6 @@ from infer.lib.infer_pack.models_onnx import SynthesizerTrnMsNSFsidM def export_onnx(ModelPath, ExportedPath): - global cpt cpt = torch.load(ModelPath, map_location="cpu") cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0] vec_channels = 256 if cpt.get("version", "v1") == "v1" else 768 From a8854a71c4779ae3b883d285cdd4fe535ba32126 Mon Sep 17 00:00:00 2001 From: Ftps Date: Sat, 19 Aug 2023 22:15:46 +0900 Subject: [PATCH 18/65] fix Model Inference MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit とりあえず動く --- infer/modules/vc/modules.py | 2 +- infer/modules/vc/pipeline.py | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/infer/modules/vc/modules.py b/infer/modules/vc/modules.py index 5c2bc60..2511214 100644 --- a/infer/modules/vc/modules.py +++ b/infer/modules/vc/modules.py @@ -123,7 +123,7 @@ class VC: else file_index2 ) # 防止小白写错,自动帮他替换掉 - audio_opt = Pipeline.pipeline( + audio_opt = self.pipeline.pipeline( self.hubert_model, self.net_g, sid, diff --git a/infer/modules/vc/pipeline.py b/infer/modules/vc/pipeline.py index 3ac47cd..eed97e0 100644 --- a/infer/modules/vc/pipeline.py +++ b/infer/modules/vc/pipeline.py @@ -285,7 +285,6 @@ class Pipeline(object): f0_up_key, f0_method, file_index, - # file_big_npy, index_rate, if_f0, filter_radius, @@ -296,7 +295,6 @@ class Pipeline(object): protect, f0_file=None, ): - print(file_index) if ( file_index != "" # and file_big_npy != "" From c054138f0faec6d71cb330b1298bf8e691232ef6 Mon Sep 17 00:00:00 2001 From: Ftps Date: Sat, 19 Aug 2023 22:47:10 +0900 Subject: [PATCH 19/65] remove moved func --- infer-web.py | 465 ++------------------------------------------------- 1 file changed, 18 insertions(+), 447 deletions(-) diff --git a/infer-web.py b/infer-web.py index c5fd117..1959ea0 100644 --- a/infer-web.py +++ b/infer-web.py @@ -4,7 +4,7 @@ import sys now_dir = os.getcwd() sys.path.append(now_dir) -import traceback, pdb +import traceback import warnings import numpy as np @@ -19,25 +19,19 @@ from subprocess import Popen from time import sleep import faiss -import ffmpeg import gradio as gr -import soundfile as sf -from config import Config +from configs.config import Config import fairseq from i18n import I18nAuto -from lib.infer_pack.models import ( - SynthesizerTrnMs256NSFsid, - SynthesizerTrnMs256NSFsid_nono, - SynthesizerTrnMs768NSFsid, - SynthesizerTrnMs768NSFsid_nono, -) -from lib.infer_pack.models_onnx import SynthesizerTrnMsNSFsidM -from infer_uvr5 import _audio_pre_, _audio_pre_new -from lib.audio import load_audio from lib.train.process_ckpt import change_info, extract_small_model, merge, show_info -from vc_infer_pipeline import VC from sklearn.cluster import MiniBatchKMeans +from dotenv import load_dotenv + +from infer.modules.vc.modules import VC +from infer.modules.uvr5.modules import uvr +from infer.modules.onnx.export import export_onnx + logging.getLogger("numba").setLevel(logging.WARNING) now_dir = os.getcwd() @@ -48,12 +42,13 @@ shutil.rmtree("%s/runtime/Lib/site-packages/uvr5_pack" % (now_dir), ignore_error os.makedirs(tmp, exist_ok=True) os.makedirs(os.path.join(now_dir, "logs"), exist_ok=True) os.makedirs(os.path.join(now_dir, "weights"), exist_ok=True) -os.environ["TEMP"] = tmp warnings.filterwarnings("ignore") torch.manual_seed(114514) - +load_dotenv() config = Config() +vc = VC(config) + if config.dml == True: def forward_dml(ctx, x, scale): @@ -127,27 +122,10 @@ class ToolButton(gr.Button, gr.components.FormComponent): return "button" -hubert_model = None - - -def load_hubert(): - global hubert_model - models, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task( - ["hubert_base.pt"], - suffix="", - ) - hubert_model = models[0] - hubert_model = hubert_model.to(config.device) - if config.is_half: - hubert_model = hubert_model.half() - else: - hubert_model = hubert_model.float() - hubert_model.eval() - - -weight_root = "weights" -weight_uvr5_root = "uvr5_weights" +weight_root = os.getenv("weight_root") +weight_uvr5_root = os.getenv("weight_uvr5_root") index_root = "logs" + names = [] for name in os.listdir(weight_root): if name.endswith(".pth"): @@ -162,365 +140,6 @@ for name in os.listdir(weight_uvr5_root): if name.endswith(".pth") or "onnx" in name: uvr5_names.append(name.replace(".pth", "")) -cpt = None - - -def vc_single( - sid, - input_audio_path, - f0_up_key, - f0_file, - f0_method, - file_index, - file_index2, - # file_big_npy, - index_rate, - filter_radius, - resample_sr, - rms_mix_rate, - protect, -): # spk_item, input_audio0, vc_transform0,f0_file,f0method0 - global tgt_sr, net_g, vc, hubert_model, version, cpt - if input_audio_path is None: - return "You need to upload an audio", None - f0_up_key = int(f0_up_key) - try: - audio = load_audio(input_audio_path, 16000) - audio_max = np.abs(audio).max() / 0.95 - if audio_max > 1: - audio /= audio_max - times = [0, 0, 0] - if not hubert_model: - load_hubert() - if_f0 = cpt.get("f0", 1) - file_index = ( - ( - file_index.strip(" ") - .strip('"') - .strip("\n") - .strip('"') - .strip(" ") - .replace("trained", "added") - ) - if file_index != "" - else file_index2 - ) # 防止小白写错,自动帮他替换掉 - # file_big_npy = ( - # file_big_npy.strip(" ").strip('"').strip("\n").strip('"').strip(" ") - # ) - audio_opt = vc.pipeline( - hubert_model, - net_g, - sid, - audio, - input_audio_path, - times, - f0_up_key, - f0_method, - file_index, - # file_big_npy, - index_rate, - if_f0, - filter_radius, - tgt_sr, - resample_sr, - rms_mix_rate, - version, - protect, - f0_file=f0_file, - ) - index_info = ( - "Using index:%s." % file_index - if os.path.exists(file_index) - else "Index not used." - ) - return "Success.\n %s\nTime:\n npy:%ss, f0:%ss, infer:%ss" % ( - index_info, - times[0], - times[1], - times[2], - ), ( - resample_sr if resample_sr >= 16000 and tgt_sr != resample_sr else tgt_sr, - audio_opt, - ) - except: - info = traceback.format_exc() - print(info) - return info, (None, None) - - -def vc_multi( - sid, - dir_path, - opt_root, - paths, - f0_up_key, - f0_method, - file_index, - file_index2, - # file_big_npy, - index_rate, - filter_radius, - resample_sr, - rms_mix_rate, - protect, - format1, -): - try: - dir_path = ( - dir_path.strip(" ").strip('"').strip("\n").strip('"').strip(" ") - ) # 防止小白拷路径头尾带了空格和"和回车 - opt_root = opt_root.strip(" ").strip('"').strip("\n").strip('"').strip(" ") - os.makedirs(opt_root, exist_ok=True) - try: - if dir_path != "": - paths = [os.path.join(dir_path, name) for name in os.listdir(dir_path)] - else: - paths = [path.name for path in paths] - except: - traceback.print_exc() - paths = [path.name for path in paths] - infos = [] - for path in paths: - info, opt = vc_single( - sid, - path, - f0_up_key, - None, - f0_method, - file_index, - file_index2, - # file_big_npy, - index_rate, - filter_radius, - resample_sr, - rms_mix_rate, - protect, - ) - if "Success" in info: - try: - tgt_sr, audio_opt = opt - if format1 in ["wav", "flac"]: - sf.write( - "%s/%s.%s" % (opt_root, os.path.basename(path), format1), - audio_opt, - tgt_sr, - ) - else: - path = "%s/%s.wav" % (opt_root, os.path.basename(path)) - sf.write( - path, - audio_opt, - tgt_sr, - ) - if os.path.exists(path): - os.system( - "ffmpeg -i %s -vn %s -q:a 2 -y" - % (path, path[:-4] + ".%s" % format1) - ) - except: - info += traceback.format_exc() - infos.append("%s->%s" % (os.path.basename(path), info)) - yield "\n".join(infos) - yield "\n".join(infos) - except: - yield traceback.format_exc() - - -def uvr(model_name, inp_root, save_root_vocal, paths, save_root_ins, agg, format0): - infos = [] - try: - inp_root = inp_root.strip(" ").strip('"').strip("\n").strip('"').strip(" ") - save_root_vocal = ( - save_root_vocal.strip(" ").strip('"').strip("\n").strip('"').strip(" ") - ) - save_root_ins = ( - save_root_ins.strip(" ").strip('"').strip("\n").strip('"').strip(" ") - ) - if model_name == "onnx_dereverb_By_FoxJoy": - from MDXNet import MDXNetDereverb - - pre_fun = MDXNetDereverb(15) - else: - func = _audio_pre_ if "DeEcho" not in model_name else _audio_pre_new - pre_fun = func( - agg=int(agg), - model_path=os.path.join(weight_uvr5_root, model_name + ".pth"), - device=config.device, - is_half=config.is_half, - ) - if inp_root != "": - paths = [os.path.join(inp_root, name) for name in os.listdir(inp_root)] - else: - paths = [path.name for path in paths] - for path in paths: - inp_path = os.path.join(inp_root, path) - need_reformat = 1 - done = 0 - try: - info = ffmpeg.probe(inp_path, cmd="ffprobe") - if ( - info["streams"][0]["channels"] == 2 - and info["streams"][0]["sample_rate"] == "44100" - ): - need_reformat = 0 - pre_fun._path_audio_( - inp_path, save_root_ins, save_root_vocal, format0 - ) - done = 1 - except: - need_reformat = 1 - traceback.print_exc() - if need_reformat == 1: - tmp_path = "%s/%s.reformatted.wav" % (tmp, os.path.basename(inp_path)) - os.system( - "ffmpeg -i %s -vn -acodec pcm_s16le -ac 2 -ar 44100 %s -y" - % (inp_path, tmp_path) - ) - inp_path = tmp_path - try: - if done == 0: - pre_fun._path_audio_( - inp_path, save_root_ins, save_root_vocal, format0 - ) - infos.append("%s->Success" % (os.path.basename(inp_path))) - yield "\n".join(infos) - except: - infos.append( - "%s->%s" % (os.path.basename(inp_path), traceback.format_exc()) - ) - yield "\n".join(infos) - except: - infos.append(traceback.format_exc()) - yield "\n".join(infos) - finally: - try: - if model_name == "onnx_dereverb_By_FoxJoy": - del pre_fun.pred.model - del pre_fun.pred.model_ - else: - del pre_fun.model - del pre_fun - except: - traceback.print_exc() - print("clean_empty_cache") - if torch.cuda.is_available(): - torch.cuda.empty_cache() - yield "\n".join(infos) - - -def get_index_path_from_model(sid): - sel_index_path = "" - name = os.path.join("logs", sid.split(".")[0], "") - # print(name) - for f in index_paths: - if name in f: - # print("selected index path:", f) - sel_index_path = f - break - return sel_index_path - - -# 一个选项卡全局只能有一个音色 -def get_vc(sid, to_return_protect0, to_return_protect1): - global n_spk, tgt_sr, net_g, vc, cpt, version - if sid == "" or sid == []: - global hubert_model - if hubert_model is not None: # 考虑到轮询, 需要加个判断看是否 sid 是由有模型切换到无模型的 - print("clean_empty_cache") - del net_g, n_spk, vc, hubert_model, tgt_sr # ,cpt - hubert_model = net_g = n_spk = vc = hubert_model = tgt_sr = None - if torch.cuda.is_available(): - torch.cuda.empty_cache() - ###楼下不这么折腾清理不干净 - if_f0 = cpt.get("f0", 1) - version = cpt.get("version", "v1") - if version == "v1": - if if_f0 == 1: - net_g = SynthesizerTrnMs256NSFsid( - *cpt["config"], is_half=config.is_half - ) - else: - net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"]) - elif version == "v2": - if if_f0 == 1: - net_g = SynthesizerTrnMs768NSFsid( - *cpt["config"], is_half=config.is_half - ) - else: - net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"]) - del net_g, cpt - if torch.cuda.is_available(): - torch.cuda.empty_cache() - return ( - {"visible": False, "__type__": "update"}, - { - "visible": True, - "value": to_return_protect0, - "__type__": "update", - }, - { - "visible": True, - "value": to_return_protect1, - "__type__": "update", - }, - "", - "", - ) - person = "%s/%s" % (weight_root, sid) - print("loading %s" % person) - - cpt = torch.load(person, map_location="cpu") - tgt_sr = cpt["config"][-1] - cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0] # n_spk - if_f0 = cpt.get("f0", 1) - if if_f0 == 0: - to_return_protect0 = to_return_protect1 = { - "visible": False, - "value": 0.33, - "__type__": "update", - } - else: - to_return_protect0 = { - "visible": True, - "value": to_return_protect0, - "__type__": "update", - } - to_return_protect1 = { - "visible": True, - "value": to_return_protect1, - "__type__": "update", - } - version = cpt.get("version", "v1") - if version == "v1": - if if_f0 == 1: - net_g = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=config.is_half) - else: - net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"]) - elif version == "v2": - if if_f0 == 1: - net_g = SynthesizerTrnMs768NSFsid(*cpt["config"], is_half=config.is_half) - else: - net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"]) - del net_g.enc_q - print(net_g.load_state_dict(cpt["weight"], strict=False)) - net_g.eval().to(config.device) - if config.is_half: - net_g = net_g.half() - else: - net_g = net_g.float() - vc = VC(tgt_sr, config) - n_spk = cpt["config"][-3] - index = {"value": get_index_path_from_model(sid), "__type__": "update"} - return ( - {"visible": True, "maximum": n_spk, "__type__": "update"}, - to_return_protect0, - to_return_protect1, - index, - index, - ) - - def change_choices(): names = [] for name in os.listdir(weight_root): @@ -1385,54 +1004,6 @@ def change_f0_method(f0method8): return {"visible": visible, "__type__": "update"} -def export_onnx(ModelPath, ExportedPath): - global cpt - cpt = torch.load(ModelPath, map_location="cpu") - cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0] - vec_channels = 256 if cpt.get("version", "v1") == "v1" else 768 - - test_phone = torch.rand(1, 200, vec_channels) # hidden unit - test_phone_lengths = torch.tensor([200]).long() # hidden unit 长度(貌似没啥用) - test_pitch = torch.randint(size=(1, 200), low=5, high=255) # 基频(单位赫兹) - test_pitchf = torch.rand(1, 200) # nsf基频 - test_ds = torch.LongTensor([0]) # 说话人ID - test_rnd = torch.rand(1, 192, 200) # 噪声(加入随机因子) - - device = "cpu" # 导出时设备(不影响使用模型) - - net_g = SynthesizerTrnMsNSFsidM( - *cpt["config"], is_half=False, version=cpt.get("version", "v1") - ) # fp32导出(C++要支持fp16必须手动将内存重新排列所以暂时不用fp16) - net_g.load_state_dict(cpt["weight"], strict=False) - input_names = ["phone", "phone_lengths", "pitch", "pitchf", "ds", "rnd"] - output_names = [ - "audio", - ] - # net_g.construct_spkmixmap(n_speaker) 多角色混合轨道导出 - torch.onnx.export( - net_g, - ( - test_phone.to(device), - test_phone_lengths.to(device), - test_pitch.to(device), - test_pitchf.to(device), - test_ds.to(device), - test_rnd.to(device), - ), - ExportedPath, - dynamic_axes={ - "phone": [1], - "pitch": [1], - "pitchf": [1], - "rnd": [2], - }, - do_constant_folding=False, - opset_version=13, - verbose=False, - input_names=input_names, - output_names=output_names, - ) - return "Finished" with gr.Blocks(title="RVC WebUI") as app: @@ -1551,7 +1122,7 @@ with gr.Blocks(title="RVC WebUI") as app: vc_output1 = gr.Textbox(label=i18n("输出信息")) vc_output2 = gr.Audio(label=i18n("输出音频(右下角三个点,点了可以下载)")) but0.click( - vc_single, + vc.vc_single, [ spk_item, input_audio0, @@ -1671,7 +1242,7 @@ with gr.Blocks(title="RVC WebUI") as app: but1 = gr.Button(i18n("转换"), variant="primary") vc_output3 = gr.Textbox(label=i18n("输出信息")) but1.click( - vc_multi, + vc.vc_multi, [ spk_item, dir_input, @@ -1693,7 +1264,7 @@ with gr.Blocks(title="RVC WebUI") as app: api_name="infer_convert_batch", ) sid0.change( - fn=get_vc, + fn=vc.get_vc, inputs=[sid0, protect0, protect1], outputs=[spk_item, protect0, protect1, file_index2, file_index4], ) @@ -1967,7 +1538,7 @@ with gr.Blocks(title="RVC WebUI") as app: info3, api_name="train_start", ) - but4.click(train_index, [exp_dir1, version19], info3) + but4.click(train_index, [exp_dir1, version19, config.n_cpu], info3) but5.click( train1key, [ From 4c2fa81012f26593c9db5946d1385d752aa7d78c Mon Sep 17 00:00:00 2001 From: Tps-F Date: Sat, 19 Aug 2023 13:47:41 +0000 Subject: [PATCH 20/65] Apply Code Formatter Change --- infer-web.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/infer-web.py b/infer-web.py index 1959ea0..0546e51 100644 --- a/infer-web.py +++ b/infer-web.py @@ -140,6 +140,7 @@ for name in os.listdir(weight_uvr5_root): if name.endswith(".pth") or "onnx" in name: uvr5_names.append(name.replace(".pth", "")) + def change_choices(): names = [] for name in os.listdir(weight_root): @@ -1004,8 +1005,6 @@ def change_f0_method(f0method8): return {"visible": visible, "__type__": "update"} - - with gr.Blocks(title="RVC WebUI") as app: gr.Markdown( value=i18n( From b0ba38c288d5b5c31e126e47c1badb093365963b Mon Sep 17 00:00:00 2001 From: Ftps Date: Sat, 19 Aug 2023 22:50:24 +0900 Subject: [PATCH 21/65] add env --- .env | 9 +++++++++ infer-web.py | 2 -- 2 files changed, 9 insertions(+), 2 deletions(-) create mode 100644 .env diff --git a/.env b/.env new file mode 100644 index 0000000..04bf864 --- /dev/null +++ b/.env @@ -0,0 +1,9 @@ +OPENBLAS_NUM_THREADS = 1 +no_proxy = localhost, 127.0.0.1, ::1 + +# You can change the location of the model, etc. by changing here +weight_root = assets/weights +weight_uvr5_root = assets/uvr5_weights +index_root = output +TEMP = tmp + diff --git a/infer-web.py b/infer-web.py index 1959ea0..53ab673 100644 --- a/infer-web.py +++ b/infer-web.py @@ -10,8 +10,6 @@ import warnings import numpy as np import torch -os.environ["OPENBLAS_NUM_THREADS"] = "1" -os.environ["no_proxy"] = "localhost, 127.0.0.1, ::1" import logging import threading from random import shuffle From ca7307a950822bb90ef6c3d0e0b8e661608d26b5 Mon Sep 17 00:00:00 2001 From: Ftps Date: Sat, 19 Aug 2023 23:01:57 +0900 Subject: [PATCH 22/65] fix path --- .github/workflows/unitest.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/unitest.yml b/.github/workflows/unitest.yml index 879df3f..eaa8dd1 100644 --- a/.github/workflows/unitest.yml +++ b/.github/workflows/unitest.yml @@ -30,7 +30,7 @@ jobs: run: | mkdir -p logs/mi-test touch logs/mi-test/preprocess.log - python trainset_preprocess_pipeline_print.py logs/mute/0_gt_wavs 48000 8 logs/mi-test True + python infer/modules/train/preprocess.py logs/mute/0_gt_wavs 48000 8 logs/mi-test True touch logs/mi-test/extract_f0_feature.log python extract_f0_print.py logs/mi-test $(nproc) pm python extract_feature_print.py cpu 1 0 0 logs/mi-test v1 From b3a29d0e497af847128f76871426ff9f26c73616 Mon Sep 17 00:00:00 2001 From: Ftps Date: Sun, 20 Aug 2023 12:37:59 +0900 Subject: [PATCH 23/65] fix infer-web --- infer-web.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/infer-web.py b/infer-web.py index 2b9487c..cb0b592 100644 --- a/infer-web.py +++ b/infer-web.py @@ -1535,7 +1535,7 @@ with gr.Blocks(title="RVC WebUI") as app: info3, api_name="train_start", ) - but4.click(train_index, [exp_dir1, version19, config.n_cpu], info3) + but4.click(train_index, [exp_dir1, version19], info3) but5.click( train1key, [ From cd924f9eec371e7bb1ade8e7b41fc9dfdf24f557 Mon Sep 17 00:00:00 2001 From: Ftps Date: Sun, 20 Aug 2023 13:43:39 +0900 Subject: [PATCH 24/65] fix uvr5 path --- infer/modules/uvr5/preprocess.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/infer/modules/uvr5/preprocess.py b/infer/modules/uvr5/preprocess.py index 86c3ab0..dae2739 100644 --- a/infer/modules/uvr5/preprocess.py +++ b/infer/modules/uvr5/preprocess.py @@ -26,7 +26,7 @@ class AudioPre: "agg": agg, "high_end_process": "mirroring", } - mp = ModelParameters("lib/uvr5_pack/lib_v5/modelparams/4band_v2.json") + mp = ModelParameters("infer/lib/uvr5_pack/lib_v5/modelparams/4band_v2.json") model = Nets.CascadedASPPNet(mp.param["bins"] * 2) cpk = torch.load(model_path, map_location="cpu") model.load_state_dict(cpk) From ed7b11eb49cde35f18a44262f311f64bd8116bc9 Mon Sep 17 00:00:00 2001 From: Ftps Date: Mon, 21 Aug 2023 20:53:11 +0900 Subject: [PATCH 25/65] train 1-2b --- assets/hubert/.gitignore | 2 + assets/rmvpe/.gitignore | 2 + i18n.py | 28 ---- infer-web.py | 87 ++++++----- infer/lib/train/process_ckpt.py | 3 +- infer/lib/train/utils.py | 4 +- .../modules/train/extract/extract_f0_print.py | 4 +- .../modules/train/extract/extract_f0_rmvpe.py | 4 +- .../train/extract/extract_f0_rmvpe_dml.py | 4 +- infer/modules/train/extract_feature_print.py | 135 ++++++++++++++++++ infer/modules/train/train.py | 21 +-- 11 files changed, 214 insertions(+), 80 deletions(-) create mode 100644 assets/hubert/.gitignore create mode 100644 assets/rmvpe/.gitignore delete mode 100644 i18n.py rename extract_f0_print.py => infer/modules/train/extract/extract_f0_print.py (94%) rename extract_f0_rmvpe.py => infer/modules/train/extract/extract_f0_rmvpe.py (93%) rename extract_f0_rmvpe_dml.py => infer/modules/train/extract/extract_f0_rmvpe_dml.py (93%) create mode 100644 infer/modules/train/extract_feature_print.py diff --git a/assets/hubert/.gitignore b/assets/hubert/.gitignore new file mode 100644 index 0000000..d6b7ef3 --- /dev/null +++ b/assets/hubert/.gitignore @@ -0,0 +1,2 @@ +* +!.gitignore diff --git a/assets/rmvpe/.gitignore b/assets/rmvpe/.gitignore new file mode 100644 index 0000000..d6b7ef3 --- /dev/null +++ b/assets/rmvpe/.gitignore @@ -0,0 +1,2 @@ +* +!.gitignore diff --git a/i18n.py b/i18n.py deleted file mode 100644 index 28b17c7..0000000 --- a/i18n.py +++ /dev/null @@ -1,28 +0,0 @@ -import locale -import json -import os - - -def load_language_list(language): - with open(f"./i18n/locale/{language}.json", "r", encoding="utf-8") as f: - language_list = json.load(f) - return language_list - - -class I18nAuto: - def __init__(self, language=None): - if language in ["Auto", None]: - language = locale.getdefaultlocale()[ - 0 - ] # getlocale can't identify the system's language ((None, None)) - if not os.path.exists(f"./lib/i18n/{language}.json"): - language = "en_US" - self.language = language - # print("Use Language:", language) - self.language_map = load_language_list(language) - - def __call__(self, key): - return self.language_map.get(key, key) - - def print(self): - print("Use Language:", self.language) diff --git a/infer-web.py b/infer-web.py index cb0b592..5b12cea 100644 --- a/infer-web.py +++ b/infer-web.py @@ -20,8 +20,13 @@ import faiss import gradio as gr from configs.config import Config import fairseq -from i18n import I18nAuto -from lib.train.process_ckpt import change_info, extract_small_model, merge, show_info +from i18n.i18n import I18nAuto +from infer.lib.train.process_ckpt import ( + change_info, + extract_small_model, + merge, + show_info, +) from sklearn.cluster import MiniBatchKMeans from dotenv import load_dotenv @@ -197,7 +202,7 @@ def preprocess_dataset(trainset_dir, exp_dir, sr, n_p): f.close() cmd = ( config.python_cmd - + ' trainset_preprocess_pipeline_print.py "%s" %s %s "%s/logs/%s" ' + + ' infer/modules/train/preprocess.py "%s" %s %s "%s/logs/%s" ' % (trainset_dir, sr, n_p, now_dir, exp_dir) + str(config.noparallel) ) @@ -232,11 +237,15 @@ def extract_f0_feature(gpus, n_p, f0method, if_f0, exp_dir, version19, gpus_rmvp f.close() if if_f0: if f0method != "rmvpe_gpu": - cmd = config.python_cmd + ' extract_f0_print.py "%s/logs/%s" %s %s' % ( - now_dir, - exp_dir, - n_p, - f0method, + cmd = ( + config.python_cmd + + ' infer/modules/train/extract/extract_f0_print.py "%s/logs/%s" %s %s' + % ( + now_dir, + exp_dir, + n_p, + f0method, + ) ) print(cmd) p = Popen( @@ -259,7 +268,7 @@ def extract_f0_feature(gpus, n_p, f0method, if_f0, exp_dir, version19, gpus_rmvp for idx, n_g in enumerate(gpus_rmvpe): cmd = ( config.python_cmd - + ' extract_f0_rmvpe.py %s %s %s "%s/logs/%s" %s ' + + ' infer/modules/train/extract/extract_f0_rmvpe.py %s %s %s "%s/logs/%s" %s ' % (leng, idx, n_g, now_dir, exp_dir, config.is_half) ) print(cmd) @@ -277,9 +286,13 @@ def extract_f0_feature(gpus, n_p, f0method, if_f0, exp_dir, version19, gpus_rmvp ), ).start() else: - cmd = config.python_cmd + ' extract_f0_rmvpe_dml.py "%s/logs/%s" ' % ( - now_dir, - exp_dir, + cmd = ( + config.python_cmd + + ' infer/modules/train/extract/extract_f0_rmvpe_dml.py "%s/logs/%s" ' + % ( + now_dir, + exp_dir, + ) ) print(cmd) p = Popen( @@ -312,7 +325,7 @@ def extract_f0_feature(gpus, n_p, f0method, if_f0, exp_dir, version19, gpus_rmvp for idx, n_g in enumerate(gpus): cmd = ( config.python_cmd - + ' extract_feature_print.py %s %s %s %s "%s/logs/%s" %s' + + ' infer/modules/train/extract_feature_print.py %s %s %s %s "%s/logs/%s" %s' % ( config.device, leng, @@ -353,26 +366,26 @@ def change_sr2(sr2, if_f0_3, version19): path_str = "" if version19 == "v1" else "_v2" f0_str = "f0" if if_f0_3 else "" if_pretrained_generator_exist = os.access( - "pretrained%s/%sG%s.pth" % (path_str, f0_str, sr2), os.F_OK + "assets/pretrained%s/%sG%s.pth" % (path_str, f0_str, sr2), os.F_OK ) if_pretrained_discriminator_exist = os.access( - "pretrained%s/%sD%s.pth" % (path_str, f0_str, sr2), os.F_OK + "assets/pretrained%s/%sD%s.pth" % (path_str, f0_str, sr2), os.F_OK ) if not if_pretrained_generator_exist: print( - "pretrained%s/%sG%s.pth" % (path_str, f0_str, sr2), + "assets/pretrained%s/%sG%s.pth" % (path_str, f0_str, sr2), "not exist, will not use pretrained model", ) if not if_pretrained_discriminator_exist: print( - "pretrained%s/%sD%s.pth" % (path_str, f0_str, sr2), + "assets/pretrained%s/%sD%s.pth" % (path_str, f0_str, sr2), "not exist, will not use pretrained model", ) return ( - "pretrained%s/%sG%s.pth" % (path_str, f0_str, sr2) + "assets/pretrained%s/%sG%s.pth" % (path_str, f0_str, sr2) if if_pretrained_generator_exist else "", - "pretrained%s/%sD%s.pth" % (path_str, f0_str, sr2) + "assets/pretrained%s/%sD%s.pth" % (path_str, f0_str, sr2) if if_pretrained_discriminator_exist else "", ) @@ -389,26 +402,26 @@ def change_version19(sr2, if_f0_3, version19): ) f0_str = "f0" if if_f0_3 else "" if_pretrained_generator_exist = os.access( - "pretrained%s/%sG%s.pth" % (path_str, f0_str, sr2), os.F_OK + "assets/pretrained%s/%sG%s.pth" % (path_str, f0_str, sr2), os.F_OK ) if_pretrained_discriminator_exist = os.access( - "pretrained%s/%sD%s.pth" % (path_str, f0_str, sr2), os.F_OK + "assets/pretrained%s/%sD%s.pth" % (path_str, f0_str, sr2), os.F_OK ) if not if_pretrained_generator_exist: print( - "pretrained%s/%sG%s.pth" % (path_str, f0_str, sr2), + "assets/pretrained%s/%sG%s.pth" % (path_str, f0_str, sr2), "not exist, will not use pretrained model", ) if not if_pretrained_discriminator_exist: print( - "pretrained%s/%sD%s.pth" % (path_str, f0_str, sr2), + "assets/pretrained%s/%sD%s.pth" % (path_str, f0_str, sr2), "not exist, will not use pretrained model", ) return ( - "pretrained%s/%sG%s.pth" % (path_str, f0_str, sr2) + "assets/pretrained%s/%sG%s.pth" % (path_str, f0_str, sr2) if if_pretrained_generator_exist else "", - "pretrained%s/%sD%s.pth" % (path_str, f0_str, sr2) + "assets/pretrained%s/%sD%s.pth" % (path_str, f0_str, sr2) if if_pretrained_discriminator_exist else "", to_return_sr2, @@ -418,37 +431,37 @@ def change_version19(sr2, if_f0_3, version19): def change_f0(if_f0_3, sr2, version19): # f0method8,pretrained_G14,pretrained_D15 path_str = "" if version19 == "v1" else "_v2" if_pretrained_generator_exist = os.access( - "pretrained%s/f0G%s.pth" % (path_str, sr2), os.F_OK + "assets/pretrained%s/f0G%s.pth" % (path_str, sr2), os.F_OK ) if_pretrained_discriminator_exist = os.access( - "pretrained%s/f0D%s.pth" % (path_str, sr2), os.F_OK + "assets/pretrained%s/f0D%s.pth" % (path_str, sr2), os.F_OK ) if not if_pretrained_generator_exist: print( - "pretrained%s/f0G%s.pth" % (path_str, sr2), + "assets/pretrained%s/f0G%s.pth" % (path_str, sr2), "not exist, will not use pretrained model", ) if not if_pretrained_discriminator_exist: print( - "pretrained%s/f0D%s.pth" % (path_str, sr2), + "assets/pretrained%s/f0D%s.pth" % (path_str, sr2), "not exist, will not use pretrained model", ) if if_f0_3: return ( {"visible": True, "__type__": "update"}, - "pretrained%s/f0G%s.pth" % (path_str, sr2) + "assets/pretrained%s/f0G%s.pth" % (path_str, sr2) if if_pretrained_generator_exist else "", - "pretrained%s/f0D%s.pth" % (path_str, sr2) + "assets/pretrained%s/f0D%s.pth" % (path_str, sr2) if if_pretrained_discriminator_exist else "", ) return ( {"visible": False, "__type__": "update"}, - ("pretrained%s/G%s.pth" % (path_str, sr2)) + ("assets/pretrained%s/G%s.pth" % (path_str, sr2)) if if_pretrained_generator_exist else "", - ("pretrained%s/D%s.pth" % (path_str, sr2)) + ("assets/pretrained%s/D%s.pth" % (path_str, sr2)) if if_pretrained_discriminator_exist else "", ) @@ -548,7 +561,7 @@ def click_train( if gpus16: cmd = ( config.python_cmd - + ' train_nsf_sim_cache_sid_load_pretrain.py -e "%s" -sr %s -f0 %s -bs %s -g %s -te %s -se %s %s %s -l %s -c %s -sw %s -v %s' + + ' infer/modules/train/train.py -e "%s" -sr %s -f0 %s -bs %s -g %s -te %s -se %s %s %s -l %s -c %s -sw %s -v %s' % ( exp_dir1, sr2, @@ -568,7 +581,7 @@ def click_train( else: cmd = ( config.python_cmd - + ' train_nsf_sim_cache_sid_load_pretrain.py -e "%s" -sr %s -f0 %s -bs %s -te %s -se %s %s %s -l %s -c %s -sw %s -v %s' + + ' infer/modules/train/train.py -e "%s" -sr %s -f0 %s -bs %s -te %s -se %s %s %s -l %s -c %s -sw %s -v %s' % ( exp_dir1, sr2, @@ -1482,12 +1495,12 @@ with gr.Blocks(title="RVC WebUI") as app: with gr.Row(): pretrained_G14 = gr.Textbox( label=i18n("加载预训练底模G路径"), - value="pretrained_v2/f0G40k.pth", + value="assets/pretrained_v2/f0G40k.pth", interactive=True, ) pretrained_D15 = gr.Textbox( label=i18n("加载预训练底模D路径"), - value="pretrained_v2/f0D40k.pth", + value="assets/pretrained_v2/f0D40k.pth", interactive=True, ) sr2.change( diff --git a/infer/lib/train/process_ckpt.py b/infer/lib/train/process_ckpt.py index a48ca61..ad32b44 100644 --- a/infer/lib/train/process_ckpt.py +++ b/infer/lib/train/process_ckpt.py @@ -1,7 +1,6 @@ import torch, traceback, os, sys -now_dir = os.getcwd() -sys.path.append(now_dir) + from collections import OrderedDict from i18n.i18n import I18nAuto diff --git a/infer/lib/train/utils.py b/infer/lib/train/utils.py index 9c0fb5c..337422b 100644 --- a/infer/lib/train/utils.py +++ b/infer/lib/train/utils.py @@ -362,9 +362,9 @@ def get_hparams(init=True): os.makedirs(experiment_dir) if args.version == "v1" or args.sample_rate == "40k": - config_path = "configs/%s.json" % args.sample_rate + config_path = "configs/v1/%s.json" % args.sample_rate else: - config_path = "configs/%s_v2.json" % args.sample_rate + config_path = "configs/v2/%s.json" % args.sample_rate config_save_path = os.path.join(experiment_dir, "config.json") if init: with open(config_path, "r") as f: diff --git a/extract_f0_print.py b/infer/modules/train/extract/extract_f0_print.py similarity index 94% rename from extract_f0_print.py rename to infer/modules/train/extract/extract_f0_print.py index 4f6c806..d95548e 100644 --- a/extract_f0_print.py +++ b/infer/modules/train/extract/extract_f0_print.py @@ -79,7 +79,9 @@ class FeatureInput(object): from lib.rmvpe import RMVPE print("loading rmvpe model") - self.model_rmvpe = RMVPE("rmvpe.pt", is_half=False, device="cpu") + self.model_rmvpe = RMVPE( + "assets/rmvpe/rmvpe.pt", is_half=False, device="cpu" + ) f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03) return f0 diff --git a/extract_f0_rmvpe.py b/infer/modules/train/extract/extract_f0_rmvpe.py similarity index 93% rename from extract_f0_rmvpe.py rename to infer/modules/train/extract/extract_f0_rmvpe.py index 00ca16c..33517e0 100644 --- a/extract_f0_rmvpe.py +++ b/infer/modules/train/extract/extract_f0_rmvpe.py @@ -42,7 +42,9 @@ class FeatureInput(object): from lib.rmvpe import RMVPE print("loading rmvpe model") - self.model_rmvpe = RMVPE("rmvpe.pt", is_half=is_half, device="cuda") + self.model_rmvpe = RMVPE( + "assets/rmvpe/rmvpe.pt", is_half=is_half, device="cuda" + ) f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03) return f0 diff --git a/extract_f0_rmvpe_dml.py b/infer/modules/train/extract/extract_f0_rmvpe_dml.py similarity index 93% rename from extract_f0_rmvpe_dml.py rename to infer/modules/train/extract/extract_f0_rmvpe_dml.py index 0de50c5..744c69f 100644 --- a/extract_f0_rmvpe_dml.py +++ b/infer/modules/train/extract/extract_f0_rmvpe_dml.py @@ -40,7 +40,9 @@ class FeatureInput(object): from lib.rmvpe import RMVPE print("loading rmvpe model") - self.model_rmvpe = RMVPE("rmvpe.pt", is_half=False, device=device) + self.model_rmvpe = RMVPE( + "assets/rmvpe/rmvpe.pt", is_half=False, device=device + ) f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03) return f0 diff --git a/infer/modules/train/extract_feature_print.py b/infer/modules/train/extract_feature_print.py new file mode 100644 index 0000000..32e0492 --- /dev/null +++ b/infer/modules/train/extract_feature_print.py @@ -0,0 +1,135 @@ +import os, sys, traceback + +os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" +os.environ["PYTORCH_MPS_HIGH_WATERMARK_RATIO"] = "0.0" + +device = sys.argv[1] +n_part = int(sys.argv[2]) +i_part = int(sys.argv[3]) +if len(sys.argv) == 6: + exp_dir = sys.argv[4] + version = sys.argv[5] +else: + i_gpu = sys.argv[4] + exp_dir = sys.argv[5] + os.environ["CUDA_VISIBLE_DEVICES"] = str(i_gpu) + version = sys.argv[6] +import torch +import torch.nn.functional as F +import soundfile as sf +import numpy as np +import fairseq + +if "privateuseone" not in device: + device = "cpu" + if torch.cuda.is_available(): + device = "cuda" + elif torch.backends.mps.is_available(): + device = "mps" +else: + import torch_directml + + device = torch_directml.device(torch_directml.default_device()) + + def forward_dml(ctx, x, scale): + ctx.scale = scale + res = x.clone().detach() + return res + + fairseq.modules.grad_multiply.GradMultiply.forward = forward_dml + +f = open("%s/extract_f0_feature.log" % exp_dir, "a+") + + +def printt(strr): + print(strr) + f.write("%s\n" % strr) + f.flush() + + +printt(sys.argv) +model_path = "assets/hubert/hubert_base.pt" + +printt(exp_dir) +wavPath = "%s/1_16k_wavs" % exp_dir +outPath = ( + "%s/3_feature256" % exp_dir if version == "v1" else "%s/3_feature768" % exp_dir +) +os.makedirs(outPath, exist_ok=True) + + +# wave must be 16k, hop_size=320 +def readwave(wav_path, normalize=False): + wav, sr = sf.read(wav_path) + assert sr == 16000 + feats = torch.from_numpy(wav).float() + if feats.dim() == 2: # double channels + feats = feats.mean(-1) + assert feats.dim() == 1, feats.dim() + if normalize: + with torch.no_grad(): + feats = F.layer_norm(feats, feats.shape) + feats = feats.view(1, -1) + return feats + + +# HuBERT model +printt("load model(s) from {}".format(model_path)) +# if hubert model is exist +if os.access(model_path, os.F_OK) == False: + printt( + "Error: Extracting is shut down because %s does not exist, you may download it from https://huggingface.co/lj1995/VoiceConversionWebUI/tree/main" + % model_path + ) + exit(0) +models, saved_cfg, task = fairseq.checkpoint_utils.load_model_ensemble_and_task( + [model_path], + suffix="", +) +model = models[0] +model = model.to(device) +printt("move model to %s" % device) +if device not in ["mps", "cpu"]: + model = model.half() +model.eval() + +todo = sorted(list(os.listdir(wavPath)))[i_part::n_part] +n = max(1, len(todo) // 10) # 最多打印十条 +if len(todo) == 0: + printt("no-feature-todo") +else: + printt("all-feature-%s" % len(todo)) + for idx, file in enumerate(todo): + try: + if file.endswith(".wav"): + wav_path = "%s/%s" % (wavPath, file) + out_path = "%s/%s" % (outPath, file.replace("wav", "npy")) + + if os.path.exists(out_path): + continue + + feats = readwave(wav_path, normalize=saved_cfg.task.normalize) + padding_mask = torch.BoolTensor(feats.shape).fill_(False) + inputs = { + "source": feats.half().to(device) + if device not in ["mps", "cpu"] + else feats.to(device), + "padding_mask": padding_mask.to(device), + "output_layer": 9 if version == "v1" else 12, # layer 9 + } + with torch.no_grad(): + logits = model.extract_features(**inputs) + feats = ( + model.final_proj(logits[0]) if version == "v1" else logits[0] + ) + + feats = feats.squeeze(0).float().cpu().numpy() + if np.isnan(feats).sum() == 0: + np.save(out_path, feats, allow_pickle=False) + else: + printt("%s-contains nan" % file) + if idx % n == 0: + printt("now-%s,all-%s,%s,%s" % (len(todo), idx, file, feats.shape)) + except: + printt(traceback.format_exc()) + printt("all-feature-done") diff --git a/infer/modules/train/train.py b/infer/modules/train/train.py index c1bdf11..3dca6c7 100644 --- a/infer/modules/train/train.py +++ b/infer/modules/train/train.py @@ -3,7 +3,7 @@ import os, sys now_dir = os.getcwd() sys.path.append(os.path.join(now_dir)) -from lib.train import utils +from infer.lib.train import utils import datetime hps = utils.get_hparams() @@ -22,10 +22,10 @@ import torch.multiprocessing as mp import torch.distributed as dist from torch.nn.parallel import DistributedDataParallel as DDP from torch.cuda.amp import autocast, GradScaler -from lib.infer_pack import commons +from infer.lib.infer_pack import commons from time import sleep from time import time as ttime -from lib.train.data_utils import ( +from infer.lib.train.data_utils import ( TextAudioLoaderMultiNSFsid, TextAudioLoader, TextAudioCollateMultiNSFsid, @@ -34,20 +34,25 @@ from lib.train.data_utils import ( ) if hps.version == "v1": - from lib.infer_pack.models import ( + from infer.lib.infer_pack.models import ( SynthesizerTrnMs256NSFsid as RVC_Model_f0, SynthesizerTrnMs256NSFsid_nono as RVC_Model_nof0, MultiPeriodDiscriminator, ) else: - from lib.infer_pack.models import ( + from infer.lib.infer_pack.models import ( SynthesizerTrnMs768NSFsid as RVC_Model_f0, SynthesizerTrnMs768NSFsid_nono as RVC_Model_nof0, MultiPeriodDiscriminatorV2 as MultiPeriodDiscriminator, ) -from lib.train.losses import generator_loss, discriminator_loss, feature_loss, kl_loss -from lib.train.mel_processing import mel_spectrogram_torch, spec_to_mel_torch -from lib.train.process_ckpt import savee +from infer.lib.train.losses import ( + generator_loss, + discriminator_loss, + feature_loss, + kl_loss, +) +from infer.lib.train.mel_processing import mel_spectrogram_torch, spec_to_mel_torch +from infer.lib.train.process_ckpt import savee global_step = 0 From f6fcda7f97e09c9991577a74a2edc73cca331bfe Mon Sep 17 00:00:00 2001 From: Ftps Date: Mon, 21 Aug 2023 21:19:05 +0900 Subject: [PATCH 26/65] fix unitest --- .github/workflows/unitest.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/unitest.yml b/.github/workflows/unitest.yml index eaa8dd1..af677d6 100644 --- a/.github/workflows/unitest.yml +++ b/.github/workflows/unitest.yml @@ -32,5 +32,5 @@ jobs: touch logs/mi-test/preprocess.log python infer/modules/train/preprocess.py logs/mute/0_gt_wavs 48000 8 logs/mi-test True touch logs/mi-test/extract_f0_feature.log - python extract_f0_print.py logs/mi-test $(nproc) pm + python infer/modules/train/extract/extract_f0_print.py logs/mi-test $(nproc) pm python extract_feature_print.py cpu 1 0 0 logs/mi-test v1 From 0ad6c2b72013321783db4c930c469c6bb38aa6a1 Mon Sep 17 00:00:00 2001 From: Tps-F Date: Sun, 27 Aug 2023 03:44:13 +0000 Subject: [PATCH 27/65] Apply Code Formatter Change --- infer-web.py | 67 +++++++++++++++++++++++----------------------------- 1 file changed, 29 insertions(+), 38 deletions(-) diff --git a/infer-web.py b/infer-web.py index ba5d0b7..4f0fa3f 100644 --- a/infer-web.py +++ b/infer-web.py @@ -213,7 +213,6 @@ def preprocess_dataset(trainset_dir, exp_dir, sr, n_p): cmd = ( get_quoted_python_cmd() + ' infer/modules/train/preprocess.py "%s" %s %s "%s/logs/%s" ' - % (trainset_dir, sr, n_p, now_dir, exp_dir) + str(config.noparallel) ) @@ -251,7 +250,6 @@ def extract_f0_feature(gpus, n_p, f0method, if_f0, exp_dir, version19, gpus_rmvp cmd = ( get_quoted_python_cmd() + ' infer/modules/train/extract/extract_f0_print.py "%s/logs/%s" %s %s' - % ( now_dir, exp_dir, @@ -278,11 +276,13 @@ def extract_f0_feature(gpus, n_p, f0method, if_f0, exp_dir, version19, gpus_rmvp leng = len(gpus_rmvpe) ps = [] for idx, n_g in enumerate(gpus_rmvpe): - cmd = ( - get_quoted_python_cmd() - + ' infer/modules/train/extract/extract_f0_rmvpe.py %s %s %s "%s/logs/%s" %s ' - - % (leng, idx, n_g, now_dir, exp_dir, config.is_half) + cmd = get_quoted_python_cmd() + ' infer/modules/train/extract/extract_f0_rmvpe.py %s %s %s "%s/logs/%s" %s ' % ( + leng, + idx, + n_g, + now_dir, + exp_dir, + config.is_half, ) print(cmd) p = Popen( @@ -336,19 +336,14 @@ def extract_f0_feature(gpus, n_p, f0method, if_f0, exp_dir, version19, gpus_rmvp leng = len(gpus) ps = [] for idx, n_g in enumerate(gpus): - cmd = ( - get_quoted_python_cmd() - + ' infer/modules/train/extract_feature_print.py %s %s %s %s "%s/logs/%s" %s' - - % ( - config.device, - leng, - idx, - n_g, - now_dir, - exp_dir, - version19, - ) + cmd = get_quoted_python_cmd() + ' infer/modules/train/extract_feature_print.py %s %s %s %s "%s/logs/%s" %s' % ( + config.device, + leng, + idx, + n_g, + now_dir, + exp_dir, + version19, ) print(cmd) p = Popen( @@ -573,24 +568,20 @@ def click_train( if pretrained_D15 == "": print("no pretrained Discriminator") if gpus16: - cmd = ( - get_quoted_python_cmd() - + ' infer/modules/train/train.py -e "%s" -sr %s -f0 %s -bs %s -g %s -te %s -se %s %s %s -l %s -c %s -sw %s -v %s' - % ( - exp_dir1, - sr2, - 1 if if_f0_3 else 0, - batch_size12, - gpus16, - total_epoch11, - save_epoch10, - "-pg %s" % pretrained_G14 if pretrained_G14 != "" else "", - "-pd %s" % pretrained_D15 if pretrained_D15 != "" else "", - 1 if if_save_latest13 == i18n("是") else 0, - 1 if if_cache_gpu17 == i18n("是") else 0, - 1 if if_save_every_weights18 == i18n("是") else 0, - version19, - ) + cmd = get_quoted_python_cmd() + ' infer/modules/train/train.py -e "%s" -sr %s -f0 %s -bs %s -g %s -te %s -se %s %s %s -l %s -c %s -sw %s -v %s' % ( + exp_dir1, + sr2, + 1 if if_f0_3 else 0, + batch_size12, + gpus16, + total_epoch11, + save_epoch10, + "-pg %s" % pretrained_G14 if pretrained_G14 != "" else "", + "-pd %s" % pretrained_D15 if pretrained_D15 != "" else "", + 1 if if_save_latest13 == i18n("是") else 0, + 1 if if_cache_gpu17 == i18n("是") else 0, + 1 if if_save_every_weights18 == i18n("是") else 0, + version19, ) else: cmd = ( From c25ad34a96a53ebee719f677a0f048a2d62034bd Mon Sep 17 00:00:00 2001 From: Ftps Date: Sun, 27 Aug 2023 12:44:35 +0900 Subject: [PATCH 28/65] format --- infer-web.py | 67 +++++++++++++++++++++++----------------------------- 1 file changed, 29 insertions(+), 38 deletions(-) diff --git a/infer-web.py b/infer-web.py index ba5d0b7..4f0fa3f 100644 --- a/infer-web.py +++ b/infer-web.py @@ -213,7 +213,6 @@ def preprocess_dataset(trainset_dir, exp_dir, sr, n_p): cmd = ( get_quoted_python_cmd() + ' infer/modules/train/preprocess.py "%s" %s %s "%s/logs/%s" ' - % (trainset_dir, sr, n_p, now_dir, exp_dir) + str(config.noparallel) ) @@ -251,7 +250,6 @@ def extract_f0_feature(gpus, n_p, f0method, if_f0, exp_dir, version19, gpus_rmvp cmd = ( get_quoted_python_cmd() + ' infer/modules/train/extract/extract_f0_print.py "%s/logs/%s" %s %s' - % ( now_dir, exp_dir, @@ -278,11 +276,13 @@ def extract_f0_feature(gpus, n_p, f0method, if_f0, exp_dir, version19, gpus_rmvp leng = len(gpus_rmvpe) ps = [] for idx, n_g in enumerate(gpus_rmvpe): - cmd = ( - get_quoted_python_cmd() - + ' infer/modules/train/extract/extract_f0_rmvpe.py %s %s %s "%s/logs/%s" %s ' - - % (leng, idx, n_g, now_dir, exp_dir, config.is_half) + cmd = get_quoted_python_cmd() + ' infer/modules/train/extract/extract_f0_rmvpe.py %s %s %s "%s/logs/%s" %s ' % ( + leng, + idx, + n_g, + now_dir, + exp_dir, + config.is_half, ) print(cmd) p = Popen( @@ -336,19 +336,14 @@ def extract_f0_feature(gpus, n_p, f0method, if_f0, exp_dir, version19, gpus_rmvp leng = len(gpus) ps = [] for idx, n_g in enumerate(gpus): - cmd = ( - get_quoted_python_cmd() - + ' infer/modules/train/extract_feature_print.py %s %s %s %s "%s/logs/%s" %s' - - % ( - config.device, - leng, - idx, - n_g, - now_dir, - exp_dir, - version19, - ) + cmd = get_quoted_python_cmd() + ' infer/modules/train/extract_feature_print.py %s %s %s %s "%s/logs/%s" %s' % ( + config.device, + leng, + idx, + n_g, + now_dir, + exp_dir, + version19, ) print(cmd) p = Popen( @@ -573,24 +568,20 @@ def click_train( if pretrained_D15 == "": print("no pretrained Discriminator") if gpus16: - cmd = ( - get_quoted_python_cmd() - + ' infer/modules/train/train.py -e "%s" -sr %s -f0 %s -bs %s -g %s -te %s -se %s %s %s -l %s -c %s -sw %s -v %s' - % ( - exp_dir1, - sr2, - 1 if if_f0_3 else 0, - batch_size12, - gpus16, - total_epoch11, - save_epoch10, - "-pg %s" % pretrained_G14 if pretrained_G14 != "" else "", - "-pd %s" % pretrained_D15 if pretrained_D15 != "" else "", - 1 if if_save_latest13 == i18n("是") else 0, - 1 if if_cache_gpu17 == i18n("是") else 0, - 1 if if_save_every_weights18 == i18n("是") else 0, - version19, - ) + cmd = get_quoted_python_cmd() + ' infer/modules/train/train.py -e "%s" -sr %s -f0 %s -bs %s -g %s -te %s -se %s %s %s -l %s -c %s -sw %s -v %s' % ( + exp_dir1, + sr2, + 1 if if_f0_3 else 0, + batch_size12, + gpus16, + total_epoch11, + save_epoch10, + "-pg %s" % pretrained_G14 if pretrained_G14 != "" else "", + "-pd %s" % pretrained_D15 if pretrained_D15 != "" else "", + 1 if if_save_latest13 == i18n("是") else 0, + 1 if if_cache_gpu17 == i18n("是") else 0, + 1 if if_save_every_weights18 == i18n("是") else 0, + version19, ) else: cmd = ( From 180e7acd5741d80308bd3fdb4e370c1f91e75350 Mon Sep 17 00:00:00 2001 From: Ftps Date: Sun, 27 Aug 2023 13:25:42 +0900 Subject: [PATCH 29/65] fix save arg --- infer/lib/train/process_ckpt.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/infer/lib/train/process_ckpt.py b/infer/lib/train/process_ckpt.py index ad32b44..37a8795 100644 --- a/infer/lib/train/process_ckpt.py +++ b/infer/lib/train/process_ckpt.py @@ -7,7 +7,7 @@ from i18n.i18n import I18nAuto i18n = I18nAuto() -def savee(ckpt, sr, if_f0, name, epoch, version, hps, i18n): +def savee(ckpt, sr, if_f0, name, epoch, version, hps): try: opt = OrderedDict() opt["weight"] = {} From 5251e75ab1d2fae2765a0d6c717cf9fcb7eec67b Mon Sep 17 00:00:00 2001 From: Ftps Date: Sun, 27 Aug 2023 19:12:30 +0900 Subject: [PATCH 30/65] fix path --- infer/lib/train/process_ckpt.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/infer/lib/train/process_ckpt.py b/infer/lib/train/process_ckpt.py index 37a8795..f2d73af 100644 --- a/infer/lib/train/process_ckpt.py +++ b/infer/lib/train/process_ckpt.py @@ -39,7 +39,7 @@ def savee(ckpt, sr, if_f0, name, epoch, version, hps): opt["sr"] = sr opt["f0"] = if_f0 opt["version"] = version - torch.save(opt, "weights/%s.pth" % name) + torch.save(opt, "assets/weights/%s.pth" % name) return "Success." except: return traceback.format_exc() @@ -182,7 +182,7 @@ def extract_small_model(path, name, sr, if_f0, info, version): opt["version"] = version opt["sr"] = sr opt["f0"] = int(if_f0) - torch.save(opt, "weights/%s.pth" % name) + torch.save(opt, "assets/weights/%s.pth" % name) return "Success." except: return traceback.format_exc() @@ -252,7 +252,7 @@ def merge(path1, path2, alpha1, sr, f0, info, name, version): opt["f0"] = 1 if f0 == i18n("是") else 0 opt["version"] = version opt["info"] = info - torch.save(opt, "weights/%s.pth" % name) + torch.save(opt, "assets/weights/%s.pth" % name) return "Success." except: return traceback.format_exc() From 3f3177b5ce07593c70f7e0e35f084adaf161fc67 Mon Sep 17 00:00:00 2001 From: Ftps Date: Sun, 27 Aug 2023 19:14:01 +0900 Subject: [PATCH 31/65] load audio with gradio-file --- infer/lib/audio.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/infer/lib/audio.py b/infer/lib/audio.py index 776939d..61db726 100644 --- a/infer/lib/audio.py +++ b/infer/lib/audio.py @@ -1,3 +1,4 @@ +import librosa import ffmpeg import numpy as np @@ -15,7 +16,13 @@ def load_audio(file, sr): .output("-", format="f32le", acodec="pcm_f32le", ac=1, ar=sr) .run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True) ) + return np.frombuffer(out, np.float32).flatten() + + except AttributeError: + audio = file[1] / 32768.0 + if len(audio.shape) == 2: + audio = np.mean(audio, -1) + return librosa.resample(audio, orig_sr=file[0], target_sr=16000) + except Exception as e: raise RuntimeError(f"Failed to load audio: {e}") - - return np.frombuffer(out, np.float32).flatten() From 9a10795908c6fceef4186c186bd97c781b87d5fc Mon Sep 17 00:00:00 2001 From: Ftps Date: Sun, 27 Aug 2023 19:49:18 +0900 Subject: [PATCH 32/65] repair app.py --- app.py | 199 +++--------------------------------- infer/modules/vc/modules.py | 26 +++-- 2 files changed, 33 insertions(+), 192 deletions(-) diff --git a/app.py b/app.py index 69bb617..e4a6415 100644 --- a/app.py +++ b/app.py @@ -1,22 +1,15 @@ import os -import torch # os.system("wget -P cvec/ https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/hubert_base.pt") import gradio as gr -import librosa -import numpy as np import logging -from fairseq import checkpoint_utils -from lib.train.vc_infer_pipeline import VC -import traceback -from config import defaultconfig as config -from lib.infer_pack.models import ( - SynthesizerTrnMs256NSFsid, - SynthesizerTrnMs256NSFsid_nono, - SynthesizerTrnMs768NSFsid, - SynthesizerTrnMs768NSFsid_nono, -) -from i18n import I18nAuto + +from configs.config import Config + +from i18n.i18n import I18nAuto +from dotenv import load_dotenv + +from infer.modules.vc.modules import VC logging.getLogger("numba").setLevel(logging.WARNING) logging.getLogger("markdown_it").setLevel(logging.WARNING) @@ -26,8 +19,12 @@ logging.getLogger("matplotlib").setLevel(logging.WARNING) i18n = I18nAuto() i18n.print() -weight_root = "weights" -weight_uvr5_root = "uvr5_weights" +load_dotenv() +config = Config() +vc = VC(config) + +weight_root = os.getenv("weight_root") +weight_uvr5_root = os.getenv("weight_uvr5_root") index_root = "logs" names = [] hubert_model = None @@ -41,168 +38,6 @@ for root, dirs, files in os.walk(index_root, topdown=False): index_paths.append("%s/%s" % (root, name)) -def get_vc(sid): - global n_spk, tgt_sr, net_g, vc, cpt, version - if sid == "" or sid == []: - global hubert_model - if hubert_model != None: # 考虑到轮询, 需要加个判断看是否 sid 是由有模型切换到无模型的 - print("clean_empty_cache") - del net_g, n_spk, vc, hubert_model, tgt_sr # ,cpt - hubert_model = net_g = n_spk = vc = hubert_model = tgt_sr = None - if torch.cuda.is_available(): - torch.cuda.empty_cache() - ###楼下不这么折腾清理不干净 - if_f0 = cpt.get("f0", 1) - version = cpt.get("version", "v1") - if version == "v1": - if if_f0 == 1: - net_g = SynthesizerTrnMs256NSFsid( - *cpt["config"], is_half=config.is_half - ) - else: - net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"]) - elif version == "v2": - if if_f0 == 1: - net_g = SynthesizerTrnMs768NSFsid( - *cpt["config"], is_half=config.is_half - ) - else: - net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"]) - del net_g, cpt - if torch.cuda.is_available(): - torch.cuda.empty_cache() - cpt = None - return {"visible": False, "__type__": "update"} - person = "%s/%s" % (weight_root, sid) - print("loading %s" % person) - cpt = torch.load(person, map_location="cpu") - tgt_sr = cpt["config"][-1] - cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0] # n_spk - if_f0 = cpt.get("f0", 1) - version = cpt.get("version", "v1") - if version == "v1": - if if_f0 == 1: - net_g = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=config.is_half) - else: - net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"]) - elif version == "v2": - if if_f0 == 1: - net_g = SynthesizerTrnMs768NSFsid(*cpt["config"], is_half=config.is_half) - else: - net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"]) - del net_g.enc_q - print(net_g.load_state_dict(cpt["weight"], strict=False)) - net_g.eval().to(config.device) - if config.is_half: - net_g = net_g.half() - else: - net_g = net_g.float() - vc = VC(tgt_sr, config) - n_spk = cpt["config"][-3] - return {"visible": True, "maximum": n_spk, "__type__": "update"} - - -def load_hubert(): - global hubert_model - models, _, _ = checkpoint_utils.load_model_ensemble_and_task( - ["hubert_base.pt"], - suffix="", - ) - hubert_model = models[0] - hubert_model = hubert_model.to(config.device) - if config.is_half: - hubert_model = hubert_model.half() - else: - hubert_model = hubert_model.float() - hubert_model.eval() - - -def vc_single( - sid, - input_audio_path, - f0_up_key, - f0_file, - f0_method, - file_index, - file_index2, - # file_big_npy, - index_rate, - filter_radius, - resample_sr, - rms_mix_rate, - protect, -): # spk_item, input_audio0, vc_transform0,f0_file,f0method0 - global tgt_sr, net_g, vc, hubert_model, version - if input_audio_path is None: - return "You need to upload an audio", None - f0_up_key = int(f0_up_key) - try: - audio = input_audio_path[1] / 32768.0 - if len(audio.shape) == 2: - audio = np.mean(audio, -1) - audio = librosa.resample(audio, orig_sr=input_audio_path[0], target_sr=16000) - audio_max = np.abs(audio).max() / 0.95 - if audio_max > 1: - audio /= audio_max - times = [0, 0, 0] - if hubert_model == None: - load_hubert() - if_f0 = cpt.get("f0", 1) - file_index = ( - ( - file_index.strip(" ") - .strip('"') - .strip("\n") - .strip('"') - .strip(" ") - .replace("trained", "added") - ) - if file_index != "" - else file_index2 - ) # 防止小白写错,自动帮他替换掉 - # file_big_npy = ( - # file_big_npy.strip(" ").strip('"').strip("\n").strip('"').strip(" ") - # ) - audio_opt = vc.pipeline( - hubert_model, - net_g, - sid, - audio, - input_audio_path, - times, - f0_up_key, - f0_method, - file_index, - # file_big_npy, - index_rate, - if_f0, - filter_radius, - tgt_sr, - resample_sr, - rms_mix_rate, - version, - protect, - f0_file=f0_file, - ) - if resample_sr >= 16000 and tgt_sr != resample_sr: - tgt_sr = resample_sr - index_info = ( - "Using index:%s." % file_index - if os.path.exists(file_index) - else "Index not used." - ) - return "Success.\n %s\nTime:\n npy:%ss, f0:%ss, infer:%ss" % ( - index_info, - times[0], - times[1], - times[2], - ), (tgt_sr, audio_opt) - except: - info = traceback.format_exc() - print(info) - return info, (None, None) - - app = gr.Blocks() with app: with gr.Tabs(): @@ -223,11 +58,7 @@ with app: visible=False, interactive=True, ) - sid.change( - fn=get_vc, - inputs=[sid], - outputs=[spk_item], - ) + sid.change(fn=vc.get_vc, inputs=[sid], outputs=[spk_item]) gr.Markdown( value=i18n("男转女推荐+12key, 女转男推荐-12key, 如果音域爆炸导致音色失真也可以自己调整到合适音域. ") ) @@ -294,7 +125,7 @@ with app: vc_output1 = gr.Textbox(label=i18n("输出信息")) vc_output2 = gr.Audio(label=i18n("输出音频(右下角三个点,点了可以下载)")) but0.click( - vc_single, + vc.vc_single, [ spk_item, vc_input3, diff --git a/infer/modules/vc/modules.py b/infer/modules/vc/modules.py index 2511214..0fb7c0a 100644 --- a/infer/modules/vc/modules.py +++ b/infer/modules/vc/modules.py @@ -29,7 +29,9 @@ class VC: self.config = config - def get_vc(self, sid, to_return_protect0, to_return_protect1): + def get_vc(self, sid, *to_return_protect): + print("aosdijfaofjoaij") + print(to_return_protect) person = f'{os.getenv("weight_root")}/{sid}' print(f"loading {person}") @@ -41,12 +43,16 @@ class VC: to_return_protect0 = { "visible": self.if_f0 != 0, - "value": to_return_protect0 if self.if_f0 != 0 else 0.5, + "value": to_return_protect[0] + if self.if_f0 != 0 and to_return_protect + else 0.5, "__type__": "update", } to_return_protect1 = { "visible": self.if_f0 != 0, - "value": to_return_protect1 if self.if_f0 != 0 else 0.33, + "value": to_return_protect[1] + if self.if_f0 != 0 and to_return_protect + else 0.33, "__type__": "update", } @@ -75,11 +81,15 @@ class VC: index = {"value": get_index_path_from_model(sid), "__type__": "update"} return ( - {"visible": True, "maximum": n_spk, "__type__": "update"}, - to_return_protect0, - to_return_protect1, - index, - index, + ( + {"visible": True, "maximum": n_spk, "__type__": "update"}, + to_return_protect0, + to_return_protect1, + index, + index, + ) + if to_return_protect + else {"visible": True, "maximum": n_spk, "__type__": "update"} ) def vc_single( From ba2924b8d12003fe3761ff07e8e5e6d5a68893d3 Mon Sep 17 00:00:00 2001 From: Ftps Date: Sun, 27 Aug 2023 20:03:27 +0900 Subject: [PATCH 33/65] remove testprint --- infer/modules/vc/modules.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/infer/modules/vc/modules.py b/infer/modules/vc/modules.py index 0fb7c0a..f3626a9 100644 --- a/infer/modules/vc/modules.py +++ b/infer/modules/vc/modules.py @@ -30,8 +30,6 @@ class VC: self.config = config def get_vc(self, sid, *to_return_protect): - print("aosdijfaofjoaij") - print(to_return_protect) person = f'{os.getenv("weight_root")}/{sid}' print(f"loading {person}") From 927fbeebe259921f5a35a4b7fa17780b9b4fbdaa Mon Sep 17 00:00:00 2001 From: Ftps <63702646+Tps-F@users.noreply.github.com> Date: Sun, 27 Aug 2023 22:04:13 +0900 Subject: [PATCH 34/65] Place does by language --- docs/{ => cn}/Changelog_CN.md | 0 docs/{ => cn}/faq.md | 0 docs/{ => en}/Changelog_EN.md | 0 docs/{ => en}/README.en.md | 0 docs/{ => en}/faiss_tips_en.md | 0 docs/{ => en}/faq_en.md | 0 docs/{ => en}/training_tips_en.md | 0 docs/{ => jp}/README.ja.md | 0 docs/{ => jp}/faiss_tips_ja.md | 0 docs/{ => jp}/training_tips_ja.md | 0 docs/{ => kr}/Changelog_KO.md | 0 docs/{ => kr}/README.ko.han.md | 0 docs/{ => kr}/README.ko.md | 0 docs/{ => kr}/faiss_tips_ko.md | 0 docs/{ => kr}/training_tips_ko.md | 0 docs/{ => tr}/Changelog_TR.md | 0 docs/{ => tr}/README.tr.md | 0 docs/{ => tr}/faiss_tips_tr.md | 0 docs/{ => tr}/faq_tr.md | 0 docs/{ => tr}/training_tips_tr.md | 0 20 files changed, 0 insertions(+), 0 deletions(-) rename docs/{ => cn}/Changelog_CN.md (100%) rename docs/{ => cn}/faq.md (100%) rename docs/{ => en}/Changelog_EN.md (100%) rename docs/{ => en}/README.en.md (100%) rename docs/{ => en}/faiss_tips_en.md (100%) rename docs/{ => en}/faq_en.md (100%) rename docs/{ => en}/training_tips_en.md (100%) rename docs/{ => jp}/README.ja.md (100%) rename docs/{ => jp}/faiss_tips_ja.md (100%) rename docs/{ => jp}/training_tips_ja.md (100%) rename docs/{ => kr}/Changelog_KO.md (100%) rename docs/{ => kr}/README.ko.han.md (100%) rename docs/{ => kr}/README.ko.md (100%) rename docs/{ => kr}/faiss_tips_ko.md (100%) rename docs/{ => kr}/training_tips_ko.md (100%) rename docs/{ => tr}/Changelog_TR.md (100%) rename docs/{ => tr}/README.tr.md (100%) rename docs/{ => tr}/faiss_tips_tr.md (100%) rename docs/{ => tr}/faq_tr.md (100%) rename docs/{ => tr}/training_tips_tr.md (100%) diff --git a/docs/Changelog_CN.md b/docs/cn/Changelog_CN.md similarity index 100% rename from docs/Changelog_CN.md rename to docs/cn/Changelog_CN.md diff --git a/docs/faq.md b/docs/cn/faq.md similarity index 100% rename from docs/faq.md rename to docs/cn/faq.md diff --git a/docs/Changelog_EN.md b/docs/en/Changelog_EN.md similarity index 100% rename from docs/Changelog_EN.md rename to docs/en/Changelog_EN.md diff --git a/docs/README.en.md b/docs/en/README.en.md similarity index 100% rename from docs/README.en.md rename to docs/en/README.en.md diff --git a/docs/faiss_tips_en.md b/docs/en/faiss_tips_en.md similarity index 100% rename from docs/faiss_tips_en.md rename to docs/en/faiss_tips_en.md diff --git a/docs/faq_en.md b/docs/en/faq_en.md similarity index 100% rename from docs/faq_en.md rename to docs/en/faq_en.md diff --git a/docs/training_tips_en.md b/docs/en/training_tips_en.md similarity index 100% rename from docs/training_tips_en.md rename to docs/en/training_tips_en.md diff --git a/docs/README.ja.md b/docs/jp/README.ja.md similarity index 100% rename from docs/README.ja.md rename to docs/jp/README.ja.md diff --git a/docs/faiss_tips_ja.md b/docs/jp/faiss_tips_ja.md similarity index 100% rename from docs/faiss_tips_ja.md rename to docs/jp/faiss_tips_ja.md diff --git a/docs/training_tips_ja.md b/docs/jp/training_tips_ja.md similarity index 100% rename from docs/training_tips_ja.md rename to docs/jp/training_tips_ja.md diff --git a/docs/Changelog_KO.md b/docs/kr/Changelog_KO.md similarity index 100% rename from docs/Changelog_KO.md rename to docs/kr/Changelog_KO.md diff --git a/docs/README.ko.han.md b/docs/kr/README.ko.han.md similarity index 100% rename from docs/README.ko.han.md rename to docs/kr/README.ko.han.md diff --git a/docs/README.ko.md b/docs/kr/README.ko.md similarity index 100% rename from docs/README.ko.md rename to docs/kr/README.ko.md diff --git a/docs/faiss_tips_ko.md b/docs/kr/faiss_tips_ko.md similarity index 100% rename from docs/faiss_tips_ko.md rename to docs/kr/faiss_tips_ko.md diff --git a/docs/training_tips_ko.md b/docs/kr/training_tips_ko.md similarity index 100% rename from docs/training_tips_ko.md rename to docs/kr/training_tips_ko.md diff --git a/docs/Changelog_TR.md b/docs/tr/Changelog_TR.md similarity index 100% rename from docs/Changelog_TR.md rename to docs/tr/Changelog_TR.md diff --git a/docs/README.tr.md b/docs/tr/README.tr.md similarity index 100% rename from docs/README.tr.md rename to docs/tr/README.tr.md diff --git a/docs/faiss_tips_tr.md b/docs/tr/faiss_tips_tr.md similarity index 100% rename from docs/faiss_tips_tr.md rename to docs/tr/faiss_tips_tr.md diff --git a/docs/faq_tr.md b/docs/tr/faq_tr.md similarity index 100% rename from docs/faq_tr.md rename to docs/tr/faq_tr.md diff --git a/docs/training_tips_tr.md b/docs/tr/training_tips_tr.md similarity index 100% rename from docs/training_tips_tr.md rename to docs/tr/training_tips_tr.md From 82900cdc90c57e913aa79895b0b401e2f0bb980a Mon Sep 17 00:00:00 2001 From: Ftps <63702646+Tps-F@users.noreply.github.com> Date: Sun, 27 Aug 2023 22:08:37 +0900 Subject: [PATCH 35/65] fix docs path --- README.md | 2 +- docs/en/README.en.md | 2 +- docs/jp/README.ja.md | 2 +- docs/kr/README.ko.han.md | 2 +- docs/kr/README.ko.md | 2 +- docs/tr/README.tr.md | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 8e39559..74680cb 100644 --- a/README.md +++ b/README.md @@ -20,7 +20,7 @@ ------ -[**English**](./docs/README.en.md) | [**中文简体**](./README.md) | [**日本語**](./docs/README.ja.md) | [**한국어**](./docs/README.ko.md) ([**韓國語**](./docs/README.ko.han.md)) | [**Türkçe**](./docs/README.tr.md) +[**English**](./docs/en/README.en.md) | [**中文简体**](./README.md) | [**日本語**](./docs/jp/README.ja.md) | [**한국어**](./docs/kr/README.ko.md) ([**韓國語**](./docs/kr/README.ko.han.md)) | [**Türkçe**](./docs/tr/README.tr.md) 点此查看我们的[演示视频](https://www.bilibili.com/video/BV1pm4y1z7Gm/) ! diff --git a/docs/en/README.en.md b/docs/en/README.en.md index 09cc102..7a1faac 100644 --- a/docs/en/README.en.md +++ b/docs/en/README.en.md @@ -18,7 +18,7 @@ An easy-to-use Voice Conversion framework based on VITS.

------ [**Changelog**](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/blob/main/docs/Changelog_EN.md) | [**FAQ (Frequently Asked Questions)**](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/wiki/FAQ-(Frequently-Asked-Questions)) -[**English**](./README.en.md) | [**中文简体**](../README.md) | [**日本語**](./README.ja.md) | [**한국어**](./README.ko.md) ([**韓國語**](./README.ko.han.md)) | [**Türkçe**](./README.tr.md) +[**English**](./docs/en/README.en.md) | [**中文简体**](./README.md) | [**日本語**](./docs/jp/README.ja.md) | [**한국어**](./docs/kr/README.ko.md) ([**韓國語**](./docs/kr/README.ko.han.md)) | [**Türkçe**](./docs/tr/README.tr.md) Check our [Demo Video](https://www.bilibili.com/video/BV1pm4y1z7Gm/) here! diff --git a/docs/jp/README.ja.md b/docs/jp/README.ja.md index 5bb2ba2..622b079 100644 --- a/docs/jp/README.ja.md +++ b/docs/jp/README.ja.md @@ -19,7 +19,7 @@ VITSに基づく使いやすい音声変換(voice changer)framework

[**更新日誌**](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/blob/main/docs/Changelog_CN.md) -[**English**](./README.en.md) | [**中文简体**](../README.md) | [**日本語**](./README.ja.md) | [**한국어**](./README.ko.md) ([**韓國語**](./README.ko.han.md)) | [**Türkçe**](./README.tr.md) +[**English**](./docs/en/README.en.md) | [**中文简体**](./README.md) | [**日本語**](./docs/jp/README.ja.md) | [**한국어**](./docs/kr/README.ko.md) ([**韓國語**](./docs/kr/README.ko.han.md)) | [**Türkçe**](./docs/tr/README.tr.md) > デモ動画は[こちら](https://www.bilibili.com/video/BV1pm4y1z7Gm/)でご覧ください。 diff --git a/docs/kr/README.ko.han.md b/docs/kr/README.ko.han.md index 78ceaac..18f636a 100644 --- a/docs/kr/README.ko.han.md +++ b/docs/kr/README.ko.han.md @@ -18,7 +18,7 @@ VITS基盤의 簡單하고使用하기 쉬운音聲變換틀

------ [**更新日誌**](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/blob/main/docs/Changelog_KO.md) -[**English**](./README.en.md) | [**中文简体**](../README.md) | [**日本語**](./README.ja.md) | [**한국어**](./README.ko.md) ([**韓國語**](./README.ko.han.md)) | [**Türkçe**](./README.tr.md) +[**English**](./docs/en/README.en.md) | [**中文简体**](./README.md) | [**日本語**](./docs/jp/README.ja.md) | [**한국어**](./docs/kr/README.ko.md) ([**韓國語**](./docs/kr/README.ko.han.md)) | [**Türkçe**](./docs/tr/README.tr.md) > [示範映像](https://www.bilibili.com/video/BV1pm4y1z7Gm/)을 確認해 보세요! diff --git a/docs/kr/README.ko.md b/docs/kr/README.ko.md index 0689688..b469ede 100644 --- a/docs/kr/README.ko.md +++ b/docs/kr/README.ko.md @@ -19,7 +19,7 @@ VITS 기반의 간단하고 사용하기 쉬운 음성 변환 프레임워크. [데모 영상](https://www.bilibili.com/video/BV1pm4y1z7Gm/)을 확인해 보세요! diff --git a/docs/tr/README.tr.md b/docs/tr/README.tr.md index 8c04cd2..0dc6b24 100644 --- a/docs/tr/README.tr.md +++ b/docs/tr/README.tr.md @@ -20,7 +20,7 @@ Kolay kullanılabilen VITS tabanlı bir Ses Dönüşümü çerçevesi.

------ [**Değişiklik Kaydı**](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/blob/main/docs/Changelog_TR.md) | [**SSS (Sıkça Sorulan Sorular)**](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/wiki/FAQ-(Frequently-Asked-Questions)) -[**English**](./README.en.md) | [**中文简体**](../README.md) | [**日本語**](./README.ja.md) | [**한국어**](./README.ko.md) ([**韓國語**](./README.ko.han.md)) | [**Türkçe**](./README.tr.md) +[**English**](./docs/en/README.en.md) | [**中文简体**](./README.md) | [**日本語**](./docs/jp/README.ja.md) | [**한국어**](./docs/kr/README.ko.md) ([**韓國語**](./docs/kr/README.ko.han.md)) | [**Türkçe**](./docs/tr/README.tr.md) Demo Videosu için [buraya](https://www.bilibili.com/video/BV1pm4y1z7Gm/) bakın! From a93e4f1cc3932debdc9eeccf9af773af0d801147 Mon Sep 17 00:00:00 2001 From: Ftps <63702646+Tps-F@users.noreply.github.com> Date: Sun, 27 Aug 2023 22:12:03 +0900 Subject: [PATCH 36/65] fix docs path(test) --- docs/en/README.en.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/README.en.md b/docs/en/README.en.md index 7a1faac..3723340 100644 --- a/docs/en/README.en.md +++ b/docs/en/README.en.md @@ -18,7 +18,7 @@ An easy-to-use Voice Conversion framework based on VITS.

------ [**Changelog**](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/blob/main/docs/Changelog_EN.md) | [**FAQ (Frequently Asked Questions)**](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/wiki/FAQ-(Frequently-Asked-Questions)) -[**English**](./docs/en/README.en.md) | [**中文简体**](./README.md) | [**日本語**](./docs/jp/README.ja.md) | [**한국어**](./docs/kr/README.ko.md) ([**韓國語**](./docs/kr/README.ko.han.md)) | [**Türkçe**](./docs/tr/README.tr.md) +[**English**](../en/README.en.md) | [**中文简体**](../../README.md) | [**日本語**](../jp/README.ja.md) | [**한국어**](../docs/kr/README.ko.md) ([**韓國語**](../docs/kr/README.ko.han.md)) | [**Türkçe**](../docs/tr/README.tr.md) Check our [Demo Video](https://www.bilibili.com/video/BV1pm4y1z7Gm/) here! From 6e389cf915815d8380409b8fd191ff6514d47218 Mon Sep 17 00:00:00 2001 From: Ftps <63702646+Tps-F@users.noreply.github.com> Date: Sun, 27 Aug 2023 22:13:47 +0900 Subject: [PATCH 37/65] fix docs path --- docs/en/README.en.md | 2 +- docs/jp/README.ja.md | 2 +- docs/kr/README.ko.han.md | 2 +- docs/kr/README.ko.md | 2 +- docs/tr/README.tr.md | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/docs/en/README.en.md b/docs/en/README.en.md index 3723340..9085eef 100644 --- a/docs/en/README.en.md +++ b/docs/en/README.en.md @@ -18,7 +18,7 @@ An easy-to-use Voice Conversion framework based on VITS.

------ [**Changelog**](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/blob/main/docs/Changelog_EN.md) | [**FAQ (Frequently Asked Questions)**](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/wiki/FAQ-(Frequently-Asked-Questions)) -[**English**](../en/README.en.md) | [**中文简体**](../../README.md) | [**日本語**](../jp/README.ja.md) | [**한국어**](../docs/kr/README.ko.md) ([**韓國語**](../docs/kr/README.ko.han.md)) | [**Türkçe**](../docs/tr/README.tr.md) +[**English**](../en/README.en.md) | [**中文简体**](../../README.md) | [**日本語**](../jp/README.ja.md) | [**한국어**](../kr/README.ko.md) ([**韓國語**](../kr/README.ko.han.md)) | [**Türkçe**](../tr/README.tr.md) Check our [Demo Video](https://www.bilibili.com/video/BV1pm4y1z7Gm/) here! diff --git a/docs/jp/README.ja.md b/docs/jp/README.ja.md index 622b079..151959e 100644 --- a/docs/jp/README.ja.md +++ b/docs/jp/README.ja.md @@ -19,7 +19,7 @@ VITSに基づく使いやすい音声変換(voice changer)framework

[**更新日誌**](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/blob/main/docs/Changelog_CN.md) -[**English**](./docs/en/README.en.md) | [**中文简体**](./README.md) | [**日本語**](./docs/jp/README.ja.md) | [**한국어**](./docs/kr/README.ko.md) ([**韓國語**](./docs/kr/README.ko.han.md)) | [**Türkçe**](./docs/tr/README.tr.md) +[**English**](../en/README.en.md) | [**中文简体**](../../README.md) | [**日本語**](../jp/README.ja.md) | [**한국어**](../kr/README.ko.md) ([**韓國語**](../kr/README.ko.han.md)) | [**Türkçe**](../tr/README.tr.md) > デモ動画は[こちら](https://www.bilibili.com/video/BV1pm4y1z7Gm/)でご覧ください。 diff --git a/docs/kr/README.ko.han.md b/docs/kr/README.ko.han.md index 18f636a..54ecf5d 100644 --- a/docs/kr/README.ko.han.md +++ b/docs/kr/README.ko.han.md @@ -18,7 +18,7 @@ VITS基盤의 簡單하고使用하기 쉬운音聲變換틀

------ [**更新日誌**](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/blob/main/docs/Changelog_KO.md) -[**English**](./docs/en/README.en.md) | [**中文简体**](./README.md) | [**日本語**](./docs/jp/README.ja.md) | [**한국어**](./docs/kr/README.ko.md) ([**韓國語**](./docs/kr/README.ko.han.md)) | [**Türkçe**](./docs/tr/README.tr.md) +[**English**](../en/README.en.md) | [**中文简体**](../../README.md) | [**日本語**](../jp/README.ja.md) | [**한국어**](../kr/README.ko.md) ([**韓國語**](../kr/README.ko.han.md)) | [**Türkçe**](../tr/README.tr.md) > [示範映像](https://www.bilibili.com/video/BV1pm4y1z7Gm/)을 確認해 보세요! diff --git a/docs/kr/README.ko.md b/docs/kr/README.ko.md index b469ede..748474c 100644 --- a/docs/kr/README.ko.md +++ b/docs/kr/README.ko.md @@ -19,7 +19,7 @@ VITS 기반의 간단하고 사용하기 쉬운 음성 변환 프레임워크. [데모 영상](https://www.bilibili.com/video/BV1pm4y1z7Gm/)을 확인해 보세요! diff --git a/docs/tr/README.tr.md b/docs/tr/README.tr.md index 0dc6b24..62cfa05 100644 --- a/docs/tr/README.tr.md +++ b/docs/tr/README.tr.md @@ -20,7 +20,7 @@ Kolay kullanılabilen VITS tabanlı bir Ses Dönüşümü çerçevesi.

------ [**Değişiklik Kaydı**](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/blob/main/docs/Changelog_TR.md) | [**SSS (Sıkça Sorulan Sorular)**](https://github.com/RVC-Project/Retrieval-based-Voice-Conversion-WebUI/wiki/FAQ-(Frequently-Asked-Questions)) -[**English**](./docs/en/README.en.md) | [**中文简体**](./README.md) | [**日本語**](./docs/jp/README.ja.md) | [**한국어**](./docs/kr/README.ko.md) ([**韓國語**](./docs/kr/README.ko.han.md)) | [**Türkçe**](./docs/tr/README.tr.md) +[**English**](../en/README.en.md) | [**中文简体**](../../README.md) | [**日本語**](../jp/README.ja.md) | [**한국어**](../kr/README.ko.md) ([**韓國語**](../kr/README.ko.han.md)) | [**Türkçe**](../tr/README.tr.md) Demo Videosu için [buraya](https://www.bilibili.com/video/BV1pm4y1z7Gm/) bakın! From 0902f1711de830aacb7080134c53764b5c9c09de Mon Sep 17 00:00:00 2001 From: Ftps <63702646+Tps-F@users.noreply.github.com> Date: Sun, 27 Aug 2023 22:21:48 +0900 Subject: [PATCH 38/65] sort import PEP8 --- infer-web.py | 29 +++++++++-------------------- 1 file changed, 9 insertions(+), 20 deletions(-) diff --git a/infer-web.py b/infer-web.py index 4f0fa3f..d3a923b 100644 --- a/infer-web.py +++ b/infer-web.py @@ -1,28 +1,22 @@ +import logging import os import shutil -import sys - -now_dir = os.getcwd() -sys.path.append(now_dir) +import threading import traceback import warnings - -import numpy as np -import torch - -import logging -import threading from random import shuffle from subprocess import Popen from time import sleep +import fairseq import faiss import gradio as gr +import numpy as np +import torch +from dotenv import load_dotenv +from sklearn.cluster import MiniBatchKMeans from configs.config import Config -import soundfile as sf - -import fairseq from i18n.i18n import I18nAuto from infer.lib.train.process_ckpt import ( change_info, @@ -30,14 +24,9 @@ from infer.lib.train.process_ckpt import ( merge, show_info, ) - -from sklearn.cluster import MiniBatchKMeans - -from dotenv import load_dotenv - -from infer.modules.vc.modules import VC -from infer.modules.uvr5.modules import uvr from infer.modules.onnx.export import export_onnx +from infer.modules.uvr5.modules import uvr +from infer.modules.vc.modules import VC logging.getLogger("numba").setLevel(logging.WARNING) From aa37b92bd3fe2accc1c15acd235c98976a76a9aa Mon Sep 17 00:00:00 2001 From: Ftps <63702646+Tps-F@users.noreply.github.com> Date: Mon, 28 Aug 2023 00:36:56 +0900 Subject: [PATCH 39/65] rewrite infer_batch_rvc.py --- tools/infer_batch_rvc.py | 70 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 70 insertions(+) create mode 100644 tools/infer_batch_rvc.py diff --git a/tools/infer_batch_rvc.py b/tools/infer_batch_rvc.py new file mode 100644 index 0000000..b8d73e7 --- /dev/null +++ b/tools/infer_batch_rvc.py @@ -0,0 +1,70 @@ +import argparse +import os +import sys + +print("Command-line arguments:", sys.argv) + +now_dir = os.getcwd() +sys.path.append(now_dir) +import sys + +import tqdm as tq +from dotenv import load_dotenv +from scipy.io import wavfile + +from configs.config import Config +from infer.modules.vc.modules import VC + + +def args() -> tuple: + parser = argparse.ArgumentParser() + parser.add_argument("--f0up_key", type=int, default=0) + parser.add_argument("--input_path", type=str, help="input path") + parser.add_argument("--index_path", type=str, help="index path") + parser.add_argument("--f0method", type=str, default="harvest", help="harvest or pm") + parser.add_argument("--opt_path", type=str, help="opt path") + parser.add_argument("--model_name", type=str, help="store in assets/weight_root") + parser.add_argument("--index_rate", type=float, default=0.66, help="index rate") + parser.add_argument("--device", type=str, help="device") + parser.add_argument("--is_half", type=bool, help="use half -> True") + parser.add_argument("--filter_radius", type=int, default=3, help="filter radius") + parser.add_argument("--resample_sr", type=int, default=0, help="resample sr") + parser.add_argument("--rms_mix_rate", type=float, default=1, help="rms mix rate") + parser.add_argument("--protect", type=float, default=0.33, help="protect") + + args = parser.parse_args() + sys.argv = sys.argv[:1] + + return args + + +def main(): + vc.get_vc(args.model_name) + audios = os.listdir(args.input_path) + for file in tq.tqdm(audios): + if file.endswith(".wav"): + file_path = os.path.join(args.input_path, file) + _, wav_opt = vc.vc_single( + 0, + file_path, + args.f0up_key, + None, + args.f0method, + args.index_path, + None, + args.index_rate, + args.filter_radius, + args.resample_sr, + args.rms_mix_rate, + args.protect, + ) + out_path = os.path.join(args.opt_path, file) + wavfile.write(out_path, vc.tgt_sr, wav_opt[1]) + + +if __name__ == "__main__": + load_dotenv() + args = args() + config = Config() + vc = VC(config) + main() From 34de3492f369bbe9ec881bb6c1f01eb9ddabf844 Mon Sep 17 00:00:00 2001 From: Ftps <63702646+Tps-F@users.noreply.github.com> Date: Mon, 28 Aug 2023 01:14:36 +0900 Subject: [PATCH 40/65] rewrite infer_cli --- infer_cli.py | 272 --------------------------------------- tools/infer_batch_rvc.py | 14 +- tools/infer_cli.py | 65 ++++++++++ 3 files changed, 73 insertions(+), 278 deletions(-) delete mode 100644 infer_cli.py create mode 100644 tools/infer_cli.py diff --git a/infer_cli.py b/infer_cli.py deleted file mode 100644 index 59f246b..0000000 --- a/infer_cli.py +++ /dev/null @@ -1,272 +0,0 @@ -from scipy.io import wavfile -from fairseq import checkpoint_utils -from lib.audio import load_audio -from lib.infer_pack.models import ( - SynthesizerTrnMs256NSFsid, - SynthesizerTrnMs256NSFsid_nono, - SynthesizerTrnMs768NSFsid, - SynthesizerTrnMs768NSFsid_nono, -) -from lib.train.vc_infer_pipeline import VC -from multiprocessing import cpu_count -import numpy as np -import torch -import sys -import glob -import argparse -import os -import sys -import pdb -import torch - -now_dir = os.getcwd() -sys.path.append(now_dir) - -#### -# USAGE -# -# In your Terminal or CMD or whatever -# python infer_cli.py [TRANSPOSE_VALUE] "[INPUT_PATH]" "[OUTPUT_PATH]" "[MODEL_PATH]" "[INDEX_FILE_PATH]" "[INFERENCE_DEVICE]" "[METHOD]" - -using_cli = False -device = "cuda:0" -is_half = False - -if len(sys.argv) > 0: - f0_up_key = int(sys.argv[1]) # transpose value - input_path = sys.argv[2] - output_path = sys.argv[3] - model_path = sys.argv[4] - file_index = sys.argv[5] # .index file - device = sys.argv[6] - f0_method = sys.argv[7] # pm or harvest or crepe - - using_cli = True - - # file_index2=sys.argv[8] - # index_rate=float(sys.argv[10]) #search feature ratio - # filter_radius=float(sys.argv[11]) #median filter - # resample_sr=float(sys.argv[12]) #resample audio in post processing - # rms_mix_rate=float(sys.argv[13]) #search feature - print(sys.argv) - - -class Config: - def __init__(self, device, is_half): - self.device = device - self.is_half = is_half - self.n_cpu = 0 - self.gpu_name = None - self.gpu_mem = None - self.x_pad, self.x_query, self.x_center, self.x_max = self.device_config() - - def device_config(self) -> tuple: - if torch.cuda.is_available() and device != "cpu": - i_device = int(self.device.split(":")[-1]) - self.gpu_name = torch.cuda.get_device_name(i_device) - if ( - ("16" in self.gpu_name and "V100" not in self.gpu_name.upper()) - or "P40" in self.gpu_name.upper() - or "1060" in self.gpu_name - or "1070" in self.gpu_name - or "1080" in self.gpu_name - ): - print("16系/10系显卡和P40强制单精度") - self.is_half = False - for config_file in ["32k.json", "40k.json", "48k.json"]: - with open(f"configs/{config_file}", "r") as f: - strr = f.read().replace("true", "false") - with open(f"configs/{config_file}", "w") as f: - f.write(strr) - with open("trainset_preprocess_pipeline_print.py", "r") as f: - strr = f.read().replace("3.7", "3.0") - with open("trainset_preprocess_pipeline_print.py", "w") as f: - f.write(strr) - else: - self.gpu_name = None - self.gpu_mem = int( - torch.cuda.get_device_properties(i_device).total_memory - / 1024 - / 1024 - / 1024 - + 0.4 - ) - if self.gpu_mem <= 4: - with open("trainset_preprocess_pipeline_print.py", "r") as f: - strr = f.read().replace("3.7", "3.0") - with open("trainset_preprocess_pipeline_print.py", "w") as f: - f.write(strr) - elif torch.backends.mps.is_available(): - print("没有发现支持的N卡, 使用MPS进行推理") - self.device = "mps" - else: - print("没有发现支持的N卡, 使用CPU进行推理") - self.device = "cpu" - self.is_half = False - - if self.n_cpu == 0: - self.n_cpu = cpu_count() - - if self.is_half: - # 6G显存配置 - x_pad = 3 - x_query = 10 - x_center = 60 - x_max = 65 - else: - # 5G显存配置 - x_pad = 1 - x_query = 6 - x_center = 38 - x_max = 41 - - if self.gpu_mem != None and self.gpu_mem <= 4: - x_pad = 1 - x_query = 5 - x_center = 30 - x_max = 32 - - return x_pad, x_query, x_center, x_max - - -config = Config(device, is_half) -now_dir = os.getcwd() -sys.path.append(now_dir) - -hubert_model = None - - -def load_hubert(): - global hubert_model - models, _, _ = checkpoint_utils.load_model_ensemble_and_task( - ["hubert_base.pt"], - suffix="", - ) - hubert_model = models[0] - hubert_model = hubert_model.to(config.device) - if config.is_half: - hubert_model = hubert_model.half() - else: - hubert_model = hubert_model.float() - hubert_model.eval() - - -def vc_single( - sid=0, - input_audio_path=None, - f0_up_key=0, - f0_file=None, - f0_method="pm", - file_index="", # .index file - file_index2="", - # file_big_npy, - index_rate=1.0, - filter_radius=3, - resample_sr=0, - rms_mix_rate=1.0, - model_path="", - output_path="", - protect=0.33, -): - global tgt_sr, net_g, vc, hubert_model, version - get_vc(model_path) - if input_audio_path is None: - return "You need to upload an audio file", None - - f0_up_key = int(f0_up_key) - audio = load_audio(input_audio_path, 16000) - audio_max = np.abs(audio).max() / 0.95 - - if audio_max > 1: - audio /= audio_max - times = [0, 0, 0] - - if hubert_model == None: - load_hubert() - - if_f0 = cpt.get("f0", 1) - - file_index = ( - ( - file_index.strip(" ") - .strip('"') - .strip("\n") - .strip('"') - .strip(" ") - .replace("trained", "added") - ) - if file_index != "" - else file_index2 - ) - - audio_opt = vc.pipeline( - hubert_model, - net_g, - sid, - audio, - input_audio_path, - times, - f0_up_key, - f0_method, - file_index, - # file_big_npy, - index_rate, - if_f0, - filter_radius, - tgt_sr, - resample_sr, - rms_mix_rate, - version, - f0_file=f0_file, - protect=protect, - ) - wavfile.write(output_path, tgt_sr, audio_opt) - return "processed" - - -def get_vc(model_path): - global n_spk, tgt_sr, net_g, vc, cpt, device, is_half, version - print("loading pth %s" % model_path) - cpt = torch.load(model_path, map_location="cpu") - tgt_sr = cpt["config"][-1] - cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0] # n_spk - if_f0 = cpt.get("f0", 1) - version = cpt.get("version", "v1") - if version == "v1": - if if_f0 == 1: - net_g = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=is_half) - else: - net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"]) - elif version == "v2": - if if_f0 == 1: - net_g = SynthesizerTrnMs768NSFsid(*cpt["config"], is_half=is_half) - else: - net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"]) - del net_g.enc_q - print(net_g.load_state_dict(cpt["weight"], strict=False)) - net_g.eval().to(device) - if is_half: - net_g = net_g.half() - else: - net_g = net_g.float() - vc = VC(tgt_sr, config) - n_spk = cpt["config"][-3] - # return {"visible": True,"maximum": n_spk, "__type__": "update"} - - -if using_cli: - vc_single( - sid=0, - input_audio_path=input_path, - f0_up_key=f0_up_key, - f0_file=None, - f0_method=f0_method, - file_index=file_index, - file_index2="", - index_rate=1, - filter_radius=3, - resample_sr=0, - rms_mix_rate=0, - model_path=model_path, - output_path=output_path, - ) diff --git a/tools/infer_batch_rvc.py b/tools/infer_batch_rvc.py index b8d73e7..763d17f 100644 --- a/tools/infer_batch_rvc.py +++ b/tools/infer_batch_rvc.py @@ -16,7 +16,7 @@ from configs.config import Config from infer.modules.vc.modules import VC -def args() -> tuple: +def arg_parse() -> tuple: parser = argparse.ArgumentParser() parser.add_argument("--f0up_key", type=int, default=0) parser.add_argument("--input_path", type=str, help="input path") @@ -39,6 +39,12 @@ def args() -> tuple: def main(): + load_dotenv() + args = arg_parse() + config = Config() + config.device = args.device if args.device else config.device + config.is_half = args.is_half if args.is_half else config.is_half + vc = VC(config) vc.get_vc(args.model_name) audios = os.listdir(args.input_path) for file in tq.tqdm(audios): @@ -59,12 +65,8 @@ def main(): args.protect, ) out_path = os.path.join(args.opt_path, file) - wavfile.write(out_path, vc.tgt_sr, wav_opt[1]) + wavfile.write(out_path, wav_opt[0], wav_opt[1]) if __name__ == "__main__": - load_dotenv() - args = args() - config = Config() - vc = VC(config) main() diff --git a/tools/infer_cli.py b/tools/infer_cli.py new file mode 100644 index 0000000..4a7dca5 --- /dev/null +++ b/tools/infer_cli.py @@ -0,0 +1,65 @@ +import argparse +import os +import sys +now_dir = os.getcwd() +sys.path.append(now_dir) +from scipy.io import wavfile + +from configs.config import Config +from infer.modules.vc.modules import VC +from dotenv import load_dotenv +#### +# USAGE +# +# In your Terminal or CMD or whatever + + +def arg_parse() -> tuple: + parser = argparse.ArgumentParser() + parser.add_argument("--f0up_key", type=int, default=0) + parser.add_argument("--input_path", type=str, help="input path") + parser.add_argument("--index_path", type=str, help="index path") + parser.add_argument("--f0method", type=str, default="harvest", help="harvest or pm") + parser.add_argument("--opt_path", type=str, help="opt path") + parser.add_argument("--model_name", type=str, help="store in assets/weight_root") + parser.add_argument("--index_rate", type=float, default=0.66, help="index rate") + parser.add_argument("--device", type=str, help="device") + parser.add_argument("--is_half", type=bool, help="use half -> True") + parser.add_argument("--filter_radius", type=int, default=3, help="filter radius") + parser.add_argument("--resample_sr", type=int, default=0, help="resample sr") + parser.add_argument("--rms_mix_rate", type=float, default=1, help="rms mix rate") + parser.add_argument("--protect", type=float, default=0.33, help="protect") + + args = parser.parse_args() + sys.argv = sys.argv[:1] + + return args + + +def main(): + load_dotenv() + args = arg_parse() + config = Config() + config.device = args.device if args.device else config.device + config.is_half = args.is_half if args.is_half else config.is_half + vc = VC(config) + vc.get_vc(args.model_name) + _, wav_opt = vc.vc_single( + 0, + args.input_path, + args.f0up_key, + None, + args.f0method, + args.index_path, + None, + args.index_rate, + args.filter_radius, + args.resample_sr, + args.rms_mix_rate, + args.protect, + ) + wavfile.write(args.opt_path, wav_opt[0], wav_opt[1]) + + +if __name__ == "__main__": + main() From 72ff21a36ece702858ade909ee284b0defb9c3c7 Mon Sep 17 00:00:00 2001 From: Tps-F Date: Sun, 27 Aug 2023 16:15:44 +0000 Subject: [PATCH 41/65] Apply Code Formatter Change --- tools/infer_cli.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tools/infer_cli.py b/tools/infer_cli.py index 4a7dca5..c885a07 100644 --- a/tools/infer_cli.py +++ b/tools/infer_cli.py @@ -1,6 +1,7 @@ import argparse import os import sys + now_dir = os.getcwd() sys.path.append(now_dir) from scipy.io import wavfile @@ -8,6 +9,7 @@ from scipy.io import wavfile from configs.config import Config from infer.modules.vc.modules import VC from dotenv import load_dotenv + #### # USAGE # @@ -57,7 +59,7 @@ def main(): args.resample_sr, args.rms_mix_rate, args.protect, - ) + ) wavfile.write(args.opt_path, wav_opt[0], wav_opt[1]) From 424932c46980969a14c539be14f9410fbc90f9e2 Mon Sep 17 00:00:00 2001 From: Ftps <63702646+Tps-F@users.noreply.github.com> Date: Mon, 28 Aug 2023 01:27:08 +0900 Subject: [PATCH 42/65] Delete duplicate files --- extract_feature_print.py | 135 ------- infer_batch_rvc.py | 216 ---------- infer_uvr5.py | 363 ----------------- lib/audio.py | 21 - lib/rmvpe.py | 692 --------------------------------- lib/slicer2.py | 260 ------------- lib/train/cmd.txt | 1 - lib/train/data_utils.py | 512 ------------------------ lib/train/losses.py | 58 --- lib/train/mel_processing.py | 130 ------- lib/train/process_ckpt.py | 259 ------------ lib/train/utils.py | 487 ----------------------- lib/train/vc_infer_pipeline.py | 449 --------------------- 13 files changed, 3583 deletions(-) delete mode 100644 extract_feature_print.py delete mode 100644 infer_batch_rvc.py delete mode 100644 infer_uvr5.py delete mode 100644 lib/audio.py delete mode 100644 lib/rmvpe.py delete mode 100644 lib/slicer2.py delete mode 100644 lib/train/cmd.txt delete mode 100644 lib/train/data_utils.py delete mode 100644 lib/train/losses.py delete mode 100644 lib/train/mel_processing.py delete mode 100644 lib/train/process_ckpt.py delete mode 100644 lib/train/utils.py delete mode 100644 lib/train/vc_infer_pipeline.py diff --git a/extract_feature_print.py b/extract_feature_print.py deleted file mode 100644 index e613de4..0000000 --- a/extract_feature_print.py +++ /dev/null @@ -1,135 +0,0 @@ -import os, sys, traceback - -os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" -os.environ["PYTORCH_MPS_HIGH_WATERMARK_RATIO"] = "0.0" - -device = sys.argv[1] -n_part = int(sys.argv[2]) -i_part = int(sys.argv[3]) -if len(sys.argv) == 6: - exp_dir = sys.argv[4] - version = sys.argv[5] -else: - i_gpu = sys.argv[4] - exp_dir = sys.argv[5] - os.environ["CUDA_VISIBLE_DEVICES"] = str(i_gpu) - version = sys.argv[6] -import torch -import torch.nn.functional as F -import soundfile as sf -import numpy as np -import fairseq - -if "privateuseone" not in device: - device = "cpu" - if torch.cuda.is_available(): - device = "cuda" - elif torch.backends.mps.is_available(): - device = "mps" -else: - import torch_directml - - device = torch_directml.device(torch_directml.default_device()) - - def forward_dml(ctx, x, scale): - ctx.scale = scale - res = x.clone().detach() - return res - - fairseq.modules.grad_multiply.GradMultiply.forward = forward_dml - -f = open("%s/extract_f0_feature.log" % exp_dir, "a+") - - -def printt(strr): - print(strr) - f.write("%s\n" % strr) - f.flush() - - -printt(sys.argv) -model_path = "hubert_base.pt" - -printt(exp_dir) -wavPath = "%s/1_16k_wavs" % exp_dir -outPath = ( - "%s/3_feature256" % exp_dir if version == "v1" else "%s/3_feature768" % exp_dir -) -os.makedirs(outPath, exist_ok=True) - - -# wave must be 16k, hop_size=320 -def readwave(wav_path, normalize=False): - wav, sr = sf.read(wav_path) - assert sr == 16000 - feats = torch.from_numpy(wav).float() - if feats.dim() == 2: # double channels - feats = feats.mean(-1) - assert feats.dim() == 1, feats.dim() - if normalize: - with torch.no_grad(): - feats = F.layer_norm(feats, feats.shape) - feats = feats.view(1, -1) - return feats - - -# HuBERT model -printt("load model(s) from {}".format(model_path)) -# if hubert model is exist -if os.access(model_path, os.F_OK) == False: - printt( - "Error: Extracting is shut down because %s does not exist, you may download it from https://huggingface.co/lj1995/VoiceConversionWebUI/tree/main" - % model_path - ) - exit(0) -models, saved_cfg, task = fairseq.checkpoint_utils.load_model_ensemble_and_task( - [model_path], - suffix="", -) -model = models[0] -model = model.to(device) -printt("move model to %s" % device) -if device not in ["mps", "cpu"]: - model = model.half() -model.eval() - -todo = sorted(list(os.listdir(wavPath)))[i_part::n_part] -n = max(1, len(todo) // 10) # 最多打印十条 -if len(todo) == 0: - printt("no-feature-todo") -else: - printt("all-feature-%s" % len(todo)) - for idx, file in enumerate(todo): - try: - if file.endswith(".wav"): - wav_path = "%s/%s" % (wavPath, file) - out_path = "%s/%s" % (outPath, file.replace("wav", "npy")) - - if os.path.exists(out_path): - continue - - feats = readwave(wav_path, normalize=saved_cfg.task.normalize) - padding_mask = torch.BoolTensor(feats.shape).fill_(False) - inputs = { - "source": feats.half().to(device) - if device not in ["mps", "cpu"] - else feats.to(device), - "padding_mask": padding_mask.to(device), - "output_layer": 9 if version == "v1" else 12, # layer 9 - } - with torch.no_grad(): - logits = model.extract_features(**inputs) - feats = ( - model.final_proj(logits[0]) if version == "v1" else logits[0] - ) - - feats = feats.squeeze(0).float().cpu().numpy() - if np.isnan(feats).sum() == 0: - np.save(out_path, feats, allow_pickle=False) - else: - printt("%s-contains nan" % file) - if idx % n == 0: - printt("now-%s,all-%s,%s,%s" % (len(todo), idx, file, feats.shape)) - except: - printt(traceback.format_exc()) - printt("all-feature-done") diff --git a/infer_batch_rvc.py b/infer_batch_rvc.py deleted file mode 100644 index 3fc9a05..0000000 --- a/infer_batch_rvc.py +++ /dev/null @@ -1,216 +0,0 @@ -""" -v1 -runtime\python.exe myinfer-v2-0528.py 0 "E:\codes\py39\RVC-beta\todo-songs" "E:\codes\py39\logs\mi-test\added_IVF677_Flat_nprobe_7.index" harvest "E:\codes\py39\RVC-beta\output" "E:\codes\py39\test-20230416b\weights\mi-test.pth" 0.66 cuda:0 True 3 0 1 0.33 -v2 -runtime\python.exe myinfer-v2-0528.py 0 "E:\codes\py39\RVC-beta\todo-songs" "E:\codes\py39\test-20230416b\logs\mi-test-v2\aadded_IVF677_Flat_nprobe_1_v2.index" harvest "E:\codes\py39\RVC-beta\output_v2" "E:\codes\py39\test-20230416b\weights\mi-test-v2.pth" 0.66 cuda:0 True 3 0 1 0.33 -""" -import os, sys, pdb, torch - -now_dir = os.getcwd() -sys.path.append(now_dir) -import sys -import torch -import tqdm as tq -from multiprocessing import cpu_count - - -class Config: - def __init__(self, device, is_half): - self.device = device - self.is_half = is_half - self.n_cpu = 0 - self.gpu_name = None - self.gpu_mem = None - self.x_pad, self.x_query, self.x_center, self.x_max = self.device_config() - - def device_config(self) -> tuple: - if torch.cuda.is_available(): - i_device = int(self.device.split(":")[-1]) - self.gpu_name = torch.cuda.get_device_name(i_device) - if ( - ("16" in self.gpu_name and "V100" not in self.gpu_name.upper()) - or "P40" in self.gpu_name.upper() - or "1060" in self.gpu_name - or "1070" in self.gpu_name - or "1080" in self.gpu_name - ): - print("16系/10系显卡和P40强制单精度") - self.is_half = False - for config_file in ["32k.json", "40k.json", "48k.json"]: - with open(f"configs/{config_file}", "r") as f: - strr = f.read().replace("true", "false") - with open(f"configs/{config_file}", "w") as f: - f.write(strr) - with open("trainset_preprocess_pipeline_print.py", "r") as f: - strr = f.read().replace("3.7", "3.0") - with open("trainset_preprocess_pipeline_print.py", "w") as f: - f.write(strr) - else: - self.gpu_name = None - self.gpu_mem = int( - torch.cuda.get_device_properties(i_device).total_memory - / 1024 - / 1024 - / 1024 - + 0.4 - ) - if self.gpu_mem <= 4: - with open("trainset_preprocess_pipeline_print.py", "r") as f: - strr = f.read().replace("3.7", "3.0") - with open("trainset_preprocess_pipeline_print.py", "w") as f: - f.write(strr) - elif torch.backends.mps.is_available(): - print("没有发现支持的N卡, 使用MPS进行推理") - self.device = "mps" - else: - print("没有发现支持的N卡, 使用CPU进行推理") - self.device = "cpu" - self.is_half = True - - if self.n_cpu == 0: - self.n_cpu = cpu_count() - - if self.is_half: - # 6G显存配置 - x_pad = 3 - x_query = 10 - x_center = 60 - x_max = 65 - else: - # 5G显存配置 - x_pad = 1 - x_query = 6 - x_center = 38 - x_max = 41 - - if self.gpu_mem != None and self.gpu_mem <= 4: - x_pad = 1 - x_query = 5 - x_center = 30 - x_max = 32 - - return x_pad, x_query, x_center, x_max - - -f0up_key = sys.argv[1] -input_path = sys.argv[2] -index_path = sys.argv[3] -f0method = sys.argv[4] # harvest or pm -opt_path = sys.argv[5] -model_path = sys.argv[6] -index_rate = float(sys.argv[7]) -device = sys.argv[8] -is_half = sys.argv[9].lower() != "false" -filter_radius = int(sys.argv[10]) -resample_sr = int(sys.argv[11]) -rms_mix_rate = float(sys.argv[12]) -protect = float(sys.argv[13]) -print(sys.argv) -config = Config(device, is_half) -now_dir = os.getcwd() -sys.path.append(now_dir) -from lib.train.vc_infer_pipeline import VC -from lib.infer_pack.models import ( - SynthesizerTrnMs256NSFsid, - SynthesizerTrnMs256NSFsid_nono, - SynthesizerTrnMs768NSFsid, - SynthesizerTrnMs768NSFsid_nono, -) -from lib.audio import load_audio -from fairseq import checkpoint_utils -from scipy.io import wavfile - -hubert_model = None - - -def load_hubert(hubert_model_path="hubert_base.pt"): - global hubert_model - models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task( - [hubert_model_path], - suffix="", - ) - hubert_model = models[0] - hubert_model = hubert_model.to(device) - if is_half: - hubert_model = hubert_model.half() - else: - hubert_model = hubert_model.float() - hubert_model.eval() - - -def vc_single(sid, input_audio, f0_up_key, f0_file, f0_method, file_index, index_rate): - global tgt_sr, net_g, vc, hubert_model, version - if input_audio is None: - return "You need to upload an audio", None - f0_up_key = int(f0_up_key) - audio = load_audio(input_audio, 16000) - times = [0, 0, 0] - if hubert_model == None: - load_hubert() - if_f0 = cpt.get("f0", 1) - # audio_opt=vc.pipeline(hubert_model,net_g,sid,audio,times,f0_up_key,f0_method,file_index,file_big_npy,index_rate,if_f0,f0_file=f0_file) - audio_opt = vc.pipeline( - hubert_model, - net_g, - sid, - audio, - input_audio, - times, - f0_up_key, - f0_method, - file_index, - index_rate, - if_f0, - filter_radius, - tgt_sr, - resample_sr, - rms_mix_rate, - version, - protect, - f0_file=f0_file, - ) - print(times) - return audio_opt - - -def get_vc(model_path): - global n_spk, tgt_sr, net_g, vc, cpt, device, is_half, version - print("loading pth %s" % model_path) - cpt = torch.load(model_path, map_location="cpu") - tgt_sr = cpt["config"][-1] - cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0] # n_spk - if_f0 = cpt.get("f0", 1) - version = cpt.get("version", "v1") - if version == "v1": - if if_f0 == 1: - net_g = SynthesizerTrnMs256NSFsid(*cpt["config"], is_half=is_half) - else: - net_g = SynthesizerTrnMs256NSFsid_nono(*cpt["config"]) - elif version == "v2": - if if_f0 == 1: # - net_g = SynthesizerTrnMs768NSFsid(*cpt["config"], is_half=is_half) - else: - net_g = SynthesizerTrnMs768NSFsid_nono(*cpt["config"]) - del net_g.enc_q - print(net_g.load_state_dict(cpt["weight"], strict=False)) # 不加这一行清不干净,真奇葩 - net_g.eval().to(device) - if is_half: - net_g = net_g.half() - else: - net_g = net_g.float() - vc = VC(tgt_sr, config) - n_spk = cpt["config"][-3] - # return {"visible": True,"maximum": n_spk, "__type__": "update"} - - -if __name__ == "__main__": - get_vc(model_path) - audios = os.listdir(input_path) - for file in tq.tqdm(audios): - if file.endswith(".wav"): - file_path = os.path.join(input_path, file) - wav_opt = vc_single( - 0, file_path, f0up_key, None, f0method, index_path, index_rate - ) - out_path = os.path.join(opt_path, file) - wavfile.write(out_path, tgt_sr, wav_opt) diff --git a/infer_uvr5.py b/infer_uvr5.py deleted file mode 100644 index 0ffdb5d..0000000 --- a/infer_uvr5.py +++ /dev/null @@ -1,363 +0,0 @@ -import os, sys, torch, warnings, pdb - -now_dir = os.getcwd() -sys.path.append(now_dir) -from json import load as ll - -warnings.filterwarnings("ignore") -import librosa -import importlib -import numpy as np -import hashlib, math -from tqdm import tqdm -from lib.uvr5_pack.lib_v5 import spec_utils -from lib.uvr5_pack.utils import _get_name_params, inference -from lib.uvr5_pack.lib_v5.model_param_init import ModelParameters -import soundfile as sf -from lib.uvr5_pack.lib_v5.nets_new import CascadedNet -from lib.uvr5_pack.lib_v5 import nets_61968KB as nets - - -class _audio_pre_: - def __init__(self, agg, model_path, device, is_half): - self.model_path = model_path - self.device = device - self.data = { - # Processing Options - "postprocess": False, - "tta": False, - # Constants - "window_size": 512, - "agg": agg, - "high_end_process": "mirroring", - } - mp = ModelParameters("lib/uvr5_pack/lib_v5/modelparams/4band_v2.json") - model = nets.CascadedASPPNet(mp.param["bins"] * 2) - cpk = torch.load(model_path, map_location="cpu") - model.load_state_dict(cpk) - model.eval() - if is_half: - model = model.half().to(device) - else: - model = model.to(device) - - self.mp = mp - self.model = model - - def _path_audio_(self, music_file, ins_root=None, vocal_root=None, format="flac"): - if ins_root is None and vocal_root is None: - return "No save root." - name = os.path.basename(music_file) - if ins_root is not None: - os.makedirs(ins_root, exist_ok=True) - if vocal_root is not None: - os.makedirs(vocal_root, exist_ok=True) - X_wave, y_wave, X_spec_s, y_spec_s = {}, {}, {}, {} - bands_n = len(self.mp.param["band"]) - # print(bands_n) - for d in range(bands_n, 0, -1): - bp = self.mp.param["band"][d] - if d == bands_n: # high-end band - ( - X_wave[d], - _, - ) = librosa.core.load( # 理论上librosa读取可能对某些音频有bug,应该上ffmpeg读取,但是太麻烦了弃坑 - music_file, - bp["sr"], - False, - dtype=np.float32, - res_type=bp["res_type"], - ) - if X_wave[d].ndim == 1: - X_wave[d] = np.asfortranarray([X_wave[d], X_wave[d]]) - else: # lower bands - X_wave[d] = librosa.core.resample( - X_wave[d + 1], - self.mp.param["band"][d + 1]["sr"], - bp["sr"], - res_type=bp["res_type"], - ) - # Stft of wave source - X_spec_s[d] = spec_utils.wave_to_spectrogram_mt( - X_wave[d], - bp["hl"], - bp["n_fft"], - self.mp.param["mid_side"], - self.mp.param["mid_side_b2"], - self.mp.param["reverse"], - ) - # pdb.set_trace() - if d == bands_n and self.data["high_end_process"] != "none": - input_high_end_h = (bp["n_fft"] // 2 - bp["crop_stop"]) + ( - self.mp.param["pre_filter_stop"] - self.mp.param["pre_filter_start"] - ) - input_high_end = X_spec_s[d][ - :, bp["n_fft"] // 2 - input_high_end_h : bp["n_fft"] // 2, : - ] - - X_spec_m = spec_utils.combine_spectrograms(X_spec_s, self.mp) - aggresive_set = float(self.data["agg"] / 100) - aggressiveness = { - "value": aggresive_set, - "split_bin": self.mp.param["band"][1]["crop_stop"], - } - with torch.no_grad(): - pred, X_mag, X_phase = inference( - X_spec_m, self.device, self.model, aggressiveness, self.data - ) - # Postprocess - if self.data["postprocess"]: - pred_inv = np.clip(X_mag - pred, 0, np.inf) - pred = spec_utils.mask_silence(pred, pred_inv) - y_spec_m = pred * X_phase - v_spec_m = X_spec_m - y_spec_m - - if ins_root is not None: - if self.data["high_end_process"].startswith("mirroring"): - input_high_end_ = spec_utils.mirroring( - self.data["high_end_process"], y_spec_m, input_high_end, self.mp - ) - wav_instrument = spec_utils.cmb_spectrogram_to_wave( - y_spec_m, self.mp, input_high_end_h, input_high_end_ - ) - else: - wav_instrument = spec_utils.cmb_spectrogram_to_wave(y_spec_m, self.mp) - print("%s instruments done" % name) - if format in ["wav", "flac"]: - sf.write( - os.path.join( - ins_root, - "instrument_{}_{}.{}".format(name, self.data["agg"], format), - ), - (np.array(wav_instrument) * 32768).astype("int16"), - self.mp.param["sr"], - ) # - else: - path = os.path.join( - ins_root, "instrument_{}_{}.wav".format(name, self.data["agg"]) - ) - sf.write( - path, - (np.array(wav_instrument) * 32768).astype("int16"), - self.mp.param["sr"], - ) - if os.path.exists(path): - os.system( - "ffmpeg -i %s -vn %s -q:a 2 -y" - % (path, path[:-4] + ".%s" % format) - ) - if vocal_root is not None: - if self.data["high_end_process"].startswith("mirroring"): - input_high_end_ = spec_utils.mirroring( - self.data["high_end_process"], v_spec_m, input_high_end, self.mp - ) - wav_vocals = spec_utils.cmb_spectrogram_to_wave( - v_spec_m, self.mp, input_high_end_h, input_high_end_ - ) - else: - wav_vocals = spec_utils.cmb_spectrogram_to_wave(v_spec_m, self.mp) - print("%s vocals done" % name) - if format in ["wav", "flac"]: - sf.write( - os.path.join( - vocal_root, - "vocal_{}_{}.{}".format(name, self.data["agg"], format), - ), - (np.array(wav_vocals) * 32768).astype("int16"), - self.mp.param["sr"], - ) - else: - path = os.path.join( - vocal_root, "vocal_{}_{}.wav".format(name, self.data["agg"]) - ) - sf.write( - path, - (np.array(wav_vocals) * 32768).astype("int16"), - self.mp.param["sr"], - ) - if os.path.exists(path): - os.system( - "ffmpeg -i %s -vn %s -q:a 2 -y" - % (path, path[:-4] + ".%s" % format) - ) - - -class _audio_pre_new: - def __init__(self, agg, model_path, device, is_half): - self.model_path = model_path - self.device = device - self.data = { - # Processing Options - "postprocess": False, - "tta": False, - # Constants - "window_size": 512, - "agg": agg, - "high_end_process": "mirroring", - } - mp = ModelParameters("lib/uvr5_pack/lib_v5/modelparams/4band_v3.json") - nout = 64 if "DeReverb" in model_path else 48 - model = CascadedNet(mp.param["bins"] * 2, nout) - cpk = torch.load(model_path, map_location="cpu") - model.load_state_dict(cpk) - model.eval() - if is_half: - model = model.half().to(device) - else: - model = model.to(device) - - self.mp = mp - self.model = model - - def _path_audio_( - self, music_file, vocal_root=None, ins_root=None, format="flac" - ): # 3个VR模型vocal和ins是反的 - if ins_root is None and vocal_root is None: - return "No save root." - name = os.path.basename(music_file) - if ins_root is not None: - os.makedirs(ins_root, exist_ok=True) - if vocal_root is not None: - os.makedirs(vocal_root, exist_ok=True) - X_wave, y_wave, X_spec_s, y_spec_s = {}, {}, {}, {} - bands_n = len(self.mp.param["band"]) - # print(bands_n) - for d in range(bands_n, 0, -1): - bp = self.mp.param["band"][d] - if d == bands_n: # high-end band - ( - X_wave[d], - _, - ) = librosa.core.load( # 理论上librosa读取可能对某些音频有bug,应该上ffmpeg读取,但是太麻烦了弃坑 - music_file, - bp["sr"], - False, - dtype=np.float32, - res_type=bp["res_type"], - ) - if X_wave[d].ndim == 1: - X_wave[d] = np.asfortranarray([X_wave[d], X_wave[d]]) - else: # lower bands - X_wave[d] = librosa.core.resample( - X_wave[d + 1], - self.mp.param["band"][d + 1]["sr"], - bp["sr"], - res_type=bp["res_type"], - ) - # Stft of wave source - X_spec_s[d] = spec_utils.wave_to_spectrogram_mt( - X_wave[d], - bp["hl"], - bp["n_fft"], - self.mp.param["mid_side"], - self.mp.param["mid_side_b2"], - self.mp.param["reverse"], - ) - # pdb.set_trace() - if d == bands_n and self.data["high_end_process"] != "none": - input_high_end_h = (bp["n_fft"] // 2 - bp["crop_stop"]) + ( - self.mp.param["pre_filter_stop"] - self.mp.param["pre_filter_start"] - ) - input_high_end = X_spec_s[d][ - :, bp["n_fft"] // 2 - input_high_end_h : bp["n_fft"] // 2, : - ] - - X_spec_m = spec_utils.combine_spectrograms(X_spec_s, self.mp) - aggresive_set = float(self.data["agg"] / 100) - aggressiveness = { - "value": aggresive_set, - "split_bin": self.mp.param["band"][1]["crop_stop"], - } - with torch.no_grad(): - pred, X_mag, X_phase = inference( - X_spec_m, self.device, self.model, aggressiveness, self.data - ) - # Postprocess - if self.data["postprocess"]: - pred_inv = np.clip(X_mag - pred, 0, np.inf) - pred = spec_utils.mask_silence(pred, pred_inv) - y_spec_m = pred * X_phase - v_spec_m = X_spec_m - y_spec_m - - if ins_root is not None: - if self.data["high_end_process"].startswith("mirroring"): - input_high_end_ = spec_utils.mirroring( - self.data["high_end_process"], y_spec_m, input_high_end, self.mp - ) - wav_instrument = spec_utils.cmb_spectrogram_to_wave( - y_spec_m, self.mp, input_high_end_h, input_high_end_ - ) - else: - wav_instrument = spec_utils.cmb_spectrogram_to_wave(y_spec_m, self.mp) - print("%s instruments done" % name) - if format in ["wav", "flac"]: - sf.write( - os.path.join( - ins_root, - "instrument_{}_{}.{}".format(name, self.data["agg"], format), - ), - (np.array(wav_instrument) * 32768).astype("int16"), - self.mp.param["sr"], - ) # - else: - path = os.path.join( - ins_root, "instrument_{}_{}.wav".format(name, self.data["agg"]) - ) - sf.write( - path, - (np.array(wav_instrument) * 32768).astype("int16"), - self.mp.param["sr"], - ) - if os.path.exists(path): - os.system( - "ffmpeg -i %s -vn %s -q:a 2 -y" - % (path, path[:-4] + ".%s" % format) - ) - if vocal_root is not None: - if self.data["high_end_process"].startswith("mirroring"): - input_high_end_ = spec_utils.mirroring( - self.data["high_end_process"], v_spec_m, input_high_end, self.mp - ) - wav_vocals = spec_utils.cmb_spectrogram_to_wave( - v_spec_m, self.mp, input_high_end_h, input_high_end_ - ) - else: - wav_vocals = spec_utils.cmb_spectrogram_to_wave(v_spec_m, self.mp) - print("%s vocals done" % name) - if format in ["wav", "flac"]: - sf.write( - os.path.join( - vocal_root, - "vocal_{}_{}.{}".format(name, self.data["agg"], format), - ), - (np.array(wav_vocals) * 32768).astype("int16"), - self.mp.param["sr"], - ) - else: - path = os.path.join( - vocal_root, "vocal_{}_{}.wav".format(name, self.data["agg"]) - ) - sf.write( - path, - (np.array(wav_vocals) * 32768).astype("int16"), - self.mp.param["sr"], - ) - if os.path.exists(path): - os.system( - "ffmpeg -i %s -vn %s -q:a 2 -y" - % (path, path[:-4] + ".%s" % format) - ) - - -if __name__ == "__main__": - device = "cuda" - is_half = True - # model_path = "uvr5_weights/2_HP-UVR.pth" - # model_path = "uvr5_weights/VR-DeEchoDeReverb.pth" - # model_path = "uvr5_weights/VR-DeEchoNormal.pth" - model_path = "uvr5_weights/DeEchoNormal.pth" - # pre_fun = _audio_pre_(model_path=model_path, device=device, is_half=True,agg=10) - pre_fun = _audio_pre_new(model_path=model_path, device=device, is_half=True, agg=10) - audio_path = "雪雪伴奏对消HP5.wav" - save_path = "opt" - pre_fun._path_audio_(audio_path, save_path, save_path) diff --git a/lib/audio.py b/lib/audio.py deleted file mode 100644 index 776939d..0000000 --- a/lib/audio.py +++ /dev/null @@ -1,21 +0,0 @@ -import ffmpeg -import numpy as np - - -def load_audio(file, sr): - try: - # https://github.com/openai/whisper/blob/main/whisper/audio.py#L26 - # This launches a subprocess to decode audio while down-mixing and resampling as necessary. - # Requires the ffmpeg CLI and `ffmpeg-python` package to be installed. - file = ( - file.strip(" ").strip('"').strip("\n").strip('"').strip(" ") - ) # 防止小白拷路径头尾带了空格和"和回车 - out, _ = ( - ffmpeg.input(file, threads=0) - .output("-", format="f32le", acodec="pcm_f32le", ac=1, ar=sr) - .run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True) - ) - except Exception as e: - raise RuntimeError(f"Failed to load audio: {e}") - - return np.frombuffer(out, np.float32).flatten() diff --git a/lib/rmvpe.py b/lib/rmvpe.py deleted file mode 100644 index e5fa613..0000000 --- a/lib/rmvpe.py +++ /dev/null @@ -1,692 +0,0 @@ -import torch, numpy as np, pdb -import torch.nn as nn -import torch.nn.functional as F -import torch, pdb -import numpy as np -import torch.nn.functional as F -from scipy.signal import get_window -from librosa.util import pad_center, tiny, normalize - - -###stft codes from https://github.com/pseeth/torch-stft/blob/master/torch_stft/util.py -def window_sumsquare( - window, - n_frames, - hop_length=200, - win_length=800, - n_fft=800, - dtype=np.float32, - norm=None, -): - """ - # from librosa 0.6 - Compute the sum-square envelope of a window function at a given hop length. - This is used to estimate modulation effects induced by windowing - observations in short-time fourier transforms. - Parameters - ---------- - window : string, tuple, number, callable, or list-like - Window specification, as in `get_window` - n_frames : int > 0 - The number of analysis frames - hop_length : int > 0 - The number of samples to advance between frames - win_length : [optional] - The length of the window function. By default, this matches `n_fft`. - n_fft : int > 0 - The length of each analysis frame. - dtype : np.dtype - The data type of the output - Returns - ------- - wss : np.ndarray, shape=`(n_fft + hop_length * (n_frames - 1))` - The sum-squared envelope of the window function - """ - if win_length is None: - win_length = n_fft - - n = n_fft + hop_length * (n_frames - 1) - x = np.zeros(n, dtype=dtype) - - # Compute the squared window at the desired length - win_sq = get_window(window, win_length, fftbins=True) - win_sq = normalize(win_sq, norm=norm) ** 2 - win_sq = pad_center(win_sq, n_fft) - - # Fill the envelope - for i in range(n_frames): - sample = i * hop_length - x[sample : min(n, sample + n_fft)] += win_sq[: max(0, min(n_fft, n - sample))] - return x - - -class STFT(torch.nn.Module): - def __init__( - self, filter_length=1024, hop_length=512, win_length=None, window="hann" - ): - """ - This module implements an STFT using 1D convolution and 1D transpose convolutions. - This is a bit tricky so there are some cases that probably won't work as working - out the same sizes before and after in all overlap add setups is tough. Right now, - this code should work with hop lengths that are half the filter length (50% overlap - between frames). - - Keyword Arguments: - filter_length {int} -- Length of filters used (default: {1024}) - hop_length {int} -- Hop length of STFT (restrict to 50% overlap between frames) (default: {512}) - win_length {[type]} -- Length of the window function applied to each frame (if not specified, it - equals the filter length). (default: {None}) - window {str} -- Type of window to use (options are bartlett, hann, hamming, blackman, blackmanharris) - (default: {'hann'}) - """ - super(STFT, self).__init__() - self.filter_length = filter_length - self.hop_length = hop_length - self.win_length = win_length if win_length else filter_length - self.window = window - self.forward_transform = None - self.pad_amount = int(self.filter_length / 2) - scale = self.filter_length / self.hop_length - fourier_basis = np.fft.fft(np.eye(self.filter_length)) - - cutoff = int((self.filter_length / 2 + 1)) - fourier_basis = np.vstack( - [np.real(fourier_basis[:cutoff, :]), np.imag(fourier_basis[:cutoff, :])] - ) - forward_basis = torch.FloatTensor(fourier_basis[:, None, :]) - inverse_basis = torch.FloatTensor( - np.linalg.pinv(scale * fourier_basis).T[:, None, :] - ) - - assert filter_length >= self.win_length - # get window and zero center pad it to filter_length - fft_window = get_window(window, self.win_length, fftbins=True) - fft_window = pad_center(fft_window, size=filter_length) - fft_window = torch.from_numpy(fft_window).float() - - # window the bases - forward_basis *= fft_window - inverse_basis *= fft_window - - self.register_buffer("forward_basis", forward_basis.float()) - self.register_buffer("inverse_basis", inverse_basis.float()) - - def transform(self, input_data): - """Take input data (audio) to STFT domain. - - Arguments: - input_data {tensor} -- Tensor of floats, with shape (num_batch, num_samples) - - Returns: - magnitude {tensor} -- Magnitude of STFT with shape (num_batch, - num_frequencies, num_frames) - phase {tensor} -- Phase of STFT with shape (num_batch, - num_frequencies, num_frames) - """ - num_batches = input_data.shape[0] - num_samples = input_data.shape[-1] - - self.num_samples = num_samples - - # similar to librosa, reflect-pad the input - input_data = input_data.view(num_batches, 1, num_samples) - # print(1234,input_data.shape) - input_data = F.pad( - input_data.unsqueeze(1), - (self.pad_amount, self.pad_amount, 0, 0, 0, 0), - mode="reflect", - ).squeeze(1) - # print(2333,input_data.shape,self.forward_basis.shape,self.hop_length) - # pdb.set_trace() - forward_transform = F.conv1d( - input_data, self.forward_basis, stride=self.hop_length, padding=0 - ) - - cutoff = int((self.filter_length / 2) + 1) - real_part = forward_transform[:, :cutoff, :] - imag_part = forward_transform[:, cutoff:, :] - - magnitude = torch.sqrt(real_part**2 + imag_part**2) - # phase = torch.atan2(imag_part.data, real_part.data) - - return magnitude # , phase - - def inverse(self, magnitude, phase): - """Call the inverse STFT (iSTFT), given magnitude and phase tensors produced - by the ```transform``` function. - - Arguments: - magnitude {tensor} -- Magnitude of STFT with shape (num_batch, - num_frequencies, num_frames) - phase {tensor} -- Phase of STFT with shape (num_batch, - num_frequencies, num_frames) - - Returns: - inverse_transform {tensor} -- Reconstructed audio given magnitude and phase. Of - shape (num_batch, num_samples) - """ - recombine_magnitude_phase = torch.cat( - [magnitude * torch.cos(phase), magnitude * torch.sin(phase)], dim=1 - ) - - inverse_transform = F.conv_transpose1d( - recombine_magnitude_phase, - self.inverse_basis, - stride=self.hop_length, - padding=0, - ) - - if self.window is not None: - window_sum = window_sumsquare( - self.window, - magnitude.size(-1), - hop_length=self.hop_length, - win_length=self.win_length, - n_fft=self.filter_length, - dtype=np.float32, - ) - # remove modulation effects - approx_nonzero_indices = torch.from_numpy( - np.where(window_sum > tiny(window_sum))[0] - ) - window_sum = torch.from_numpy(window_sum).to(inverse_transform.device) - inverse_transform[:, :, approx_nonzero_indices] /= window_sum[ - approx_nonzero_indices - ] - - # scale by hop ratio - inverse_transform *= float(self.filter_length) / self.hop_length - - inverse_transform = inverse_transform[..., self.pad_amount :] - inverse_transform = inverse_transform[..., : self.num_samples] - inverse_transform = inverse_transform.squeeze(1) - - return inverse_transform - - def forward(self, input_data): - """Take input data (audio) to STFT domain and then back to audio. - - Arguments: - input_data {tensor} -- Tensor of floats, with shape (num_batch, num_samples) - - Returns: - reconstruction {tensor} -- Reconstructed audio given magnitude and phase. Of - shape (num_batch, num_samples) - """ - self.magnitude, self.phase = self.transform(input_data) - reconstruction = self.inverse(self.magnitude, self.phase) - return reconstruction - - -from time import time as ttime - - -class BiGRU(nn.Module): - def __init__(self, input_features, hidden_features, num_layers): - super(BiGRU, self).__init__() - self.gru = nn.GRU( - input_features, - hidden_features, - num_layers=num_layers, - batch_first=True, - bidirectional=True, - ) - - def forward(self, x): - return self.gru(x)[0] - - -class ConvBlockRes(nn.Module): - def __init__(self, in_channels, out_channels, momentum=0.01): - super(ConvBlockRes, self).__init__() - self.conv = nn.Sequential( - nn.Conv2d( - in_channels=in_channels, - out_channels=out_channels, - kernel_size=(3, 3), - stride=(1, 1), - padding=(1, 1), - bias=False, - ), - nn.BatchNorm2d(out_channels, momentum=momentum), - nn.ReLU(), - nn.Conv2d( - in_channels=out_channels, - out_channels=out_channels, - kernel_size=(3, 3), - stride=(1, 1), - padding=(1, 1), - bias=False, - ), - nn.BatchNorm2d(out_channels, momentum=momentum), - nn.ReLU(), - ) - if in_channels != out_channels: - self.shortcut = nn.Conv2d(in_channels, out_channels, (1, 1)) - self.is_shortcut = True - else: - self.is_shortcut = False - - def forward(self, x): - if self.is_shortcut: - return self.conv(x) + self.shortcut(x) - else: - return self.conv(x) + x - - -class Encoder(nn.Module): - def __init__( - self, - in_channels, - in_size, - n_encoders, - kernel_size, - n_blocks, - out_channels=16, - momentum=0.01, - ): - super(Encoder, self).__init__() - self.n_encoders = n_encoders - self.bn = nn.BatchNorm2d(in_channels, momentum=momentum) - self.layers = nn.ModuleList() - self.latent_channels = [] - for i in range(self.n_encoders): - self.layers.append( - ResEncoderBlock( - in_channels, out_channels, kernel_size, n_blocks, momentum=momentum - ) - ) - self.latent_channels.append([out_channels, in_size]) - in_channels = out_channels - out_channels *= 2 - in_size //= 2 - self.out_size = in_size - self.out_channel = out_channels - - def forward(self, x): - concat_tensors = [] - x = self.bn(x) - for i in range(self.n_encoders): - _, x = self.layers[i](x) - concat_tensors.append(_) - return x, concat_tensors - - -class ResEncoderBlock(nn.Module): - def __init__( - self, in_channels, out_channels, kernel_size, n_blocks=1, momentum=0.01 - ): - super(ResEncoderBlock, self).__init__() - self.n_blocks = n_blocks - self.conv = nn.ModuleList() - self.conv.append(ConvBlockRes(in_channels, out_channels, momentum)) - for i in range(n_blocks - 1): - self.conv.append(ConvBlockRes(out_channels, out_channels, momentum)) - self.kernel_size = kernel_size - if self.kernel_size is not None: - self.pool = nn.AvgPool2d(kernel_size=kernel_size) - - def forward(self, x): - for i in range(self.n_blocks): - x = self.conv[i](x) - if self.kernel_size is not None: - return x, self.pool(x) - else: - return x - - -class Intermediate(nn.Module): # - def __init__(self, in_channels, out_channels, n_inters, n_blocks, momentum=0.01): - super(Intermediate, self).__init__() - self.n_inters = n_inters - self.layers = nn.ModuleList() - self.layers.append( - ResEncoderBlock(in_channels, out_channels, None, n_blocks, momentum) - ) - for i in range(self.n_inters - 1): - self.layers.append( - ResEncoderBlock(out_channels, out_channels, None, n_blocks, momentum) - ) - - def forward(self, x): - for i in range(self.n_inters): - x = self.layers[i](x) - return x - - -class ResDecoderBlock(nn.Module): - def __init__(self, in_channels, out_channels, stride, n_blocks=1, momentum=0.01): - super(ResDecoderBlock, self).__init__() - out_padding = (0, 1) if stride == (1, 2) else (1, 1) - self.n_blocks = n_blocks - self.conv1 = nn.Sequential( - nn.ConvTranspose2d( - in_channels=in_channels, - out_channels=out_channels, - kernel_size=(3, 3), - stride=stride, - padding=(1, 1), - output_padding=out_padding, - bias=False, - ), - nn.BatchNorm2d(out_channels, momentum=momentum), - nn.ReLU(), - ) - self.conv2 = nn.ModuleList() - self.conv2.append(ConvBlockRes(out_channels * 2, out_channels, momentum)) - for i in range(n_blocks - 1): - self.conv2.append(ConvBlockRes(out_channels, out_channels, momentum)) - - def forward(self, x, concat_tensor): - x = self.conv1(x) - x = torch.cat((x, concat_tensor), dim=1) - for i in range(self.n_blocks): - x = self.conv2[i](x) - return x - - -class Decoder(nn.Module): - def __init__(self, in_channels, n_decoders, stride, n_blocks, momentum=0.01): - super(Decoder, self).__init__() - self.layers = nn.ModuleList() - self.n_decoders = n_decoders - for i in range(self.n_decoders): - out_channels = in_channels // 2 - self.layers.append( - ResDecoderBlock(in_channels, out_channels, stride, n_blocks, momentum) - ) - in_channels = out_channels - - def forward(self, x, concat_tensors): - for i in range(self.n_decoders): - x = self.layers[i](x, concat_tensors[-1 - i]) - return x - - -class DeepUnet(nn.Module): - def __init__( - self, - kernel_size, - n_blocks, - en_de_layers=5, - inter_layers=4, - in_channels=1, - en_out_channels=16, - ): - super(DeepUnet, self).__init__() - self.encoder = Encoder( - in_channels, 128, en_de_layers, kernel_size, n_blocks, en_out_channels - ) - self.intermediate = Intermediate( - self.encoder.out_channel // 2, - self.encoder.out_channel, - inter_layers, - n_blocks, - ) - self.decoder = Decoder( - self.encoder.out_channel, en_de_layers, kernel_size, n_blocks - ) - - def forward(self, x): - x, concat_tensors = self.encoder(x) - x = self.intermediate(x) - x = self.decoder(x, concat_tensors) - return x - - -class E2E(nn.Module): - def __init__( - self, - n_blocks, - n_gru, - kernel_size, - en_de_layers=5, - inter_layers=4, - in_channels=1, - en_out_channels=16, - ): - super(E2E, self).__init__() - self.unet = DeepUnet( - kernel_size, - n_blocks, - en_de_layers, - inter_layers, - in_channels, - en_out_channels, - ) - self.cnn = nn.Conv2d(en_out_channels, 3, (3, 3), padding=(1, 1)) - if n_gru: - self.fc = nn.Sequential( - BiGRU(3 * 128, 256, n_gru), - nn.Linear(512, 360), - nn.Dropout(0.25), - nn.Sigmoid(), - ) - else: - self.fc = nn.Sequential( - nn.Linear(3 * nn.N_MELS, nn.N_CLASS), nn.Dropout(0.25), nn.Sigmoid() - ) - - def forward(self, mel): - # print(mel.shape) - mel = mel.transpose(-1, -2).unsqueeze(1) - x = self.cnn(self.unet(mel)).transpose(1, 2).flatten(-2) - x = self.fc(x) - # print(x.shape) - return x - - -from librosa.filters import mel - - -class MelSpectrogram(torch.nn.Module): - def __init__( - self, - is_half, - n_mel_channels, - sampling_rate, - win_length, - hop_length, - n_fft=None, - mel_fmin=0, - mel_fmax=None, - clamp=1e-5, - ): - super().__init__() - n_fft = win_length if n_fft is None else n_fft - self.hann_window = {} - mel_basis = mel( - sr=sampling_rate, - n_fft=n_fft, - n_mels=n_mel_channels, - fmin=mel_fmin, - fmax=mel_fmax, - htk=True, - ) - mel_basis = torch.from_numpy(mel_basis).float() - self.register_buffer("mel_basis", mel_basis) - self.n_fft = win_length if n_fft is None else n_fft - self.hop_length = hop_length - self.win_length = win_length - self.sampling_rate = sampling_rate - self.n_mel_channels = n_mel_channels - self.clamp = clamp - self.is_half = is_half - - def forward(self, audio, keyshift=0, speed=1, center=True): - factor = 2 ** (keyshift / 12) - n_fft_new = int(np.round(self.n_fft * factor)) - win_length_new = int(np.round(self.win_length * factor)) - hop_length_new = int(np.round(self.hop_length * speed)) - keyshift_key = str(keyshift) + "_" + str(audio.device) - if keyshift_key not in self.hann_window: - self.hann_window[keyshift_key] = torch.hann_window(win_length_new).to( - # "cpu"if(audio.device.type=="privateuseone") else audio.device - audio.device - ) - # fft = torch.stft(#doesn't support pytorch_dml - # # audio.cpu() if(audio.device.type=="privateuseone")else audio, - # audio, - # n_fft=n_fft_new, - # hop_length=hop_length_new, - # win_length=win_length_new, - # window=self.hann_window[keyshift_key], - # center=center, - # return_complex=True, - # ) - # magnitude = torch.sqrt(fft.real.pow(2) + fft.imag.pow(2)) - # print(1111111111) - # print(222222222222222,audio.device,self.is_half) - if hasattr(self, "stft") == False: - # print(n_fft_new,hop_length_new,win_length_new,audio.shape) - self.stft = STFT( - filter_length=n_fft_new, - hop_length=hop_length_new, - win_length=win_length_new, - window="hann", - ).to(audio.device) - magnitude = self.stft.transform(audio) # phase - # if (audio.device.type == "privateuseone"): - # magnitude=magnitude.to(audio.device) - if keyshift != 0: - size = self.n_fft // 2 + 1 - resize = magnitude.size(1) - if resize < size: - magnitude = F.pad(magnitude, (0, 0, 0, size - resize)) - magnitude = magnitude[:, :size, :] * self.win_length / win_length_new - mel_output = torch.matmul(self.mel_basis, magnitude) - if self.is_half == True: - mel_output = mel_output.half() - log_mel_spec = torch.log(torch.clamp(mel_output, min=self.clamp)) - # print(log_mel_spec.device.type) - return log_mel_spec - - -class RMVPE: - def __init__(self, model_path, is_half, device=None): - self.resample_kernel = {} - self.resample_kernel = {} - self.is_half = is_half - if device is None: - device = "cuda" if torch.cuda.is_available() else "cpu" - self.device = device - self.mel_extractor = MelSpectrogram( - is_half, 128, 16000, 1024, 160, None, 30, 8000 - ).to(device) - if "privateuseone" in str(device): - import onnxruntime as ort - - ort_session = ort.InferenceSession( - "rmvpe.onnx", providers=["DmlExecutionProvider"] - ) - self.model = ort_session - else: - model = E2E(4, 1, (2, 2)) - ckpt = torch.load(model_path, map_location="cpu") - model.load_state_dict(ckpt) - model.eval() - if is_half == True: - model = model.half() - self.model = model - self.model = self.model.to(device) - cents_mapping = 20 * np.arange(360) + 1997.3794084376191 - self.cents_mapping = np.pad(cents_mapping, (4, 4)) # 368 - - def mel2hidden(self, mel): - with torch.no_grad(): - n_frames = mel.shape[-1] - mel = F.pad( - mel, (0, 32 * ((n_frames - 1) // 32 + 1) - n_frames), mode="reflect" - ) - if "privateuseone" in str(self.device): - onnx_input_name = self.model.get_inputs()[0].name - onnx_outputs_names = self.model.get_outputs()[0].name - hidden = self.model.run( - [onnx_outputs_names], - input_feed={onnx_input_name: mel.cpu().numpy()}, - )[0] - else: - hidden = self.model(mel) - return hidden[:, :n_frames] - - def decode(self, hidden, thred=0.03): - cents_pred = self.to_local_average_cents(hidden, thred=thred) - f0 = 10 * (2 ** (cents_pred / 1200)) - f0[f0 == 10] = 0 - # f0 = np.array([10 * (2 ** (cent_pred / 1200)) if cent_pred else 0 for cent_pred in cents_pred]) - return f0 - - def infer_from_audio(self, audio, thred=0.03): - # torch.cuda.synchronize() - t0 = ttime() - mel = self.mel_extractor( - torch.from_numpy(audio).float().to(self.device).unsqueeze(0), center=True - ) - # print(123123123,mel.device.type) - # torch.cuda.synchronize() - t1 = ttime() - hidden = self.mel2hidden(mel) - # torch.cuda.synchronize() - t2 = ttime() - # print(234234,hidden.device.type) - if "privateuseone" not in str(self.device): - hidden = hidden.squeeze(0).cpu().numpy() - else: - hidden = hidden[0] - if self.is_half == True: - hidden = hidden.astype("float32") - - f0 = self.decode(hidden, thred=thred) - # torch.cuda.synchronize() - t3 = ttime() - # print("hmvpe:%s\t%s\t%s\t%s"%(t1-t0,t2-t1,t3-t2,t3-t0)) - return f0 - - def to_local_average_cents(self, salience, thred=0.05): - # t0 = ttime() - center = np.argmax(salience, axis=1) # 帧长#index - salience = np.pad(salience, ((0, 0), (4, 4))) # 帧长,368 - # t1 = ttime() - center += 4 - todo_salience = [] - todo_cents_mapping = [] - starts = center - 4 - ends = center + 5 - for idx in range(salience.shape[0]): - todo_salience.append(salience[:, starts[idx] : ends[idx]][idx]) - todo_cents_mapping.append(self.cents_mapping[starts[idx] : ends[idx]]) - # t2 = ttime() - todo_salience = np.array(todo_salience) # 帧长,9 - todo_cents_mapping = np.array(todo_cents_mapping) # 帧长,9 - product_sum = np.sum(todo_salience * todo_cents_mapping, 1) - weight_sum = np.sum(todo_salience, 1) # 帧长 - devided = product_sum / weight_sum # 帧长 - # t3 = ttime() - maxx = np.max(salience, axis=1) # 帧长 - devided[maxx <= thred] = 0 - # t4 = ttime() - # print("decode:%s\t%s\t%s\t%s" % (t1 - t0, t2 - t1, t3 - t2, t4 - t3)) - return devided - - -if __name__ == "__main__": - import soundfile as sf, librosa - - audio, sampling_rate = sf.read(r"C:\Users\liujing04\Desktop\Z\冬之花clip1.wav") - if len(audio.shape) > 1: - audio = librosa.to_mono(audio.transpose(1, 0)) - audio_bak = audio.copy() - if sampling_rate != 16000: - audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000) - model_path = r"D:\BaiduNetdiskDownload\RVC-beta-v2-0727AMD_realtime\rmvpe.pt" - thred = 0.03 # 0.01 - device = "cuda" if torch.cuda.is_available() else "cpu" - rmvpe = RMVPE(model_path, is_half=False, device=device) - t0 = ttime() - f0 = rmvpe.infer_from_audio(audio, thred=thred) - # f0 = rmvpe.infer_from_audio(audio, thred=thred) - # f0 = rmvpe.infer_from_audio(audio, thred=thred) - # f0 = rmvpe.infer_from_audio(audio, thred=thred) - # f0 = rmvpe.infer_from_audio(audio, thred=thred) - t1 = ttime() - print(f0.shape, t1 - t0) diff --git a/lib/slicer2.py b/lib/slicer2.py deleted file mode 100644 index 7d9d16d..0000000 --- a/lib/slicer2.py +++ /dev/null @@ -1,260 +0,0 @@ -import numpy as np - - -# This function is obtained from librosa. -def get_rms( - y, - frame_length=2048, - hop_length=512, - pad_mode="constant", -): - padding = (int(frame_length // 2), int(frame_length // 2)) - y = np.pad(y, padding, mode=pad_mode) - - axis = -1 - # put our new within-frame axis at the end for now - out_strides = y.strides + tuple([y.strides[axis]]) - # Reduce the shape on the framing axis - x_shape_trimmed = list(y.shape) - x_shape_trimmed[axis] -= frame_length - 1 - out_shape = tuple(x_shape_trimmed) + tuple([frame_length]) - xw = np.lib.stride_tricks.as_strided(y, shape=out_shape, strides=out_strides) - if axis < 0: - target_axis = axis - 1 - else: - target_axis = axis + 1 - xw = np.moveaxis(xw, -1, target_axis) - # Downsample along the target axis - slices = [slice(None)] * xw.ndim - slices[axis] = slice(0, None, hop_length) - x = xw[tuple(slices)] - - # Calculate power - power = np.mean(np.abs(x) ** 2, axis=-2, keepdims=True) - - return np.sqrt(power) - - -class Slicer: - def __init__( - self, - sr: int, - threshold: float = -40.0, - min_length: int = 5000, - min_interval: int = 300, - hop_size: int = 20, - max_sil_kept: int = 5000, - ): - if not min_length >= min_interval >= hop_size: - raise ValueError( - "The following condition must be satisfied: min_length >= min_interval >= hop_size" - ) - if not max_sil_kept >= hop_size: - raise ValueError( - "The following condition must be satisfied: max_sil_kept >= hop_size" - ) - min_interval = sr * min_interval / 1000 - self.threshold = 10 ** (threshold / 20.0) - self.hop_size = round(sr * hop_size / 1000) - self.win_size = min(round(min_interval), 4 * self.hop_size) - self.min_length = round(sr * min_length / 1000 / self.hop_size) - self.min_interval = round(min_interval / self.hop_size) - self.max_sil_kept = round(sr * max_sil_kept / 1000 / self.hop_size) - - def _apply_slice(self, waveform, begin, end): - if len(waveform.shape) > 1: - return waveform[ - :, begin * self.hop_size : min(waveform.shape[1], end * self.hop_size) - ] - else: - return waveform[ - begin * self.hop_size : min(waveform.shape[0], end * self.hop_size) - ] - - # @timeit - def slice(self, waveform): - if len(waveform.shape) > 1: - samples = waveform.mean(axis=0) - else: - samples = waveform - if samples.shape[0] <= self.min_length: - return [waveform] - rms_list = get_rms( - y=samples, frame_length=self.win_size, hop_length=self.hop_size - ).squeeze(0) - sil_tags = [] - silence_start = None - clip_start = 0 - for i, rms in enumerate(rms_list): - # Keep looping while frame is silent. - if rms < self.threshold: - # Record start of silent frames. - if silence_start is None: - silence_start = i - continue - # Keep looping while frame is not silent and silence start has not been recorded. - if silence_start is None: - continue - # Clear recorded silence start if interval is not enough or clip is too short - is_leading_silence = silence_start == 0 and i > self.max_sil_kept - need_slice_middle = ( - i - silence_start >= self.min_interval - and i - clip_start >= self.min_length - ) - if not is_leading_silence and not need_slice_middle: - silence_start = None - continue - # Need slicing. Record the range of silent frames to be removed. - if i - silence_start <= self.max_sil_kept: - pos = rms_list[silence_start : i + 1].argmin() + silence_start - if silence_start == 0: - sil_tags.append((0, pos)) - else: - sil_tags.append((pos, pos)) - clip_start = pos - elif i - silence_start <= self.max_sil_kept * 2: - pos = rms_list[ - i - self.max_sil_kept : silence_start + self.max_sil_kept + 1 - ].argmin() - pos += i - self.max_sil_kept - pos_l = ( - rms_list[ - silence_start : silence_start + self.max_sil_kept + 1 - ].argmin() - + silence_start - ) - pos_r = ( - rms_list[i - self.max_sil_kept : i + 1].argmin() - + i - - self.max_sil_kept - ) - if silence_start == 0: - sil_tags.append((0, pos_r)) - clip_start = pos_r - else: - sil_tags.append((min(pos_l, pos), max(pos_r, pos))) - clip_start = max(pos_r, pos) - else: - pos_l = ( - rms_list[ - silence_start : silence_start + self.max_sil_kept + 1 - ].argmin() - + silence_start - ) - pos_r = ( - rms_list[i - self.max_sil_kept : i + 1].argmin() - + i - - self.max_sil_kept - ) - if silence_start == 0: - sil_tags.append((0, pos_r)) - else: - sil_tags.append((pos_l, pos_r)) - clip_start = pos_r - silence_start = None - # Deal with trailing silence. - total_frames = rms_list.shape[0] - if ( - silence_start is not None - and total_frames - silence_start >= self.min_interval - ): - silence_end = min(total_frames, silence_start + self.max_sil_kept) - pos = rms_list[silence_start : silence_end + 1].argmin() + silence_start - sil_tags.append((pos, total_frames + 1)) - # Apply and return slices. - if len(sil_tags) == 0: - return [waveform] - else: - chunks = [] - if sil_tags[0][0] > 0: - chunks.append(self._apply_slice(waveform, 0, sil_tags[0][0])) - for i in range(len(sil_tags) - 1): - chunks.append( - self._apply_slice(waveform, sil_tags[i][1], sil_tags[i + 1][0]) - ) - if sil_tags[-1][1] < total_frames: - chunks.append( - self._apply_slice(waveform, sil_tags[-1][1], total_frames) - ) - return chunks - - -def main(): - import os.path - from argparse import ArgumentParser - - import librosa - import soundfile - - parser = ArgumentParser() - parser.add_argument("audio", type=str, help="The audio to be sliced") - parser.add_argument( - "--out", type=str, help="Output directory of the sliced audio clips" - ) - parser.add_argument( - "--db_thresh", - type=float, - required=False, - default=-40, - help="The dB threshold for silence detection", - ) - parser.add_argument( - "--min_length", - type=int, - required=False, - default=5000, - help="The minimum milliseconds required for each sliced audio clip", - ) - parser.add_argument( - "--min_interval", - type=int, - required=False, - default=300, - help="The minimum milliseconds for a silence part to be sliced", - ) - parser.add_argument( - "--hop_size", - type=int, - required=False, - default=10, - help="Frame length in milliseconds", - ) - parser.add_argument( - "--max_sil_kept", - type=int, - required=False, - default=500, - help="The maximum silence length kept around the sliced clip, presented in milliseconds", - ) - args = parser.parse_args() - out = args.out - if out is None: - out = os.path.dirname(os.path.abspath(args.audio)) - audio, sr = librosa.load(args.audio, sr=None, mono=False) - slicer = Slicer( - sr=sr, - threshold=args.db_thresh, - min_length=args.min_length, - min_interval=args.min_interval, - hop_size=args.hop_size, - max_sil_kept=args.max_sil_kept, - ) - chunks = slicer.slice(audio) - if not os.path.exists(out): - os.makedirs(out) - for i, chunk in enumerate(chunks): - if len(chunk.shape) > 1: - chunk = chunk.T - soundfile.write( - os.path.join( - out, - f"%s_%d.wav" - % (os.path.basename(args.audio).rsplit(".", maxsplit=1)[0], i), - ), - chunk, - sr, - ) - - -if __name__ == "__main__": - main() diff --git a/lib/train/cmd.txt b/lib/train/cmd.txt deleted file mode 100644 index e4b895e..0000000 --- a/lib/train/cmd.txt +++ /dev/null @@ -1 +0,0 @@ -python train_nsf_sim_cache_sid.py -c configs/mi_mix40k_nsf_co256_cs1sid_ms2048.json -m ft-mi \ No newline at end of file diff --git a/lib/train/data_utils.py b/lib/train/data_utils.py deleted file mode 100644 index 3437e24..0000000 --- a/lib/train/data_utils.py +++ /dev/null @@ -1,512 +0,0 @@ -import os, traceback -import numpy as np -import torch -import torch.utils.data - -from lib.train.mel_processing import spectrogram_torch -from lib.train.utils import load_wav_to_torch, load_filepaths_and_text - - -class TextAudioLoaderMultiNSFsid(torch.utils.data.Dataset): - """ - 1) loads audio, text pairs - 2) normalizes text and converts them to sequences of integers - 3) computes spectrograms from audio files. - """ - - def __init__(self, audiopaths_and_text, hparams): - self.audiopaths_and_text = load_filepaths_and_text(audiopaths_and_text) - self.max_wav_value = hparams.max_wav_value - self.sampling_rate = hparams.sampling_rate - self.filter_length = hparams.filter_length - self.hop_length = hparams.hop_length - self.win_length = hparams.win_length - self.sampling_rate = hparams.sampling_rate - self.min_text_len = getattr(hparams, "min_text_len", 1) - self.max_text_len = getattr(hparams, "max_text_len", 5000) - self._filter() - - def _filter(self): - """ - Filter text & store spec lengths - """ - # Store spectrogram lengths for Bucketing - # wav_length ~= file_size / (wav_channels * Bytes per dim) = file_size / (1 * 2) - # spec_length = wav_length // hop_length - audiopaths_and_text_new = [] - lengths = [] - for audiopath, text, pitch, pitchf, dv in self.audiopaths_and_text: - if self.min_text_len <= len(text) and len(text) <= self.max_text_len: - audiopaths_and_text_new.append([audiopath, text, pitch, pitchf, dv]) - lengths.append(os.path.getsize(audiopath) // (3 * self.hop_length)) - self.audiopaths_and_text = audiopaths_and_text_new - self.lengths = lengths - - def get_sid(self, sid): - sid = torch.LongTensor([int(sid)]) - return sid - - def get_audio_text_pair(self, audiopath_and_text): - # separate filename and text - file = audiopath_and_text[0] - phone = audiopath_and_text[1] - pitch = audiopath_and_text[2] - pitchf = audiopath_and_text[3] - dv = audiopath_and_text[4] - - phone, pitch, pitchf = self.get_labels(phone, pitch, pitchf) - spec, wav = self.get_audio(file) - dv = self.get_sid(dv) - - len_phone = phone.size()[0] - len_spec = spec.size()[-1] - # print(123,phone.shape,pitch.shape,spec.shape) - if len_phone != len_spec: - len_min = min(len_phone, len_spec) - # amor - len_wav = len_min * self.hop_length - - spec = spec[:, :len_min] - wav = wav[:, :len_wav] - - phone = phone[:len_min, :] - pitch = pitch[:len_min] - pitchf = pitchf[:len_min] - - return (spec, wav, phone, pitch, pitchf, dv) - - def get_labels(self, phone, pitch, pitchf): - phone = np.load(phone) - phone = np.repeat(phone, 2, axis=0) - pitch = np.load(pitch) - pitchf = np.load(pitchf) - n_num = min(phone.shape[0], 900) # DistributedBucketSampler - # print(234,phone.shape,pitch.shape) - phone = phone[:n_num, :] - pitch = pitch[:n_num] - pitchf = pitchf[:n_num] - phone = torch.FloatTensor(phone) - pitch = torch.LongTensor(pitch) - pitchf = torch.FloatTensor(pitchf) - return phone, pitch, pitchf - - def get_audio(self, filename): - audio, sampling_rate = load_wav_to_torch(filename) - if sampling_rate != self.sampling_rate: - raise ValueError( - "{} SR doesn't match target {} SR".format( - sampling_rate, self.sampling_rate - ) - ) - audio_norm = audio - # audio_norm = audio / self.max_wav_value - # audio_norm = audio / np.abs(audio).max() - - audio_norm = audio_norm.unsqueeze(0) - spec_filename = filename.replace(".wav", ".spec.pt") - if os.path.exists(spec_filename): - try: - spec = torch.load(spec_filename) - except: - print(spec_filename, traceback.format_exc()) - spec = spectrogram_torch( - audio_norm, - self.filter_length, - self.sampling_rate, - self.hop_length, - self.win_length, - center=False, - ) - spec = torch.squeeze(spec, 0) - torch.save(spec, spec_filename, _use_new_zipfile_serialization=False) - else: - spec = spectrogram_torch( - audio_norm, - self.filter_length, - self.sampling_rate, - self.hop_length, - self.win_length, - center=False, - ) - spec = torch.squeeze(spec, 0) - torch.save(spec, spec_filename, _use_new_zipfile_serialization=False) - return spec, audio_norm - - def __getitem__(self, index): - return self.get_audio_text_pair(self.audiopaths_and_text[index]) - - def __len__(self): - return len(self.audiopaths_and_text) - - -class TextAudioCollateMultiNSFsid: - """Zero-pads model inputs and targets""" - - def __init__(self, return_ids=False): - self.return_ids = return_ids - - def __call__(self, batch): - """Collate's training batch from normalized text and aduio - PARAMS - ------ - batch: [text_normalized, spec_normalized, wav_normalized] - """ - # Right zero-pad all one-hot text sequences to max input length - _, ids_sorted_decreasing = torch.sort( - torch.LongTensor([x[0].size(1) for x in batch]), dim=0, descending=True - ) - - max_spec_len = max([x[0].size(1) for x in batch]) - max_wave_len = max([x[1].size(1) for x in batch]) - spec_lengths = torch.LongTensor(len(batch)) - wave_lengths = torch.LongTensor(len(batch)) - spec_padded = torch.FloatTensor(len(batch), batch[0][0].size(0), max_spec_len) - wave_padded = torch.FloatTensor(len(batch), 1, max_wave_len) - spec_padded.zero_() - wave_padded.zero_() - - max_phone_len = max([x[2].size(0) for x in batch]) - phone_lengths = torch.LongTensor(len(batch)) - phone_padded = torch.FloatTensor( - len(batch), max_phone_len, batch[0][2].shape[1] - ) # (spec, wav, phone, pitch) - pitch_padded = torch.LongTensor(len(batch), max_phone_len) - pitchf_padded = torch.FloatTensor(len(batch), max_phone_len) - phone_padded.zero_() - pitch_padded.zero_() - pitchf_padded.zero_() - # dv = torch.FloatTensor(len(batch), 256)#gin=256 - sid = torch.LongTensor(len(batch)) - - for i in range(len(ids_sorted_decreasing)): - row = batch[ids_sorted_decreasing[i]] - - spec = row[0] - spec_padded[i, :, : spec.size(1)] = spec - spec_lengths[i] = spec.size(1) - - wave = row[1] - wave_padded[i, :, : wave.size(1)] = wave - wave_lengths[i] = wave.size(1) - - phone = row[2] - phone_padded[i, : phone.size(0), :] = phone - phone_lengths[i] = phone.size(0) - - pitch = row[3] - pitch_padded[i, : pitch.size(0)] = pitch - pitchf = row[4] - pitchf_padded[i, : pitchf.size(0)] = pitchf - - # dv[i] = row[5] - sid[i] = row[5] - - return ( - phone_padded, - phone_lengths, - pitch_padded, - pitchf_padded, - spec_padded, - spec_lengths, - wave_padded, - wave_lengths, - # dv - sid, - ) - - -class TextAudioLoader(torch.utils.data.Dataset): - """ - 1) loads audio, text pairs - 2) normalizes text and converts them to sequences of integers - 3) computes spectrograms from audio files. - """ - - def __init__(self, audiopaths_and_text, hparams): - self.audiopaths_and_text = load_filepaths_and_text(audiopaths_and_text) - self.max_wav_value = hparams.max_wav_value - self.sampling_rate = hparams.sampling_rate - self.filter_length = hparams.filter_length - self.hop_length = hparams.hop_length - self.win_length = hparams.win_length - self.sampling_rate = hparams.sampling_rate - self.min_text_len = getattr(hparams, "min_text_len", 1) - self.max_text_len = getattr(hparams, "max_text_len", 5000) - self._filter() - - def _filter(self): - """ - Filter text & store spec lengths - """ - # Store spectrogram lengths for Bucketing - # wav_length ~= file_size / (wav_channels * Bytes per dim) = file_size / (1 * 2) - # spec_length = wav_length // hop_length - audiopaths_and_text_new = [] - lengths = [] - for audiopath, text, dv in self.audiopaths_and_text: - if self.min_text_len <= len(text) and len(text) <= self.max_text_len: - audiopaths_and_text_new.append([audiopath, text, dv]) - lengths.append(os.path.getsize(audiopath) // (3 * self.hop_length)) - self.audiopaths_and_text = audiopaths_and_text_new - self.lengths = lengths - - def get_sid(self, sid): - sid = torch.LongTensor([int(sid)]) - return sid - - def get_audio_text_pair(self, audiopath_and_text): - # separate filename and text - file = audiopath_and_text[0] - phone = audiopath_and_text[1] - dv = audiopath_and_text[2] - - phone = self.get_labels(phone) - spec, wav = self.get_audio(file) - dv = self.get_sid(dv) - - len_phone = phone.size()[0] - len_spec = spec.size()[-1] - if len_phone != len_spec: - len_min = min(len_phone, len_spec) - len_wav = len_min * self.hop_length - spec = spec[:, :len_min] - wav = wav[:, :len_wav] - phone = phone[:len_min, :] - return (spec, wav, phone, dv) - - def get_labels(self, phone): - phone = np.load(phone) - phone = np.repeat(phone, 2, axis=0) - n_num = min(phone.shape[0], 900) # DistributedBucketSampler - phone = phone[:n_num, :] - phone = torch.FloatTensor(phone) - return phone - - def get_audio(self, filename): - audio, sampling_rate = load_wav_to_torch(filename) - if sampling_rate != self.sampling_rate: - raise ValueError( - "{} SR doesn't match target {} SR".format( - sampling_rate, self.sampling_rate - ) - ) - audio_norm = audio - # audio_norm = audio / self.max_wav_value - # audio_norm = audio / np.abs(audio).max() - - audio_norm = audio_norm.unsqueeze(0) - spec_filename = filename.replace(".wav", ".spec.pt") - if os.path.exists(spec_filename): - try: - spec = torch.load(spec_filename) - except: - print(spec_filename, traceback.format_exc()) - spec = spectrogram_torch( - audio_norm, - self.filter_length, - self.sampling_rate, - self.hop_length, - self.win_length, - center=False, - ) - spec = torch.squeeze(spec, 0) - torch.save(spec, spec_filename, _use_new_zipfile_serialization=False) - else: - spec = spectrogram_torch( - audio_norm, - self.filter_length, - self.sampling_rate, - self.hop_length, - self.win_length, - center=False, - ) - spec = torch.squeeze(spec, 0) - torch.save(spec, spec_filename, _use_new_zipfile_serialization=False) - return spec, audio_norm - - def __getitem__(self, index): - return self.get_audio_text_pair(self.audiopaths_and_text[index]) - - def __len__(self): - return len(self.audiopaths_and_text) - - -class TextAudioCollate: - """Zero-pads model inputs and targets""" - - def __init__(self, return_ids=False): - self.return_ids = return_ids - - def __call__(self, batch): - """Collate's training batch from normalized text and aduio - PARAMS - ------ - batch: [text_normalized, spec_normalized, wav_normalized] - """ - # Right zero-pad all one-hot text sequences to max input length - _, ids_sorted_decreasing = torch.sort( - torch.LongTensor([x[0].size(1) for x in batch]), dim=0, descending=True - ) - - max_spec_len = max([x[0].size(1) for x in batch]) - max_wave_len = max([x[1].size(1) for x in batch]) - spec_lengths = torch.LongTensor(len(batch)) - wave_lengths = torch.LongTensor(len(batch)) - spec_padded = torch.FloatTensor(len(batch), batch[0][0].size(0), max_spec_len) - wave_padded = torch.FloatTensor(len(batch), 1, max_wave_len) - spec_padded.zero_() - wave_padded.zero_() - - max_phone_len = max([x[2].size(0) for x in batch]) - phone_lengths = torch.LongTensor(len(batch)) - phone_padded = torch.FloatTensor( - len(batch), max_phone_len, batch[0][2].shape[1] - ) - phone_padded.zero_() - sid = torch.LongTensor(len(batch)) - - for i in range(len(ids_sorted_decreasing)): - row = batch[ids_sorted_decreasing[i]] - - spec = row[0] - spec_padded[i, :, : spec.size(1)] = spec - spec_lengths[i] = spec.size(1) - - wave = row[1] - wave_padded[i, :, : wave.size(1)] = wave - wave_lengths[i] = wave.size(1) - - phone = row[2] - phone_padded[i, : phone.size(0), :] = phone - phone_lengths[i] = phone.size(0) - - sid[i] = row[3] - - return ( - phone_padded, - phone_lengths, - spec_padded, - spec_lengths, - wave_padded, - wave_lengths, - sid, - ) - - -class DistributedBucketSampler(torch.utils.data.distributed.DistributedSampler): - """ - Maintain similar input lengths in a batch. - Length groups are specified by boundaries. - Ex) boundaries = [b1, b2, b3] -> any batch is included either {x | b1 < length(x) <=b2} or {x | b2 < length(x) <= b3}. - - It removes samples which are not included in the boundaries. - Ex) boundaries = [b1, b2, b3] -> any x s.t. length(x) <= b1 or length(x) > b3 are discarded. - """ - - def __init__( - self, - dataset, - batch_size, - boundaries, - num_replicas=None, - rank=None, - shuffle=True, - ): - super().__init__(dataset, num_replicas=num_replicas, rank=rank, shuffle=shuffle) - self.lengths = dataset.lengths - self.batch_size = batch_size - self.boundaries = boundaries - - self.buckets, self.num_samples_per_bucket = self._create_buckets() - self.total_size = sum(self.num_samples_per_bucket) - self.num_samples = self.total_size // self.num_replicas - - def _create_buckets(self): - buckets = [[] for _ in range(len(self.boundaries) - 1)] - for i in range(len(self.lengths)): - length = self.lengths[i] - idx_bucket = self._bisect(length) - if idx_bucket != -1: - buckets[idx_bucket].append(i) - - for i in range(len(buckets) - 1, -1, -1): # - if len(buckets[i]) == 0: - buckets.pop(i) - self.boundaries.pop(i + 1) - - num_samples_per_bucket = [] - for i in range(len(buckets)): - len_bucket = len(buckets[i]) - total_batch_size = self.num_replicas * self.batch_size - rem = ( - total_batch_size - (len_bucket % total_batch_size) - ) % total_batch_size - num_samples_per_bucket.append(len_bucket + rem) - return buckets, num_samples_per_bucket - - def __iter__(self): - # deterministically shuffle based on epoch - g = torch.Generator() - g.manual_seed(self.epoch) - - indices = [] - if self.shuffle: - for bucket in self.buckets: - indices.append(torch.randperm(len(bucket), generator=g).tolist()) - else: - for bucket in self.buckets: - indices.append(list(range(len(bucket)))) - - batches = [] - for i in range(len(self.buckets)): - bucket = self.buckets[i] - len_bucket = len(bucket) - ids_bucket = indices[i] - num_samples_bucket = self.num_samples_per_bucket[i] - - # add extra samples to make it evenly divisible - rem = num_samples_bucket - len_bucket - ids_bucket = ( - ids_bucket - + ids_bucket * (rem // len_bucket) - + ids_bucket[: (rem % len_bucket)] - ) - - # subsample - ids_bucket = ids_bucket[self.rank :: self.num_replicas] - - # batching - for j in range(len(ids_bucket) // self.batch_size): - batch = [ - bucket[idx] - for idx in ids_bucket[ - j * self.batch_size : (j + 1) * self.batch_size - ] - ] - batches.append(batch) - - if self.shuffle: - batch_ids = torch.randperm(len(batches), generator=g).tolist() - batches = [batches[i] for i in batch_ids] - self.batches = batches - - assert len(self.batches) * self.batch_size == self.num_samples - return iter(self.batches) - - def _bisect(self, x, lo=0, hi=None): - if hi is None: - hi = len(self.boundaries) - 1 - - if hi > lo: - mid = (hi + lo) // 2 - if self.boundaries[mid] < x and x <= self.boundaries[mid + 1]: - return mid - elif x <= self.boundaries[mid]: - return self._bisect(x, lo, mid) - else: - return self._bisect(x, mid + 1, hi) - else: - return -1 - - def __len__(self): - return self.num_samples // self.batch_size diff --git a/lib/train/losses.py b/lib/train/losses.py deleted file mode 100644 index aa7bd81..0000000 --- a/lib/train/losses.py +++ /dev/null @@ -1,58 +0,0 @@ -import torch - - -def feature_loss(fmap_r, fmap_g): - loss = 0 - for dr, dg in zip(fmap_r, fmap_g): - for rl, gl in zip(dr, dg): - rl = rl.float().detach() - gl = gl.float() - loss += torch.mean(torch.abs(rl - gl)) - - return loss * 2 - - -def discriminator_loss(disc_real_outputs, disc_generated_outputs): - loss = 0 - r_losses = [] - g_losses = [] - for dr, dg in zip(disc_real_outputs, disc_generated_outputs): - dr = dr.float() - dg = dg.float() - r_loss = torch.mean((1 - dr) ** 2) - g_loss = torch.mean(dg**2) - loss += r_loss + g_loss - r_losses.append(r_loss.item()) - g_losses.append(g_loss.item()) - - return loss, r_losses, g_losses - - -def generator_loss(disc_outputs): - loss = 0 - gen_losses = [] - for dg in disc_outputs: - dg = dg.float() - l = torch.mean((1 - dg) ** 2) - gen_losses.append(l) - loss += l - - return loss, gen_losses - - -def kl_loss(z_p, logs_q, m_p, logs_p, z_mask): - """ - z_p, logs_q: [b, h, t_t] - m_p, logs_p: [b, h, t_t] - """ - z_p = z_p.float() - logs_q = logs_q.float() - m_p = m_p.float() - logs_p = logs_p.float() - z_mask = z_mask.float() - - kl = logs_p - logs_q - 0.5 - kl += 0.5 * ((z_p - m_p) ** 2) * torch.exp(-2.0 * logs_p) - kl = torch.sum(kl * z_mask) - l = kl / torch.sum(z_mask) - return l diff --git a/lib/train/mel_processing.py b/lib/train/mel_processing.py deleted file mode 100644 index 3cc3687..0000000 --- a/lib/train/mel_processing.py +++ /dev/null @@ -1,130 +0,0 @@ -import torch -import torch.utils.data -from librosa.filters import mel as librosa_mel_fn - - -MAX_WAV_VALUE = 32768.0 - - -def dynamic_range_compression_torch(x, C=1, clip_val=1e-5): - """ - PARAMS - ------ - C: compression factor - """ - return torch.log(torch.clamp(x, min=clip_val) * C) - - -def dynamic_range_decompression_torch(x, C=1): - """ - PARAMS - ------ - C: compression factor used to compress - """ - return torch.exp(x) / C - - -def spectral_normalize_torch(magnitudes): - return dynamic_range_compression_torch(magnitudes) - - -def spectral_de_normalize_torch(magnitudes): - return dynamic_range_decompression_torch(magnitudes) - - -# Reusable banks -mel_basis = {} -hann_window = {} - - -def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False): - """Convert waveform into Linear-frequency Linear-amplitude spectrogram. - - Args: - y :: (B, T) - Audio waveforms - n_fft - sampling_rate - hop_size - win_size - center - Returns: - :: (B, Freq, Frame) - Linear-frequency Linear-amplitude spectrogram - """ - # Validation - if torch.min(y) < -1.07: - print("min value is ", torch.min(y)) - if torch.max(y) > 1.07: - print("max value is ", torch.max(y)) - - # Window - Cache if needed - global hann_window - dtype_device = str(y.dtype) + "_" + str(y.device) - wnsize_dtype_device = str(win_size) + "_" + dtype_device - if wnsize_dtype_device not in hann_window: - hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to( - dtype=y.dtype, device=y.device - ) - - # Padding - y = torch.nn.functional.pad( - y.unsqueeze(1), - (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)), - mode="reflect", - ) - y = y.squeeze(1) - - # Complex Spectrogram :: (B, T) -> (B, Freq, Frame, RealComplex=2) - spec = torch.stft( - y, - n_fft, - hop_length=hop_size, - win_length=win_size, - window=hann_window[wnsize_dtype_device], - center=center, - pad_mode="reflect", - normalized=False, - onesided=True, - return_complex=False, - ) - - # Linear-frequency Linear-amplitude spectrogram :: (B, Freq, Frame, RealComplex=2) -> (B, Freq, Frame) - spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6) - return spec - - -def spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax): - # MelBasis - Cache if needed - global mel_basis - dtype_device = str(spec.dtype) + "_" + str(spec.device) - fmax_dtype_device = str(fmax) + "_" + dtype_device - if fmax_dtype_device not in mel_basis: - mel = librosa_mel_fn( - sr=sampling_rate, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax - ) - mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to( - dtype=spec.dtype, device=spec.device - ) - - # Mel-frequency Log-amplitude spectrogram :: (B, Freq=num_mels, Frame) - melspec = torch.matmul(mel_basis[fmax_dtype_device], spec) - melspec = spectral_normalize_torch(melspec) - return melspec - - -def mel_spectrogram_torch( - y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False -): - """Convert waveform into Mel-frequency Log-amplitude spectrogram. - - Args: - y :: (B, T) - Waveforms - Returns: - melspec :: (B, Freq, Frame) - Mel-frequency Log-amplitude spectrogram - """ - # Linear-frequency Linear-amplitude spectrogram :: (B, T) -> (B, Freq, Frame) - spec = spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center) - - # Mel-frequency Log-amplitude spectrogram :: (B, Freq, Frame) -> (B, Freq=num_mels, Frame) - melspec = spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax) - - return melspec diff --git a/lib/train/process_ckpt.py b/lib/train/process_ckpt.py deleted file mode 100644 index 324d5a5..0000000 --- a/lib/train/process_ckpt.py +++ /dev/null @@ -1,259 +0,0 @@ -import torch, traceback, os, sys - -now_dir = os.getcwd() -sys.path.append(now_dir) -from collections import OrderedDict -from i18n import I18nAuto - -i18n = I18nAuto() - - -def savee(ckpt, sr, if_f0, name, epoch, version, hps): - try: - opt = OrderedDict() - opt["weight"] = {} - for key in ckpt.keys(): - if "enc_q" in key: - continue - opt["weight"][key] = ckpt[key].half() - opt["config"] = [ - hps.data.filter_length // 2 + 1, - 32, - hps.model.inter_channels, - hps.model.hidden_channels, - hps.model.filter_channels, - hps.model.n_heads, - hps.model.n_layers, - hps.model.kernel_size, - hps.model.p_dropout, - hps.model.resblock, - hps.model.resblock_kernel_sizes, - hps.model.resblock_dilation_sizes, - hps.model.upsample_rates, - hps.model.upsample_initial_channel, - hps.model.upsample_kernel_sizes, - hps.model.spk_embed_dim, - hps.model.gin_channels, - hps.data.sampling_rate, - ] - opt["info"] = "%sepoch" % epoch - opt["sr"] = sr - opt["f0"] = if_f0 - opt["version"] = version - torch.save(opt, "weights/%s.pth" % name) - return "Success." - except: - return traceback.format_exc() - - -def show_info(path): - try: - a = torch.load(path, map_location="cpu") - return "模型信息:%s\n采样率:%s\n模型是否输入音高引导:%s\n版本:%s" % ( - a.get("info", "None"), - a.get("sr", "None"), - a.get("f0", "None"), - a.get("version", "None"), - ) - except: - return traceback.format_exc() - - -def extract_small_model(path, name, sr, if_f0, info, version): - try: - ckpt = torch.load(path, map_location="cpu") - if "model" in ckpt: - ckpt = ckpt["model"] - opt = OrderedDict() - opt["weight"] = {} - for key in ckpt.keys(): - if "enc_q" in key: - continue - opt["weight"][key] = ckpt[key].half() - if sr == "40k": - opt["config"] = [ - 1025, - 32, - 192, - 192, - 768, - 2, - 6, - 3, - 0, - "1", - [3, 7, 11], - [[1, 3, 5], [1, 3, 5], [1, 3, 5]], - [10, 10, 2, 2], - 512, - [16, 16, 4, 4], - 109, - 256, - 40000, - ] - elif sr == "48k": - if version == "v1": - opt["config"] = [ - 1025, - 32, - 192, - 192, - 768, - 2, - 6, - 3, - 0, - "1", - [3, 7, 11], - [[1, 3, 5], [1, 3, 5], [1, 3, 5]], - [10, 6, 2, 2, 2], - 512, - [16, 16, 4, 4, 4], - 109, - 256, - 48000, - ] - else: - opt["config"] = [ - 1025, - 32, - 192, - 192, - 768, - 2, - 6, - 3, - 0, - "1", - [3, 7, 11], - [[1, 3, 5], [1, 3, 5], [1, 3, 5]], - [12, 10, 2, 2], - 512, - [24, 20, 4, 4], - 109, - 256, - 48000, - ] - elif sr == "32k": - if version == "v1": - opt["config"] = [ - 513, - 32, - 192, - 192, - 768, - 2, - 6, - 3, - 0, - "1", - [3, 7, 11], - [[1, 3, 5], [1, 3, 5], [1, 3, 5]], - [10, 4, 2, 2, 2], - 512, - [16, 16, 4, 4, 4], - 109, - 256, - 32000, - ] - else: - opt["config"] = [ - 513, - 32, - 192, - 192, - 768, - 2, - 6, - 3, - 0, - "1", - [3, 7, 11], - [[1, 3, 5], [1, 3, 5], [1, 3, 5]], - [10, 8, 2, 2], - 512, - [20, 16, 4, 4], - 109, - 256, - 32000, - ] - if info == "": - info = "Extracted model." - opt["info"] = info - opt["version"] = version - opt["sr"] = sr - opt["f0"] = int(if_f0) - torch.save(opt, "weights/%s.pth" % name) - return "Success." - except: - return traceback.format_exc() - - -def change_info(path, info, name): - try: - ckpt = torch.load(path, map_location="cpu") - ckpt["info"] = info - if name == "": - name = os.path.basename(path) - torch.save(ckpt, "weights/%s" % name) - return "Success." - except: - return traceback.format_exc() - - -def merge(path1, path2, alpha1, sr, f0, info, name, version): - try: - - def extract(ckpt): - a = ckpt["model"] - opt = OrderedDict() - opt["weight"] = {} - for key in a.keys(): - if "enc_q" in key: - continue - opt["weight"][key] = a[key] - return opt - - ckpt1 = torch.load(path1, map_location="cpu") - ckpt2 = torch.load(path2, map_location="cpu") - cfg = ckpt1["config"] - if "model" in ckpt1: - ckpt1 = extract(ckpt1) - else: - ckpt1 = ckpt1["weight"] - if "model" in ckpt2: - ckpt2 = extract(ckpt2) - else: - ckpt2 = ckpt2["weight"] - if sorted(list(ckpt1.keys())) != sorted(list(ckpt2.keys())): - return "Fail to merge the models. The model architectures are not the same." - opt = OrderedDict() - opt["weight"] = {} - for key in ckpt1.keys(): - # try: - if key == "emb_g.weight" and ckpt1[key].shape != ckpt2[key].shape: - min_shape0 = min(ckpt1[key].shape[0], ckpt2[key].shape[0]) - opt["weight"][key] = ( - alpha1 * (ckpt1[key][:min_shape0].float()) - + (1 - alpha1) * (ckpt2[key][:min_shape0].float()) - ).half() - else: - opt["weight"][key] = ( - alpha1 * (ckpt1[key].float()) + (1 - alpha1) * (ckpt2[key].float()) - ).half() - # except: - # pdb.set_trace() - opt["config"] = cfg - """ - if(sr=="40k"):opt["config"] = [1025, 32, 192, 192, 768, 2, 6, 3, 0, "1", [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [10, 10, 2, 2], 512, [16, 16, 4, 4,4], 109, 256, 40000] - elif(sr=="48k"):opt["config"] = [1025, 32, 192, 192, 768, 2, 6, 3, 0, "1", [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [10,6,2,2,2], 512, [16, 16, 4, 4], 109, 256, 48000] - elif(sr=="32k"):opt["config"] = [513, 32, 192, 192, 768, 2, 6, 3, 0, "1", [3, 7, 11], [[1, 3, 5], [1, 3, 5], [1, 3, 5]], [10, 4, 2, 2, 2], 512, [16, 16, 4, 4,4], 109, 256, 32000] - """ - opt["sr"] = sr - opt["f0"] = 1 if f0 == i18n("是") else 0 - opt["version"] = version - opt["info"] = info - torch.save(opt, "weights/%s.pth" % name) - return "Success." - except: - return traceback.format_exc() diff --git a/lib/train/utils.py b/lib/train/utils.py deleted file mode 100644 index 9c0fb5c..0000000 --- a/lib/train/utils.py +++ /dev/null @@ -1,487 +0,0 @@ -import os, traceback -import glob -import sys -import argparse -import logging -import json -import subprocess -import numpy as np -from scipy.io.wavfile import read -import torch - -MATPLOTLIB_FLAG = False - -logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) -logger = logging - - -def load_checkpoint_d(checkpoint_path, combd, sbd, optimizer=None, load_opt=1): - assert os.path.isfile(checkpoint_path) - checkpoint_dict = torch.load(checkpoint_path, map_location="cpu") - - ################## - def go(model, bkey): - saved_state_dict = checkpoint_dict[bkey] - if hasattr(model, "module"): - state_dict = model.module.state_dict() - else: - state_dict = model.state_dict() - new_state_dict = {} - for k, v in state_dict.items(): # 模型需要的shape - try: - new_state_dict[k] = saved_state_dict[k] - if saved_state_dict[k].shape != state_dict[k].shape: - print( - "shape-%s-mismatch|need-%s|get-%s" - % (k, state_dict[k].shape, saved_state_dict[k].shape) - ) # - raise KeyError - except: - # logger.info(traceback.format_exc()) - logger.info("%s is not in the checkpoint" % k) # pretrain缺失的 - new_state_dict[k] = v # 模型自带的随机值 - if hasattr(model, "module"): - model.module.load_state_dict(new_state_dict, strict=False) - else: - model.load_state_dict(new_state_dict, strict=False) - return model - - go(combd, "combd") - model = go(sbd, "sbd") - ############# - logger.info("Loaded model weights") - - iteration = checkpoint_dict["iteration"] - learning_rate = checkpoint_dict["learning_rate"] - if ( - optimizer is not None and load_opt == 1 - ): ###加载不了,如果是空的的话,重新初始化,可能还会影响lr时间表的更新,因此在train文件最外围catch - # try: - optimizer.load_state_dict(checkpoint_dict["optimizer"]) - # except: - # traceback.print_exc() - logger.info("Loaded checkpoint '{}' (epoch {})".format(checkpoint_path, iteration)) - return model, optimizer, learning_rate, iteration - - -# def load_checkpoint(checkpoint_path, model, optimizer=None): -# assert os.path.isfile(checkpoint_path) -# checkpoint_dict = torch.load(checkpoint_path, map_location='cpu') -# iteration = checkpoint_dict['iteration'] -# learning_rate = checkpoint_dict['learning_rate'] -# if optimizer is not None: -# optimizer.load_state_dict(checkpoint_dict['optimizer']) -# # print(1111) -# saved_state_dict = checkpoint_dict['model'] -# # print(1111) -# -# if hasattr(model, 'module'): -# state_dict = model.module.state_dict() -# else: -# state_dict = model.state_dict() -# new_state_dict= {} -# for k, v in state_dict.items(): -# try: -# new_state_dict[k] = saved_state_dict[k] -# except: -# logger.info("%s is not in the checkpoint" % k) -# new_state_dict[k] = v -# if hasattr(model, 'module'): -# model.module.load_state_dict(new_state_dict) -# else: -# model.load_state_dict(new_state_dict) -# logger.info("Loaded checkpoint '{}' (epoch {})" .format( -# checkpoint_path, iteration)) -# return model, optimizer, learning_rate, iteration -def load_checkpoint(checkpoint_path, model, optimizer=None, load_opt=1): - assert os.path.isfile(checkpoint_path) - checkpoint_dict = torch.load(checkpoint_path, map_location="cpu") - - saved_state_dict = checkpoint_dict["model"] - if hasattr(model, "module"): - state_dict = model.module.state_dict() - else: - state_dict = model.state_dict() - new_state_dict = {} - for k, v in state_dict.items(): # 模型需要的shape - try: - new_state_dict[k] = saved_state_dict[k] - if saved_state_dict[k].shape != state_dict[k].shape: - print( - "shape-%s-mismatch|need-%s|get-%s" - % (k, state_dict[k].shape, saved_state_dict[k].shape) - ) # - raise KeyError - except: - # logger.info(traceback.format_exc()) - logger.info("%s is not in the checkpoint" % k) # pretrain缺失的 - new_state_dict[k] = v # 模型自带的随机值 - if hasattr(model, "module"): - model.module.load_state_dict(new_state_dict, strict=False) - else: - model.load_state_dict(new_state_dict, strict=False) - logger.info("Loaded model weights") - - iteration = checkpoint_dict["iteration"] - learning_rate = checkpoint_dict["learning_rate"] - if ( - optimizer is not None and load_opt == 1 - ): ###加载不了,如果是空的的话,重新初始化,可能还会影响lr时间表的更新,因此在train文件最外围catch - # try: - optimizer.load_state_dict(checkpoint_dict["optimizer"]) - # except: - # traceback.print_exc() - logger.info("Loaded checkpoint '{}' (epoch {})".format(checkpoint_path, iteration)) - return model, optimizer, learning_rate, iteration - - -def save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path): - logger.info( - "Saving model and optimizer state at epoch {} to {}".format( - iteration, checkpoint_path - ) - ) - if hasattr(model, "module"): - state_dict = model.module.state_dict() - else: - state_dict = model.state_dict() - torch.save( - { - "model": state_dict, - "iteration": iteration, - "optimizer": optimizer.state_dict(), - "learning_rate": learning_rate, - }, - checkpoint_path, - ) - - -def save_checkpoint_d(combd, sbd, optimizer, learning_rate, iteration, checkpoint_path): - logger.info( - "Saving model and optimizer state at epoch {} to {}".format( - iteration, checkpoint_path - ) - ) - if hasattr(combd, "module"): - state_dict_combd = combd.module.state_dict() - else: - state_dict_combd = combd.state_dict() - if hasattr(sbd, "module"): - state_dict_sbd = sbd.module.state_dict() - else: - state_dict_sbd = sbd.state_dict() - torch.save( - { - "combd": state_dict_combd, - "sbd": state_dict_sbd, - "iteration": iteration, - "optimizer": optimizer.state_dict(), - "learning_rate": learning_rate, - }, - checkpoint_path, - ) - - -def summarize( - writer, - global_step, - scalars={}, - histograms={}, - images={}, - audios={}, - audio_sampling_rate=22050, -): - for k, v in scalars.items(): - writer.add_scalar(k, v, global_step) - for k, v in histograms.items(): - writer.add_histogram(k, v, global_step) - for k, v in images.items(): - writer.add_image(k, v, global_step, dataformats="HWC") - for k, v in audios.items(): - writer.add_audio(k, v, global_step, audio_sampling_rate) - - -def latest_checkpoint_path(dir_path, regex="G_*.pth"): - f_list = glob.glob(os.path.join(dir_path, regex)) - f_list.sort(key=lambda f: int("".join(filter(str.isdigit, f)))) - x = f_list[-1] - print(x) - return x - - -def plot_spectrogram_to_numpy(spectrogram): - global MATPLOTLIB_FLAG - if not MATPLOTLIB_FLAG: - import matplotlib - - matplotlib.use("Agg") - MATPLOTLIB_FLAG = True - mpl_logger = logging.getLogger("matplotlib") - mpl_logger.setLevel(logging.WARNING) - import matplotlib.pylab as plt - import numpy as np - - fig, ax = plt.subplots(figsize=(10, 2)) - im = ax.imshow(spectrogram, aspect="auto", origin="lower", interpolation="none") - plt.colorbar(im, ax=ax) - plt.xlabel("Frames") - plt.ylabel("Channels") - plt.tight_layout() - - fig.canvas.draw() - data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep="") - data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,)) - plt.close() - return data - - -def plot_alignment_to_numpy(alignment, info=None): - global MATPLOTLIB_FLAG - if not MATPLOTLIB_FLAG: - import matplotlib - - matplotlib.use("Agg") - MATPLOTLIB_FLAG = True - mpl_logger = logging.getLogger("matplotlib") - mpl_logger.setLevel(logging.WARNING) - import matplotlib.pylab as plt - import numpy as np - - fig, ax = plt.subplots(figsize=(6, 4)) - im = ax.imshow( - alignment.transpose(), aspect="auto", origin="lower", interpolation="none" - ) - fig.colorbar(im, ax=ax) - xlabel = "Decoder timestep" - if info is not None: - xlabel += "\n\n" + info - plt.xlabel(xlabel) - plt.ylabel("Encoder timestep") - plt.tight_layout() - - fig.canvas.draw() - data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep="") - data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,)) - plt.close() - return data - - -def load_wav_to_torch(full_path): - sampling_rate, data = read(full_path) - return torch.FloatTensor(data.astype(np.float32)), sampling_rate - - -def load_filepaths_and_text(filename, split="|"): - with open(filename, encoding="utf-8") as f: - filepaths_and_text = [line.strip().split(split) for line in f] - return filepaths_and_text - - -def get_hparams(init=True): - """ - todo: - 结尾七人组: - 保存频率、总epoch done - bs done - pretrainG、pretrainD done - 卡号:os.en["CUDA_VISIBLE_DEVICES"] done - if_latest done - 模型:if_f0 done - 采样率:自动选择config done - 是否缓存数据集进GPU:if_cache_data_in_gpu done - - -m: - 自动决定training_files路径,改掉train_nsf_load_pretrain.py里的hps.data.training_files done - -c不要了 - """ - parser = argparse.ArgumentParser() - # parser.add_argument('-c', '--config', type=str, default="configs/40k.json",help='JSON file for configuration') - parser.add_argument( - "-se", - "--save_every_epoch", - type=int, - required=True, - help="checkpoint save frequency (epoch)", - ) - parser.add_argument( - "-te", "--total_epoch", type=int, required=True, help="total_epoch" - ) - parser.add_argument( - "-pg", "--pretrainG", type=str, default="", help="Pretrained Discriminator path" - ) - parser.add_argument( - "-pd", "--pretrainD", type=str, default="", help="Pretrained Generator path" - ) - parser.add_argument("-g", "--gpus", type=str, default="0", help="split by -") - parser.add_argument( - "-bs", "--batch_size", type=int, required=True, help="batch size" - ) - parser.add_argument( - "-e", "--experiment_dir", type=str, required=True, help="experiment dir" - ) # -m - parser.add_argument( - "-sr", "--sample_rate", type=str, required=True, help="sample rate, 32k/40k/48k" - ) - parser.add_argument( - "-sw", - "--save_every_weights", - type=str, - default="0", - help="save the extracted model in weights directory when saving checkpoints", - ) - parser.add_argument( - "-v", "--version", type=str, required=True, help="model version" - ) - parser.add_argument( - "-f0", - "--if_f0", - type=int, - required=True, - help="use f0 as one of the inputs of the model, 1 or 0", - ) - parser.add_argument( - "-l", - "--if_latest", - type=int, - required=True, - help="if only save the latest G/D pth file, 1 or 0", - ) - parser.add_argument( - "-c", - "--if_cache_data_in_gpu", - type=int, - required=True, - help="if caching the dataset in GPU memory, 1 or 0", - ) - - args = parser.parse_args() - name = args.experiment_dir - experiment_dir = os.path.join("./logs", args.experiment_dir) - - if not os.path.exists(experiment_dir): - os.makedirs(experiment_dir) - - if args.version == "v1" or args.sample_rate == "40k": - config_path = "configs/%s.json" % args.sample_rate - else: - config_path = "configs/%s_v2.json" % args.sample_rate - config_save_path = os.path.join(experiment_dir, "config.json") - if init: - with open(config_path, "r") as f: - data = f.read() - with open(config_save_path, "w") as f: - f.write(data) - else: - with open(config_save_path, "r") as f: - data = f.read() - config = json.loads(data) - - hparams = HParams(**config) - hparams.model_dir = hparams.experiment_dir = experiment_dir - hparams.save_every_epoch = args.save_every_epoch - hparams.name = name - hparams.total_epoch = args.total_epoch - hparams.pretrainG = args.pretrainG - hparams.pretrainD = args.pretrainD - hparams.version = args.version - hparams.gpus = args.gpus - hparams.train.batch_size = args.batch_size - hparams.sample_rate = args.sample_rate - hparams.if_f0 = args.if_f0 - hparams.if_latest = args.if_latest - hparams.save_every_weights = args.save_every_weights - hparams.if_cache_data_in_gpu = args.if_cache_data_in_gpu - hparams.data.training_files = "%s/filelist.txt" % experiment_dir - return hparams - - -def get_hparams_from_dir(model_dir): - config_save_path = os.path.join(model_dir, "config.json") - with open(config_save_path, "r") as f: - data = f.read() - config = json.loads(data) - - hparams = HParams(**config) - hparams.model_dir = model_dir - return hparams - - -def get_hparams_from_file(config_path): - with open(config_path, "r") as f: - data = f.read() - config = json.loads(data) - - hparams = HParams(**config) - return hparams - - -def check_git_hash(model_dir): - source_dir = os.path.dirname(os.path.realpath(__file__)) - if not os.path.exists(os.path.join(source_dir, ".git")): - logger.warn( - "{} is not a git repository, therefore hash value comparison will be ignored.".format( - source_dir - ) - ) - return - - cur_hash = subprocess.getoutput("git rev-parse HEAD") - - path = os.path.join(model_dir, "githash") - if os.path.exists(path): - saved_hash = open(path).read() - if saved_hash != cur_hash: - logger.warn( - "git hash values are different. {}(saved) != {}(current)".format( - saved_hash[:8], cur_hash[:8] - ) - ) - else: - open(path, "w").write(cur_hash) - - -def get_logger(model_dir, filename="train.log"): - global logger - logger = logging.getLogger(os.path.basename(model_dir)) - logger.setLevel(logging.DEBUG) - - formatter = logging.Formatter("%(asctime)s\t%(name)s\t%(levelname)s\t%(message)s") - if not os.path.exists(model_dir): - os.makedirs(model_dir) - h = logging.FileHandler(os.path.join(model_dir, filename)) - h.setLevel(logging.DEBUG) - h.setFormatter(formatter) - logger.addHandler(h) - return logger - - -class HParams: - def __init__(self, **kwargs): - for k, v in kwargs.items(): - if type(v) == dict: - v = HParams(**v) - self[k] = v - - def keys(self): - return self.__dict__.keys() - - def items(self): - return self.__dict__.items() - - def values(self): - return self.__dict__.values() - - def __len__(self): - return len(self.__dict__) - - def __getitem__(self, key): - return getattr(self, key) - - def __setitem__(self, key, value): - return setattr(self, key, value) - - def __contains__(self, key): - return key in self.__dict__ - - def __repr__(self): - return self.__dict__.__repr__() diff --git a/lib/train/vc_infer_pipeline.py b/lib/train/vc_infer_pipeline.py deleted file mode 100644 index 980fc21..0000000 --- a/lib/train/vc_infer_pipeline.py +++ /dev/null @@ -1,449 +0,0 @@ -import numpy as np, parselmouth, torch, pdb, sys, os -from time import time as ttime -import torch.nn.functional as F -import scipy.signal as signal -import pyworld, os, traceback, faiss, librosa, torchcrepe -from scipy import signal -from functools import lru_cache - -now_dir = os.getcwd() -sys.path.append(now_dir) - -bh, ah = signal.butter(N=5, Wn=48, btype="high", fs=16000) - -input_audio_path2wav = {} - - -@lru_cache -def cache_harvest_f0(input_audio_path, fs, f0max, f0min, frame_period): - audio = input_audio_path2wav[input_audio_path] - f0, t = pyworld.harvest( - audio, - fs=fs, - f0_ceil=f0max, - f0_floor=f0min, - frame_period=frame_period, - ) - f0 = pyworld.stonemask(audio, f0, t, fs) - return f0 - - -def change_rms(data1, sr1, data2, sr2, rate): # 1是输入音频,2是输出音频,rate是2的占比 - # print(data1.max(),data2.max()) - rms1 = librosa.feature.rms( - y=data1, frame_length=sr1 // 2 * 2, hop_length=sr1 // 2 - ) # 每半秒一个点 - rms2 = librosa.feature.rms(y=data2, frame_length=sr2 // 2 * 2, hop_length=sr2 // 2) - rms1 = torch.from_numpy(rms1) - rms1 = F.interpolate( - rms1.unsqueeze(0), size=data2.shape[0], mode="linear" - ).squeeze() - rms2 = torch.from_numpy(rms2) - rms2 = F.interpolate( - rms2.unsqueeze(0), size=data2.shape[0], mode="linear" - ).squeeze() - rms2 = torch.max(rms2, torch.zeros_like(rms2) + 1e-6) - data2 *= ( - torch.pow(rms1, torch.tensor(1 - rate)) - * torch.pow(rms2, torch.tensor(rate - 1)) - ).numpy() - return data2 - - -class VC(object): - def __init__(self, tgt_sr, config): - self.x_pad, self.x_query, self.x_center, self.x_max, self.is_half = ( - config.x_pad, - config.x_query, - config.x_center, - config.x_max, - config.is_half, - ) - self.sr = 16000 # hubert输入采样率 - self.window = 160 # 每帧点数 - self.t_pad = self.sr * self.x_pad # 每条前后pad时间 - self.t_pad_tgt = tgt_sr * self.x_pad - self.t_pad2 = self.t_pad * 2 - self.t_query = self.sr * self.x_query # 查询切点前后查询时间 - self.t_center = self.sr * self.x_center # 查询切点位置 - self.t_max = self.sr * self.x_max # 免查询时长阈值 - self.device = config.device - - def get_f0( - self, - input_audio_path, - x, - p_len, - f0_up_key, - f0_method, - filter_radius, - inp_f0=None, - ): - global input_audio_path2wav - time_step = self.window / self.sr * 1000 - f0_min = 50 - f0_max = 1100 - f0_mel_min = 1127 * np.log(1 + f0_min / 700) - f0_mel_max = 1127 * np.log(1 + f0_max / 700) - if f0_method == "pm": - f0 = ( - parselmouth.Sound(x, self.sr) - .to_pitch_ac( - time_step=time_step / 1000, - voicing_threshold=0.6, - pitch_floor=f0_min, - pitch_ceiling=f0_max, - ) - .selected_array["frequency"] - ) - pad_size = (p_len - len(f0) + 1) // 2 - if pad_size > 0 or p_len - len(f0) - pad_size > 0: - f0 = np.pad( - f0, [[pad_size, p_len - len(f0) - pad_size]], mode="constant" - ) - elif f0_method == "harvest": - input_audio_path2wav[input_audio_path] = x.astype(np.double) - f0 = cache_harvest_f0(input_audio_path, self.sr, f0_max, f0_min, 10) - if filter_radius > 2: - f0 = signal.medfilt(f0, 3) - elif f0_method == "crepe": - model = "full" - # Pick a batch size that doesn't cause memory errors on your gpu - batch_size = 512 - # Compute pitch using first gpu - audio = torch.tensor(np.copy(x))[None].float() - f0, pd = torchcrepe.predict( - audio, - self.sr, - self.window, - f0_min, - f0_max, - model, - batch_size=batch_size, - device=self.device, - return_periodicity=True, - ) - pd = torchcrepe.filter.median(pd, 3) - f0 = torchcrepe.filter.mean(f0, 3) - f0[pd < 0.1] = 0 - f0 = f0[0].cpu().numpy() - elif f0_method == "rmvpe": - if hasattr(self, "model_rmvpe") == False: - from lib.rmvpe import RMVPE - - print("loading rmvpe model") - self.model_rmvpe = RMVPE( - "rmvpe.pt", is_half=self.is_half, device=self.device - ) - - f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03) - if "privateuseone" in str(self.device): # clean ortruntime memory - del self.model_rmvpe.model - del self.model_rmvpe - print("cleaning ortruntime memory") - - f0 *= pow(2, f0_up_key / 12) - # with open("test.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()])) - tf0 = self.sr // self.window # 每秒f0点数 - if inp_f0 is not None: - delta_t = np.round( - (inp_f0[:, 0].max() - inp_f0[:, 0].min()) * tf0 + 1 - ).astype("int16") - replace_f0 = np.interp( - list(range(delta_t)), inp_f0[:, 0] * 100, inp_f0[:, 1] - ) - shape = f0[self.x_pad * tf0 : self.x_pad * tf0 + len(replace_f0)].shape[0] - f0[self.x_pad * tf0 : self.x_pad * tf0 + len(replace_f0)] = replace_f0[ - :shape - ] - # with open("test_opt.txt","w")as f:f.write("\n".join([str(i)for i in f0.tolist()])) - f0bak = f0.copy() - f0_mel = 1127 * np.log(1 + f0 / 700) - f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / ( - f0_mel_max - f0_mel_min - ) + 1 - f0_mel[f0_mel <= 1] = 1 - f0_mel[f0_mel > 255] = 255 - f0_coarse = np.rint(f0_mel).astype(np.int32) - return f0_coarse, f0bak # 1-0 - - def vc( - self, - model, - net_g, - sid, - audio0, - pitch, - pitchf, - times, - index, - big_npy, - index_rate, - version, - protect, - ): # ,file_index,file_big_npy - feats = torch.from_numpy(audio0) - if self.is_half: - feats = feats.half() - else: - feats = feats.float() - if feats.dim() == 2: # double channels - feats = feats.mean(-1) - assert feats.dim() == 1, feats.dim() - feats = feats.view(1, -1) - padding_mask = torch.BoolTensor(feats.shape).to(self.device).fill_(False) - - inputs = { - "source": feats.to(self.device), - "padding_mask": padding_mask, - "output_layer": 9 if version == "v1" else 12, - } - t0 = ttime() - with torch.no_grad(): - logits = model.extract_features(**inputs) - feats = model.final_proj(logits[0]) if version == "v1" else logits[0] - if protect < 0.5 and pitch != None and pitchf != None: - feats0 = feats.clone() - if ( - isinstance(index, type(None)) == False - and isinstance(big_npy, type(None)) == False - and index_rate != 0 - ): - npy = feats[0].cpu().numpy() - if self.is_half: - npy = npy.astype("float32") - - # _, I = index.search(npy, 1) - # npy = big_npy[I.squeeze()] - - score, ix = index.search(npy, k=8) - weight = np.square(1 / score) - weight /= weight.sum(axis=1, keepdims=True) - npy = np.sum(big_npy[ix] * np.expand_dims(weight, axis=2), axis=1) - - if self.is_half: - npy = npy.astype("float16") - feats = ( - torch.from_numpy(npy).unsqueeze(0).to(self.device) * index_rate - + (1 - index_rate) * feats - ) - - feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1) - if protect < 0.5 and pitch != None and pitchf != None: - feats0 = F.interpolate(feats0.permute(0, 2, 1), scale_factor=2).permute( - 0, 2, 1 - ) - t1 = ttime() - p_len = audio0.shape[0] // self.window - if feats.shape[1] < p_len: - p_len = feats.shape[1] - if pitch != None and pitchf != None: - pitch = pitch[:, :p_len] - pitchf = pitchf[:, :p_len] - - if protect < 0.5 and pitch != None and pitchf != None: - pitchff = pitchf.clone() - pitchff[pitchf > 0] = 1 - pitchff[pitchf < 1] = protect - pitchff = pitchff.unsqueeze(-1) - feats = feats * pitchff + feats0 * (1 - pitchff) - feats = feats.to(feats0.dtype) - p_len = torch.tensor([p_len], device=self.device).long() - with torch.no_grad(): - if pitch != None and pitchf != None: - audio1 = ( - (net_g.infer(feats, p_len, pitch, pitchf, sid)[0][0, 0]) - .data.cpu() - .float() - .numpy() - ) - else: - audio1 = ( - (net_g.infer(feats, p_len, sid)[0][0, 0]).data.cpu().float().numpy() - ) - del feats, p_len, padding_mask - if torch.cuda.is_available(): - torch.cuda.empty_cache() - t2 = ttime() - times[0] += t1 - t0 - times[2] += t2 - t1 - return audio1 - - def pipeline( - self, - model, - net_g, - sid, - audio, - input_audio_path, - times, - f0_up_key, - f0_method, - file_index, - # file_big_npy, - index_rate, - if_f0, - filter_radius, - tgt_sr, - resample_sr, - rms_mix_rate, - version, - protect, - f0_file=None, - ): - if ( - file_index != "" - # and file_big_npy != "" - # and os.path.exists(file_big_npy) == True - and os.path.exists(file_index) == True - and index_rate != 0 - ): - try: - index = faiss.read_index(file_index) - # big_npy = np.load(file_big_npy) - big_npy = index.reconstruct_n(0, index.ntotal) - except: - traceback.print_exc() - index = big_npy = None - else: - index = big_npy = None - audio = signal.filtfilt(bh, ah, audio) - audio_pad = np.pad(audio, (self.window // 2, self.window // 2), mode="reflect") - opt_ts = [] - if audio_pad.shape[0] > self.t_max: - audio_sum = np.zeros_like(audio) - for i in range(self.window): - audio_sum += audio_pad[i : i - self.window] - for t in range(self.t_center, audio.shape[0], self.t_center): - opt_ts.append( - t - - self.t_query - + np.where( - np.abs(audio_sum[t - self.t_query : t + self.t_query]) - == np.abs(audio_sum[t - self.t_query : t + self.t_query]).min() - )[0][0] - ) - s = 0 - audio_opt = [] - t = None - t1 = ttime() - audio_pad = np.pad(audio, (self.t_pad, self.t_pad), mode="reflect") - p_len = audio_pad.shape[0] // self.window - inp_f0 = None - if hasattr(f0_file, "name") == True: - try: - with open(f0_file.name, "r") as f: - lines = f.read().strip("\n").split("\n") - inp_f0 = [] - for line in lines: - inp_f0.append([float(i) for i in line.split(",")]) - inp_f0 = np.array(inp_f0, dtype="float32") - except: - traceback.print_exc() - sid = torch.tensor(sid, device=self.device).unsqueeze(0).long() - pitch, pitchf = None, None - if if_f0 == 1: - pitch, pitchf = self.get_f0( - input_audio_path, - audio_pad, - p_len, - f0_up_key, - f0_method, - filter_radius, - inp_f0, - ) - pitch = pitch[:p_len] - pitchf = pitchf[:p_len] - if self.device == "mps": - pitchf = pitchf.astype(np.float32) - pitch = torch.tensor(pitch, device=self.device).unsqueeze(0).long() - pitchf = torch.tensor(pitchf, device=self.device).unsqueeze(0).float() - t2 = ttime() - times[1] += t2 - t1 - for t in opt_ts: - t = t // self.window * self.window - if if_f0 == 1: - audio_opt.append( - self.vc( - model, - net_g, - sid, - audio_pad[s : t + self.t_pad2 + self.window], - pitch[:, s // self.window : (t + self.t_pad2) // self.window], - pitchf[:, s // self.window : (t + self.t_pad2) // self.window], - times, - index, - big_npy, - index_rate, - version, - protect, - )[self.t_pad_tgt : -self.t_pad_tgt] - ) - else: - audio_opt.append( - self.vc( - model, - net_g, - sid, - audio_pad[s : t + self.t_pad2 + self.window], - None, - None, - times, - index, - big_npy, - index_rate, - version, - protect, - )[self.t_pad_tgt : -self.t_pad_tgt] - ) - s = t - if if_f0 == 1: - audio_opt.append( - self.vc( - model, - net_g, - sid, - audio_pad[t:], - pitch[:, t // self.window :] if t is not None else pitch, - pitchf[:, t // self.window :] if t is not None else pitchf, - times, - index, - big_npy, - index_rate, - version, - protect, - )[self.t_pad_tgt : -self.t_pad_tgt] - ) - else: - audio_opt.append( - self.vc( - model, - net_g, - sid, - audio_pad[t:], - None, - None, - times, - index, - big_npy, - index_rate, - version, - protect, - )[self.t_pad_tgt : -self.t_pad_tgt] - ) - audio_opt = np.concatenate(audio_opt) - if rms_mix_rate != 1: - audio_opt = change_rms(audio, 16000, audio_opt, tgt_sr, rms_mix_rate) - if resample_sr >= 16000 and tgt_sr != resample_sr: - audio_opt = librosa.resample( - audio_opt, orig_sr=tgt_sr, target_sr=resample_sr - ) - audio_max = np.abs(audio_opt).max() / 0.99 - max_int16 = 32768 - if audio_max > 1: - max_int16 /= audio_max - audio_opt = (audio_opt * max_int16).astype(np.int16) - del pitch, pitchf, sid - if torch.cuda.is_available(): - torch.cuda.empty_cache() - return audio_opt From e1e947cf2231b414383a0d10e332a24e90dc1a56 Mon Sep 17 00:00:00 2001 From: Ftps <63702646+Tps-F@users.noreply.github.com> Date: Mon, 28 Aug 2023 16:00:51 +0900 Subject: [PATCH 43/65] update readme --- README.md | 8 ++++---- docs/en/README.en.md | 10 +++++++--- docs/jp/README.ja.md | 10 +++++++--- docs/kr/README.ko.han.md | 10 +++++++--- docs/kr/README.ko.md | 10 +++++++--- docs/tr/README.tr.md | 8 ++++---- 6 files changed, 36 insertions(+), 20 deletions(-) diff --git a/README.md b/README.md index 74680cb..795a8bb 100644 --- a/README.md +++ b/README.md @@ -89,15 +89,15 @@ RVC需要其他一些预模型来推理和训练。 以下是一份清单,包括了所有RVC所需的预模型和其他文件的名称: ```bash -hubert_base.pt +./assets/hubert/hubert_base.pt -./pretrained +./assets/pretrained -./uvr5_weights +./assets/uvr5_weights 想测试v2版本模型的话,需要额外下载 -./pretrained_v2 +./assets/pretrained_v2 如果你正在使用Windows,则你可能需要这个文件,若ffmpeg和ffprobe已安装则跳过; ubuntu/debian 用户可以通过apt install ffmpeg来安装这2个库, Mac 用户则可以通过brew install ffmpeg来安装 (需要预先安装brew) diff --git a/docs/en/README.en.md b/docs/en/README.en.md index 9085eef..806c15d 100644 --- a/docs/en/README.en.md +++ b/docs/en/README.en.md @@ -91,11 +91,15 @@ You need to download them from our [Huggingface space](https://huggingface.co/lj Here's a list of Pre-models and other files that RVC needs: ```bash -hubert_base.pt +./assets/hubert/hubert_base.pt -./pretrained +./assets/pretrained -./uvr5_weights +./assets/uvr5_weights + +Additional downloads are required if you want to test the v2 version of the model. + +./assets/pretrained_v2 If you want to test the v2 version model (the v2 version model has changed the input from the 256 dimensional feature of 9-layer Hubert+final_proj to the 768 dimensional feature of 12-layer Hubert, and has added 3 period discriminators), you will need to download additional features diff --git a/docs/jp/README.ja.md b/docs/jp/README.ja.md index 151959e..6200fda 100644 --- a/docs/jp/README.ja.md +++ b/docs/jp/README.ja.md @@ -72,11 +72,15 @@ modelsは[Hugging Face space](https://huggingface.co/lj1995/VoiceConversionWebUI 以下は、RVCに必要な基底モデルやその他のファイルの一覧です。 ```bash -hubert_base.pt +./assets/hubert/hubert_base.pt -./pretrained +./assets/pretrained -./uvr5_weights +./assets/uvr5_weights + +V2のモデルを使用するには、追加でファイルをダウンロードする必要があります + +./assets/pretrained_v2 # ffmpegがすでにinstallされている場合は省略 ./ffmpeg diff --git a/docs/kr/README.ko.han.md b/docs/kr/README.ko.han.md index 54ecf5d..78c3c47 100644 --- a/docs/kr/README.ko.han.md +++ b/docs/kr/README.ko.han.md @@ -69,11 +69,15 @@ RVC 모델은 推論과訓練을 依하여 다른 預備모델이 必要합니 다음은 RVC에 必要한 預備모델 및 其他 파일 目錄입니다: ```bash -hubert_base.pt +./assets/hubert/hubert_base.pt -./pretrained +./assets/pretrained -./uvr5_weights +./assets/uvr5_weights + +V2 버전 모델을 테스트하려면 추가 다운로드가 필요합니다. + +./assets/pretrained_v2 # Windows를 使用하는境遇 이 사전도 必要할 수 있습니다. FFmpeg가 設置되어 있으면 건너뛰어도 됩니다. ffmpeg.exe diff --git a/docs/kr/README.ko.md b/docs/kr/README.ko.md index 748474c..5ea73e0 100644 --- a/docs/kr/README.ko.md +++ b/docs/kr/README.ko.md @@ -77,11 +77,15 @@ RVC 모델은 추론과 훈련을 위하여 다른 사전 모델이 필요합니 다음은 RVC에 필요한 사전 모델 및 기타 파일 목록입니다: ```bash -hubert_base.pt +./assets/hubert/hubert_base.pt -./pretrained +./assets/pretrained -./uvr5_weights +./assets/uvr5_weights + +V2 버전 모델을 테스트하려면 추가 다운로드가 필요합니다. + +./assets/pretrained_v2 # Windows를 사용하는 경우 이 사전도 필요할 수 있습니다. FFmpeg가 설치되어 있으면 건너뛰어도 됩니다. ffmpeg.exe diff --git a/docs/tr/README.tr.md b/docs/tr/README.tr.md index 62cfa05..8c0c2b1 100644 --- a/docs/tr/README.tr.md +++ b/docs/tr/README.tr.md @@ -88,15 +88,15 @@ Onları [Huggingface alanımızdan](https://huggingface.co/lj1995/VoiceConversio İşte RVC'nin ihtiyaç duyduğu Diğer Ön-Modellerin ve diğer dosyaların listesi: ```bash -hubert_base.pt +./assets/hubert/hubert_base.pt -./pretrained +./assets/pretrained -./uvr5_weights +./assets/uvr5_weights V2 sürümü modelini test etmek istiyorsanız (v2 sürümü modeli girişi 256 boyutlu 9 katmanlı Hubert+final_proj'dan 768 boyutlu 12 katmanlı Hubert'ın özelliğine ve 3 dönem ayrımına değiştirilmiştir), ek özellikleri indirmeniz gerekecektir. -./pretrained_v2 +./assets/pretrained_v2 #Eğer Windows kullanıyorsanız, FFmpeg yüklü değilse bu dictionariyaya da ihtiyacınız olabilir, FFmpeg yüklüyse atlayın ffmpeg.exe From d70b3d6f4615eccd218affca119b3fa98825a092 Mon Sep 17 00:00:00 2001 From: Ftps <63702646+Tps-F@users.noreply.github.com> Date: Mon, 28 Aug 2023 16:06:37 +0900 Subject: [PATCH 44/65] docker path --- Dockerfile | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/Dockerfile b/Dockerfile index 65ffbc1..fc1ee26 100644 --- a/Dockerfile +++ b/Dockerfile @@ -12,17 +12,17 @@ RUN apt update && apt install -y -qq ffmpeg aria2 RUN pip3 install -r requirements.txt -RUN aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained_v2/D40k.pth -d pretrained_v2/ -o D40k.pth -RUN aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained_v2/G40k.pth -d pretrained_v2/ -o G40k.pth -RUN aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained_v2/f0D40k.pth -d pretrained_v2/ -o f0D40k.pth -RUN aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained_v2/f0G40k.pth -d pretrained_v2/ -o f0G40k.pth +RUN aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained_v2/D40k.pth -d assets/pretrained_v2/ -o D40k.pth +RUN aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained_v2/G40k.pth -d assets/pretrained_v2/ -o G40k.pth +RUN aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained_v2/f0D40k.pth -d assets/pretrained_v2/ -o f0D40k.pth +RUN aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/pretrained_v2/f0G40k.pth -d assets/pretrained_v2/ -o f0G40k.pth -RUN aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/HP2-人声vocals+非人声instrumentals.pth -d uvr5_weights/ -o HP2-人声vocals+非人声instrumentals.pth -RUN aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/HP5-主旋律人声vocals+其他instrumentals.pth -d uvr5_weights/ -o HP5-主旋律人声vocals+其他instrumentals.pth +RUN aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/HP2-人声vocals+非人声instrumentals.pth -d assets/uvr5_weights/ -o HP2-人声vocals+非人声instrumentals.pth +RUN aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/uvr5_weights/HP5-主旋律人声vocals+其他instrumentals.pth -d assets/uvr5_weights/ -o HP5-主旋律人声vocals+其他instrumentals.pth -RUN aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/hubert_base.pt -o hubert_base.pt +RUN aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/hubert_base.pt -d assets/hubert -o hubert_base.pt -RUN aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/rmvpe.pt -o rmvpe.pt +RUN aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/rmvpe.pt -d assets/hubert -o rmvpe.pt VOLUME [ "/app/weights", "/app/opt" ] From 3c7f1f1407c2acefd08247971ea645d25d8c8d73 Mon Sep 17 00:00:00 2001 From: Ftps <63702646+Tps-F@users.noreply.github.com> Date: Mon, 28 Aug 2023 16:07:41 +0900 Subject: [PATCH 45/65] replace files --- MDXNet.py | 285 ------------------ app.py => tools/app.py | 8 +- tools/infer_cli.py | 2 +- .../rvc_for_realtime.py | 27 +- 4 files changed, 24 insertions(+), 298 deletions(-) delete mode 100644 MDXNet.py rename app.py => tools/app.py (99%) rename rvc_for_realtime.py => tools/rvc_for_realtime.py (98%) diff --git a/MDXNet.py b/MDXNet.py deleted file mode 100644 index c519e25..0000000 --- a/MDXNet.py +++ /dev/null @@ -1,285 +0,0 @@ -import soundfile as sf -import torch, pdb, os, warnings, librosa -import numpy as np -from tqdm import tqdm -import torch - -dim_c = 4 - - -class Conv_TDF_net_trim: - def __init__( - self, device, model_name, target_name, L, dim_f, dim_t, n_fft, hop=1024 - ): - super(Conv_TDF_net_trim, self).__init__() - - self.dim_f = dim_f - self.dim_t = 2**dim_t - self.n_fft = n_fft - self.hop = hop - self.n_bins = self.n_fft // 2 + 1 - self.chunk_size = hop * (self.dim_t - 1) - self.window = torch.hann_window(window_length=self.n_fft, periodic=True).to( - device - ) - self.target_name = target_name - self.blender = "blender" in model_name - - out_c = dim_c * 4 if target_name == "*" else dim_c - self.freq_pad = torch.zeros( - [1, out_c, self.n_bins - self.dim_f, self.dim_t] - ).to(device) - - self.n = L // 2 - - def stft(self, x): - x = x.reshape([-1, self.chunk_size]) - x = torch.stft( - x, - n_fft=self.n_fft, - hop_length=self.hop, - window=self.window, - center=True, - return_complex=True, - ) - x = torch.view_as_real(x) - x = x.permute([0, 3, 1, 2]) - x = x.reshape([-1, 2, 2, self.n_bins, self.dim_t]).reshape( - [-1, dim_c, self.n_bins, self.dim_t] - ) - return x[:, :, : self.dim_f] - - def istft(self, x, freq_pad=None): - freq_pad = ( - self.freq_pad.repeat([x.shape[0], 1, 1, 1]) - if freq_pad is None - else freq_pad - ) - x = torch.cat([x, freq_pad], -2) - c = 4 * 2 if self.target_name == "*" else 2 - x = x.reshape([-1, c, 2, self.n_bins, self.dim_t]).reshape( - [-1, 2, self.n_bins, self.dim_t] - ) - x = x.permute([0, 2, 3, 1]) - x = x.contiguous() - x = torch.view_as_complex(x) - x = torch.istft( - x, n_fft=self.n_fft, hop_length=self.hop, window=self.window, center=True - ) - return x.reshape([-1, c, self.chunk_size]) - - -def get_models(device, dim_f, dim_t, n_fft): - return Conv_TDF_net_trim( - device=device, - model_name="Conv-TDF", - target_name="vocals", - L=11, - dim_f=dim_f, - dim_t=dim_t, - n_fft=n_fft, - ) - - -warnings.filterwarnings("ignore") -import sys - -now_dir = os.getcwd() -sys.path.append(now_dir) -from config import Config - -cpu = torch.device("cpu") -device = Config().device -# if torch.cuda.is_available(): -# device = torch.device("cuda:0") -# elif torch.backends.mps.is_available(): -# device = torch.device("mps") -# else: -# device = torch.device("cpu") - - -class Predictor: - def __init__(self, args): - self.args = args - self.model_ = get_models( - device=cpu, dim_f=args.dim_f, dim_t=args.dim_t, n_fft=args.n_fft - ) - import onnxruntime as ort - - print(ort.get_available_providers()) - self.model = ort.InferenceSession( - os.path.join(args.onnx, self.model_.target_name + ".onnx"), - providers=[ - "CUDAExecutionProvider", - "DmlExecutionProvider", - "CPUExecutionProvider", - ], - ) - print("onnx load done") - - def demix(self, mix): - samples = mix.shape[-1] - margin = self.args.margin - chunk_size = self.args.chunks * 44100 - assert not margin == 0, "margin cannot be zero!" - if margin > chunk_size: - margin = chunk_size - - segmented_mix = {} - - if self.args.chunks == 0 or samples < chunk_size: - chunk_size = samples - - counter = -1 - for skip in range(0, samples, chunk_size): - counter += 1 - - s_margin = 0 if counter == 0 else margin - end = min(skip + chunk_size + margin, samples) - - start = skip - s_margin - - segmented_mix[skip] = mix[:, start:end].copy() - if end == samples: - break - - sources = self.demix_base(segmented_mix, margin_size=margin) - """ - mix:(2,big_sample) - segmented_mix:offset->(2,small_sample) - sources:(1,2,big_sample) - """ - return sources - - def demix_base(self, mixes, margin_size): - chunked_sources = [] - progress_bar = tqdm(total=len(mixes)) - progress_bar.set_description("Processing") - for mix in mixes: - cmix = mixes[mix] - sources = [] - n_sample = cmix.shape[1] - model = self.model_ - trim = model.n_fft // 2 - gen_size = model.chunk_size - 2 * trim - pad = gen_size - n_sample % gen_size - mix_p = np.concatenate( - (np.zeros((2, trim)), cmix, np.zeros((2, pad)), np.zeros((2, trim))), 1 - ) - mix_waves = [] - i = 0 - while i < n_sample + pad: - waves = np.array(mix_p[:, i : i + model.chunk_size]) - mix_waves.append(waves) - i += gen_size - mix_waves = torch.tensor(mix_waves, dtype=torch.float32).to(cpu) - with torch.no_grad(): - _ort = self.model - spek = model.stft(mix_waves) - if self.args.denoise: - spec_pred = ( - -_ort.run(None, {"input": -spek.cpu().numpy()})[0] * 0.5 - + _ort.run(None, {"input": spek.cpu().numpy()})[0] * 0.5 - ) - tar_waves = model.istft(torch.tensor(spec_pred)) - else: - tar_waves = model.istft( - torch.tensor(_ort.run(None, {"input": spek.cpu().numpy()})[0]) - ) - tar_signal = ( - tar_waves[:, :, trim:-trim] - .transpose(0, 1) - .reshape(2, -1) - .numpy()[:, :-pad] - ) - - start = 0 if mix == 0 else margin_size - end = None if mix == list(mixes.keys())[::-1][0] else -margin_size - if margin_size == 0: - end = None - sources.append(tar_signal[:, start:end]) - - progress_bar.update(1) - - chunked_sources.append(sources) - _sources = np.concatenate(chunked_sources, axis=-1) - # del self.model - progress_bar.close() - return _sources - - def prediction(self, m, vocal_root, others_root, format): - os.makedirs(vocal_root, exist_ok=True) - os.makedirs(others_root, exist_ok=True) - basename = os.path.basename(m) - mix, rate = librosa.load(m, mono=False, sr=44100) - if mix.ndim == 1: - mix = np.asfortranarray([mix, mix]) - mix = mix.T - sources = self.demix(mix.T) - opt = sources[0].T - if format in ["wav", "flac"]: - sf.write( - "%s/%s_main_vocal.%s" % (vocal_root, basename, format), mix - opt, rate - ) - sf.write("%s/%s_others.%s" % (others_root, basename, format), opt, rate) - else: - path_vocal = "%s/%s_main_vocal.wav" % (vocal_root, basename) - path_other = "%s/%s_others.wav" % (others_root, basename) - sf.write(path_vocal, mix - opt, rate) - sf.write(path_other, opt, rate) - if os.path.exists(path_vocal): - os.system( - "ffmpeg -i %s -vn %s -q:a 2 -y" - % (path_vocal, path_vocal[:-4] + ".%s" % format) - ) - if os.path.exists(path_other): - os.system( - "ffmpeg -i %s -vn %s -q:a 2 -y" - % (path_other, path_other[:-4] + ".%s" % format) - ) - - -class MDXNetDereverb: - def __init__(self, chunks): - self.onnx = "uvr5_weights/onnx_dereverb_By_FoxJoy" - self.shifts = 10 #'Predict with randomised equivariant stabilisation' - self.mixing = "min_mag" # ['default','min_mag','max_mag'] - self.chunks = chunks - self.margin = 44100 - self.dim_t = 9 - self.dim_f = 3072 - self.n_fft = 6144 - self.denoise = True - self.pred = Predictor(self) - - def _path_audio_(self, input, vocal_root, others_root, format): - self.pred.prediction(input, vocal_root, others_root, format) - - -if __name__ == "__main__": - dereverb = MDXNetDereverb(15) - from time import time as ttime - - t0 = ttime() - dereverb._path_audio_( - "雪雪伴奏对消HP5.wav", - "vocal", - "others", - ) - t1 = ttime() - print(t1 - t0) - - -""" - -runtime\python.exe MDXNet.py - -6G: -15/9:0.8G->6.8G -14:0.8G->6.5G -25:炸 - -half15:0.7G->6.6G,22.69s -fp32-15:0.7G->6.6G,20.85s - -""" diff --git a/app.py b/tools/app.py similarity index 99% rename from app.py rename to tools/app.py index e4a6415..76a9a83 100644 --- a/app.py +++ b/tools/app.py @@ -1,14 +1,12 @@ +import logging import os # os.system("wget -P cvec/ https://huggingface.co/lj1995/VoiceConversionWebUI/resolve/main/hubert_base.pt") import gradio as gr -import logging - -from configs.config import Config - -from i18n.i18n import I18nAuto from dotenv import load_dotenv +from configs.config import Config +from i18n.i18n import I18nAuto from infer.modules.vc.modules import VC logging.getLogger("numba").setLevel(logging.WARNING) diff --git a/tools/infer_cli.py b/tools/infer_cli.py index c885a07..bbe0a53 100644 --- a/tools/infer_cli.py +++ b/tools/infer_cli.py @@ -4,11 +4,11 @@ import sys now_dir = os.getcwd() sys.path.append(now_dir) +from dotenv import load_dotenv from scipy.io import wavfile from configs.config import Config from infer.modules.vc.modules import VC -from dotenv import load_dotenv #### # USAGE diff --git a/rvc_for_realtime.py b/tools/rvc_for_realtime.py similarity index 98% rename from rvc_for_realtime.py rename to tools/rvc_for_realtime.py index 548472d..32316c3 100644 --- a/rvc_for_realtime.py +++ b/tools/rvc_for_realtime.py @@ -1,21 +1,34 @@ -import os, sys -import faiss, torch, traceback, parselmouth, numpy as np, torchcrepe, torch.nn as nn, pyworld +import os +import sys +import traceback +from time import time as ttime + import fairseq -from lib.infer_pack.models import ( +import faiss +import numpy as np +import parselmouth +import pyworld +import scipy.signal as signal +import torch +import torch.nn as nn +import torch.nn.functional as F +import torchcrepe + +from infer.lib.infer_pack.models import ( SynthesizerTrnMs256NSFsid, SynthesizerTrnMs256NSFsid_nono, SynthesizerTrnMs768NSFsid, SynthesizerTrnMs768NSFsid_nono, ) -from time import time as ttime -import torch.nn.functional as F -import scipy.signal as signal now_dir = os.getcwd() sys.path.append(now_dir) -from config import defaultconfig as config from multiprocessing import Manager as M +from configs.config import Config + +Config() + mm = M() if config.dml == True: From 58e32b6def8a19dbeb04afd2a036784964898548 Mon Sep 17 00:00:00 2001 From: Ftps <63702646+Tps-F@users.noreply.github.com> Date: Mon, 28 Aug 2023 16:08:31 +0900 Subject: [PATCH 46/65] format --- configs/config.py | 8 ++-- gui_v1.py | 32 +++++++++------ i18n/i18n.py | 2 +- i18n/locale/scan_i18n.py | 1 - infer/lib/audio.py | 2 +- infer/lib/infer_pack/attentions.py | 4 +- infer/lib/infer_pack/commons.py | 1 + infer/lib/infer_pack/models.py | 20 ++++----- infer/lib/infer_pack/models_onnx.py | 20 ++++----- infer/lib/infer_pack/modules.py | 9 ++-- .../modules/F0Predictor/DioF0Predictor.py | 5 ++- .../modules/F0Predictor/HarvestF0Predictor.py | 5 ++- .../modules/F0Predictor/PMF0Predictor.py | 5 ++- infer/lib/infer_pack/onnx_inference.py | 2 +- infer/lib/infer_pack/transforms.py | 4 +- infer/lib/rmvpe.py | 13 +++--- infer/lib/train/data_utils.py | 6 ++- infer/lib/train/mel_processing.py | 1 - infer/lib/train/process_ckpt.py | 9 ++-- infer/lib/train/utils.py | 12 +++--- infer/lib/uvr5_pack/lib_v5/layers.py | 2 +- .../lib/uvr5_pack/lib_v5/layers_123812KB .py | 2 +- infer/lib/uvr5_pack/lib_v5/layers_123821KB.py | 2 +- infer/lib/uvr5_pack/lib_v5/layers_33966KB.py | 2 +- infer/lib/uvr5_pack/lib_v5/layers_537227KB.py | 2 +- infer/lib/uvr5_pack/lib_v5/layers_537238KB.py | 2 +- infer/lib/uvr5_pack/lib_v5/layers_new.py | 2 +- infer/lib/uvr5_pack/lib_v5/nets.py | 8 ++-- infer/lib/uvr5_pack/lib_v5/nets_123812KB.py | 2 +- infer/lib/uvr5_pack/lib_v5/nets_123821KB.py | 2 +- infer/lib/uvr5_pack/lib_v5/nets_33966KB.py | 2 +- infer/lib/uvr5_pack/lib_v5/nets_537227KB.py | 4 +- infer/lib/uvr5_pack/lib_v5/nets_537238KB.py | 4 +- infer/lib/uvr5_pack/lib_v5/nets_61968KB.py | 2 +- infer/lib/uvr5_pack/lib_v5/nets_new.py | 3 +- infer/lib/uvr5_pack/lib_v5/spec_utils.py | 13 ++++-- infer/lib/uvr5_pack/utils.py | 7 ++-- .../modules/train/extract/extract_f0_print.py | 12 ++++-- .../modules/train/extract/extract_f0_rmvpe.py | 12 ++++-- .../train/extract/extract_f0_rmvpe_dml.py | 12 ++++-- infer/modules/train/extract_feature_print.py | 10 +++-- infer/modules/train/preprocess.py | 16 +++++--- infer/modules/train/train.py | 41 +++++++++++-------- infer/modules/uvr5/mdxnet.py | 4 +- infer/modules/uvr5/modules.py | 4 +- infer/modules/uvr5/preprocess.py | 11 +++-- infer/modules/vc/modules.py | 4 +- infer/modules/vc/pipeline.py | 9 +++- tools/calc_rvc_model_similarity.py | 4 +- tools/export_onnx.py | 2 +- tools/infer/infer-pm-index256.py | 26 ++++++------ tools/infer/train-index-v2.py | 7 +++- tools/infer/train-index.py | 5 ++- tools/infer/trans_weights.py | 4 +- tools/onnx_inference_demo.py | 1 + 55 files changed, 237 insertions(+), 169 deletions(-) diff --git a/configs/config.py b/configs/config.py index 250b1ea..90b9bc0 100644 --- a/configs/config.py +++ b/configs/config.py @@ -1,9 +1,10 @@ -import os import argparse +import os import sys -import torch from multiprocessing import cpu_count +import torch + def use_fp32_config(): for config_file in [ @@ -198,6 +199,3 @@ class Config: except: pass return x_pad, x_query, x_center, x_max - - -defaultconfig = Config() diff --git a/gui_v1.py b/gui_v1.py index 9486508..07cf5be 100644 --- a/gui_v1.py +++ b/gui_v1.py @@ -1,4 +1,6 @@ -import os, sys, pdb +import os +import pdb +import sys os.environ["OMP_NUM_THREADS"] = "2" if sys.platform == "darwin": @@ -16,7 +18,8 @@ class Harvest(multiprocessing.Process): self.opt_q = opt_q def run(self): - import numpy as np, pyworld + import numpy as np + import pyworld while 1: idx, x, res_f0, n_cpu, ts = self.inp_q.get() @@ -33,21 +36,26 @@ class Harvest(multiprocessing.Process): if __name__ == "__main__": - from multiprocessing import Queue - from queue import Empty - import numpy as np - import multiprocessing - import traceback, re import json - import PySimpleGUI as sg - import sounddevice as sd + import multiprocessing + import re + import threading + import time + import traceback + from multiprocessing import Queue, cpu_count + from queue import Empty + + import librosa import noisereduce as nr - from multiprocessing import cpu_count - import librosa, torch, time, threading + import numpy as np + import PySimpleGUI as sg + import rvc_for_realtime + import sounddevice as sd + import torch import torch.nn.functional as F import torchaudio.transforms as tat + from i18n import I18nAuto - import rvc_for_realtime i18n = I18nAuto() device = rvc_for_realtime.config.device diff --git a/i18n/i18n.py b/i18n/i18n.py index 28b17c7..f4fb9b3 100644 --- a/i18n/i18n.py +++ b/i18n/i18n.py @@ -1,5 +1,5 @@ -import locale import json +import locale import os diff --git a/i18n/locale/scan_i18n.py b/i18n/locale/scan_i18n.py index ce875c9..b5fe055 100644 --- a/i18n/locale/scan_i18n.py +++ b/i18n/locale/scan_i18n.py @@ -1,7 +1,6 @@ import ast import glob import json - from collections import OrderedDict diff --git a/infer/lib/audio.py b/infer/lib/audio.py index 61db726..045055c 100644 --- a/infer/lib/audio.py +++ b/infer/lib/audio.py @@ -1,5 +1,5 @@ -import librosa import ffmpeg +import librosa import numpy as np diff --git a/infer/lib/infer_pack/attentions.py b/infer/lib/infer_pack/attentions.py index fc3538b..2b6060c 100644 --- a/infer/lib/infer_pack/attentions.py +++ b/infer/lib/infer_pack/attentions.py @@ -1,12 +1,12 @@ import copy import math + import numpy as np import torch from torch import nn from torch.nn import functional as F -from infer.lib.infer_pack import commons -from infer.lib.infer_pack import modules +from infer.lib.infer_pack import commons, modules from infer.lib.infer_pack.modules import LayerNorm diff --git a/infer/lib/infer_pack/commons.py b/infer/lib/infer_pack/commons.py index 4937729..7ba7d21 100644 --- a/infer/lib/infer_pack/commons.py +++ b/infer/lib/infer_pack/commons.py @@ -1,4 +1,5 @@ import math + import numpy as np import torch from torch import nn diff --git a/infer/lib/infer_pack/models.py b/infer/lib/infer_pack/models.py index 8c598cf..9878048 100644 --- a/infer/lib/infer_pack/models.py +++ b/infer/lib/infer_pack/models.py @@ -1,17 +1,17 @@ -import math, pdb, os +import math +import os +import pdb from time import time as ttime + +import numpy as np import torch from torch import nn +from torch.nn import AvgPool1d, Conv1d, Conv2d, ConvTranspose1d from torch.nn import functional as F -from infer.lib.infer_pack import modules -from infer.lib.infer_pack import attentions -from infer.lib.infer_pack import commons -from infer.lib.infer_pack.commons import init_weights, get_padding -from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d -from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm -from infer.lib.infer_pack.commons import init_weights -import numpy as np -from infer.lib.infer_pack import commons +from torch.nn.utils import remove_weight_norm, spectral_norm, weight_norm + +from infer.lib.infer_pack import attentions, commons, modules +from infer.lib.infer_pack.commons import get_padding, init_weights class TextEncoder256(nn.Module): diff --git a/infer/lib/infer_pack/models_onnx.py b/infer/lib/infer_pack/models_onnx.py index f4b2a15..4642a90 100644 --- a/infer/lib/infer_pack/models_onnx.py +++ b/infer/lib/infer_pack/models_onnx.py @@ -1,17 +1,17 @@ -import math, pdb, os +import math +import os +import pdb from time import time as ttime + +import numpy as np import torch from torch import nn +from torch.nn import AvgPool1d, Conv1d, Conv2d, ConvTranspose1d from torch.nn import functional as F -from infer.lib.infer_pack import modules -from infer.lib.infer_pack import attentions -from infer.lib.infer_pack import commons -from infer.lib.infer_pack.commons import init_weights, get_padding -from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d -from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm -from infer.lib.infer_pack.commons import init_weights -import numpy as np -from infer.lib.infer_pack import commons +from torch.nn.utils import remove_weight_norm, spectral_norm, weight_norm + +from infer.lib.infer_pack import attentions, commons, modules +from infer.lib.infer_pack.commons import get_padding, init_weights class TextEncoder256(nn.Module): diff --git a/infer/lib/infer_pack/modules.py b/infer/lib/infer_pack/modules.py index 386f7a2..edf2207 100644 --- a/infer/lib/infer_pack/modules.py +++ b/infer/lib/infer_pack/modules.py @@ -1,19 +1,18 @@ import copy import math + import numpy as np import scipy import torch from torch import nn +from torch.nn import AvgPool1d, Conv1d, Conv2d, ConvTranspose1d from torch.nn import functional as F - -from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d -from torch.nn.utils import weight_norm, remove_weight_norm +from torch.nn.utils import remove_weight_norm, weight_norm from infer.lib.infer_pack import commons -from infer.lib.infer_pack.commons import init_weights, get_padding +from infer.lib.infer_pack.commons import get_padding, init_weights from infer.lib.infer_pack.transforms import piecewise_rational_quadratic_transform - LRELU_SLOPE = 0.1 diff --git a/infer/lib/infer_pack/modules/F0Predictor/DioF0Predictor.py b/infer/lib/infer_pack/modules/F0Predictor/DioF0Predictor.py index e82a7fe..e69a603 100644 --- a/infer/lib/infer_pack/modules/F0Predictor/DioF0Predictor.py +++ b/infer/lib/infer_pack/modules/F0Predictor/DioF0Predictor.py @@ -1,6 +1,7 @@ -from infer.lib.infer_pack.modules.F0Predictor.F0Predictor import F0Predictor -import pyworld import numpy as np +import pyworld + +from infer.lib.infer_pack.modules.F0Predictor.F0Predictor import F0Predictor class DioF0Predictor(F0Predictor): diff --git a/infer/lib/infer_pack/modules/F0Predictor/HarvestF0Predictor.py b/infer/lib/infer_pack/modules/F0Predictor/HarvestF0Predictor.py index eb96c52..27f3356 100644 --- a/infer/lib/infer_pack/modules/F0Predictor/HarvestF0Predictor.py +++ b/infer/lib/infer_pack/modules/F0Predictor/HarvestF0Predictor.py @@ -1,6 +1,7 @@ -from infer.lib.infer_pack.modules.F0Predictor.F0Predictor import F0Predictor -import pyworld import numpy as np +import pyworld + +from infer.lib.infer_pack.modules.F0Predictor.F0Predictor import F0Predictor class HarvestF0Predictor(F0Predictor): diff --git a/infer/lib/infer_pack/modules/F0Predictor/PMF0Predictor.py b/infer/lib/infer_pack/modules/F0Predictor/PMF0Predictor.py index 384ff4c..957ec46 100644 --- a/infer/lib/infer_pack/modules/F0Predictor/PMF0Predictor.py +++ b/infer/lib/infer_pack/modules/F0Predictor/PMF0Predictor.py @@ -1,6 +1,7 @@ -from infer.lib.infer_pack.modules.F0Predictor.F0Predictor import F0Predictor -import parselmouth import numpy as np +import parselmouth + +from infer.lib.infer_pack.modules.F0Predictor.F0Predictor import F0Predictor class PMF0Predictor(F0Predictor): diff --git a/infer/lib/infer_pack/onnx_inference.py b/infer/lib/infer_pack/onnx_inference.py index b4aba75..3901d76 100644 --- a/infer/lib/infer_pack/onnx_inference.py +++ b/infer/lib/infer_pack/onnx_inference.py @@ -1,6 +1,6 @@ -import onnxruntime import librosa import numpy as np +import onnxruntime import soundfile diff --git a/infer/lib/infer_pack/transforms.py b/infer/lib/infer_pack/transforms.py index 7d93c48..6d07b3b 100644 --- a/infer/lib/infer_pack/transforms.py +++ b/infer/lib/infer_pack/transforms.py @@ -1,9 +1,7 @@ +import numpy as np import torch from torch.nn import functional as F -import numpy as np - - DEFAULT_MIN_BIN_WIDTH = 1e-3 DEFAULT_MIN_BIN_HEIGHT = 1e-3 DEFAULT_MIN_DERIVATIVE = 1e-3 diff --git a/infer/lib/rmvpe.py b/infer/lib/rmvpe.py index e5fa613..0c288b2 100644 --- a/infer/lib/rmvpe.py +++ b/infer/lib/rmvpe.py @@ -1,11 +1,11 @@ -import torch, numpy as np, pdb +import pdb + +import numpy as np +import torch import torch.nn as nn import torch.nn.functional as F -import torch, pdb -import numpy as np -import torch.nn.functional as F +from librosa.util import normalize, pad_center, tiny from scipy.signal import get_window -from librosa.util import pad_center, tiny, normalize ###stft codes from https://github.com/pseeth/torch-stft/blob/master/torch_stft/util.py @@ -670,7 +670,8 @@ class RMVPE: if __name__ == "__main__": - import soundfile as sf, librosa + import librosa + import soundfile as sf audio, sampling_rate = sf.read(r"C:\Users\liujing04\Desktop\Z\冬之花clip1.wav") if len(audio.shape) > 1: diff --git a/infer/lib/train/data_utils.py b/infer/lib/train/data_utils.py index 7793f15..db4e78f 100644 --- a/infer/lib/train/data_utils.py +++ b/infer/lib/train/data_utils.py @@ -1,10 +1,12 @@ -import os, traceback +import os +import traceback + import numpy as np import torch import torch.utils.data from infer.lib.train.mel_processing import spectrogram_torch -from infer.lib.train.utils import load_wav_to_torch, load_filepaths_and_text +from infer.lib.train.utils import load_filepaths_and_text, load_wav_to_torch class TextAudioLoaderMultiNSFsid(torch.utils.data.Dataset): diff --git a/infer/lib/train/mel_processing.py b/infer/lib/train/mel_processing.py index 3cc3687..85342c4 100644 --- a/infer/lib/train/mel_processing.py +++ b/infer/lib/train/mel_processing.py @@ -2,7 +2,6 @@ import torch import torch.utils.data from librosa.filters import mel as librosa_mel_fn - MAX_WAV_VALUE = 32768.0 diff --git a/infer/lib/train/process_ckpt.py b/infer/lib/train/process_ckpt.py index f2d73af..887dc71 100644 --- a/infer/lib/train/process_ckpt.py +++ b/infer/lib/train/process_ckpt.py @@ -1,7 +1,10 @@ -import torch, traceback, os, sys - - +import os +import sys +import traceback from collections import OrderedDict + +import torch + from i18n.i18n import I18nAuto i18n = I18nAuto() diff --git a/infer/lib/train/utils.py b/infer/lib/train/utils.py index 337422b..314eee7 100644 --- a/infer/lib/train/utils.py +++ b/infer/lib/train/utils.py @@ -1,13 +1,15 @@ -import os, traceback -import glob -import sys import argparse -import logging +import glob import json +import logging +import os import subprocess +import sys +import traceback + import numpy as np -from scipy.io.wavfile import read import torch +from scipy.io.wavfile import read MATPLOTLIB_FLAG = False diff --git a/infer/lib/uvr5_pack/lib_v5/layers.py b/infer/lib/uvr5_pack/lib_v5/layers.py index b82f06b..4fc1b5c 100644 --- a/infer/lib/uvr5_pack/lib_v5/layers.py +++ b/infer/lib/uvr5_pack/lib_v5/layers.py @@ -1,6 +1,6 @@ import torch -from torch import nn import torch.nn.functional as F +from torch import nn from . import spec_utils diff --git a/infer/lib/uvr5_pack/lib_v5/layers_123812KB .py b/infer/lib/uvr5_pack/lib_v5/layers_123812KB .py index b82f06b..4fc1b5c 100644 --- a/infer/lib/uvr5_pack/lib_v5/layers_123812KB .py +++ b/infer/lib/uvr5_pack/lib_v5/layers_123812KB .py @@ -1,6 +1,6 @@ import torch -from torch import nn import torch.nn.functional as F +from torch import nn from . import spec_utils diff --git a/infer/lib/uvr5_pack/lib_v5/layers_123821KB.py b/infer/lib/uvr5_pack/lib_v5/layers_123821KB.py index b82f06b..4fc1b5c 100644 --- a/infer/lib/uvr5_pack/lib_v5/layers_123821KB.py +++ b/infer/lib/uvr5_pack/lib_v5/layers_123821KB.py @@ -1,6 +1,6 @@ import torch -from torch import nn import torch.nn.functional as F +from torch import nn from . import spec_utils diff --git a/infer/lib/uvr5_pack/lib_v5/layers_33966KB.py b/infer/lib/uvr5_pack/lib_v5/layers_33966KB.py index a38b7bb..9b127bc 100644 --- a/infer/lib/uvr5_pack/lib_v5/layers_33966KB.py +++ b/infer/lib/uvr5_pack/lib_v5/layers_33966KB.py @@ -1,6 +1,6 @@ import torch -from torch import nn import torch.nn.functional as F +from torch import nn from . import spec_utils diff --git a/infer/lib/uvr5_pack/lib_v5/layers_537227KB.py b/infer/lib/uvr5_pack/lib_v5/layers_537227KB.py index a38b7bb..9b127bc 100644 --- a/infer/lib/uvr5_pack/lib_v5/layers_537227KB.py +++ b/infer/lib/uvr5_pack/lib_v5/layers_537227KB.py @@ -1,6 +1,6 @@ import torch -from torch import nn import torch.nn.functional as F +from torch import nn from . import spec_utils diff --git a/infer/lib/uvr5_pack/lib_v5/layers_537238KB.py b/infer/lib/uvr5_pack/lib_v5/layers_537238KB.py index a38b7bb..9b127bc 100644 --- a/infer/lib/uvr5_pack/lib_v5/layers_537238KB.py +++ b/infer/lib/uvr5_pack/lib_v5/layers_537238KB.py @@ -1,6 +1,6 @@ import torch -from torch import nn import torch.nn.functional as F +from torch import nn from . import spec_utils diff --git a/infer/lib/uvr5_pack/lib_v5/layers_new.py b/infer/lib/uvr5_pack/lib_v5/layers_new.py index 0c13e60..44153b6 100644 --- a/infer/lib/uvr5_pack/lib_v5/layers_new.py +++ b/infer/lib/uvr5_pack/lib_v5/layers_new.py @@ -1,6 +1,6 @@ import torch -from torch import nn import torch.nn.functional as F +from torch import nn from . import spec_utils diff --git a/infer/lib/uvr5_pack/lib_v5/nets.py b/infer/lib/uvr5_pack/lib_v5/nets.py index db4c5e3..5da3948 100644 --- a/infer/lib/uvr5_pack/lib_v5/nets.py +++ b/infer/lib/uvr5_pack/lib_v5/nets.py @@ -1,8 +1,8 @@ -import torch -from torch import nn -import torch.nn.functional as F - import layers +import torch +import torch.nn.functional as F +from torch import nn + from . import spec_utils diff --git a/infer/lib/uvr5_pack/lib_v5/nets_123812KB.py b/infer/lib/uvr5_pack/lib_v5/nets_123812KB.py index becbfae..167d4cb 100644 --- a/infer/lib/uvr5_pack/lib_v5/nets_123812KB.py +++ b/infer/lib/uvr5_pack/lib_v5/nets_123812KB.py @@ -1,6 +1,6 @@ import torch -from torch import nn import torch.nn.functional as F +from torch import nn from . import layers_123821KB as layers diff --git a/infer/lib/uvr5_pack/lib_v5/nets_123821KB.py b/infer/lib/uvr5_pack/lib_v5/nets_123821KB.py index becbfae..167d4cb 100644 --- a/infer/lib/uvr5_pack/lib_v5/nets_123821KB.py +++ b/infer/lib/uvr5_pack/lib_v5/nets_123821KB.py @@ -1,6 +1,6 @@ import torch -from torch import nn import torch.nn.functional as F +from torch import nn from . import layers_123821KB as layers diff --git a/infer/lib/uvr5_pack/lib_v5/nets_33966KB.py b/infer/lib/uvr5_pack/lib_v5/nets_33966KB.py index b8986f9..73a5b83 100644 --- a/infer/lib/uvr5_pack/lib_v5/nets_33966KB.py +++ b/infer/lib/uvr5_pack/lib_v5/nets_33966KB.py @@ -1,6 +1,6 @@ import torch -from torch import nn import torch.nn.functional as F +from torch import nn from . import layers_33966KB as layers diff --git a/infer/lib/uvr5_pack/lib_v5/nets_537227KB.py b/infer/lib/uvr5_pack/lib_v5/nets_537227KB.py index a1bb530..823b44f 100644 --- a/infer/lib/uvr5_pack/lib_v5/nets_537227KB.py +++ b/infer/lib/uvr5_pack/lib_v5/nets_537227KB.py @@ -1,7 +1,7 @@ -import torch import numpy as np -from torch import nn +import torch import torch.nn.functional as F +from torch import nn from . import layers_537238KB as layers diff --git a/infer/lib/uvr5_pack/lib_v5/nets_537238KB.py b/infer/lib/uvr5_pack/lib_v5/nets_537238KB.py index a1bb530..823b44f 100644 --- a/infer/lib/uvr5_pack/lib_v5/nets_537238KB.py +++ b/infer/lib/uvr5_pack/lib_v5/nets_537238KB.py @@ -1,7 +1,7 @@ -import torch import numpy as np -from torch import nn +import torch import torch.nn.functional as F +from torch import nn from . import layers_537238KB as layers diff --git a/infer/lib/uvr5_pack/lib_v5/nets_61968KB.py b/infer/lib/uvr5_pack/lib_v5/nets_61968KB.py index becbfae..167d4cb 100644 --- a/infer/lib/uvr5_pack/lib_v5/nets_61968KB.py +++ b/infer/lib/uvr5_pack/lib_v5/nets_61968KB.py @@ -1,6 +1,6 @@ import torch -from torch import nn import torch.nn.functional as F +from torch import nn from . import layers_123821KB as layers diff --git a/infer/lib/uvr5_pack/lib_v5/nets_new.py b/infer/lib/uvr5_pack/lib_v5/nets_new.py index bfaf72e..1c0f4fa 100644 --- a/infer/lib/uvr5_pack/lib_v5/nets_new.py +++ b/infer/lib/uvr5_pack/lib_v5/nets_new.py @@ -1,6 +1,7 @@ import torch -from torch import nn import torch.nn.functional as F +from torch import nn + from . import layers_new diff --git a/infer/lib/uvr5_pack/lib_v5/spec_utils.py b/infer/lib/uvr5_pack/lib_v5/spec_utils.py index a3fd46d..a9634fd 100644 --- a/infer/lib/uvr5_pack/lib_v5/spec_utils.py +++ b/infer/lib/uvr5_pack/lib_v5/spec_utils.py @@ -1,8 +1,12 @@ -import os, librosa +import hashlib +import json +import math +import os + +import librosa import numpy as np import soundfile as sf from tqdm import tqdm -import json, math, hashlib def crop_center(h1, h2): @@ -519,10 +523,11 @@ def istft(spec, hl): if __name__ == "__main__": - import cv2 + import argparse import sys import time - import argparse + + import cv2 from model_param_init import ModelParameters p = argparse.ArgumentParser() diff --git a/infer/lib/uvr5_pack/utils.py b/infer/lib/uvr5_pack/utils.py index a04c001..f4805cd 100644 --- a/infer/lib/uvr5_pack/utils.py +++ b/infer/lib/uvr5_pack/utils.py @@ -1,8 +1,9 @@ -import torch -import numpy as np -from tqdm import tqdm import json +import numpy as np +import torch +from tqdm import tqdm + def load_data(file_name: str = "./infer/lib/uvr5_pack/name_params.json") -> dict: with open(file_name, "r") as f: diff --git a/infer/modules/train/extract/extract_f0_print.py b/infer/modules/train/extract/extract_f0_print.py index d95548e..d97c766 100644 --- a/infer/modules/train/extract/extract_f0_print.py +++ b/infer/modules/train/extract/extract_f0_print.py @@ -1,10 +1,16 @@ -import os, traceback, sys, parselmouth +import os +import sys +import traceback + +import parselmouth now_dir = os.getcwd() sys.path.append(now_dir) -from lib.audio import load_audio +import logging + +import numpy as np import pyworld -import numpy as np, logging +from lib.audio import load_audio logging.getLogger("numba").setLevel(logging.WARNING) from multiprocessing import Process diff --git a/infer/modules/train/extract/extract_f0_rmvpe.py b/infer/modules/train/extract/extract_f0_rmvpe.py index 33517e0..0bb2a3e 100644 --- a/infer/modules/train/extract/extract_f0_rmvpe.py +++ b/infer/modules/train/extract/extract_f0_rmvpe.py @@ -1,10 +1,16 @@ -import os, traceback, sys, parselmouth +import os +import sys +import traceback + +import parselmouth now_dir = os.getcwd() sys.path.append(now_dir) -from lib.audio import load_audio +import logging + +import numpy as np import pyworld -import numpy as np, logging +from lib.audio import load_audio logging.getLogger("numba").setLevel(logging.WARNING) diff --git a/infer/modules/train/extract/extract_f0_rmvpe_dml.py b/infer/modules/train/extract/extract_f0_rmvpe_dml.py index 744c69f..1616e72 100644 --- a/infer/modules/train/extract/extract_f0_rmvpe_dml.py +++ b/infer/modules/train/extract/extract_f0_rmvpe_dml.py @@ -1,10 +1,16 @@ -import os, traceback, sys, parselmouth +import os +import sys +import traceback + +import parselmouth now_dir = os.getcwd() sys.path.append(now_dir) -from lib.audio import load_audio +import logging + +import numpy as np import pyworld -import numpy as np, logging +from lib.audio import load_audio logging.getLogger("numba").setLevel(logging.WARNING) diff --git a/infer/modules/train/extract_feature_print.py b/infer/modules/train/extract_feature_print.py index 32e0492..f8bfc2a 100644 --- a/infer/modules/train/extract_feature_print.py +++ b/infer/modules/train/extract_feature_print.py @@ -1,4 +1,6 @@ -import os, sys, traceback +import os +import sys +import traceback os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" os.environ["PYTORCH_MPS_HIGH_WATERMARK_RATIO"] = "0.0" @@ -14,11 +16,11 @@ else: exp_dir = sys.argv[5] os.environ["CUDA_VISIBLE_DEVICES"] = str(i_gpu) version = sys.argv[6] +import fairseq +import numpy as np +import soundfile as sf import torch import torch.nn.functional as F -import soundfile as sf -import numpy as np -import fairseq if "privateuseone" not in device: device = "cpu" diff --git a/infer/modules/train/preprocess.py b/infer/modules/train/preprocess.py index 62671ba..a7cab57 100644 --- a/infer/modules/train/preprocess.py +++ b/infer/modules/train/preprocess.py @@ -1,4 +1,7 @@ -import sys, os, multiprocessing +import multiprocessing +import os +import sys + from scipy import signal now_dir = os.getcwd() @@ -9,12 +12,15 @@ sr = int(sys.argv[2]) n_p = int(sys.argv[3]) exp_dir = sys.argv[4] noparallel = sys.argv[5] == "True" -import numpy as np, os, traceback -from lib.slicer2 import Slicer -import librosa, traceback -from scipy.io import wavfile import multiprocessing +import os +import traceback + +import librosa +import numpy as np from lib.audio import load_audio +from lib.slicer2 import Slicer +from scipy.io import wavfile mutex = multiprocessing.Lock() f = open("%s/preprocess.log" % exp_dir, "a+") diff --git a/infer/modules/train/train.py b/infer/modules/train/train.py index 3dca6c7..ac52cf6 100644 --- a/infer/modules/train/train.py +++ b/infer/modules/train/train.py @@ -1,43 +1,47 @@ -import os, sys +import os +import sys now_dir = os.getcwd() sys.path.append(os.path.join(now_dir)) -from infer.lib.train import utils import datetime +from infer.lib.train import utils + hps = utils.get_hparams() os.environ["CUDA_VISIBLE_DEVICES"] = hps.gpus.replace("-", ",") n_gpus = len(hps.gpus.split("-")) -from random import shuffle, randint +from random import randint, shuffle import torch torch.backends.cudnn.deterministic = False torch.backends.cudnn.benchmark = False -from torch.nn import functional as F -from torch.utils.data import DataLoader -from torch.utils.tensorboard import SummaryWriter -import torch.multiprocessing as mp -import torch.distributed as dist -from torch.nn.parallel import DistributedDataParallel as DDP -from torch.cuda.amp import autocast, GradScaler -from infer.lib.infer_pack import commons from time import sleep from time import time as ttime + +import torch.distributed as dist +import torch.multiprocessing as mp +from torch.cuda.amp import GradScaler, autocast +from torch.nn import functional as F +from torch.nn.parallel import DistributedDataParallel as DDP +from torch.utils.data import DataLoader +from torch.utils.tensorboard import SummaryWriter + +from infer.lib.infer_pack import commons from infer.lib.train.data_utils import ( - TextAudioLoaderMultiNSFsid, - TextAudioLoader, - TextAudioCollateMultiNSFsid, - TextAudioCollate, DistributedBucketSampler, + TextAudioCollate, + TextAudioCollateMultiNSFsid, + TextAudioLoader, + TextAudioLoaderMultiNSFsid, ) if hps.version == "v1": + from infer.lib.infer_pack.models import MultiPeriodDiscriminator + from infer.lib.infer_pack.models import SynthesizerTrnMs256NSFsid as RVC_Model_f0 from infer.lib.infer_pack.models import ( - SynthesizerTrnMs256NSFsid as RVC_Model_f0, SynthesizerTrnMs256NSFsid_nono as RVC_Model_nof0, - MultiPeriodDiscriminator, ) else: from infer.lib.infer_pack.models import ( @@ -45,10 +49,11 @@ else: SynthesizerTrnMs768NSFsid_nono as RVC_Model_nof0, MultiPeriodDiscriminatorV2 as MultiPeriodDiscriminator, ) + from infer.lib.train.losses import ( - generator_loss, discriminator_loss, feature_loss, + generator_loss, kl_loss, ) from infer.lib.train.mel_processing import mel_spectrogram_torch, spec_to_mel_torch diff --git a/infer/modules/uvr5/mdxnet.py b/infer/modules/uvr5/mdxnet.py index bd84f82..4a70469 100644 --- a/infer/modules/uvr5/mdxnet.py +++ b/infer/modules/uvr5/mdxnet.py @@ -1,12 +1,12 @@ import os import warnings -import soundfile as sf import librosa import numpy as np import onnxruntime as ort -from tqdm import tqdm +import soundfile as sf import torch +from tqdm import tqdm cpu = torch.device("cpu") diff --git a/infer/modules/uvr5/modules.py b/infer/modules/uvr5/modules.py index 4f5269a..16ad0a9 100644 --- a/infer/modules/uvr5/modules.py +++ b/infer/modules/uvr5/modules.py @@ -1,12 +1,12 @@ import os import traceback -import torch import ffmpeg +import torch from configs.config import Config -from infer.modules.uvr5.preprocess import AudioPre, AudioPreDeEcho from infer.modules.uvr5.mdxnet import MDXNetDereverb +from infer.modules.uvr5.preprocess import AudioPre, AudioPreDeEcho config = Config() diff --git a/infer/modules/uvr5/preprocess.py b/infer/modules/uvr5/preprocess.py index dae2739..26aeada 100644 --- a/infer/modules/uvr5/preprocess.py +++ b/infer/modules/uvr5/preprocess.py @@ -1,16 +1,15 @@ import os -import torch import librosa import numpy as np import soundfile as sf +import torch -from infer.lib.uvr5_pack.lib_v5 import spec_utils -from infer.lib.uvr5_pack.utils import inference -from infer.lib.uvr5_pack.lib_v5.model_param_init import ModelParameters - -from infer.lib.uvr5_pack.lib_v5.nets_new import CascadedNet from infer.lib.uvr5_pack.lib_v5 import nets_61968KB as Nets +from infer.lib.uvr5_pack.lib_v5 import spec_utils +from infer.lib.uvr5_pack.lib_v5.model_param_init import ModelParameters +from infer.lib.uvr5_pack.lib_v5.nets_new import CascadedNet +from infer.lib.uvr5_pack.utils import inference class AudioPre: diff --git a/infer/modules/vc/modules.py b/infer/modules/vc/modules.py index f3626a9..ac37f44 100644 --- a/infer/modules/vc/modules.py +++ b/infer/modules/vc/modules.py @@ -1,9 +1,10 @@ import traceback import numpy as np -import torch import soundfile as sf +import torch +from infer.lib.audio import load_audio from infer.lib.infer_pack.models import ( SynthesizerTrnMs256NSFsid, SynthesizerTrnMs256NSFsid_nono, @@ -12,7 +13,6 @@ from infer.lib.infer_pack.models import ( ) from infer.modules.vc.pipeline import Pipeline from infer.modules.vc.utils import * -from infer.lib.audio import load_audio class VC: diff --git a/infer/modules/vc/pipeline.py b/infer/modules/vc/pipeline.py index eed97e0..31e5399 100644 --- a/infer/modules/vc/pipeline.py +++ b/infer/modules/vc/pipeline.py @@ -1,13 +1,18 @@ +import os import sys +import traceback +from functools import lru_cache from time import time as ttime +import faiss +import librosa import numpy as np import parselmouth +import pyworld import torch import torch.nn.functional as F -import pyworld, os, traceback, faiss, librosa, torchcrepe +import torchcrepe from scipy import signal -from functools import lru_cache now_dir = os.getcwd() sys.path.append(now_dir) diff --git a/tools/calc_rvc_model_similarity.py b/tools/calc_rvc_model_similarity.py index edc1cf8..3f74ca5 100644 --- a/tools/calc_rvc_model_similarity.py +++ b/tools/calc_rvc_model_similarity.py @@ -1,6 +1,8 @@ # This code references https://huggingface.co/JosephusCheung/ASimilarityCalculatior/blob/main/qwerty.py # Fill in the path of the model to be queried and the root directory of the reference models, and this script will return the similarity between the model to be queried and all reference models. -import sys, os +import os +import sys + import torch import torch.nn as nn import torch.nn.functional as F diff --git a/tools/export_onnx.py b/tools/export_onnx.py index 2d334a6..a38f943 100644 --- a/tools/export_onnx.py +++ b/tools/export_onnx.py @@ -1,5 +1,5 @@ -from lib.infer_pack.models_onnx import SynthesizerTrnMsNSFsidM import torch +from lib.infer_pack.models_onnx import SynthesizerTrnMsNSFsidM if __name__ == "__main__": MoeVS = True # 模型是否为MoeVoiceStudio(原MoeSS)使用 diff --git a/tools/infer/infer-pm-index256.py b/tools/infer/infer-pm-index256.py index 2ab44e1..92be320 100644 --- a/tools/infer/infer-pm-index256.py +++ b/tools/infer/infer-pm-index256.py @@ -2,34 +2,36 @@ 对源特征进行检索 """ -import torch, pdb, os, parselmouth +import os +import pdb + +import parselmouth +import torch os.environ["CUDA_VISIBLE_DEVICES"] = "0" +# import torchcrepe +from time import time as ttime + +# import pyworld +import librosa import numpy as np +import scipy.signal as signal import soundfile as sf +import torch.nn.functional as F +from fairseq import checkpoint_utils # from models import SynthesizerTrn256#hifigan_nonsf # from lib.infer_pack.models import SynthesizerTrn256NSF as SynthesizerTrn256#hifigan_nsf from lib.infer_pack.models import ( SynthesizerTrnMs256NSFsid as SynthesizerTrn256, ) # hifigan_nsf +from scipy.io import wavfile # from lib.infer_pack.models import SynthesizerTrnMs256NSFsid_sim as SynthesizerTrn256#hifigan_nsf # from models import SynthesizerTrn256NSFsim as SynthesizerTrn256#hifigan_nsf # from models import SynthesizerTrn256NSFsimFlow as SynthesizerTrn256#hifigan_nsf -from scipy.io import wavfile -from fairseq import checkpoint_utils - -# import pyworld -import librosa -import torch.nn.functional as F -import scipy.signal as signal - -# import torchcrepe -from time import time as ttime - device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model_path = r"E:\codes\py39\vits_vc_gpu_train\hubert_base.pt" # print("load model(s) from {}".format(model_path)) diff --git a/tools/infer/train-index-v2.py b/tools/infer/train-index-v2.py index 77dfa0b..e72ffe7 100644 --- a/tools/infer/train-index-v2.py +++ b/tools/infer/train-index-v2.py @@ -1,11 +1,14 @@ """ 格式:直接cid为自带的index位;aid放不下了,通过字典来查,反正就5w个 """ -import faiss, numpy as np, os -from sklearn.cluster import MiniBatchKMeans +import os import traceback from multiprocessing import cpu_count +import faiss +import numpy as np +from sklearn.cluster import MiniBatchKMeans + # ###########如果是原始特征要先写save n_cpu = 0 if n_cpu == 0: diff --git a/tools/infer/train-index.py b/tools/infer/train-index.py index c49f24b..2446e4c 100644 --- a/tools/infer/train-index.py +++ b/tools/infer/train-index.py @@ -1,7 +1,10 @@ """ 格式:直接cid为自带的index位;aid放不下了,通过字典来查,反正就5w个 """ -import faiss, numpy as np, os +import os + +import faiss +import numpy as np # ###########如果是原始特征要先写save inp_root = r"E:\codes\py39\dataset\mi\2-co256" diff --git a/tools/infer/trans_weights.py b/tools/infer/trans_weights.py index e0f7f0c..a8ff3b0 100644 --- a/tools/infer/trans_weights.py +++ b/tools/infer/trans_weights.py @@ -1,4 +1,6 @@ -import torch, pdb +import pdb + +import torch # a=torch.load(r"E:\codes\py39\vits_vc_gpu_train\logs\ft-mi-suc\G_1000.pth")["model"]#sim_nsf# # a=torch.load(r"E:\codes\py39\vits_vc_gpu_train\logs\ft-mi-freeze-vocoder-flow-enc_q\G_1000.pth")["model"]#sim_nsf# diff --git a/tools/onnx_inference_demo.py b/tools/onnx_inference_demo.py index a4a9490..03bb868 100644 --- a/tools/onnx_inference_demo.py +++ b/tools/onnx_inference_demo.py @@ -1,4 +1,5 @@ import soundfile + from ..lib.infer_pack.onnx_inference import OnnxRVC hop_size = 512 From 18d72f0c06a1fde560af3a072ddb2e0e8b10ad55 Mon Sep 17 00:00:00 2001 From: Ftps <63702646+Tps-F@users.noreply.github.com> Date: Mon, 28 Aug 2023 16:16:31 +0900 Subject: [PATCH 47/65] fix path --- docker-compose.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker-compose.yml b/docker-compose.yml index 4b40ec5..a5db88d 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -6,7 +6,7 @@ services: dockerfile: Dockerfile container_name: rvc volumes: - - ./weights:/app/weights + - ./weights:/app/assets/weights - ./opt:/app/opt # - ./dataset:/app/dataset # you can use this folder in order to provide your dataset for model training ports: From 29ae9d9d287331c1b741cb590956e11026936808 Mon Sep 17 00:00:00 2001 From: Ftps <63702646+Tps-F@users.noreply.github.com> Date: Mon, 28 Aug 2023 16:17:46 +0900 Subject: [PATCH 48/65] fix lib-path --- infer/modules/train/preprocess.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/infer/modules/train/preprocess.py b/infer/modules/train/preprocess.py index a7cab57..7ff76ee 100644 --- a/infer/modules/train/preprocess.py +++ b/infer/modules/train/preprocess.py @@ -18,8 +18,8 @@ import traceback import librosa import numpy as np -from lib.audio import load_audio -from lib.slicer2 import Slicer +from infer.lib.audio import load_audio +from infer.lib.slicer2 import Slicer from scipy.io import wavfile mutex = multiprocessing.Lock() From 87a3a4ea38551cb09fd51f98f3ffff8e9b70b2c7 Mon Sep 17 00:00:00 2001 From: Ftps <63702646+Tps-F@users.noreply.github.com> Date: Mon, 28 Aug 2023 16:27:51 +0900 Subject: [PATCH 49/65] fix lib path --- infer/modules/train/extract/extract_f0_print.py | 4 ++-- infer/modules/train/extract/extract_f0_rmvpe.py | 4 ++-- infer/modules/train/extract/extract_f0_rmvpe_dml.py | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/infer/modules/train/extract/extract_f0_print.py b/infer/modules/train/extract/extract_f0_print.py index d97c766..a3e9c9b 100644 --- a/infer/modules/train/extract/extract_f0_print.py +++ b/infer/modules/train/extract/extract_f0_print.py @@ -10,7 +10,7 @@ import logging import numpy as np import pyworld -from lib.audio import load_audio +from infer.lib.audio import load_audio logging.getLogger("numba").setLevel(logging.WARNING) from multiprocessing import Process @@ -82,7 +82,7 @@ class FeatureInput(object): f0 = pyworld.stonemask(x.astype(np.double), f0, t, self.fs) elif f0_method == "rmvpe": if hasattr(self, "model_rmvpe") == False: - from lib.rmvpe import RMVPE + from infer.lib.rmvpe import RMVPE print("loading rmvpe model") self.model_rmvpe = RMVPE( diff --git a/infer/modules/train/extract/extract_f0_rmvpe.py b/infer/modules/train/extract/extract_f0_rmvpe.py index 0bb2a3e..98a91e7 100644 --- a/infer/modules/train/extract/extract_f0_rmvpe.py +++ b/infer/modules/train/extract/extract_f0_rmvpe.py @@ -10,7 +10,7 @@ import logging import numpy as np import pyworld -from lib.audio import load_audio +from infer.lib.audio import load_audio logging.getLogger("numba").setLevel(logging.WARNING) @@ -45,7 +45,7 @@ class FeatureInput(object): # p_len = x.shape[0] // self.hop if f0_method == "rmvpe": if hasattr(self, "model_rmvpe") == False: - from lib.rmvpe import RMVPE + from infer.lib.rmvpe import RMVPE print("loading rmvpe model") self.model_rmvpe = RMVPE( diff --git a/infer/modules/train/extract/extract_f0_rmvpe_dml.py b/infer/modules/train/extract/extract_f0_rmvpe_dml.py index 1616e72..2d5c7f4 100644 --- a/infer/modules/train/extract/extract_f0_rmvpe_dml.py +++ b/infer/modules/train/extract/extract_f0_rmvpe_dml.py @@ -10,7 +10,7 @@ import logging import numpy as np import pyworld -from lib.audio import load_audio +from infer.lib.audio import load_audio logging.getLogger("numba").setLevel(logging.WARNING) @@ -43,7 +43,7 @@ class FeatureInput(object): # p_len = x.shape[0] // self.hop if f0_method == "rmvpe": if hasattr(self, "model_rmvpe") == False: - from lib.rmvpe import RMVPE + from infer.lib.rmvpe import RMVPE print("loading rmvpe model") self.model_rmvpe = RMVPE( From a1bbcd6fd546aa88a7d32a8682b47569b235c9a6 Mon Sep 17 00:00:00 2001 From: Ftps <63702646+Tps-F@users.noreply.github.com> Date: Mon, 28 Aug 2023 16:33:44 +0900 Subject: [PATCH 50/65] fix workflow --- .github/workflows/unitest.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/unitest.yml b/.github/workflows/unitest.yml index af677d6..eca7f70 100644 --- a/.github/workflows/unitest.yml +++ b/.github/workflows/unitest.yml @@ -33,4 +33,4 @@ jobs: python infer/modules/train/preprocess.py logs/mute/0_gt_wavs 48000 8 logs/mi-test True touch logs/mi-test/extract_f0_feature.log python infer/modules/train/extract/extract_f0_print.py logs/mi-test $(nproc) pm - python extract_feature_print.py cpu 1 0 0 logs/mi-test v1 + python infer/modules/train/extract_feature_print.py cpu 1 0 0 logs/mi-test v1 From 740625fd2b385b5633efeff7e64874f29c41902f Mon Sep 17 00:00:00 2001 From: Ftps <63702646+Tps-F@users.noreply.github.com> Date: Mon, 28 Aug 2023 21:58:36 +0900 Subject: [PATCH 51/65] fix i18n --- i18n/i18n.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/i18n/i18n.py b/i18n/i18n.py index f4fb9b3..a64ee23 100644 --- a/i18n/i18n.py +++ b/i18n/i18n.py @@ -15,7 +15,7 @@ class I18nAuto: language = locale.getdefaultlocale()[ 0 ] # getlocale can't identify the system's language ((None, None)) - if not os.path.exists(f"./lib/i18n/{language}.json"): + if not os.path.exists(f"./i18n/locale/{language}.json"): language = "en_US" self.language = language # print("Use Language:", language) From 6e14c7b5f56bd01e207aaa912c94fb468647fa8a Mon Sep 17 00:00:00 2001 From: Ftps <63702646+Tps-F@users.noreply.github.com> Date: Mon, 28 Aug 2023 23:56:39 +0900 Subject: [PATCH 52/65] rewrite oneclick_train --- i18n/locale/en_US.json | 1 + i18n/locale/es_ES.json | 1 + i18n/locale/it_IT.json | 1 + i18n/locale/ja_JP.json | 1 + i18n/locale/ru_RU.json | 1 + i18n/locale/tr_TR.json | 1 + i18n/locale/zh_CN.json | 1 + i18n/locale/zh_HK.json | 1 + i18n/locale/zh_SG.json | 1 + i18n/locale/zh_TW.json | 1 + infer-web.py | 281 ++--------------------------------------- 11 files changed, 21 insertions(+), 270 deletions(-) diff --git a/i18n/locale/en_US.json b/i18n/locale/en_US.json index c734dea..ef5fbd9 100644 --- a/i18n/locale/en_US.json +++ b/i18n/locale/en_US.json @@ -15,6 +15,7 @@ "rmvpe卡号配置:以-分隔输入使用的不同进程卡号,例如0-0-1使用在卡0上跑2个进程并在卡1上跑1个进程": "Enter the GPU index(es) separated by '-', e.g., 0-0-1 to use 2 processes in GPU0 and 1 process in GPU1", "step1: 填写实验配置. 实验数据放在logs下, 每个实验一个文件夹, 需手工输入实验名路径, 内含实验配置, 日志, 训练得到的模型文件. ": "Step 1: Fill in the experimental configuration. Experimental data is stored in the 'logs' folder, with each experiment having a separate folder. Manually enter the experiment name path, which contains the experimental configuration, logs, and trained model files.", "step1:正在处理数据": "Step 1: Processing data", + "step2:正在提取音高&正在提取特征": "step2:Pitch extraction & feature extraction", "step2a: 自动遍历训练文件夹下所有可解码成音频的文件并进行切片归一化, 在实验目录下生成2个wav文件夹; 暂时只支持单人训练. ": "Step 2a: Automatically traverse all files in the training folder that can be decoded into audio and perform slice normalization. Generates 2 wav folders in the experiment directory. Currently, only single-singer/speaker training is supported.", "step2a:无需提取音高": "Step 2a: Skipping pitch extraction", "step2b: 使用CPU提取音高(如果模型带音高), 使用GPU提取特征(选择卡号)": "Step 2b: Use CPU to extract pitch (if the model has pitch), use GPU to extract features (select GPU index):", diff --git a/i18n/locale/es_ES.json b/i18n/locale/es_ES.json index 6083ef9..ebcb860 100644 --- a/i18n/locale/es_ES.json +++ b/i18n/locale/es_ES.json @@ -15,6 +15,7 @@ "rmvpe卡号配置:以-分隔输入使用的不同进程卡号,例如0-0-1使用在卡0上跑2个进程并在卡1上跑1个进程": "rmvpe卡号配置:以-分隔输入使用的不同进程卡号,例如0-0-1使用在卡0上跑2个进程并在卡1上跑1个进程", "step1: 填写实验配置. 实验数据放在logs下, 每个实验一个文件夹, 需手工输入实验名路径, 内含实验配置, 日志, 训练得到的模型文件. ": "paso 1: Complete la configuración del experimento. Los datos del experimento se almacenan en el directorio 'logs', con cada experimento en una carpeta separada. La ruta del nombre del experimento debe ingresarse manualmente y debe contener la configuración del experimento, los registros y los archivos del modelo entrenado.", "step1:正在处理数据": "Paso 1: Procesando datos", + "step2:正在提取音高&正在提取特征": "step2:正在提取音高&正在提取特征", "step2a: 自动遍历训练文件夹下所有可解码成音频的文件并进行切片归一化, 在实验目录下生成2个wav文件夹; 暂时只支持单人训练. ": "paso 2a: recorra automáticamente la carpeta de capacitación y corte y normalice todos los archivos de audio que se pueden decodificar en audio. Se generarán dos carpetas 'wav' en el directorio del experimento. Actualmente, solo se admite la capacitación de una sola persona.", "step2a:无需提取音高": "Paso 2a: No es necesario extraer el tono", "step2b: 使用CPU提取音高(如果模型带音高), 使用GPU提取特征(选择卡号)": "paso 2b: use la CPU para extraer el tono (si el modelo tiene guía de tono) y la GPU para extraer características (seleccione el número de tarjeta).", diff --git a/i18n/locale/it_IT.json b/i18n/locale/it_IT.json index e94f6b3..26736c9 100644 --- a/i18n/locale/it_IT.json +++ b/i18n/locale/it_IT.json @@ -15,6 +15,7 @@ "rmvpe卡号配置:以-分隔输入使用的不同进程卡号,例如0-0-1使用在卡0上跑2个进程并在卡1上跑1个进程": "rmvpe卡号配置:以-分隔输入使用的不同进程卡号,例如0-0-1使用在卡0上跑2个进程并在卡1上跑1个进程", "step1: 填写实验配置. 实验数据放在logs下, 每个实验一个文件夹, 需手工输入实验名路径, 内含实验配置, 日志, 训练得到的模型文件. ": "Passaggio 1: compilare la configurazione sperimentale. ", "step1:正在处理数据": "Passaggio 1: elaborazione dei dati", + "step2:正在提取音高&正在提取特征": "step2:正在提取音高&正在提取特征", "step2a: 自动遍历训练文件夹下所有可解码成音频的文件并进行切片归一化, 在实验目录下生成2个wav文件夹; 暂时只支持单人训练. ": "Passaggio 2a: attraversa automaticamente tutti i file nella cartella di addestramento che possono essere decodificati in audio ed esegui la normalizzazione delle sezioni. ", "step2a:无需提取音高": "Step 2a: Saltare l'estrazione del tono", "step2b: 使用CPU提取音高(如果模型带音高), 使用GPU提取特征(选择卡号)": "Passaggio 2b: utilizzare la CPU per estrarre il tono (se il modello ha il tono), utilizzare la GPU per estrarre le caratteristiche (selezionare l'indice GPU):", diff --git a/i18n/locale/ja_JP.json b/i18n/locale/ja_JP.json index 92bd344..12647ec 100644 --- a/i18n/locale/ja_JP.json +++ b/i18n/locale/ja_JP.json @@ -15,6 +15,7 @@ "rmvpe卡号配置:以-分隔输入使用的不同进程卡号,例如0-0-1使用在卡0上跑2个进程并在卡1上跑1个进程": "rmvpeカード番号設定:異なるプロセスに使用するカード番号を入力する。例えば、0-0-1でカード0に2つのプロセス、カード1に1つのプロセスを実行する。", "step1: 填写实验配置. 实验数据放在logs下, 每个实验一个文件夹, 需手工输入实验名路径, 内含实验配置, 日志, 训练得到的模型文件. ": "ステップ1:実験設定を入力します。実験データはlogsに保存され、各実験にはフォルダーがあります。実験名のパスを手動で入力する必要があり、実験設定、ログ、トレーニングされたモデルファイルが含まれます。", "step1:正在处理数据": "step1:処理中のデータ", + "step2:正在提取音高&正在提取特征": "step2:ピッチ抽出と特徴抽出", "step2a: 自动遍历训练文件夹下所有可解码成音频的文件并进行切片归一化, 在实验目录下生成2个wav文件夹; 暂时只支持单人训练. ": "ステップ2a: 訓練フォルダー内のすべての音声ファイルを自動的に探索し、スライスと正規化を行い、2つのwavフォルダーを実験ディレクトリに生成します。現在は一人でのトレーニングのみをサポートしています。", "step2a:无需提取音高": "step2a:ピッチの抽出は不要", "step2b: 使用CPU提取音高(如果模型带音高), 使用GPU提取特征(选择卡号)": "ステップ2b: CPUを使用して音高を抽出する(モデルに音高がある場合)、GPUを使用して特徴を抽出する(GPUの番号を選択する)", diff --git a/i18n/locale/ru_RU.json b/i18n/locale/ru_RU.json index 5dc4b27..d94216b 100644 --- a/i18n/locale/ru_RU.json +++ b/i18n/locale/ru_RU.json @@ -15,6 +15,7 @@ "rmvpe卡号配置:以-分隔输入使用的不同进程卡号,例如0-0-1使用在卡0上跑2个进程并在卡1上跑1个进程": "Введите номера графических процессоров, разделенные символом «-», например, 0-0-1, чтобы запустить два процесса на GPU 0 и один процесс на GPU 1:", "step1: 填写实验配置. 实验数据放在logs下, 每个实验一个文件夹, 需手工输入实验名路径, 内含实验配置, 日志, 训练得到的模型文件. ": "Шаг 1. Конфигурирование модели. Данные обучения модели сохраняются в папку 'logs', и для каждой модели создаётся отдельная папка. Введите вручную путь к настройкам для модели, в которой находятся логи и тренировочные файлы.", "step1:正在处理数据": "Шаг 1. Переработка данных", + "step2:正在提取音高&正在提取特征": "step2:正在提取音高&正在提取特征", "step2a: 自动遍历训练文件夹下所有可解码成音频的文件并进行切片归一化, 在实验目录下生成2个wav文件夹; 暂时只支持单人训练. ": "Шаг 2А. Автоматическая обработка исходных аудиозаписей для обучения и выполнение нормализации среза. Создаст 2 папки wav в папке модели. В данный момент поддерживается обучение только на одноголосных записях.", "step2a:无需提取音高": "Шаг 2А. Пропуск извлечения тональности", "step2b: 使用CPU提取音高(如果模型带音高), 使用GPU提取特征(选择卡号)": "Шаг 2Б. Оценка и извлечение тональности в аудиофайлах с помощью процессора (если включена поддержка изменения высоты звука), извлечение черт с помощью GPU (выберите номер GPU):", diff --git a/i18n/locale/tr_TR.json b/i18n/locale/tr_TR.json index 8e847c2..3b1b0eb 100644 --- a/i18n/locale/tr_TR.json +++ b/i18n/locale/tr_TR.json @@ -15,6 +15,7 @@ "rmvpe卡号配置:以-分隔输入使用的不同进程卡号,例如0-0-1使用在卡0上跑2个进程并在卡1上跑1个进程": "rmvpe卡号配置:以-分隔输入使用的不同进程卡号,例如0-0-1使用在卡0上跑2个进程并在卡1上跑1个进程", "step1: 填写实验配置. 实验数据放在logs下, 每个实验一个文件夹, 需手工输入实验名路径, 内含实验配置, 日志, 训练得到的模型文件. ": "Adım 1: Deneysel yapılandırmayı doldurun. Deneysel veriler 'logs' klasöründe saklanır ve her bir deney için ayrı bir klasör vardır. Deneysel adı yolu manuel olarak girin; bu yol, deneysel yapılandırmayı, günlükleri ve eğitilmiş model dosyalarını içerir.", "step1:正在处理数据": "Adım 1: Veri işleme", + "step2:正在提取音高&正在提取特征": "step2:正在提取音高&正在提取特征", "step2a: 自动遍历训练文件夹下所有可解码成音频的文件并进行切片归一化, 在实验目录下生成2个wav文件夹; 暂时只支持单人训练. ": "Adım 2a: Eğitim klasöründe ses dosyalarını otomatik olarak gezinerek dilimleme normalizasyonu yapın. Deney dizini içinde 2 wav klasörü oluşturur. Şu anda sadece tek kişilik eğitim desteklenmektedir.", "step2a:无需提取音高": "Adım 2a: Pitch çıkartma adımını atlama", "step2b: 使用CPU提取音高(如果模型带音高), 使用GPU提取特征(选择卡号)": "Adım 2b: Ses yüksekliği (Pitch) çıkartmak için CPU kullanın (eğer model ses yüksekliği içeriyorsa), özellikleri çıkartmak için GPU kullanın (GPU indeksini seçin):", diff --git a/i18n/locale/zh_CN.json b/i18n/locale/zh_CN.json index 12f8738..96ca25b 100644 --- a/i18n/locale/zh_CN.json +++ b/i18n/locale/zh_CN.json @@ -15,6 +15,7 @@ "rmvpe卡号配置:以-分隔输入使用的不同进程卡号,例如0-0-1使用在卡0上跑2个进程并在卡1上跑1个进程": "rmvpe卡号配置:以-分隔输入使用的不同进程卡号,例如0-0-1使用在卡0上跑2个进程并在卡1上跑1个进程", "step1: 填写实验配置. 实验数据放在logs下, 每个实验一个文件夹, 需手工输入实验名路径, 内含实验配置, 日志, 训练得到的模型文件. ": "step1: 填写实验配置. 实验数据放在logs下, 每个实验一个文件夹, 需手工输入实验名路径, 内含实验配置, 日志, 训练得到的模型文件. ", "step1:正在处理数据": "step1:正在处理数据", + "step2:正在提取音高&正在提取特征": "step2:正在提取音高&正在提取特征", "step2a: 自动遍历训练文件夹下所有可解码成音频的文件并进行切片归一化, 在实验目录下生成2个wav文件夹; 暂时只支持单人训练. ": "step2a: 自动遍历训练文件夹下所有可解码成音频的文件并进行切片归一化, 在实验目录下生成2个wav文件夹; 暂时只支持单人训练. ", "step2a:无需提取音高": "step2a:无需提取音高", "step2b: 使用CPU提取音高(如果模型带音高), 使用GPU提取特征(选择卡号)": "step2b: 使用CPU提取音高(如果模型带音高), 使用GPU提取特征(选择卡号)", diff --git a/i18n/locale/zh_HK.json b/i18n/locale/zh_HK.json index 52e0b40..a4ebff1 100644 --- a/i18n/locale/zh_HK.json +++ b/i18n/locale/zh_HK.json @@ -15,6 +15,7 @@ "rmvpe卡号配置:以-分隔输入使用的不同进程卡号,例如0-0-1使用在卡0上跑2个进程并在卡1上跑1个进程": "rmvpe卡號配置:以-分隔輸入使用的不同進程卡號,例如0-0-1使用在卡0上跑2個進程並在卡1上跑1個進程", "step1: 填写实验配置. 实验数据放在logs下, 每个实验一个文件夹, 需手工输入实验名路径, 内含实验配置, 日志, 训练得到的模型文件. ": "step1:填寫實驗配置。實驗數據放在logs下,每個實驗一個資料夾,需手動輸入實驗名路徑,內含實驗配置、日誌、訓練得到的模型檔案。", "step1:正在处理数据": "step1:正在处理数据", + "step2:正在提取音高&正在提取特征": "step2:正在提取音高&正在提取特征", "step2a: 自动遍历训练文件夹下所有可解码成音频的文件并进行切片归一化, 在实验目录下生成2个wav文件夹; 暂时只支持单人训练. ": "step2a:自動遍歷訓練資料夾下所有可解碼成音頻的檔案並進行切片歸一化,在實驗目錄下生成2個wav資料夾;暫時只支援單人訓練。", "step2a:无需提取音高": "step2a:无需提取音高", "step2b: 使用CPU提取音高(如果模型带音高), 使用GPU提取特征(选择卡号)": "步驟2b: 使用CPU提取音高(如果模型帶音高), 使用GPU提取特徵(選擇卡號)", diff --git a/i18n/locale/zh_SG.json b/i18n/locale/zh_SG.json index 52e0b40..a4ebff1 100644 --- a/i18n/locale/zh_SG.json +++ b/i18n/locale/zh_SG.json @@ -15,6 +15,7 @@ "rmvpe卡号配置:以-分隔输入使用的不同进程卡号,例如0-0-1使用在卡0上跑2个进程并在卡1上跑1个进程": "rmvpe卡號配置:以-分隔輸入使用的不同進程卡號,例如0-0-1使用在卡0上跑2個進程並在卡1上跑1個進程", "step1: 填写实验配置. 实验数据放在logs下, 每个实验一个文件夹, 需手工输入实验名路径, 内含实验配置, 日志, 训练得到的模型文件. ": "step1:填寫實驗配置。實驗數據放在logs下,每個實驗一個資料夾,需手動輸入實驗名路徑,內含實驗配置、日誌、訓練得到的模型檔案。", "step1:正在处理数据": "step1:正在处理数据", + "step2:正在提取音高&正在提取特征": "step2:正在提取音高&正在提取特征", "step2a: 自动遍历训练文件夹下所有可解码成音频的文件并进行切片归一化, 在实验目录下生成2个wav文件夹; 暂时只支持单人训练. ": "step2a:自動遍歷訓練資料夾下所有可解碼成音頻的檔案並進行切片歸一化,在實驗目錄下生成2個wav資料夾;暫時只支援單人訓練。", "step2a:无需提取音高": "step2a:无需提取音高", "step2b: 使用CPU提取音高(如果模型带音高), 使用GPU提取特征(选择卡号)": "步驟2b: 使用CPU提取音高(如果模型帶音高), 使用GPU提取特徵(選擇卡號)", diff --git a/i18n/locale/zh_TW.json b/i18n/locale/zh_TW.json index 52e0b40..a4ebff1 100644 --- a/i18n/locale/zh_TW.json +++ b/i18n/locale/zh_TW.json @@ -15,6 +15,7 @@ "rmvpe卡号配置:以-分隔输入使用的不同进程卡号,例如0-0-1使用在卡0上跑2个进程并在卡1上跑1个进程": "rmvpe卡號配置:以-分隔輸入使用的不同進程卡號,例如0-0-1使用在卡0上跑2個進程並在卡1上跑1個進程", "step1: 填写实验配置. 实验数据放在logs下, 每个实验一个文件夹, 需手工输入实验名路径, 内含实验配置, 日志, 训练得到的模型文件. ": "step1:填寫實驗配置。實驗數據放在logs下,每個實驗一個資料夾,需手動輸入實驗名路徑,內含實驗配置、日誌、訓練得到的模型檔案。", "step1:正在处理数据": "step1:正在处理数据", + "step2:正在提取音高&正在提取特征": "step2:正在提取音高&正在提取特征", "step2a: 自动遍历训练文件夹下所有可解码成音频的文件并进行切片归一化, 在实验目录下生成2个wav文件夹; 暂时只支持单人训练. ": "step2a:自動遍歷訓練資料夾下所有可解碼成音頻的檔案並進行切片歸一化,在實驗目錄下生成2個wav資料夾;暫時只支援單人訓練。", "step2a:无需提取音高": "step2a:无需提取音高", "step2b: 使用CPU提取音高(如果模型带音高), 使用GPU提取特征(选择卡号)": "步驟2b: 使用CPU提取音高(如果模型帶音高), 使用GPU提取特徵(選擇卡號)", diff --git a/infer-web.py b/infer-web.py index d3a923b..9cfa723 100644 --- a/infer-web.py +++ b/infer-web.py @@ -705,280 +705,21 @@ def train1key( infos.append(strr) return "\n".join(infos) - model_log_dir = "%s/logs/%s" % (now_dir, exp_dir1) - preprocess_log_path = "%s/preprocess.log" % model_log_dir - extract_f0_feature_log_path = "%s/extract_f0_feature.log" % model_log_dir - gt_wavs_dir = "%s/0_gt_wavs" % model_log_dir - feature_dir = ( - "%s/3_feature256" % model_log_dir - if version19 == "v1" - else "%s/3_feature768" % model_log_dir - ) - - os.makedirs(model_log_dir, exist_ok=True) - #########step1:处理数据 - open(preprocess_log_path, "w").close() - cmd = ( - get_quoted_python_cmd() - + ' trainset_preprocess_pipeline_print.py "%s" %s %s "%s" ' - % (trainset_dir4, sr_dict[sr2], np7, model_log_dir) - + str(config.noparallel) - ) + ####### step1:处理数据 yield get_info_str(i18n("step1:正在处理数据")) - yield get_info_str(cmd) - p = Popen(cmd, shell=True) - p.wait() - with open(preprocess_log_path, "r") as f: - print(f.read()) - #########step2a:提取音高 - open(extract_f0_feature_log_path, "w") - if if_f0_3: - yield get_info_str("step2a:正在提取音高") - if f0method8 != "rmvpe_gpu": - cmd = config.python_cmd + ' extract_f0_print.py "%s" %s %s' % ( - model_log_dir, - np7, - f0method8, - ) - yield get_info_str(cmd) - p = Popen(cmd, shell=True, cwd=now_dir) - p.wait() - else: - if gpus_rmvpe != "-": - gpus_rmvpe = gpus_rmvpe.split("-") - leng = len(gpus_rmvpe) - ps = [] - for idx, n_g in enumerate(gpus_rmvpe): - cmd = ( - get_quoted_python_cmd() - + ' extract_f0_rmvpe.py %s %s %s "%s" %s ' - % ( - leng, - idx, - n_g, - model_log_dir, - config.is_half, - ) - ) - yield get_info_str(cmd) - p = Popen( - cmd, shell=True, cwd=now_dir - ) # , shell=True, stdin=PIPE, stdout=PIPE, stderr=PIPE, cwd=now_dir - ps.append(p) - for p in ps: - p.wait() - else: # dml - cmd = config.python_cmd + ' extract_f0_rmvpe_dml.py "%s" ' % ( - model_log_dir - ) - yield get_info_str(cmd) - p = Popen( - cmd, shell=True, cwd=now_dir - ) # , shell=True, stdin=PIPE, stdout=PIPE, stderr=PIPE, cwd=now_dir - p.wait() - with open(extract_f0_feature_log_path, "r") as f: - print(f.read()) - else: - yield get_info_str(i18n("step2a:无需提取音高")) - #######step2b:提取特征 - yield get_info_str(i18n("step2b:正在提取特征")) - gpus = gpus16.split("-") - leng = len(gpus) - ps = [] - for idx, n_g in enumerate(gpus): - cmd = ( - get_quoted_python_cmd() - + ' extract_feature_print.py %s %s %s %s "%s" %s' - % ( - config.device, - leng, - idx, - n_g, - model_log_dir, - version19, - ) - ) - yield get_info_str(cmd) - p = Popen( - cmd, shell=True, cwd=now_dir - ) # , shell=True, stdin=PIPE, stdout=PIPE, stderr=PIPE, cwd=now_dir - ps.append(p) - for p in ps: - p.wait() - with open(extract_f0_feature_log_path, "r") as f: - print(f.read()) - #######step3a:训练模型 + [get_info_str(_) for _ in preprocess_dataset(trainset_dir4, exp_dir1, sr2, np7)] + + ####### step2a:提取音高 + yield get_info_str(i18n("step2:正在提取音高&正在提取特征")) + [get_info_str(_) for _ in extract_f0_feature(gpus16, np7, f0method8, if_f0_3, exp_dir1, version19, gpus_rmvpe)] + + ####### step3a:训练模型 yield get_info_str(i18n("step3a:正在训练模型")) - # 生成filelist - if if_f0_3: - f0_dir = "%s/2a_f0" % model_log_dir - f0nsf_dir = "%s/2b-f0nsf" % model_log_dir - names = ( - set([name.split(".")[0] for name in os.listdir(gt_wavs_dir)]) - & set([name.split(".")[0] for name in os.listdir(feature_dir)]) - & set([name.split(".")[0] for name in os.listdir(f0_dir)]) - & set([name.split(".")[0] for name in os.listdir(f0nsf_dir)]) - ) - else: - names = set([name.split(".")[0] for name in os.listdir(gt_wavs_dir)]) & set( - [name.split(".")[0] for name in os.listdir(feature_dir)] - ) - opt = [] - for name in names: - if if_f0_3: - opt.append( - "%s/%s.wav|%s/%s.npy|%s/%s.wav.npy|%s/%s.wav.npy|%s" - % ( - gt_wavs_dir.replace("\\", "\\\\"), - name, - feature_dir.replace("\\", "\\\\"), - name, - f0_dir.replace("\\", "\\\\"), - name, - f0nsf_dir.replace("\\", "\\\\"), - name, - spk_id5, - ) - ) - else: - opt.append( - "%s/%s.wav|%s/%s.npy|%s" - % ( - gt_wavs_dir.replace("\\", "\\\\"), - name, - feature_dir.replace("\\", "\\\\"), - name, - spk_id5, - ) - ) - fea_dim = 256 if version19 == "v1" else 768 - if if_f0_3: - for _ in range(2): - opt.append( - "%s/logs/mute/0_gt_wavs/mute%s.wav|%s/logs/mute/3_feature%s/mute.npy|%s/logs/mute/2a_f0/mute.wav.npy|%s/logs/mute/2b-f0nsf/mute.wav.npy|%s" - % (now_dir, sr2, now_dir, fea_dim, now_dir, now_dir, spk_id5) - ) - else: - for _ in range(2): - opt.append( - "%s/logs/mute/0_gt_wavs/mute%s.wav|%s/logs/mute/3_feature%s/mute.npy|%s" - % (now_dir, sr2, now_dir, fea_dim, spk_id5) - ) - shuffle(opt) - with open("%s/filelist.txt" % model_log_dir, "w") as f: - f.write("\n".join(opt)) - yield get_info_str("write filelist done") - if gpus16: - cmd = get_quoted_python_cmd() + ' train_nsf_sim_cache_sid_load_pretrain.py -e "%s" -sr %s -f0 %s -bs %s -g %s -te %s -se %s %s %s -l %s -c %s -sw %s -v %s' % ( - exp_dir1, - sr2, - 1 if if_f0_3 else 0, - batch_size12, - gpus16, - total_epoch11, - save_epoch10, - "-pg %s" % pretrained_G14 if pretrained_G14 != "" else "", - "-pd %s" % pretrained_D15 if pretrained_D15 != "" else "", - 1 if if_save_latest13 == i18n("是") else 0, - 1 if if_cache_gpu17 == i18n("是") else 0, - 1 if if_save_every_weights18 == i18n("是") else 0, - version19, - ) - else: - cmd = ( - config.python_cmd - + ' train_nsf_sim_cache_sid_load_pretrain.py -e "%s" -sr %s -f0 %s -bs %s -te %s -se %s %s %s -l %s -c %s -sw %s -v %s' - % ( - exp_dir1, - sr2, - 1 if if_f0_3 else 0, - batch_size12, - total_epoch11, - save_epoch10, - "-pg %s" % pretrained_G14 if pretrained_G14 != "" else "", - "-pd %s" % pretrained_D15 if pretrained_D15 != "" else "", - 1 if if_save_latest13 == i18n("是") else 0, - 1 if if_cache_gpu17 == i18n("是") else 0, - 1 if if_save_every_weights18 == i18n("是") else 0, - version19, - ) - ) - yield get_info_str(cmd) - p = Popen(cmd, shell=True, cwd=now_dir) - p.wait() + click_train(exp_dir1, sr2, if_f0_3, spk_id5, save_epoch10, total_epoch11, batch_size12, if_save_latest13, pretrained_G14, pretrained_D15, gpus16, if_cache_gpu17, if_save_every_weights18, version19) yield get_info_str(i18n("训练结束, 您可查看控制台训练日志或实验文件夹下的train.log")) - #######step3b:训练索引 - npys = [] - listdir_res = list(os.listdir(feature_dir)) - for name in sorted(listdir_res): - phone = np.load("%s/%s" % (feature_dir, name)) - npys.append(phone) - big_npy = np.concatenate(npys, 0) - big_npy_idx = np.arange(big_npy.shape[0]) - np.random.shuffle(big_npy_idx) - big_npy = big_npy[big_npy_idx] - - if big_npy.shape[0] > 2e5: - # if(1): - info = "Trying doing kmeans %s shape to 10k centers." % big_npy.shape[0] - print(info) - yield get_info_str(info) - try: - big_npy = ( - MiniBatchKMeans( - n_clusters=10000, - verbose=True, - batch_size=256 * config.n_cpu, - compute_labels=False, - init="random", - ) - .fit(big_npy) - .cluster_centers_ - ) - except: - info = traceback.format_exc() - print(info) - yield get_info_str(info) - - np.save("%s/total_fea.npy" % model_log_dir, big_npy) - n_ivf = min(int(16 * np.sqrt(big_npy.shape[0])), big_npy.shape[0] // 39) - yield get_info_str("%s,%s" % (big_npy.shape, n_ivf)) - index = faiss.index_factory(256 if version19 == "v1" else 768, "IVF%s,Flat" % n_ivf) - yield get_info_str("training index") - index_ivf = faiss.extract_index_ivf(index) # - index_ivf.nprobe = 1 - index.train(big_npy) - faiss.write_index( - index, - "%s/trained_IVF%s_Flat_nprobe_%s_%s_%s.index" - % ( - model_log_dir.replace(now_dir + "/", ""), - n_ivf, - index_ivf.nprobe, - exp_dir1, - version19, - ), - ) - yield get_info_str("adding index") - batch_size_add = 8192 - for i in range(0, big_npy.shape[0], batch_size_add): - index.add(big_npy[i : i + batch_size_add]) - faiss.write_index( - index, - "%s/added_IVF%s_Flat_nprobe_%s_%s_%s.index" - % ( - model_log_dir.replace(now_dir + "/", ""), - n_ivf, - index_ivf.nprobe, - exp_dir1, - version19, - ), - ) - yield get_info_str( - "成功构建索引, added_IVF%s_Flat_nprobe_%s_%s_%s.index" - % (n_ivf, index_ivf.nprobe, exp_dir1, version19) - ) + ####### step3b:训练索引 + [get_info_str(_) for _ in train_index(exp_dir1, version19)] yield get_info_str(i18n("全流程结束!")) From 12331b41f5c395164f6e80f47176e9d4feca7f0e Mon Sep 17 00:00:00 2001 From: Ftps <63702646+Tps-F@users.noreply.github.com> Date: Mon, 28 Aug 2023 23:57:17 +0900 Subject: [PATCH 53/65] add opt and more --- .gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitignore b/.gitignore index 630c32e..22e9bf8 100644 --- a/.gitignore +++ b/.gitignore @@ -5,3 +5,6 @@ __pycache__ hubert_base.pt /logs .venv +/opt +tools/aria2c/ +tools/flag.txt From 73627d22bb01b04cfc53b1cda1c7459a9b9ca645 Mon Sep 17 00:00:00 2001 From: Ftps <63702646+Tps-F@users.noreply.github.com> Date: Mon, 28 Aug 2023 23:57:32 +0900 Subject: [PATCH 54/65] fix import --- gui_v1.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gui_v1.py b/gui_v1.py index 07cf5be..90b8817 100644 --- a/gui_v1.py +++ b/gui_v1.py @@ -49,12 +49,12 @@ if __name__ == "__main__": import noisereduce as nr import numpy as np import PySimpleGUI as sg - import rvc_for_realtime import sounddevice as sd import torch import torch.nn.functional as F import torchaudio.transforms as tat + import tools.rvc_for_realtime as rvc_for_realtime from i18n import I18nAuto i18n = I18nAuto() From cffdce209786d4ddc0e0e2b9bfd9dcb90aa88614 Mon Sep 17 00:00:00 2001 From: Ftps <63702646+Tps-F@users.noreply.github.com> Date: Mon, 28 Aug 2023 23:58:20 +0900 Subject: [PATCH 55/65] fix locale_diff.py --- i18n/locale_diff.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/i18n/locale_diff.py b/i18n/locale_diff.py index 479f22f..2cb2ec5 100644 --- a/i18n/locale_diff.py +++ b/i18n/locale_diff.py @@ -3,10 +3,10 @@ import os from collections import OrderedDict # Define the standard file name -standard_file = "zh_CN.json" +standard_file = "i18n/locale/zh_CN.json" # Find all JSON files in the directory -dir_path = "i18n/locale" +dir_path = "i18n/locale/" languages = [ f for f in os.listdir(dir_path) if f.endswith(".json") and f != standard_file ] @@ -18,7 +18,7 @@ with open(standard_file, "r", encoding="utf-8") as f: # Loop through each language file for lang_file in languages: # Load the language file - with open(lang_file, "r", encoding="utf-8") as f: + with open(dir_path + lang_file, "r", encoding="utf-8") as f: lang_data = json.load(f, object_pairs_hook=OrderedDict) # Find the difference between the language file and the standard file @@ -40,6 +40,6 @@ for lang_file in languages: ) # Save the updated language file - with open(lang_file, "w", encoding="utf-8") as f: + with open(dir_path + lang_file, "w", encoding="utf-8") as f: json.dump(lang_data, f, ensure_ascii=False, indent=4, sort_keys=True) f.write("\n") From 8eb87a4fd958812afad24568bd7b6b2b617551b2 Mon Sep 17 00:00:00 2001 From: Ftps <63702646+Tps-F@users.noreply.github.com> Date: Mon, 28 Aug 2023 23:58:50 +0900 Subject: [PATCH 56/65] format --- infer-web.py | 24 +++++++++++++++++-- .../modules/train/extract/extract_f0_print.py | 1 + .../modules/train/extract/extract_f0_rmvpe.py | 1 + .../train/extract/extract_f0_rmvpe_dml.py | 1 + infer/modules/train/preprocess.py | 3 ++- 5 files changed, 27 insertions(+), 3 deletions(-) diff --git a/infer-web.py b/infer-web.py index 9cfa723..b75b8f8 100644 --- a/infer-web.py +++ b/infer-web.py @@ -711,11 +711,31 @@ def train1key( ####### step2a:提取音高 yield get_info_str(i18n("step2:正在提取音高&正在提取特征")) - [get_info_str(_) for _ in extract_f0_feature(gpus16, np7, f0method8, if_f0_3, exp_dir1, version19, gpus_rmvpe)] + [ + get_info_str(_) + for _ in extract_f0_feature( + gpus16, np7, f0method8, if_f0_3, exp_dir1, version19, gpus_rmvpe + ) + ] ####### step3a:训练模型 yield get_info_str(i18n("step3a:正在训练模型")) - click_train(exp_dir1, sr2, if_f0_3, spk_id5, save_epoch10, total_epoch11, batch_size12, if_save_latest13, pretrained_G14, pretrained_D15, gpus16, if_cache_gpu17, if_save_every_weights18, version19) + click_train( + exp_dir1, + sr2, + if_f0_3, + spk_id5, + save_epoch10, + total_epoch11, + batch_size12, + if_save_latest13, + pretrained_G14, + pretrained_D15, + gpus16, + if_cache_gpu17, + if_save_every_weights18, + version19, + ) yield get_info_str(i18n("训练结束, 您可查看控制台训练日志或实验文件夹下的train.log")) ####### step3b:训练索引 diff --git a/infer/modules/train/extract/extract_f0_print.py b/infer/modules/train/extract/extract_f0_print.py index a3e9c9b..6949f1c 100644 --- a/infer/modules/train/extract/extract_f0_print.py +++ b/infer/modules/train/extract/extract_f0_print.py @@ -10,6 +10,7 @@ import logging import numpy as np import pyworld + from infer.lib.audio import load_audio logging.getLogger("numba").setLevel(logging.WARNING) diff --git a/infer/modules/train/extract/extract_f0_rmvpe.py b/infer/modules/train/extract/extract_f0_rmvpe.py index 98a91e7..52d7492 100644 --- a/infer/modules/train/extract/extract_f0_rmvpe.py +++ b/infer/modules/train/extract/extract_f0_rmvpe.py @@ -10,6 +10,7 @@ import logging import numpy as np import pyworld + from infer.lib.audio import load_audio logging.getLogger("numba").setLevel(logging.WARNING) diff --git a/infer/modules/train/extract/extract_f0_rmvpe_dml.py b/infer/modules/train/extract/extract_f0_rmvpe_dml.py index 2d5c7f4..2d812ab 100644 --- a/infer/modules/train/extract/extract_f0_rmvpe_dml.py +++ b/infer/modules/train/extract/extract_f0_rmvpe_dml.py @@ -10,6 +10,7 @@ import logging import numpy as np import pyworld + from infer.lib.audio import load_audio logging.getLogger("numba").setLevel(logging.WARNING) diff --git a/infer/modules/train/preprocess.py b/infer/modules/train/preprocess.py index 7ff76ee..c57b5dc 100644 --- a/infer/modules/train/preprocess.py +++ b/infer/modules/train/preprocess.py @@ -18,9 +18,10 @@ import traceback import librosa import numpy as np +from scipy.io import wavfile + from infer.lib.audio import load_audio from infer.lib.slicer2 import Slicer -from scipy.io import wavfile mutex = multiprocessing.Lock() f = open("%s/preprocess.log" % exp_dir, "a+") From 19670917c64538e240b6ad2d1c5564d9b61acb3f Mon Sep 17 00:00:00 2001 From: Ftps <63702646+Tps-F@users.noreply.github.com> Date: Tue, 29 Aug 2023 15:04:25 +0900 Subject: [PATCH 57/65] fix configs.json --- configs/v1/32k.json | 2 +- configs/v1/40k.json | 2 +- configs/v1/48k.json | 2 +- configs/v2/32k.json | 2 +- configs/v2/48k.json | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/configs/v1/32k.json b/configs/v1/32k.json index 400b6be..d5f16d6 100644 --- a/configs/v1/32k.json +++ b/configs/v1/32k.json @@ -7,7 +7,7 @@ "betas": [0.8, 0.99], "eps": 1e-9, "batch_size": 4, - "fp16_run": false, + "fp16_run": true, "lr_decay": 0.999875, "segment_size": 12800, "init_lr_ratio": 1, diff --git a/configs/v1/40k.json b/configs/v1/40k.json index cb30b8b..4ffc87b 100644 --- a/configs/v1/40k.json +++ b/configs/v1/40k.json @@ -7,7 +7,7 @@ "betas": [0.8, 0.99], "eps": 1e-9, "batch_size": 4, - "fp16_run": false, + "fp16_run": true, "lr_decay": 0.999875, "segment_size": 12800, "init_lr_ratio": 1, diff --git a/configs/v1/48k.json b/configs/v1/48k.json index 6875991..2d0e05b 100644 --- a/configs/v1/48k.json +++ b/configs/v1/48k.json @@ -7,7 +7,7 @@ "betas": [0.8, 0.99], "eps": 1e-9, "batch_size": 4, - "fp16_run": false, + "fp16_run": true, "lr_decay": 0.999875, "segment_size": 11520, "init_lr_ratio": 1, diff --git a/configs/v2/32k.json b/configs/v2/32k.json index 36adb8a..70e534f 100644 --- a/configs/v2/32k.json +++ b/configs/v2/32k.json @@ -7,7 +7,7 @@ "betas": [0.8, 0.99], "eps": 1e-9, "batch_size": 4, - "fp16_run": false, + "fp16_run": true, "lr_decay": 0.999875, "segment_size": 12800, "init_lr_ratio": 1, diff --git a/configs/v2/48k.json b/configs/v2/48k.json index 73ee363..75f770c 100644 --- a/configs/v2/48k.json +++ b/configs/v2/48k.json @@ -7,7 +7,7 @@ "betas": [0.8, 0.99], "eps": 1e-9, "batch_size": 4, - "fp16_run": false, + "fp16_run": true, "lr_decay": 0.999875, "segment_size": 17280, "init_lr_ratio": 1, From ea1047ec8e72cc03b2db915f39059a0b80f0a748 Mon Sep 17 00:00:00 2001 From: Ftps <63702646+Tps-F@users.noreply.github.com> Date: Tue, 29 Aug 2023 15:41:19 +0900 Subject: [PATCH 58/65] fix genlocale --- .github/workflows/genlocale.yml | 4 ++-- i18n/locale_diff.py | 8 +++++--- i18n/{locale => }/scan_i18n.py | 6 +++--- 3 files changed, 10 insertions(+), 8 deletions(-) rename i18n/{locale => }/scan_i18n.py (91%) diff --git a/.github/workflows/genlocale.yml b/.github/workflows/genlocale.yml index ebed03a..d678a15 100644 --- a/.github/workflows/genlocale.yml +++ b/.github/workflows/genlocale.yml @@ -13,8 +13,8 @@ jobs: - name: Run locale generation run: | - python3 lib/i18n/scan_i18n.py - cd lib/i18n && python3 locale_diff.py + python3 i18n/scan_i18n.py + python3 i18n/locale_diff.py - name: Commit back if: ${{ !github.head_ref }} diff --git a/i18n/locale_diff.py b/i18n/locale_diff.py index 2cb2ec5..7c7d05c 100644 --- a/i18n/locale_diff.py +++ b/i18n/locale_diff.py @@ -8,7 +8,9 @@ standard_file = "i18n/locale/zh_CN.json" # Find all JSON files in the directory dir_path = "i18n/locale/" languages = [ - f for f in os.listdir(dir_path) if f.endswith(".json") and f != standard_file + os.path.join(dir_path, f) + for f in os.listdir(dir_path) + if f.endswith(".json") and f != standard_file ] # Load the standard file @@ -18,7 +20,7 @@ with open(standard_file, "r", encoding="utf-8") as f: # Loop through each language file for lang_file in languages: # Load the language file - with open(dir_path + lang_file, "r", encoding="utf-8") as f: + with open(lang_file, "r", encoding="utf-8") as f: lang_data = json.load(f, object_pairs_hook=OrderedDict) # Find the difference between the language file and the standard file @@ -40,6 +42,6 @@ for lang_file in languages: ) # Save the updated language file - with open(dir_path + lang_file, "w", encoding="utf-8") as f: + with open(lang_file, "w", encoding="utf-8") as f: json.dump(lang_data, f, ensure_ascii=False, indent=4, sort_keys=True) f.write("\n") diff --git a/i18n/locale/scan_i18n.py b/i18n/scan_i18n.py similarity index 91% rename from i18n/locale/scan_i18n.py rename to i18n/scan_i18n.py index b5fe055..f3e52cf 100644 --- a/i18n/locale/scan_i18n.py +++ b/i18n/scan_i18n.py @@ -49,8 +49,8 @@ print() print("Total unique:", len(code_keys)) -standard_file = "zh_CN.json" -with open(f"lib/i18n/{standard_file}", "r", encoding="utf-8") as f: +standard_file = "i18n/locale/zh_CN.json" +with open(standard_file, "r", encoding="utf-8") as f: standard_data = json.load(f, object_pairs_hook=OrderedDict) standard_keys = set(standard_data.keys()) @@ -70,6 +70,6 @@ for s in strings: code_keys_dict[s] = s # write back -with open(f"lib/i18n/{standard_file}", "w", encoding="utf-8") as f: +with open(standard_file, "w", encoding="utf-8") as f: json.dump(code_keys_dict, f, ensure_ascii=False, indent=4, sort_keys=True) f.write("\n") From fd78036f640075e79143b50c28f820849e388575 Mon Sep 17 00:00:00 2001 From: Ftps Date: Tue, 29 Aug 2023 15:47:17 +0900 Subject: [PATCH 59/65] fix gui_v1 --- gui_v1.py | 2 +- tools/rvc_for_realtime.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/gui_v1.py b/gui_v1.py index 90b8817..5edb349 100644 --- a/gui_v1.py +++ b/gui_v1.py @@ -55,7 +55,7 @@ if __name__ == "__main__": import torchaudio.transforms as tat import tools.rvc_for_realtime as rvc_for_realtime - from i18n import I18nAuto + from i18n.i18n import I18nAuto i18n = I18nAuto() device = rvc_for_realtime.config.device diff --git a/tools/rvc_for_realtime.py b/tools/rvc_for_realtime.py index 32316c3..734d926 100644 --- a/tools/rvc_for_realtime.py +++ b/tools/rvc_for_realtime.py @@ -27,7 +27,7 @@ from multiprocessing import Manager as M from configs.config import Config -Config() +config = Config() mm = M() if config.dml == True: From 89eecdceabe1654490df70518f1eb46ff6c6191c Mon Sep 17 00:00:00 2001 From: Ftps <63702646+Tps-F@users.noreply.github.com> Date: Tue, 29 Aug 2023 16:00:24 +0900 Subject: [PATCH 60/65] fix gui_v1 --- configs/config.json | 15 +-------------- gui_v1.py | 2 +- tools/rvc_for_realtime.py | 6 +++--- 3 files changed, 5 insertions(+), 18 deletions(-) diff --git a/configs/config.json b/configs/config.json index 6bf4f6b..62813e1 100644 --- a/configs/config.json +++ b/configs/config.json @@ -1,14 +1 @@ -{ - "pth_path": "weights/kikiV1.pth", - "index_path": "logs/kikiV1.index", - "sg_input_device": "VoiceMeeter Output (VB-Audio Vo (MME)", - "sg_output_device": "VoiceMeeter Aux Input (VB-Audio (MME)", - "threhold": -45.0, - "pitch": 0.0, - "index_rate": 1.0, - "block_time": 0.09, - "crossfade_length": 0.15, - "extra_time": 5.0, - "n_cpu": 8.0, - "f0method": "rmvpe" -} +{"pth_path": "F:/src/Retrieval-based-Voice-Conversion-WebUI/assets/weights/Mahiro.pth", "index_path": "F:/src/Retrieval-based-Voice-Conversion-WebUI/logs/Mahiro_added_IVF94_Flat_nprobe_3.index", "sg_input_device": "Microphone (USB Advanced Audio Device) (Windows DirectSound)", "sg_output_device": "Output 1/2 (Komplete Audio 6 MK2) (Windows DirectSound)", "threhold": -45.0, "pitch": 0.0, "index_rate": 1.0, "block_time": 0.09, "crossfade_length": 0.15, "extra_time": 5.0, "n_cpu": 8.0, "f0method": "rmvpe"} \ No newline at end of file diff --git a/gui_v1.py b/gui_v1.py index 5edb349..8ed0e38 100644 --- a/gui_v1.py +++ b/gui_v1.py @@ -139,7 +139,7 @@ if __name__ == "__main__": ), sg.FileBrowse( i18n("选择.pth文件"), - initial_folder=os.path.join(os.getcwd(), "weights"), + initial_folder=os.path.join(os.getcwd(), "assets/weights"), file_types=((". pth"),), ), ], diff --git a/tools/rvc_for_realtime.py b/tools/rvc_for_realtime.py index 734d926..601efcb 100644 --- a/tools/rvc_for_realtime.py +++ b/tools/rvc_for_realtime.py @@ -70,7 +70,7 @@ class RVC: print("index search enabled") self.index_rate = index_rate models, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task( - ["hubert_base.pt"], + ["assets/hubert/hubert_base.pt"], suffix="", ) hubert_model = models[0] @@ -224,14 +224,14 @@ class RVC: def get_f0_rmvpe(self, x, f0_up_key): if hasattr(self, "model_rmvpe") == False: - from lib.rmvpe import RMVPE + from infer.lib.rmvpe import RMVPE print("loading rmvpe model") self.model_rmvpe = RMVPE( # "rmvpe.pt", is_half=self.is_half if self.device.type!="privateuseone" else False, device=self.device if self.device.type!="privateuseone"else "cpu"####dml时强制对rmvpe用cpu跑 # "rmvpe.pt", is_half=False, device=self.device####dml配置 # "rmvpe.pt", is_half=False, device="cpu"####锁定cpu配置 - "rmvpe.pt", + "assets/rmvpe/rmvpe.pt", is_half=self.is_half, device=self.device, ####正常逻辑 ) From ccbb8e03d5bdcb8adbcae8a7e50a26a6ab564c20 Mon Sep 17 00:00:00 2001 From: Tps-F Date: Tue, 29 Aug 2023 07:01:02 +0000 Subject: [PATCH 61/65] Apply Code Formatter Change --- gui_v1.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/gui_v1.py b/gui_v1.py index 8ed0e38..16ae1b3 100644 --- a/gui_v1.py +++ b/gui_v1.py @@ -139,7 +139,9 @@ if __name__ == "__main__": ), sg.FileBrowse( i18n("选择.pth文件"), - initial_folder=os.path.join(os.getcwd(), "assets/weights"), + initial_folder=os.path.join( + os.getcwd(), "assets/weights" + ), file_types=((". pth"),), ), ], From 7348815c67b7b903fa23198a0fa42e6cd641b303 Mon Sep 17 00:00:00 2001 From: Ftps <63702646+Tps-F@users.noreply.github.com> Date: Tue, 29 Aug 2023 16:14:11 +0900 Subject: [PATCH 62/65] fix import --- tools/export_onnx.py | 2 +- tools/onnx_inference_demo.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/export_onnx.py b/tools/export_onnx.py index a38f943..822e09e 100644 --- a/tools/export_onnx.py +++ b/tools/export_onnx.py @@ -1,5 +1,5 @@ import torch -from lib.infer_pack.models_onnx import SynthesizerTrnMsNSFsidM +from infer.lib.infer_pack.models_onnx import SynthesizerTrnMsNSFsidM if __name__ == "__main__": MoeVS = True # 模型是否为MoeVoiceStudio(原MoeSS)使用 diff --git a/tools/onnx_inference_demo.py b/tools/onnx_inference_demo.py index 03bb868..bd9ef1c 100644 --- a/tools/onnx_inference_demo.py +++ b/tools/onnx_inference_demo.py @@ -1,6 +1,6 @@ import soundfile -from ..lib.infer_pack.onnx_inference import OnnxRVC +from ..infer.lib.infer_pack.onnx_inference import OnnxRVC hop_size = 512 sampling_rate = 40000 # 采样率 From ae171a117209e8dfef2b98e1b71219be399916cf Mon Sep 17 00:00:00 2001 From: Ftps <63702646+Tps-F@users.noreply.github.com> Date: Tue, 29 Aug 2023 17:15:19 +0900 Subject: [PATCH 63/65] fix import --- tools/infer/infer-pm-index256.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/infer/infer-pm-index256.py b/tools/infer/infer-pm-index256.py index 92be320..efaaa81 100644 --- a/tools/infer/infer-pm-index256.py +++ b/tools/infer/infer-pm-index256.py @@ -22,7 +22,7 @@ from fairseq import checkpoint_utils # from models import SynthesizerTrn256#hifigan_nonsf # from lib.infer_pack.models import SynthesizerTrn256NSF as SynthesizerTrn256#hifigan_nsf -from lib.infer_pack.models import ( +from infer.lib.infer_pack.models import ( SynthesizerTrnMs256NSFsid as SynthesizerTrn256, ) # hifigan_nsf from scipy.io import wavfile From 50944921aee141442081a15fb3f5180674fa20cf Mon Sep 17 00:00:00 2001 From: Ftps <63702646+Tps-F@users.noreply.github.com> Date: Tue, 29 Aug 2023 19:36:51 +0900 Subject: [PATCH 64/65] revert config.json --- configs/config.json | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/configs/config.json b/configs/config.json index 62813e1..6bf4f6b 100644 --- a/configs/config.json +++ b/configs/config.json @@ -1 +1,14 @@ -{"pth_path": "F:/src/Retrieval-based-Voice-Conversion-WebUI/assets/weights/Mahiro.pth", "index_path": "F:/src/Retrieval-based-Voice-Conversion-WebUI/logs/Mahiro_added_IVF94_Flat_nprobe_3.index", "sg_input_device": "Microphone (USB Advanced Audio Device) (Windows DirectSound)", "sg_output_device": "Output 1/2 (Komplete Audio 6 MK2) (Windows DirectSound)", "threhold": -45.0, "pitch": 0.0, "index_rate": 1.0, "block_time": 0.09, "crossfade_length": 0.15, "extra_time": 5.0, "n_cpu": 8.0, "f0method": "rmvpe"} \ No newline at end of file +{ + "pth_path": "weights/kikiV1.pth", + "index_path": "logs/kikiV1.index", + "sg_input_device": "VoiceMeeter Output (VB-Audio Vo (MME)", + "sg_output_device": "VoiceMeeter Aux Input (VB-Audio (MME)", + "threhold": -45.0, + "pitch": 0.0, + "index_rate": 1.0, + "block_time": 0.09, + "crossfade_length": 0.15, + "extra_time": 5.0, + "n_cpu": 8.0, + "f0method": "rmvpe" +} From d880f0d19f24c5aed0425e9d76066c6a0f90fec2 Mon Sep 17 00:00:00 2001 From: Ftps <63702646+Tps-F@users.noreply.github.com> Date: Tue, 29 Aug 2023 20:34:14 +0900 Subject: [PATCH 65/65] fix genlocale --- .github/workflows/genlocale.yml | 3 ++- i18n/locale_diff.py | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/.github/workflows/genlocale.yml b/.github/workflows/genlocale.yml index d678a15..96a29e8 100644 --- a/.github/workflows/genlocale.yml +++ b/.github/workflows/genlocale.yml @@ -14,7 +14,8 @@ jobs: - name: Run locale generation run: | python3 i18n/scan_i18n.py - python3 i18n/locale_diff.py + cd i18n + python3 locale_diff.py - name: Commit back if: ${{ !github.head_ref }} diff --git a/i18n/locale_diff.py b/i18n/locale_diff.py index 7c7d05c..674f7dd 100644 --- a/i18n/locale_diff.py +++ b/i18n/locale_diff.py @@ -3,10 +3,10 @@ import os from collections import OrderedDict # Define the standard file name -standard_file = "i18n/locale/zh_CN.json" +standard_file = "locale/zh_CN.json" # Find all JSON files in the directory -dir_path = "i18n/locale/" +dir_path = "locale/" languages = [ os.path.join(dir_path, f) for f in os.listdir(dir_path)