From f431f8fb3f13aa6dfedf33383f70de35fe07dfbd Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Mon, 23 Oct 2023 23:01:02 +0900 Subject: [PATCH 1/4] chore(format): run black on main (#1448) Co-authored-by: github-actions[bot] --- infer-web.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/infer-web.py b/infer-web.py index 4cdfef3..9c356c1 100644 --- a/infer-web.py +++ b/infer-web.py @@ -1,6 +1,7 @@ import os import sys from dotenv import load_dotenv + now_dir = os.getcwd() sys.path.append(now_dir) load_dotenv() @@ -32,7 +33,6 @@ import shutil import logging - logging.getLogger("numba").setLevel(logging.WARNING) logger = logging.getLogger(__name__) @@ -438,8 +438,9 @@ def change_version19(sr2, if_f0_3, version19): def change_f0(if_f0_3, sr2, version19): # f0method8,pretrained_G14,pretrained_D15 path_str = "" if version19 == "v1" else "_v2" return ( - {"visible": if_f0_3, "__type__": "update"},{"visible": if_f0_3, "__type__": "update"}, - *get_pretrained_models(path_str, "f0"if if_f0_3==True else "", sr2), + {"visible": if_f0_3, "__type__": "update"}, + {"visible": if_f0_3, "__type__": "update"}, + *get_pretrained_models(path_str, "f0" if if_f0_3 == True else "", sr2), ) @@ -780,9 +781,7 @@ with gr.Blocks(title="RVC WebUI") as app: with gr.Row(): sid0 = gr.Dropdown(label=i18n("推理音色"), choices=sorted(names)) with gr.Column(): - refresh_button = gr.Button( - i18n("刷新音色列表和索引路径"), variant="primary" - ) + refresh_button = gr.Button(i18n("刷新音色列表和索引路径"), variant="primary") clean_button = gr.Button(i18n("卸载音色省显存"), variant="primary") spk_item = gr.Slider( minimum=0, @@ -872,7 +871,8 @@ with gr.Blocks(title="RVC WebUI") as app: interactive=True, ) f0_file = gr.File( - label=i18n("F0曲线文件, 可选, 一行一个音高, 代替默认F0及升降调"),visible=False + label=i18n("F0曲线文件, 可选, 一行一个音高, 代替默认F0及升降调"), + visible=False, ) refresh_button.click( @@ -1282,7 +1282,7 @@ with gr.Blocks(title="RVC WebUI") as app: if_f0_3.change( change_f0, [if_f0_3, sr2, version19], - [f0method8, gpus_rmvpe,pretrained_G14, pretrained_D15], + [f0method8, gpus_rmvpe, pretrained_G14, pretrained_D15], ) gpus16 = gr.Textbox( label=i18n("以-分隔输入使用的卡号, 例如 0-1-2 使用卡0和卡1和卡2"), From e7e9d5934d847f3c43ff767fe482b689d114d6fc Mon Sep 17 00:00:00 2001 From: CN_ChiTu <36254426+CNChTu@users.noreply.github.com> Date: Thu, 14 Dec 2023 21:01:46 +0800 Subject: [PATCH 2/4] add fcpe for realtime --- tools/rvc_for_realtime.py | 859 +++++++++++++++++++------------------- 1 file changed, 438 insertions(+), 421 deletions(-) diff --git a/tools/rvc_for_realtime.py b/tools/rvc_for_realtime.py index 68358bb..8e16e87 100644 --- a/tools/rvc_for_realtime.py +++ b/tools/rvc_for_realtime.py @@ -1,421 +1,438 @@ -from io import BytesIO -import os -import pickle -import sys -import traceback -from infer.lib import jit -from infer.lib.jit.get_synthesizer import get_synthesizer -from time import time as ttime -import fairseq -import faiss -import numpy as np -import parselmouth -import pyworld -import scipy.signal as signal -import torch -import torch.nn as nn -import torch.nn.functional as F -import torchcrepe - -from infer.lib.infer_pack.models import ( - SynthesizerTrnMs256NSFsid, - SynthesizerTrnMs256NSFsid_nono, - SynthesizerTrnMs768NSFsid, - SynthesizerTrnMs768NSFsid_nono, -) - -now_dir = os.getcwd() -sys.path.append(now_dir) -from multiprocessing import Manager as M - -from configs.config import Config - -# config = Config() - -mm = M() - - -def printt(strr, *args): - if len(args) == 0: - print(strr) - else: - print(strr % args) - - -# config.device=torch.device("cpu")########强制cpu测试 -# config.is_half=False########强制cpu测试 -class RVC: - def __init__( - self, - key, - pth_path, - index_path, - index_rate, - n_cpu, - inp_q, - opt_q, - config: Config, - last_rvc=None, - ) -> None: - """ - 初始化 - """ - try: - if config.dml == True: - - def forward_dml(ctx, x, scale): - ctx.scale = scale - res = x.clone().detach() - return res - - fairseq.modules.grad_multiply.GradMultiply.forward = forward_dml - # global config - self.config = config - self.inp_q = inp_q - self.opt_q = opt_q - # device="cpu"########强制cpu测试 - self.device = config.device - self.f0_up_key = key - self.time_step = 160 / 16000 * 1000 - self.f0_min = 50 - self.f0_max = 1100 - self.f0_mel_min = 1127 * np.log(1 + self.f0_min / 700) - self.f0_mel_max = 1127 * np.log(1 + self.f0_max / 700) - self.sr = 16000 - self.window = 160 - self.n_cpu = n_cpu - self.use_jit = self.config.use_jit - self.is_half = config.is_half - - if index_rate != 0: - self.index = faiss.read_index(index_path) - self.big_npy = self.index.reconstruct_n(0, self.index.ntotal) - printt("Index search enabled") - self.pth_path: str = pth_path - self.index_path = index_path - self.index_rate = index_rate - - if last_rvc is None: - models, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task( - ["assets/hubert/hubert_base.pt"], - suffix="", - ) - hubert_model = models[0] - hubert_model = hubert_model.to(self.device) - if self.is_half: - hubert_model = hubert_model.half() - else: - hubert_model = hubert_model.float() - hubert_model.eval() - self.model = hubert_model - else: - self.model = last_rvc.model - - self.net_g: nn.Module = None - - def set_default_model(): - self.net_g, cpt = get_synthesizer(self.pth_path, self.device) - self.tgt_sr = cpt["config"][-1] - cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0] - self.if_f0 = cpt.get("f0", 1) - self.version = cpt.get("version", "v1") - if self.is_half: - self.net_g = self.net_g.half() - else: - self.net_g = self.net_g.float() - - def set_jit_model(): - jit_pth_path = self.pth_path.rstrip(".pth") - jit_pth_path += ".half.jit" if self.is_half else ".jit" - reload = False - if str(self.device) == "cuda": - self.device = torch.device("cuda:0") - if os.path.exists(jit_pth_path): - cpt = jit.load(jit_pth_path) - model_device = cpt["device"] - if model_device != str(self.device): - reload = True - else: - reload = True - - if reload: - cpt = jit.synthesizer_jit_export( - self.pth_path, - "script", - None, - device=self.device, - is_half=self.is_half, - ) - - self.tgt_sr = cpt["config"][-1] - self.if_f0 = cpt.get("f0", 1) - self.version = cpt.get("version", "v1") - self.net_g = torch.jit.load( - BytesIO(cpt["model"]), map_location=self.device - ) - self.net_g.infer = self.net_g.forward - self.net_g.eval().to(self.device) - - def set_synthesizer(): - if self.use_jit and not config.dml: - if self.is_half and "cpu" in str(self.device): - printt( - "Use default Synthesizer model. \ - Jit is not supported on the CPU for half floating point" - ) - set_default_model() - else: - set_jit_model() - else: - set_default_model() - - if last_rvc is None or last_rvc.pth_path != self.pth_path: - set_synthesizer() - else: - self.tgt_sr = last_rvc.tgt_sr - self.if_f0 = last_rvc.if_f0 - self.version = last_rvc.version - self.is_half = last_rvc.is_half - if last_rvc.use_jit != self.use_jit: - set_synthesizer() - else: - self.net_g = last_rvc.net_g - - if last_rvc is not None and hasattr(last_rvc, "model_rmvpe"): - self.model_rmvpe = last_rvc.model_rmvpe - except: - printt(traceback.format_exc()) - - def change_key(self, new_key): - self.f0_up_key = new_key - - def change_index_rate(self, new_index_rate): - if new_index_rate != 0 and self.index_rate == 0: - self.index = faiss.read_index(self.index_path) - self.big_npy = self.index.reconstruct_n(0, self.index.ntotal) - printt("Index search enabled") - self.index_rate = new_index_rate - - def get_f0_post(self, f0): - f0_min = self.f0_min - f0_max = self.f0_max - f0_mel_min = 1127 * np.log(1 + f0_min / 700) - f0_mel_max = 1127 * np.log(1 + f0_max / 700) - f0bak = f0.copy() - f0_mel = 1127 * np.log(1 + f0 / 700) - f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / ( - f0_mel_max - f0_mel_min - ) + 1 - f0_mel[f0_mel <= 1] = 1 - f0_mel[f0_mel > 255] = 255 - f0_coarse = np.rint(f0_mel).astype(np.int32) - return f0_coarse, f0bak - - def get_f0(self, x, f0_up_key, n_cpu, method="harvest"): - n_cpu = int(n_cpu) - if method == "crepe": - return self.get_f0_crepe(x, f0_up_key) - if method == "rmvpe": - return self.get_f0_rmvpe(x, f0_up_key) - if method == "pm": - p_len = x.shape[0] // 160 + 1 - f0_min = 65 - l_pad = int(np.ceil(1.5 / f0_min * 16000)) - r_pad = l_pad + 1 - s = parselmouth.Sound(np.pad(x, (l_pad, r_pad)), 16000).to_pitch_ac( - time_step=0.01, - voicing_threshold=0.6, - pitch_floor=f0_min, - pitch_ceiling=1100, - ) - assert np.abs(s.t1 - 1.5 / f0_min) < 0.001 - f0 = s.selected_array["frequency"] - if len(f0) < p_len: - f0 = np.pad(f0, (0, p_len - len(f0))) - f0 = f0[:p_len] - f0 *= pow(2, f0_up_key / 12) - return self.get_f0_post(f0) - if n_cpu == 1: - f0, t = pyworld.harvest( - x.astype(np.double), - fs=16000, - f0_ceil=1100, - f0_floor=50, - frame_period=10, - ) - f0 = signal.medfilt(f0, 3) - f0 *= pow(2, f0_up_key / 12) - return self.get_f0_post(f0) - f0bak = np.zeros(x.shape[0] // 160 + 1, dtype=np.float64) - length = len(x) - part_length = 160 * ((length // 160 - 1) // n_cpu + 1) - n_cpu = (length // 160 - 1) // (part_length // 160) + 1 - ts = ttime() - res_f0 = mm.dict() - for idx in range(n_cpu): - tail = part_length * (idx + 1) + 320 - if idx == 0: - self.inp_q.put((idx, x[:tail], res_f0, n_cpu, ts)) - else: - self.inp_q.put( - (idx, x[part_length * idx - 320 : tail], res_f0, n_cpu, ts) - ) - while 1: - res_ts = self.opt_q.get() - if res_ts == ts: - break - f0s = [i[1] for i in sorted(res_f0.items(), key=lambda x: x[0])] - for idx, f0 in enumerate(f0s): - if idx == 0: - f0 = f0[:-3] - elif idx != n_cpu - 1: - f0 = f0[2:-3] - else: - f0 = f0[2:] - f0bak[ - part_length * idx // 160 : part_length * idx // 160 + f0.shape[0] - ] = f0 - f0bak = signal.medfilt(f0bak, 3) - f0bak *= pow(2, f0_up_key / 12) - return self.get_f0_post(f0bak) - - def get_f0_crepe(self, x, f0_up_key): - if "privateuseone" in str(self.device): ###不支持dml,cpu又太慢用不成,拿pm顶替 - return self.get_f0(x, f0_up_key, 1, "pm") - audio = torch.tensor(np.copy(x))[None].float() - # printt("using crepe,device:%s"%self.device) - f0, pd = torchcrepe.predict( - audio, - self.sr, - 160, - self.f0_min, - self.f0_max, - "full", - batch_size=512, - # device=self.device if self.device.type!="privateuseone" else "cpu",###crepe不用半精度全部是全精度所以不愁###cpu延迟高到没法用 - device=self.device, - return_periodicity=True, - ) - pd = torchcrepe.filter.median(pd, 3) - f0 = torchcrepe.filter.mean(f0, 3) - f0[pd < 0.1] = 0 - f0 = f0[0].cpu().numpy() - f0 *= pow(2, f0_up_key / 12) - return self.get_f0_post(f0) - - def get_f0_rmvpe(self, x, f0_up_key): - if hasattr(self, "model_rmvpe") == False: - from infer.lib.rmvpe import RMVPE - - printt("Loading rmvpe model") - self.model_rmvpe = RMVPE( - # "rmvpe.pt", is_half=self.is_half if self.device.type!="privateuseone" else False, device=self.device if self.device.type!="privateuseone"else "cpu"####dml时强制对rmvpe用cpu跑 - # "rmvpe.pt", is_half=False, device=self.device####dml配置 - # "rmvpe.pt", is_half=False, device="cpu"####锁定cpu配置 - "assets/rmvpe/rmvpe.pt", - is_half=self.is_half, - device=self.device, ####正常逻辑 - use_jit=self.config.use_jit, - ) - # self.model_rmvpe = RMVPE("aug2_58000_half.pt", is_half=self.is_half, device=self.device) - f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03) - f0 *= pow(2, f0_up_key / 12) - return self.get_f0_post(f0) - - def infer( - self, - feats: torch.Tensor, - indata: np.ndarray, - block_frame_16k, - rate, - cache_pitch, - cache_pitchf, - f0method, - ) -> np.ndarray: - feats = feats.view(1, -1) - if self.config.is_half: - feats = feats.half() - else: - feats = feats.float() - feats = feats.to(self.device) - t1 = ttime() - with torch.no_grad(): - padding_mask = torch.BoolTensor(feats.shape).to(self.device).fill_(False) - inputs = { - "source": feats, - "padding_mask": padding_mask, - "output_layer": 9 if self.version == "v1" else 12, - } - logits = self.model.extract_features(**inputs) - feats = ( - self.model.final_proj(logits[0]) if self.version == "v1" else logits[0] - ) - feats = torch.cat((feats, feats[:, -1:, :]), 1) - t2 = ttime() - try: - if hasattr(self, "index") and self.index_rate != 0: - leng_replace_head = int(rate * feats[0].shape[0]) - npy = feats[0][-leng_replace_head:].cpu().numpy().astype("float32") - score, ix = self.index.search(npy, k=8) - weight = np.square(1 / score) - weight /= weight.sum(axis=1, keepdims=True) - npy = np.sum(self.big_npy[ix] * np.expand_dims(weight, axis=2), axis=1) - if self.config.is_half: - npy = npy.astype("float16") - feats[0][-leng_replace_head:] = ( - torch.from_numpy(npy).unsqueeze(0).to(self.device) * self.index_rate - + (1 - self.index_rate) * feats[0][-leng_replace_head:] - ) - else: - printt("Index search FAILED or disabled") - except: - traceback.print_exc() - printt("Index search FAILED") - feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1) - t3 = ttime() - if self.if_f0 == 1: - pitch, pitchf = self.get_f0(indata, self.f0_up_key, self.n_cpu, f0method) - start_frame = block_frame_16k // 160 - end_frame = len(cache_pitch) - (pitch.shape[0] - 4) + start_frame - cache_pitch[:] = np.append(cache_pitch[start_frame:end_frame], pitch[3:-1]) - cache_pitchf[:] = np.append( - cache_pitchf[start_frame:end_frame], pitchf[3:-1] - ) - p_len = min(feats.shape[1], 13000, cache_pitch.shape[0]) - else: - cache_pitch, cache_pitchf = None, None - p_len = min(feats.shape[1], 13000) - t4 = ttime() - feats = feats[:, :p_len, :] - if self.if_f0 == 1: - cache_pitch = cache_pitch[:p_len] - cache_pitchf = cache_pitchf[:p_len] - cache_pitch = torch.LongTensor(cache_pitch).unsqueeze(0).to(self.device) - cache_pitchf = torch.FloatTensor(cache_pitchf).unsqueeze(0).to(self.device) - p_len = torch.LongTensor([p_len]).to(self.device) - ii = 0 # sid - sid = torch.LongTensor([ii]).to(self.device) - with torch.no_grad(): - if self.if_f0 == 1: - # printt(12222222222,feats.device,p_len.device,cache_pitch.device,cache_pitchf.device,sid.device,rate2) - infered_audio = self.net_g.infer( - feats, - p_len, - cache_pitch, - cache_pitchf, - sid, - torch.FloatTensor([rate]), - )[0][0, 0].data.float() - else: - infered_audio = self.net_g.infer( - feats, p_len, sid, torch.FloatTensor([rate]) - )[0][0, 0].data.float() - t5 = ttime() - printt( - "Spent time: fea = %.2fs, index = %.2fs, f0 = %.2fs, model = %.2fs", - t2 - t1, - t3 - t2, - t4 - t3, - t5 - t4, - ) - return infered_audio +from io import BytesIO +import os +import pickle +import sys +import traceback +from infer.lib import jit +from infer.lib.jit.get_synthesizer import get_synthesizer +from time import time as ttime +import fairseq +import faiss +import numpy as np +import parselmouth +import pyworld +import scipy.signal as signal +import torch +import torch.nn as nn +import torch.nn.functional as F +import torchcrepe + +from infer.lib.infer_pack.models import ( + SynthesizerTrnMs256NSFsid, + SynthesizerTrnMs256NSFsid_nono, + SynthesizerTrnMs768NSFsid, + SynthesizerTrnMs768NSFsid_nono, +) + +now_dir = os.getcwd() +sys.path.append(now_dir) +from multiprocessing import Manager as M + +from configs.config import Config + +# config = Config() + +mm = M() + + +def printt(strr, *args): + if len(args) == 0: + print(strr) + else: + print(strr % args) + + +# config.device=torch.device("cpu")########强制cpu测试 +# config.is_half=False########强制cpu测试 +class RVC: + def __init__( + self, + key, + pth_path, + index_path, + index_rate, + n_cpu, + inp_q, + opt_q, + config: Config, + last_rvc=None, + ) -> None: + """ + 初始化 + """ + try: + if config.dml == True: + def forward_dml(ctx, x, scale): + ctx.scale = scale + res = x.clone().detach() + return res + + fairseq.modules.grad_multiply.GradMultiply.forward = forward_dml + # global config + self.config = config + self.inp_q = inp_q + self.opt_q = opt_q + # device="cpu"########强制cpu测试 + self.device = config.device + self.f0_up_key = key + self.time_step = 160 / 16000 * 1000 + self.f0_min = 50 + self.f0_max = 1100 + self.f0_mel_min = 1127 * np.log(1 + self.f0_min / 700) + self.f0_mel_max = 1127 * np.log(1 + self.f0_max / 700) + self.sr = 16000 + self.window = 160 + self.n_cpu = n_cpu + self.use_jit = self.config.use_jit + self.is_half = config.is_half + + if index_rate != 0: + self.index = faiss.read_index(index_path) + self.big_npy = self.index.reconstruct_n(0, self.index.ntotal) + printt("Index search enabled") + self.pth_path: str = pth_path + self.index_path = index_path + self.index_rate = index_rate + + if last_rvc is None: + models, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task( + ["assets/hubert/hubert_base.pt"], + suffix="", + ) + hubert_model = models[0] + hubert_model = hubert_model.to(self.device) + if self.is_half: + hubert_model = hubert_model.half() + else: + hubert_model = hubert_model.float() + hubert_model.eval() + self.model = hubert_model + else: + self.model = last_rvc.model + + self.net_g: nn.Module = None + + def set_default_model(): + self.net_g, cpt = get_synthesizer(self.pth_path, self.device) + self.tgt_sr = cpt["config"][-1] + cpt["config"][-3] = cpt["weight"]["emb_g.weight"].shape[0] + self.if_f0 = cpt.get("f0", 1) + self.version = cpt.get("version", "v1") + if self.is_half: + self.net_g = self.net_g.half() + else: + self.net_g = self.net_g.float() + + def set_jit_model(): + jit_pth_path = self.pth_path.rstrip(".pth") + jit_pth_path += ".half.jit" if self.is_half else ".jit" + reload = False + if str(self.device) == "cuda": + self.device = torch.device("cuda:0") + if os.path.exists(jit_pth_path): + cpt = jit.load(jit_pth_path) + model_device = cpt["device"] + if model_device != str(self.device): + reload = True + else: + reload = True + + if reload: + cpt = jit.synthesizer_jit_export( + self.pth_path, + "script", + None, + device=self.device, + is_half=self.is_half, + ) + + self.tgt_sr = cpt["config"][-1] + self.if_f0 = cpt.get("f0", 1) + self.version = cpt.get("version", "v1") + self.net_g = torch.jit.load( + BytesIO(cpt["model"]), map_location=self.device + ) + self.net_g.infer = self.net_g.forward + self.net_g.eval().to(self.device) + + def set_synthesizer(): + if self.use_jit and not config.dml: + if self.is_half and "cpu" in str(self.device): + printt( + "Use default Synthesizer model. \ + Jit is not supported on the CPU for half floating point" + ) + set_default_model() + else: + set_jit_model() + else: + set_default_model() + + if last_rvc is None or last_rvc.pth_path != self.pth_path: + set_synthesizer() + else: + self.tgt_sr = last_rvc.tgt_sr + self.if_f0 = last_rvc.if_f0 + self.version = last_rvc.version + self.is_half = last_rvc.is_half + if last_rvc.use_jit != self.use_jit: + set_synthesizer() + else: + self.net_g = last_rvc.net_g + + if last_rvc is not None and hasattr(last_rvc, "model_rmvpe"): + self.model_rmvpe = last_rvc.model_rmvpe + if last_rvc is not None and hasattr(last_rvc, "model_fcpe"): + self.model_fcpe = last_rvc.model_fcpe + except: + printt(traceback.format_exc()) + + def change_key(self, new_key): + self.f0_up_key = new_key + + def change_index_rate(self, new_index_rate): + if new_index_rate != 0 and self.index_rate == 0: + self.index = faiss.read_index(self.index_path) + self.big_npy = self.index.reconstruct_n(0, self.index.ntotal) + printt("Index search enabled") + self.index_rate = new_index_rate + + def get_f0_post(self, f0): + f0_min = self.f0_min + f0_max = self.f0_max + f0_mel_min = 1127 * np.log(1 + f0_min / 700) + f0_mel_max = 1127 * np.log(1 + f0_max / 700) + f0bak = f0.copy() + f0_mel = 1127 * np.log(1 + f0 / 700) + f0_mel[f0_mel > 0] = (f0_mel[f0_mel > 0] - f0_mel_min) * 254 / ( + f0_mel_max - f0_mel_min + ) + 1 + f0_mel[f0_mel <= 1] = 1 + f0_mel[f0_mel > 255] = 255 + f0_coarse = np.rint(f0_mel).astype(np.int32) + return f0_coarse, f0bak + + def get_f0(self, x, f0_up_key, n_cpu, method="harvest"): + n_cpu = int(n_cpu) + if method == "crepe": + return self.get_f0_crepe(x, f0_up_key) + if method == "rmvpe": + return self.get_f0_rmvpe(x, f0_up_key) + if method == "fcpe": + return self.get_f0_fcpe(x, f0_up_key) + if method == "pm": + p_len = x.shape[0] // 160 + 1 + f0_min = 65 + l_pad = int(np.ceil(1.5 / f0_min * 16000)) + r_pad = l_pad + 1 + s = parselmouth.Sound(np.pad(x, (l_pad, r_pad)), 16000).to_pitch_ac( + time_step=0.01, + voicing_threshold=0.6, + pitch_floor=f0_min, + pitch_ceiling=1100, + ) + assert np.abs(s.t1 - 1.5 / f0_min) < 0.001 + f0 = s.selected_array["frequency"] + if len(f0) < p_len: + f0 = np.pad(f0, (0, p_len - len(f0))) + f0 = f0[:p_len] + f0 *= pow(2, f0_up_key / 12) + return self.get_f0_post(f0) + if n_cpu == 1: + f0, t = pyworld.harvest( + x.astype(np.double), + fs=16000, + f0_ceil=1100, + f0_floor=50, + frame_period=10, + ) + f0 = signal.medfilt(f0, 3) + f0 *= pow(2, f0_up_key / 12) + return self.get_f0_post(f0) + f0bak = np.zeros(x.shape[0] // 160 + 1, dtype=np.float64) + length = len(x) + part_length = 160 * ((length // 160 - 1) // n_cpu + 1) + n_cpu = (length // 160 - 1) // (part_length // 160) + 1 + ts = ttime() + res_f0 = mm.dict() + for idx in range(n_cpu): + tail = part_length * (idx + 1) + 320 + if idx == 0: + self.inp_q.put((idx, x[:tail], res_f0, n_cpu, ts)) + else: + self.inp_q.put( + (idx, x[part_length * idx - 320: tail], res_f0, n_cpu, ts) + ) + while 1: + res_ts = self.opt_q.get() + if res_ts == ts: + break + f0s = [i[1] for i in sorted(res_f0.items(), key=lambda x: x[0])] + for idx, f0 in enumerate(f0s): + if idx == 0: + f0 = f0[:-3] + elif idx != n_cpu - 1: + f0 = f0[2:-3] + else: + f0 = f0[2:] + f0bak[ + part_length * idx // 160: part_length * idx // 160 + f0.shape[0] + ] = f0 + f0bak = signal.medfilt(f0bak, 3) + f0bak *= pow(2, f0_up_key / 12) + return self.get_f0_post(f0bak) + + def get_f0_crepe(self, x, f0_up_key): + if "privateuseone" in str(self.device): ###不支持dml,cpu又太慢用不成,拿pm顶替 + return self.get_f0(x, f0_up_key, 1, "pm") + audio = torch.tensor(np.copy(x))[None].float() + # printt("using crepe,device:%s"%self.device) + f0, pd = torchcrepe.predict( + audio, + self.sr, + 160, + self.f0_min, + self.f0_max, + "full", + batch_size=512, + # device=self.device if self.device.type!="privateuseone" else "cpu",###crepe不用半精度全部是全精度所以不愁###cpu延迟高到没法用 + device=self.device, + return_periodicity=True, + ) + pd = torchcrepe.filter.median(pd, 3) + f0 = torchcrepe.filter.mean(f0, 3) + f0[pd < 0.1] = 0 + f0 = f0[0].cpu().numpy() + f0 *= pow(2, f0_up_key / 12) + return self.get_f0_post(f0) + + def get_f0_rmvpe(self, x, f0_up_key): + if hasattr(self, "model_rmvpe") == False: + from infer.lib.rmvpe import RMVPE + + printt("Loading rmvpe model") + self.model_rmvpe = RMVPE( + # "rmvpe.pt", is_half=self.is_half if self.device.type!="privateuseone" else False, device=self.device if self.device.type!="privateuseone"else "cpu"####dml时强制对rmvpe用cpu跑 + # "rmvpe.pt", is_half=False, device=self.device####dml配置 + # "rmvpe.pt", is_half=False, device="cpu"####锁定cpu配置 + "assets/rmvpe/rmvpe.pt", + is_half=self.is_half, + device=self.device, ####正常逻辑 + use_jit=self.config.use_jit, + ) + # self.model_rmvpe = RMVPE("aug2_58000_half.pt", is_half=self.is_half, device=self.device) + f0 = self.model_rmvpe.infer_from_audio(x, thred=0.03) + f0 *= pow(2, f0_up_key / 12) + return self.get_f0_post(f0) + + def get_f0_fcpe(self, x, f0_up_key): + if hasattr(self, "model_fcpe") == False: + from torchfcpe import spawn_bundled_infer_model + printt("Loading fcpe model") + self.model_fcpe = spawn_bundled_infer_model(self.device) + f0 = self.model_fcpe.infer( + torch.from_numpy(x).to(self.device).unsqueeze(0).float(), + sr=16000, + decoder_mode='local_argmax', + threshold=0.006, + ).squeeze().cpu().numpy() + f0 *= pow(2, f0_up_key / 12) + return self.get_f0_post(f0) + + def infer( + self, + feats: torch.Tensor, + indata: np.ndarray, + block_frame_16k, + rate, + cache_pitch, + cache_pitchf, + f0method, + ) -> np.ndarray: + feats = feats.view(1, -1) + if self.config.is_half: + feats = feats.half() + else: + feats = feats.float() + feats = feats.to(self.device) + t1 = ttime() + with torch.no_grad(): + padding_mask = torch.BoolTensor(feats.shape).to(self.device).fill_(False) + inputs = { + "source": feats, + "padding_mask": padding_mask, + "output_layer": 9 if self.version == "v1" else 12, + } + logits = self.model.extract_features(**inputs) + feats = ( + self.model.final_proj(logits[0]) if self.version == "v1" else logits[0] + ) + feats = torch.cat((feats, feats[:, -1:, :]), 1) + t2 = ttime() + try: + if hasattr(self, "index") and self.index_rate != 0: + leng_replace_head = int(rate * feats[0].shape[0]) + npy = feats[0][-leng_replace_head:].cpu().numpy().astype("float32") + score, ix = self.index.search(npy, k=8) + weight = np.square(1 / score) + weight /= weight.sum(axis=1, keepdims=True) + npy = np.sum(self.big_npy[ix] * np.expand_dims(weight, axis=2), axis=1) + if self.config.is_half: + npy = npy.astype("float16") + feats[0][-leng_replace_head:] = ( + torch.from_numpy(npy).unsqueeze(0).to(self.device) * self.index_rate + + (1 - self.index_rate) * feats[0][-leng_replace_head:] + ) + else: + printt("Index search FAILED or disabled") + except: + traceback.print_exc() + printt("Index search FAILED") + feats = F.interpolate(feats.permute(0, 2, 1), scale_factor=2).permute(0, 2, 1) + t3 = ttime() + if self.if_f0 == 1: + pitch, pitchf = self.get_f0(indata, self.f0_up_key, self.n_cpu, f0method) + start_frame = block_frame_16k // 160 + end_frame = len(cache_pitch) - (pitch.shape[0] - 4) + start_frame + cache_pitch[:] = np.append(cache_pitch[start_frame:end_frame], pitch[3:-1]) + cache_pitchf[:] = np.append( + cache_pitchf[start_frame:end_frame], pitchf[3:-1] + ) + p_len = min(feats.shape[1], 13000, cache_pitch.shape[0]) + else: + cache_pitch, cache_pitchf = None, None + p_len = min(feats.shape[1], 13000) + t4 = ttime() + feats = feats[:, :p_len, :] + if self.if_f0 == 1: + cache_pitch = cache_pitch[:p_len] + cache_pitchf = cache_pitchf[:p_len] + cache_pitch = torch.LongTensor(cache_pitch).unsqueeze(0).to(self.device) + cache_pitchf = torch.FloatTensor(cache_pitchf).unsqueeze(0).to(self.device) + p_len = torch.LongTensor([p_len]).to(self.device) + ii = 0 # sid + sid = torch.LongTensor([ii]).to(self.device) + with torch.no_grad(): + if self.if_f0 == 1: + # printt(12222222222,feats.device,p_len.device,cache_pitch.device,cache_pitchf.device,sid.device,rate2) + infered_audio = self.net_g.infer( + feats, + p_len, + cache_pitch, + cache_pitchf, + sid, + torch.FloatTensor([rate]), + )[0][0, 0].data.float() + else: + infered_audio = self.net_g.infer( + feats, p_len, sid, torch.FloatTensor([rate]) + )[0][0, 0].data.float() + t5 = ttime() + printt( + "Spent time: fea = %.2fs, index = %.2fs, f0 = %.2fs, model = %.2fs", + t2 - t1, + t3 - t2, + t4 - t3, + t5 - t4, + ) + return infered_audio From 89746605601403334b95f487580cdbcda2efdd22 Mon Sep 17 00:00:00 2001 From: CN_ChiTu <36254426+CNChTu@users.noreply.github.com> Date: Thu, 14 Dec 2023 21:08:36 +0800 Subject: [PATCH 3/4] add fcpe for realtime --- gui_v1.py | 1759 +++++++++++++++++++++++++++-------------------------- 1 file changed, 885 insertions(+), 874 deletions(-) diff --git a/gui_v1.py b/gui_v1.py index 3254892..7f4c640 100644 --- a/gui_v1.py +++ b/gui_v1.py @@ -1,874 +1,885 @@ -import os -import sys -from dotenv import load_dotenv - -load_dotenv() - -os.environ["OMP_NUM_THREADS"] = "4" -if sys.platform == "darwin": - os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" - -now_dir = os.getcwd() -sys.path.append(now_dir) -import multiprocessing - -stream_latency = -1 - - -def printt(strr, *args): - if len(args) == 0: - print(strr) - else: - print(strr % args) - - -class Harvest(multiprocessing.Process): - def __init__(self, inp_q, opt_q): - multiprocessing.Process.__init__(self) - self.inp_q = inp_q - self.opt_q = opt_q - - def run(self): - import numpy as np - import pyworld - - while 1: - idx, x, res_f0, n_cpu, ts = self.inp_q.get() - f0, t = pyworld.harvest( - x.astype(np.double), - fs=16000, - f0_ceil=1100, - f0_floor=50, - frame_period=10, - ) - res_f0[idx] = f0 - if len(res_f0.keys()) >= n_cpu: - self.opt_q.put(ts) - - -if __name__ == "__main__": - import json - import multiprocessing - import re - import threading - import time - import traceback - from multiprocessing import Queue, cpu_count - from queue import Empty - - import librosa - from tools.torchgate import TorchGate - import numpy as np - import PySimpleGUI as sg - import sounddevice as sd - import torch - import torch.nn.functional as F - import torchaudio.transforms as tat - - import tools.rvc_for_realtime as rvc_for_realtime - from i18n.i18n import I18nAuto - from configs.config import Config - - i18n = I18nAuto() - - # device = rvc_for_realtime.config.device - # device = torch.device( - # "cuda" - # if torch.cuda.is_available() - # else ("mps" if torch.backends.mps.is_available() else "cpu") - # ) - current_dir = os.getcwd() - inp_q = Queue() - opt_q = Queue() - n_cpu = min(cpu_count(), 8) - for _ in range(n_cpu): - Harvest(inp_q, opt_q).start() - - class GUIConfig: - def __init__(self) -> None: - self.pth_path: str = "" - self.index_path: str = "" - self.pitch: int = 0 - self.samplerate: int = 40000 - self.block_time: float = 1.0 # s - self.buffer_num: int = 1 - self.threhold: int = -60 - self.crossfade_time: float = 0.05 - self.extra_time: float = 2.5 - self.I_noise_reduce = False - self.O_noise_reduce = False - self.rms_mix_rate = 0.0 - self.index_rate = 0.3 - self.n_cpu = min(n_cpu, 6) - self.f0method = "harvest" - self.sg_input_device = "" - self.sg_output_device = "" - - class GUI: - def __init__(self) -> None: - self.gui_config = GUIConfig() - self.config = Config() - self.flag_vc = False - self.function = "vc" - self.delay_time = 0 - self.launcher() - - def load(self): - input_devices, output_devices, _, _ = self.get_devices() - try: - with open("configs/config.json", "r") as j: - data = json.load(j) - data["pm"] = data["f0method"] == "pm" - data["harvest"] = data["f0method"] == "harvest" - data["crepe"] = data["f0method"] == "crepe" - data["rmvpe"] = data["f0method"] == "rmvpe" - if data["sg_input_device"] not in input_devices: - data["sg_input_device"] = input_devices[sd.default.device[0]] - if data["sg_output_device"] not in output_devices: - data["sg_output_device"] = output_devices[sd.default.device[1]] - except: - with open("configs/config.json", "w") as j: - data = { - "pth_path": " ", - "index_path": " ", - "sg_input_device": input_devices[sd.default.device[0]], - "sg_output_device": output_devices[sd.default.device[1]], - "threhold": "-60", - "pitch": "0", - "index_rate": "0", - "rms_mix_rate": "0", - "block_time": "0.25", - "crossfade_length": "0.05", - "extra_time": "2.5", - "f0method": "rmvpe", - "use_jit": False, - } - data["pm"] = data["f0method"] == "pm" - data["harvest"] = data["f0method"] == "harvest" - data["crepe"] = data["f0method"] == "crepe" - data["rmvpe"] = data["f0method"] == "rmvpe" - return data - - def launcher(self): - data = self.load() - self.config.use_jit = False # data.get("use_jit", self.config.use_jit) - sg.theme("LightBlue3") - input_devices, output_devices, _, _ = self.get_devices() - layout = [ - [ - sg.Frame( - title=i18n("加载模型"), - layout=[ - [ - sg.Input( - default_text=data.get("pth_path", ""), - key="pth_path", - ), - sg.FileBrowse( - i18n("选择.pth文件"), - initial_folder=os.path.join( - os.getcwd(), "assets/weights" - ), - file_types=((". pth"),), - ), - ], - [ - sg.Input( - default_text=data.get("index_path", ""), - key="index_path", - ), - sg.FileBrowse( - i18n("选择.index文件"), - initial_folder=os.path.join(os.getcwd(), "logs"), - file_types=((". index"),), - ), - ], - ], - ) - ], - [ - sg.Frame( - layout=[ - [ - sg.Text(i18n("输入设备")), - sg.Combo( - input_devices, - key="sg_input_device", - default_value=data.get("sg_input_device", ""), - ), - ], - [ - sg.Text(i18n("输出设备")), - sg.Combo( - output_devices, - key="sg_output_device", - default_value=data.get("sg_output_device", ""), - ), - ], - [sg.Button(i18n("重载设备列表"), key="reload_devices")], - ], - title=i18n("音频设备(请使用同种类驱动)"), - ) - ], - [ - sg.Frame( - layout=[ - [ - sg.Text(i18n("响应阈值")), - sg.Slider( - range=(-60, 0), - key="threhold", - resolution=1, - orientation="h", - default_value=data.get("threhold", "-60"), - enable_events=True, - ), - ], - [ - sg.Text(i18n("音调设置")), - sg.Slider( - range=(-24, 24), - key="pitch", - resolution=1, - orientation="h", - default_value=data.get("pitch", "0"), - enable_events=True, - ), - ], - [ - sg.Text(i18n("Index Rate")), - sg.Slider( - range=(0.0, 1.0), - key="index_rate", - resolution=0.01, - orientation="h", - default_value=data.get("index_rate", "0"), - enable_events=True, - ), - ], - [ - sg.Text(i18n("响度因子")), - sg.Slider( - range=(0.0, 1.0), - key="rms_mix_rate", - resolution=0.01, - orientation="h", - default_value=data.get("rms_mix_rate", "0"), - enable_events=True, - ), - ], - [ - sg.Text(i18n("音高算法")), - sg.Radio( - "pm", - "f0method", - key="pm", - default=data.get("pm", "") == True, - enable_events=True, - ), - sg.Radio( - "harvest", - "f0method", - key="harvest", - default=data.get("harvest", "") == True, - enable_events=True, - ), - sg.Radio( - "crepe", - "f0method", - key="crepe", - default=data.get("crepe", "") == True, - enable_events=True, - ), - sg.Radio( - "rmvpe", - "f0method", - key="rmvpe", - default=data.get("rmvpe", "") == True, - enable_events=True, - ), - ], - ], - title=i18n("常规设置"), - ), - sg.Frame( - layout=[ - [ - sg.Text(i18n("采样长度")), - sg.Slider( - range=(0.05, 2.4), - key="block_time", - resolution=0.01, - orientation="h", - default_value=data.get("block_time", "0.25"), - enable_events=True, - ), - ], - # [ - # sg.Text("设备延迟"), - # sg.Slider( - # range=(0, 1), - # key="device_latency", - # resolution=0.001, - # orientation="h", - # default_value=data.get("device_latency", "0.1"), - # enable_events=True, - # ), - # ], - [ - sg.Text(i18n("harvest进程数")), - sg.Slider( - range=(1, n_cpu), - key="n_cpu", - resolution=1, - orientation="h", - default_value=data.get( - "n_cpu", min(self.gui_config.n_cpu, n_cpu) - ), - enable_events=True, - ), - ], - [ - sg.Text(i18n("淡入淡出长度")), - sg.Slider( - range=(0.01, 0.15), - key="crossfade_length", - resolution=0.01, - orientation="h", - default_value=data.get("crossfade_length", "0.05"), - enable_events=True, - ), - ], - [ - sg.Text(i18n("额外推理时长")), - sg.Slider( - range=(0.05, 5.00), - key="extra_time", - resolution=0.01, - orientation="h", - default_value=data.get("extra_time", "2.5"), - enable_events=True, - ), - ], - [ - sg.Checkbox( - i18n("输入降噪"), - key="I_noise_reduce", - enable_events=True, - ), - sg.Checkbox( - i18n("输出降噪"), - key="O_noise_reduce", - enable_events=True, - ), - # sg.Checkbox( - # "JIT加速", - # default=self.config.use_jit, - # key="use_jit", - # enable_events=False, - # ), - ], - # [sg.Text("注:首次使用JIT加速时,会出现卡顿,\n 并伴随一些噪音,但这是正常现象!")], - ], - title=i18n("性能设置"), - ), - ], - [ - sg.Button(i18n("开始音频转换"), key="start_vc"), - sg.Button(i18n("停止音频转换"), key="stop_vc"), - sg.Radio( - i18n("输入监听"), - "function", - key="im", - default=False, - enable_events=True, - ), - sg.Radio( - i18n("输出变声"), - "function", - key="vc", - default=True, - enable_events=True, - ), - sg.Text(i18n("算法延迟(ms):")), - sg.Text("0", key="delay_time"), - sg.Text(i18n("推理时间(ms):")), - sg.Text("0", key="infer_time"), - ], - ] - self.window = sg.Window("RVC - GUI", layout=layout, finalize=True) - self.event_handler() - - def event_handler(self): - while True: - event, values = self.window.read() - if event == sg.WINDOW_CLOSED: - self.flag_vc = False - exit() - if event == "reload_devices": - prev_input = self.window["sg_input_device"].get() - prev_output = self.window["sg_output_device"].get() - input_devices, output_devices, _, _ = self.get_devices(update=True) - if prev_input not in input_devices: - self.gui_config.sg_input_device = input_devices[0] - else: - self.gui_config.sg_input_device = prev_input - self.window["sg_input_device"].Update(values=input_devices) - self.window["sg_input_device"].Update( - value=self.gui_config.sg_input_device - ) - if prev_output not in output_devices: - self.gui_config.sg_output_device = output_devices[0] - else: - self.gui_config.sg_output_device = prev_output - self.window["sg_output_device"].Update(values=output_devices) - self.window["sg_output_device"].Update( - value=self.gui_config.sg_output_device - ) - if event == "start_vc" and self.flag_vc == False: - if self.set_values(values) == True: - printt("cuda_is_available: %s", torch.cuda.is_available()) - self.start_vc() - settings = { - "pth_path": values["pth_path"], - "index_path": values["index_path"], - "sg_input_device": values["sg_input_device"], - "sg_output_device": values["sg_output_device"], - "threhold": values["threhold"], - "pitch": values["pitch"], - "rms_mix_rate": values["rms_mix_rate"], - "index_rate": values["index_rate"], - # "device_latency": values["device_latency"], - "block_time": values["block_time"], - "crossfade_length": values["crossfade_length"], - "extra_time": values["extra_time"], - "n_cpu": values["n_cpu"], - # "use_jit": values["use_jit"], - "use_jit": False, - "f0method": ["pm", "harvest", "crepe", "rmvpe"][ - [ - values["pm"], - values["harvest"], - values["crepe"], - values["rmvpe"], - ].index(True) - ], - } - with open("configs/config.json", "w") as j: - json.dump(settings, j) - global stream_latency - while stream_latency < 0: - time.sleep(0.01) - self.delay_time = ( - stream_latency - + values["block_time"] - + values["crossfade_length"] - + 0.01 - ) - if values["I_noise_reduce"]: - self.delay_time += values["crossfade_length"] - self.window["delay_time"].update(int(self.delay_time * 1000)) - if event == "stop_vc" and self.flag_vc == True: - self.flag_vc = False - stream_latency = -1 - # Parameter hot update - if event == "threhold": - self.gui_config.threhold = values["threhold"] - elif event == "pitch": - self.gui_config.pitch = values["pitch"] - if hasattr(self, "rvc"): - self.rvc.change_key(values["pitch"]) - elif event == "index_rate": - self.gui_config.index_rate = values["index_rate"] - if hasattr(self, "rvc"): - self.rvc.change_index_rate(values["index_rate"]) - elif event == "rms_mix_rate": - self.gui_config.rms_mix_rate = values["rms_mix_rate"] - elif event in ["pm", "harvest", "crepe", "rmvpe"]: - self.gui_config.f0method = event - elif event == "I_noise_reduce": - self.gui_config.I_noise_reduce = values["I_noise_reduce"] - if stream_latency > 0: - self.delay_time += ( - 1 if values["I_noise_reduce"] else -1 - ) * values["crossfade_length"] - self.window["delay_time"].update(int(self.delay_time * 1000)) - elif event == "O_noise_reduce": - self.gui_config.O_noise_reduce = values["O_noise_reduce"] - elif event in ["vc", "im"]: - self.function = event - elif event != "start_vc" and self.flag_vc == True: - # Other parameters do not support hot update - self.flag_vc = False - stream_latency = -1 - - def set_values(self, values): - if len(values["pth_path"].strip()) == 0: - sg.popup(i18n("请选择pth文件")) - return False - if len(values["index_path"].strip()) == 0: - sg.popup(i18n("请选择index文件")) - return False - pattern = re.compile("[^\x00-\x7F]+") - if pattern.findall(values["pth_path"]): - sg.popup(i18n("pth文件路径不可包含中文")) - return False - if pattern.findall(values["index_path"]): - sg.popup(i18n("index文件路径不可包含中文")) - return False - self.set_devices(values["sg_input_device"], values["sg_output_device"]) - self.config.use_jit = False # values["use_jit"] - # self.device_latency = values["device_latency"] - self.gui_config.pth_path = values["pth_path"] - self.gui_config.index_path = values["index_path"] - self.gui_config.threhold = values["threhold"] - self.gui_config.pitch = values["pitch"] - self.gui_config.block_time = values["block_time"] - self.gui_config.crossfade_time = values["crossfade_length"] - self.gui_config.extra_time = values["extra_time"] - self.gui_config.I_noise_reduce = values["I_noise_reduce"] - self.gui_config.O_noise_reduce = values["O_noise_reduce"] - self.gui_config.rms_mix_rate = values["rms_mix_rate"] - self.gui_config.index_rate = values["index_rate"] - self.gui_config.n_cpu = values["n_cpu"] - self.gui_config.f0method = ["pm", "harvest", "crepe", "rmvpe"][ - [ - values["pm"], - values["harvest"], - values["crepe"], - values["rmvpe"], - ].index(True) - ] - return True - - def start_vc(self): - torch.cuda.empty_cache() - self.flag_vc = True - self.rvc = rvc_for_realtime.RVC( - self.gui_config.pitch, - self.gui_config.pth_path, - self.gui_config.index_path, - self.gui_config.index_rate, - self.gui_config.n_cpu, - inp_q, - opt_q, - self.config, - self.rvc if hasattr(self, "rvc") else None, - ) - self.gui_config.samplerate = self.rvc.tgt_sr - self.zc = self.rvc.tgt_sr // 100 - self.block_frame = ( - int( - np.round( - self.gui_config.block_time - * self.gui_config.samplerate - / self.zc - ) - ) - * self.zc - ) - self.block_frame_16k = 160 * self.block_frame // self.zc - self.crossfade_frame = ( - int( - np.round( - self.gui_config.crossfade_time - * self.gui_config.samplerate - / self.zc - ) - ) - * self.zc - ) - self.sola_search_frame = self.zc - self.extra_frame = ( - int( - np.round( - self.gui_config.extra_time - * self.gui_config.samplerate - / self.zc - ) - ) - * self.zc - ) - self.input_wav: torch.Tensor = torch.zeros( - self.extra_frame - + self.crossfade_frame - + self.sola_search_frame - + self.block_frame, - device=self.config.device, - dtype=torch.float32, - ) - self.input_wav_res: torch.Tensor = torch.zeros( - 160 * self.input_wav.shape[0] // self.zc, - device=self.config.device, - dtype=torch.float32, - ) - self.pitch: np.ndarray = np.zeros( - self.input_wav.shape[0] // self.zc, - dtype="int32", - ) - self.pitchf: np.ndarray = np.zeros( - self.input_wav.shape[0] // self.zc, - dtype="float64", - ) - self.sola_buffer: torch.Tensor = torch.zeros( - self.crossfade_frame, device=self.config.device, dtype=torch.float32 - ) - self.nr_buffer: torch.Tensor = self.sola_buffer.clone() - self.output_buffer: torch.Tensor = self.input_wav.clone() - self.res_buffer: torch.Tensor = torch.zeros( - 2 * self.zc, device=self.config.device, dtype=torch.float32 - ) - self.valid_rate = 1 - (self.extra_frame - 1) / self.input_wav.shape[0] - self.fade_in_window: torch.Tensor = ( - torch.sin( - 0.5 - * np.pi - * torch.linspace( - 0.0, - 1.0, - steps=self.crossfade_frame, - device=self.config.device, - dtype=torch.float32, - ) - ) - ** 2 - ) - self.fade_out_window: torch.Tensor = 1 - self.fade_in_window - self.resampler = tat.Resample( - orig_freq=self.gui_config.samplerate, - new_freq=16000, - dtype=torch.float32, - ).to(self.config.device) - self.tg = TorchGate( - sr=self.gui_config.samplerate, n_fft=4 * self.zc, prop_decrease=0.9 - ).to(self.config.device) - thread_vc = threading.Thread(target=self.soundinput) - thread_vc.start() - - def soundinput(self): - """ - 接受音频输入 - """ - channels = 1 if sys.platform == "darwin" else 2 - with sd.Stream( - channels=channels, - callback=self.audio_callback, - blocksize=self.block_frame, - samplerate=self.gui_config.samplerate, - dtype="float32", - ) as stream: - global stream_latency - stream_latency = stream.latency[-1] - while self.flag_vc: - time.sleep(self.gui_config.block_time) - printt("Audio block passed.") - printt("ENDing VC") - - def audio_callback( - self, indata: np.ndarray, outdata: np.ndarray, frames, times, status - ): - """ - 音频处理 - """ - start_time = time.perf_counter() - indata = librosa.to_mono(indata.T) - if self.gui_config.threhold > -60: - rms = librosa.feature.rms( - y=indata, frame_length=4 * self.zc, hop_length=self.zc - ) - db_threhold = ( - librosa.amplitude_to_db(rms, ref=1.0)[0] < self.gui_config.threhold - ) - for i in range(db_threhold.shape[0]): - if db_threhold[i]: - indata[i * self.zc : (i + 1) * self.zc] = 0 - self.input_wav[: -self.block_frame] = self.input_wav[ - self.block_frame : - ].clone() - self.input_wav[-self.block_frame :] = torch.from_numpy(indata).to( - self.config.device - ) - self.input_wav_res[: -self.block_frame_16k] = self.input_wav_res[ - self.block_frame_16k : - ].clone() - # input noise reduction and resampling - if self.gui_config.I_noise_reduce and self.function == "vc": - input_wav = self.input_wav[ - -self.crossfade_frame - self.block_frame - 2 * self.zc : - ] - input_wav = self.tg( - input_wav.unsqueeze(0), self.input_wav.unsqueeze(0) - )[0, 2 * self.zc :] - input_wav[: self.crossfade_frame] *= self.fade_in_window - input_wav[: self.crossfade_frame] += ( - self.nr_buffer * self.fade_out_window - ) - self.nr_buffer[:] = input_wav[-self.crossfade_frame :] - input_wav = torch.cat( - (self.res_buffer[:], input_wav[: self.block_frame]) - ) - self.res_buffer[:] = input_wav[-2 * self.zc :] - self.input_wav_res[-self.block_frame_16k - 160 :] = self.resampler( - input_wav - )[160:] - else: - self.input_wav_res[-self.block_frame_16k - 160 :] = self.resampler( - self.input_wav[-self.block_frame - 2 * self.zc :] - )[160:] - # infer - if self.function == "vc": - f0_extractor_frame = self.block_frame_16k + 800 - if self.gui_config.f0method == "rmvpe": - f0_extractor_frame = ( - 5120 * ((f0_extractor_frame - 1) // 5120 + 1) - 160 - ) - infer_wav = self.rvc.infer( - self.input_wav_res, - self.input_wav_res[-f0_extractor_frame:].cpu().numpy(), - self.block_frame_16k, - self.valid_rate, - self.pitch, - self.pitchf, - self.gui_config.f0method, - ) - infer_wav = infer_wav[ - -self.crossfade_frame - self.sola_search_frame - self.block_frame : - ] - else: - infer_wav = self.input_wav[ - -self.crossfade_frame - self.sola_search_frame - self.block_frame : - ].clone() - # output noise reduction - if (self.gui_config.O_noise_reduce and self.function == "vc") or ( - self.gui_config.I_noise_reduce and self.function == "im" - ): - self.output_buffer[: -self.block_frame] = self.output_buffer[ - self.block_frame : - ].clone() - self.output_buffer[-self.block_frame :] = infer_wav[-self.block_frame :] - infer_wav = self.tg( - infer_wav.unsqueeze(0), self.output_buffer.unsqueeze(0) - ).squeeze(0) - # volume envelop mixing - if self.gui_config.rms_mix_rate < 1 and self.function == "vc": - rms1 = librosa.feature.rms( - y=self.input_wav_res[-160 * infer_wav.shape[0] // self.zc :] - .cpu() - .numpy(), - frame_length=640, - hop_length=160, - ) - rms1 = torch.from_numpy(rms1).to(self.config.device) - rms1 = F.interpolate( - rms1.unsqueeze(0), - size=infer_wav.shape[0] + 1, - mode="linear", - align_corners=True, - )[0, 0, :-1] - rms2 = librosa.feature.rms( - y=infer_wav[:].cpu().numpy(), - frame_length=4 * self.zc, - hop_length=self.zc, - ) - rms2 = torch.from_numpy(rms2).to(self.config.device) - rms2 = F.interpolate( - rms2.unsqueeze(0), - size=infer_wav.shape[0] + 1, - mode="linear", - align_corners=True, - )[0, 0, :-1] - rms2 = torch.max(rms2, torch.zeros_like(rms2) + 1e-3) - infer_wav *= torch.pow( - rms1 / rms2, torch.tensor(1 - self.gui_config.rms_mix_rate) - ) - # SOLA algorithm from https://github.com/yxlllc/DDSP-SVC - conv_input = infer_wav[ - None, None, : self.crossfade_frame + self.sola_search_frame - ] - cor_nom = F.conv1d(conv_input, self.sola_buffer[None, None, :]) - cor_den = torch.sqrt( - F.conv1d( - conv_input**2, - torch.ones(1, 1, self.crossfade_frame, device=self.config.device), - ) - + 1e-8 - ) - if sys.platform == "darwin": - _, sola_offset = torch.max(cor_nom[0, 0] / cor_den[0, 0]) - sola_offset = sola_offset.item() - else: - sola_offset = torch.argmax(cor_nom[0, 0] / cor_den[0, 0]) - printt("sola_offset = %d", int(sola_offset)) - infer_wav = infer_wav[ - sola_offset : sola_offset + self.block_frame + self.crossfade_frame - ] - infer_wav[: self.crossfade_frame] *= self.fade_in_window - infer_wav[: self.crossfade_frame] += self.sola_buffer * self.fade_out_window - self.sola_buffer[:] = infer_wav[-self.crossfade_frame :] - if sys.platform == "darwin": - outdata[:] = ( - infer_wav[: -self.crossfade_frame].cpu().numpy()[:, np.newaxis] - ) - else: - outdata[:] = ( - infer_wav[: -self.crossfade_frame].repeat(2, 1).t().cpu().numpy() - ) - total_time = time.perf_counter() - start_time - self.window["infer_time"].update(int(total_time * 1000)) - printt("Infer time: %.2f", total_time) - - def get_devices(self, update: bool = True): - """获取设备列表""" - if update: - sd._terminate() - sd._initialize() - devices = sd.query_devices() - hostapis = sd.query_hostapis() - for hostapi in hostapis: - for device_idx in hostapi["devices"]: - devices[device_idx]["hostapi_name"] = hostapi["name"] - input_devices = [ - f"{d['name']} ({d['hostapi_name']})" - for d in devices - if d["max_input_channels"] > 0 - ] - output_devices = [ - f"{d['name']} ({d['hostapi_name']})" - for d in devices - if d["max_output_channels"] > 0 - ] - input_devices_indices = [ - d["index"] if "index" in d else d["name"] - for d in devices - if d["max_input_channels"] > 0 - ] - output_devices_indices = [ - d["index"] if "index" in d else d["name"] - for d in devices - if d["max_output_channels"] > 0 - ] - return ( - input_devices, - output_devices, - input_devices_indices, - output_devices_indices, - ) - - def set_devices(self, input_device, output_device): - """设置输出设备""" - ( - input_devices, - output_devices, - input_device_indices, - output_device_indices, - ) = self.get_devices() - sd.default.device[0] = input_device_indices[ - input_devices.index(input_device) - ] - sd.default.device[1] = output_device_indices[ - output_devices.index(output_device) - ] - printt("Input device: %s:%s", str(sd.default.device[0]), input_device) - printt("Output device: %s:%s", str(sd.default.device[1]), output_device) - - gui = GUI() +import os +import sys +from dotenv import load_dotenv + +load_dotenv() + +os.environ["OMP_NUM_THREADS"] = "4" +if sys.platform == "darwin": + os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" + +now_dir = os.getcwd() +sys.path.append(now_dir) +import multiprocessing + +stream_latency = -1 + + +def printt(strr, *args): + if len(args) == 0: + print(strr) + else: + print(strr % args) + + +class Harvest(multiprocessing.Process): + def __init__(self, inp_q, opt_q): + multiprocessing.Process.__init__(self) + self.inp_q = inp_q + self.opt_q = opt_q + + def run(self): + import numpy as np + import pyworld + + while 1: + idx, x, res_f0, n_cpu, ts = self.inp_q.get() + f0, t = pyworld.harvest( + x.astype(np.double), + fs=16000, + f0_ceil=1100, + f0_floor=50, + frame_period=10, + ) + res_f0[idx] = f0 + if len(res_f0.keys()) >= n_cpu: + self.opt_q.put(ts) + + +if __name__ == "__main__": + import json + import multiprocessing + import re + import threading + import time + import traceback + from multiprocessing import Queue, cpu_count + from queue import Empty + + import librosa + from tools.torchgate import TorchGate + import numpy as np + import PySimpleGUI as sg + import sounddevice as sd + import torch + import torch.nn.functional as F + import torchaudio.transforms as tat + + import tools.rvc_for_realtime as rvc_for_realtime + from i18n.i18n import I18nAuto + from configs.config import Config + + i18n = I18nAuto() + + # device = rvc_for_realtime.config.device + # device = torch.device( + # "cuda" + # if torch.cuda.is_available() + # else ("mps" if torch.backends.mps.is_available() else "cpu") + # ) + current_dir = os.getcwd() + inp_q = Queue() + opt_q = Queue() + n_cpu = min(cpu_count(), 8) + for _ in range(n_cpu): + Harvest(inp_q, opt_q).start() + + class GUIConfig: + def __init__(self) -> None: + self.pth_path: str = "" + self.index_path: str = "" + self.pitch: int = 0 + self.samplerate: int = 40000 + self.block_time: float = 1.0 # s + self.buffer_num: int = 1 + self.threhold: int = -60 + self.crossfade_time: float = 0.05 + self.extra_time: float = 2.5 + self.I_noise_reduce = False + self.O_noise_reduce = False + self.rms_mix_rate = 0.0 + self.index_rate = 0.3 + self.n_cpu = min(n_cpu, 6) + self.f0method = "harvest" + self.sg_input_device = "" + self.sg_output_device = "" + + class GUI: + def __init__(self) -> None: + self.gui_config = GUIConfig() + self.config = Config() + self.flag_vc = False + self.function = "vc" + self.delay_time = 0 + self.launcher() + + def load(self): + input_devices, output_devices, _, _ = self.get_devices() + try: + with open("configs/config.json", "r") as j: + data = json.load(j) + data["pm"] = data["f0method"] == "pm" + data["harvest"] = data["f0method"] == "harvest" + data["crepe"] = data["f0method"] == "crepe" + data["rmvpe"] = data["f0method"] == "rmvpe" + data["fcpe"] = data["f0method"] == "fcpe" + if data["sg_input_device"] not in input_devices: + data["sg_input_device"] = input_devices[sd.default.device[0]] + if data["sg_output_device"] not in output_devices: + data["sg_output_device"] = output_devices[sd.default.device[1]] + except: + with open("configs/config.json", "w") as j: + data = { + "pth_path": " ", + "index_path": " ", + "sg_input_device": input_devices[sd.default.device[0]], + "sg_output_device": output_devices[sd.default.device[1]], + "threhold": "-60", + "pitch": "0", + "index_rate": "0", + "rms_mix_rate": "0", + "block_time": "0.25", + "crossfade_length": "0.05", + "extra_time": "2.5", + "f0method": "rmvpe", + "use_jit": False, + } + data["pm"] = data["f0method"] == "pm" + data["harvest"] = data["f0method"] == "harvest" + data["crepe"] = data["f0method"] == "crepe" + data["rmvpe"] = data["f0method"] == "rmvpe" + data["fcpe"] = data["f0method"] == "fcpe" + return data + + def launcher(self): + data = self.load() + self.config.use_jit = False # data.get("use_jit", self.config.use_jit) + sg.theme("LightBlue3") + input_devices, output_devices, _, _ = self.get_devices() + layout = [ + [ + sg.Frame( + title=i18n("加载模型"), + layout=[ + [ + sg.Input( + default_text=data.get("pth_path", ""), + key="pth_path", + ), + sg.FileBrowse( + i18n("选择.pth文件"), + initial_folder=os.path.join( + os.getcwd(), "assets/weights" + ), + file_types=((". pth"),), + ), + ], + [ + sg.Input( + default_text=data.get("index_path", ""), + key="index_path", + ), + sg.FileBrowse( + i18n("选择.index文件"), + initial_folder=os.path.join(os.getcwd(), "logs"), + file_types=((". index"),), + ), + ], + ], + ) + ], + [ + sg.Frame( + layout=[ + [ + sg.Text(i18n("输入设备")), + sg.Combo( + input_devices, + key="sg_input_device", + default_value=data.get("sg_input_device", ""), + ), + ], + [ + sg.Text(i18n("输出设备")), + sg.Combo( + output_devices, + key="sg_output_device", + default_value=data.get("sg_output_device", ""), + ), + ], + [sg.Button(i18n("重载设备列表"), key="reload_devices")], + ], + title=i18n("音频设备(请使用同种类驱动)"), + ) + ], + [ + sg.Frame( + layout=[ + [ + sg.Text(i18n("响应阈值")), + sg.Slider( + range=(-60, 0), + key="threhold", + resolution=1, + orientation="h", + default_value=data.get("threhold", "-60"), + enable_events=True, + ), + ], + [ + sg.Text(i18n("音调设置")), + sg.Slider( + range=(-24, 24), + key="pitch", + resolution=1, + orientation="h", + default_value=data.get("pitch", "0"), + enable_events=True, + ), + ], + [ + sg.Text(i18n("Index Rate")), + sg.Slider( + range=(0.0, 1.0), + key="index_rate", + resolution=0.01, + orientation="h", + default_value=data.get("index_rate", "0"), + enable_events=True, + ), + ], + [ + sg.Text(i18n("响度因子")), + sg.Slider( + range=(0.0, 1.0), + key="rms_mix_rate", + resolution=0.01, + orientation="h", + default_value=data.get("rms_mix_rate", "0"), + enable_events=True, + ), + ], + [ + sg.Text(i18n("音高算法")), + sg.Radio( + "pm", + "f0method", + key="pm", + default=data.get("pm", "") == True, + enable_events=True, + ), + sg.Radio( + "harvest", + "f0method", + key="harvest", + default=data.get("harvest", "") == True, + enable_events=True, + ), + sg.Radio( + "crepe", + "f0method", + key="crepe", + default=data.get("crepe", "") == True, + enable_events=True, + ), + sg.Radio( + "rmvpe", + "f0method", + key="rmvpe", + default=data.get("rmvpe", "") == True, + enable_events=True, + ), + sg.Radio( + "fcpe", + "f0method", + key="fcpe", + default=data.get("fcpe", "") == True, + enable_events=True, + ), + ], + ], + title=i18n("常规设置"), + ), + sg.Frame( + layout=[ + [ + sg.Text(i18n("采样长度")), + sg.Slider( + range=(0.05, 2.4), + key="block_time", + resolution=0.01, + orientation="h", + default_value=data.get("block_time", "0.25"), + enable_events=True, + ), + ], + # [ + # sg.Text("设备延迟"), + # sg.Slider( + # range=(0, 1), + # key="device_latency", + # resolution=0.001, + # orientation="h", + # default_value=data.get("device_latency", "0.1"), + # enable_events=True, + # ), + # ], + [ + sg.Text(i18n("harvest进程数")), + sg.Slider( + range=(1, n_cpu), + key="n_cpu", + resolution=1, + orientation="h", + default_value=data.get( + "n_cpu", min(self.gui_config.n_cpu, n_cpu) + ), + enable_events=True, + ), + ], + [ + sg.Text(i18n("淡入淡出长度")), + sg.Slider( + range=(0.01, 0.15), + key="crossfade_length", + resolution=0.01, + orientation="h", + default_value=data.get("crossfade_length", "0.05"), + enable_events=True, + ), + ], + [ + sg.Text(i18n("额外推理时长")), + sg.Slider( + range=(0.05, 5.00), + key="extra_time", + resolution=0.01, + orientation="h", + default_value=data.get("extra_time", "2.5"), + enable_events=True, + ), + ], + [ + sg.Checkbox( + i18n("输入降噪"), + key="I_noise_reduce", + enable_events=True, + ), + sg.Checkbox( + i18n("输出降噪"), + key="O_noise_reduce", + enable_events=True, + ), + # sg.Checkbox( + # "JIT加速", + # default=self.config.use_jit, + # key="use_jit", + # enable_events=False, + # ), + ], + # [sg.Text("注:首次使用JIT加速时,会出现卡顿,\n 并伴随一些噪音,但这是正常现象!")], + ], + title=i18n("性能设置"), + ), + ], + [ + sg.Button(i18n("开始音频转换"), key="start_vc"), + sg.Button(i18n("停止音频转换"), key="stop_vc"), + sg.Radio( + i18n("输入监听"), + "function", + key="im", + default=False, + enable_events=True, + ), + sg.Radio( + i18n("输出变声"), + "function", + key="vc", + default=True, + enable_events=True, + ), + sg.Text(i18n("算法延迟(ms):")), + sg.Text("0", key="delay_time"), + sg.Text(i18n("推理时间(ms):")), + sg.Text("0", key="infer_time"), + ], + ] + self.window = sg.Window("RVC - GUI", layout=layout, finalize=True) + self.event_handler() + + def event_handler(self): + while True: + event, values = self.window.read() + if event == sg.WINDOW_CLOSED: + self.flag_vc = False + exit() + if event == "reload_devices": + prev_input = self.window["sg_input_device"].get() + prev_output = self.window["sg_output_device"].get() + input_devices, output_devices, _, _ = self.get_devices(update=True) + if prev_input not in input_devices: + self.gui_config.sg_input_device = input_devices[0] + else: + self.gui_config.sg_input_device = prev_input + self.window["sg_input_device"].Update(values=input_devices) + self.window["sg_input_device"].Update( + value=self.gui_config.sg_input_device + ) + if prev_output not in output_devices: + self.gui_config.sg_output_device = output_devices[0] + else: + self.gui_config.sg_output_device = prev_output + self.window["sg_output_device"].Update(values=output_devices) + self.window["sg_output_device"].Update( + value=self.gui_config.sg_output_device + ) + if event == "start_vc" and self.flag_vc == False: + if self.set_values(values) == True: + printt("cuda_is_available: %s", torch.cuda.is_available()) + self.start_vc() + settings = { + "pth_path": values["pth_path"], + "index_path": values["index_path"], + "sg_input_device": values["sg_input_device"], + "sg_output_device": values["sg_output_device"], + "threhold": values["threhold"], + "pitch": values["pitch"], + "rms_mix_rate": values["rms_mix_rate"], + "index_rate": values["index_rate"], + # "device_latency": values["device_latency"], + "block_time": values["block_time"], + "crossfade_length": values["crossfade_length"], + "extra_time": values["extra_time"], + "n_cpu": values["n_cpu"], + # "use_jit": values["use_jit"], + "use_jit": False, + "f0method": ["pm", "harvest", "crepe", "rmvpe", "fcpe"][ + [ + values["pm"], + values["harvest"], + values["crepe"], + values["rmvpe"], + values["fcpe"], + ].index(True) + ], + } + with open("configs/config.json", "w") as j: + json.dump(settings, j) + global stream_latency + while stream_latency < 0: + time.sleep(0.01) + self.delay_time = ( + stream_latency + + values["block_time"] + + values["crossfade_length"] + + 0.01 + ) + if values["I_noise_reduce"]: + self.delay_time += values["crossfade_length"] + self.window["delay_time"].update(int(self.delay_time * 1000)) + if event == "stop_vc" and self.flag_vc == True: + self.flag_vc = False + stream_latency = -1 + # Parameter hot update + if event == "threhold": + self.gui_config.threhold = values["threhold"] + elif event == "pitch": + self.gui_config.pitch = values["pitch"] + if hasattr(self, "rvc"): + self.rvc.change_key(values["pitch"]) + elif event == "index_rate": + self.gui_config.index_rate = values["index_rate"] + if hasattr(self, "rvc"): + self.rvc.change_index_rate(values["index_rate"]) + elif event == "rms_mix_rate": + self.gui_config.rms_mix_rate = values["rms_mix_rate"] + elif event in ["pm", "harvest", "crepe", "rmvpe", "fcpe"]: + self.gui_config.f0method = event + elif event == "I_noise_reduce": + self.gui_config.I_noise_reduce = values["I_noise_reduce"] + if stream_latency > 0: + self.delay_time += ( + 1 if values["I_noise_reduce"] else -1 + ) * values["crossfade_length"] + self.window["delay_time"].update(int(self.delay_time * 1000)) + elif event == "O_noise_reduce": + self.gui_config.O_noise_reduce = values["O_noise_reduce"] + elif event in ["vc", "im"]: + self.function = event + elif event != "start_vc" and self.flag_vc == True: + # Other parameters do not support hot update + self.flag_vc = False + stream_latency = -1 + + def set_values(self, values): + if len(values["pth_path"].strip()) == 0: + sg.popup(i18n("请选择pth文件")) + return False + if len(values["index_path"].strip()) == 0: + sg.popup(i18n("请选择index文件")) + return False + pattern = re.compile("[^\x00-\x7F]+") + if pattern.findall(values["pth_path"]): + sg.popup(i18n("pth文件路径不可包含中文")) + return False + if pattern.findall(values["index_path"]): + sg.popup(i18n("index文件路径不可包含中文")) + return False + self.set_devices(values["sg_input_device"], values["sg_output_device"]) + self.config.use_jit = False # values["use_jit"] + # self.device_latency = values["device_latency"] + self.gui_config.pth_path = values["pth_path"] + self.gui_config.index_path = values["index_path"] + self.gui_config.threhold = values["threhold"] + self.gui_config.pitch = values["pitch"] + self.gui_config.block_time = values["block_time"] + self.gui_config.crossfade_time = values["crossfade_length"] + self.gui_config.extra_time = values["extra_time"] + self.gui_config.I_noise_reduce = values["I_noise_reduce"] + self.gui_config.O_noise_reduce = values["O_noise_reduce"] + self.gui_config.rms_mix_rate = values["rms_mix_rate"] + self.gui_config.index_rate = values["index_rate"] + self.gui_config.n_cpu = values["n_cpu"] + self.gui_config.f0method = ["pm", "harvest", "crepe", "rmvpe", "fcpe"][ + [ + values["pm"], + values["harvest"], + values["crepe"], + values["rmvpe"], + values["fcpe"], + ].index(True) + ] + return True + + def start_vc(self): + torch.cuda.empty_cache() + self.flag_vc = True + self.rvc = rvc_for_realtime.RVC( + self.gui_config.pitch, + self.gui_config.pth_path, + self.gui_config.index_path, + self.gui_config.index_rate, + self.gui_config.n_cpu, + inp_q, + opt_q, + self.config, + self.rvc if hasattr(self, "rvc") else None, + ) + self.gui_config.samplerate = self.rvc.tgt_sr + self.zc = self.rvc.tgt_sr // 100 + self.block_frame = ( + int( + np.round( + self.gui_config.block_time + * self.gui_config.samplerate + / self.zc + ) + ) + * self.zc + ) + self.block_frame_16k = 160 * self.block_frame // self.zc + self.crossfade_frame = ( + int( + np.round( + self.gui_config.crossfade_time + * self.gui_config.samplerate + / self.zc + ) + ) + * self.zc + ) + self.sola_search_frame = self.zc + self.extra_frame = ( + int( + np.round( + self.gui_config.extra_time + * self.gui_config.samplerate + / self.zc + ) + ) + * self.zc + ) + self.input_wav: torch.Tensor = torch.zeros( + self.extra_frame + + self.crossfade_frame + + self.sola_search_frame + + self.block_frame, + device=self.config.device, + dtype=torch.float32, + ) + self.input_wav_res: torch.Tensor = torch.zeros( + 160 * self.input_wav.shape[0] // self.zc, + device=self.config.device, + dtype=torch.float32, + ) + self.pitch: np.ndarray = np.zeros( + self.input_wav.shape[0] // self.zc, + dtype="int32", + ) + self.pitchf: np.ndarray = np.zeros( + self.input_wav.shape[0] // self.zc, + dtype="float64", + ) + self.sola_buffer: torch.Tensor = torch.zeros( + self.crossfade_frame, device=self.config.device, dtype=torch.float32 + ) + self.nr_buffer: torch.Tensor = self.sola_buffer.clone() + self.output_buffer: torch.Tensor = self.input_wav.clone() + self.res_buffer: torch.Tensor = torch.zeros( + 2 * self.zc, device=self.config.device, dtype=torch.float32 + ) + self.valid_rate = 1 - (self.extra_frame - 1) / self.input_wav.shape[0] + self.fade_in_window: torch.Tensor = ( + torch.sin( + 0.5 + * np.pi + * torch.linspace( + 0.0, + 1.0, + steps=self.crossfade_frame, + device=self.config.device, + dtype=torch.float32, + ) + ) + ** 2 + ) + self.fade_out_window: torch.Tensor = 1 - self.fade_in_window + self.resampler = tat.Resample( + orig_freq=self.gui_config.samplerate, + new_freq=16000, + dtype=torch.float32, + ).to(self.config.device) + self.tg = TorchGate( + sr=self.gui_config.samplerate, n_fft=4 * self.zc, prop_decrease=0.9 + ).to(self.config.device) + thread_vc = threading.Thread(target=self.soundinput) + thread_vc.start() + + def soundinput(self): + """ + 接受音频输入 + """ + channels = 1 if sys.platform == "darwin" else 2 + with sd.Stream( + channels=channels, + callback=self.audio_callback, + blocksize=self.block_frame, + samplerate=self.gui_config.samplerate, + dtype="float32", + ) as stream: + global stream_latency + stream_latency = stream.latency[-1] + while self.flag_vc: + time.sleep(self.gui_config.block_time) + printt("Audio block passed.") + printt("ENDing VC") + + def audio_callback( + self, indata: np.ndarray, outdata: np.ndarray, frames, times, status + ): + """ + 音频处理 + """ + start_time = time.perf_counter() + indata = librosa.to_mono(indata.T) + if self.gui_config.threhold > -60: + rms = librosa.feature.rms( + y=indata, frame_length=4 * self.zc, hop_length=self.zc + ) + db_threhold = ( + librosa.amplitude_to_db(rms, ref=1.0)[0] < self.gui_config.threhold + ) + for i in range(db_threhold.shape[0]): + if db_threhold[i]: + indata[i * self.zc : (i + 1) * self.zc] = 0 + self.input_wav[: -self.block_frame] = self.input_wav[ + self.block_frame : + ].clone() + self.input_wav[-self.block_frame :] = torch.from_numpy(indata).to( + self.config.device + ) + self.input_wav_res[: -self.block_frame_16k] = self.input_wav_res[ + self.block_frame_16k : + ].clone() + # input noise reduction and resampling + if self.gui_config.I_noise_reduce and self.function == "vc": + input_wav = self.input_wav[ + -self.crossfade_frame - self.block_frame - 2 * self.zc : + ] + input_wav = self.tg( + input_wav.unsqueeze(0), self.input_wav.unsqueeze(0) + )[0, 2 * self.zc :] + input_wav[: self.crossfade_frame] *= self.fade_in_window + input_wav[: self.crossfade_frame] += ( + self.nr_buffer * self.fade_out_window + ) + self.nr_buffer[:] = input_wav[-self.crossfade_frame :] + input_wav = torch.cat( + (self.res_buffer[:], input_wav[: self.block_frame]) + ) + self.res_buffer[:] = input_wav[-2 * self.zc :] + self.input_wav_res[-self.block_frame_16k - 160 :] = self.resampler( + input_wav + )[160:] + else: + self.input_wav_res[-self.block_frame_16k - 160 :] = self.resampler( + self.input_wav[-self.block_frame - 2 * self.zc :] + )[160:] + # infer + if self.function == "vc": + f0_extractor_frame = self.block_frame_16k + 800 + if self.gui_config.f0method == "rmvpe": + f0_extractor_frame = ( + 5120 * ((f0_extractor_frame - 1) // 5120 + 1) - 160 + ) + infer_wav = self.rvc.infer( + self.input_wav_res, + self.input_wav_res[-f0_extractor_frame:].cpu().numpy(), + self.block_frame_16k, + self.valid_rate, + self.pitch, + self.pitchf, + self.gui_config.f0method, + ) + infer_wav = infer_wav[ + -self.crossfade_frame - self.sola_search_frame - self.block_frame : + ] + else: + infer_wav = self.input_wav[ + -self.crossfade_frame - self.sola_search_frame - self.block_frame : + ].clone() + # output noise reduction + if (self.gui_config.O_noise_reduce and self.function == "vc") or ( + self.gui_config.I_noise_reduce and self.function == "im" + ): + self.output_buffer[: -self.block_frame] = self.output_buffer[ + self.block_frame : + ].clone() + self.output_buffer[-self.block_frame :] = infer_wav[-self.block_frame :] + infer_wav = self.tg( + infer_wav.unsqueeze(0), self.output_buffer.unsqueeze(0) + ).squeeze(0) + # volume envelop mixing + if self.gui_config.rms_mix_rate < 1 and self.function == "vc": + rms1 = librosa.feature.rms( + y=self.input_wav_res[-160 * infer_wav.shape[0] // self.zc :] + .cpu() + .numpy(), + frame_length=640, + hop_length=160, + ) + rms1 = torch.from_numpy(rms1).to(self.config.device) + rms1 = F.interpolate( + rms1.unsqueeze(0), + size=infer_wav.shape[0] + 1, + mode="linear", + align_corners=True, + )[0, 0, :-1] + rms2 = librosa.feature.rms( + y=infer_wav[:].cpu().numpy(), + frame_length=4 * self.zc, + hop_length=self.zc, + ) + rms2 = torch.from_numpy(rms2).to(self.config.device) + rms2 = F.interpolate( + rms2.unsqueeze(0), + size=infer_wav.shape[0] + 1, + mode="linear", + align_corners=True, + )[0, 0, :-1] + rms2 = torch.max(rms2, torch.zeros_like(rms2) + 1e-3) + infer_wav *= torch.pow( + rms1 / rms2, torch.tensor(1 - self.gui_config.rms_mix_rate) + ) + # SOLA algorithm from https://github.com/yxlllc/DDSP-SVC + conv_input = infer_wav[ + None, None, : self.crossfade_frame + self.sola_search_frame + ] + cor_nom = F.conv1d(conv_input, self.sola_buffer[None, None, :]) + cor_den = torch.sqrt( + F.conv1d( + conv_input**2, + torch.ones(1, 1, self.crossfade_frame, device=self.config.device), + ) + + 1e-8 + ) + if sys.platform == "darwin": + _, sola_offset = torch.max(cor_nom[0, 0] / cor_den[0, 0]) + sola_offset = sola_offset.item() + else: + sola_offset = torch.argmax(cor_nom[0, 0] / cor_den[0, 0]) + printt("sola_offset = %d", int(sola_offset)) + infer_wav = infer_wav[ + sola_offset : sola_offset + self.block_frame + self.crossfade_frame + ] + infer_wav[: self.crossfade_frame] *= self.fade_in_window + infer_wav[: self.crossfade_frame] += self.sola_buffer * self.fade_out_window + self.sola_buffer[:] = infer_wav[-self.crossfade_frame :] + if sys.platform == "darwin": + outdata[:] = ( + infer_wav[: -self.crossfade_frame].cpu().numpy()[:, np.newaxis] + ) + else: + outdata[:] = ( + infer_wav[: -self.crossfade_frame].repeat(2, 1).t().cpu().numpy() + ) + total_time = time.perf_counter() - start_time + self.window["infer_time"].update(int(total_time * 1000)) + printt("Infer time: %.2f", total_time) + + def get_devices(self, update: bool = True): + """获取设备列表""" + if update: + sd._terminate() + sd._initialize() + devices = sd.query_devices() + hostapis = sd.query_hostapis() + for hostapi in hostapis: + for device_idx in hostapi["devices"]: + devices[device_idx]["hostapi_name"] = hostapi["name"] + input_devices = [ + f"{d['name']} ({d['hostapi_name']})" + for d in devices + if d["max_input_channels"] > 0 + ] + output_devices = [ + f"{d['name']} ({d['hostapi_name']})" + for d in devices + if d["max_output_channels"] > 0 + ] + input_devices_indices = [ + d["index"] if "index" in d else d["name"] + for d in devices + if d["max_input_channels"] > 0 + ] + output_devices_indices = [ + d["index"] if "index" in d else d["name"] + for d in devices + if d["max_output_channels"] > 0 + ] + return ( + input_devices, + output_devices, + input_devices_indices, + output_devices_indices, + ) + + def set_devices(self, input_device, output_device): + """设置输出设备""" + ( + input_devices, + output_devices, + input_device_indices, + output_device_indices, + ) = self.get_devices() + sd.default.device[0] = input_device_indices[ + input_devices.index(input_device) + ] + sd.default.device[1] = output_device_indices[ + output_devices.index(output_device) + ] + printt("Input device: %s:%s", str(sd.default.device[0]), input_device) + printt("Output device: %s:%s", str(sd.default.device[1]), output_device) + + gui = GUI() From e6cda00fcfe326d8cd85e4daf27cb582507b36b8 Mon Sep 17 00:00:00 2001 From: CN_ChiTu <36254426+CNChTu@users.noreply.github.com> Date: Thu, 14 Dec 2023 21:09:41 +0800 Subject: [PATCH 4/4] add fcpe for realtime --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index f0e7181..cb6e93e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -42,6 +42,7 @@ onnxruntime; sys_platform == 'darwin' onnxruntime-gpu; sys_platform != 'darwin' torchcrepe==0.0.20 fastapi==0.88 +torchfcpe ffmpy==0.3.1 python-dotenv>=1.0.0 av