offline-source-separation-uvr.py
3.5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
#!/usr/bin/env python3
# Copyright (c) 2025 Xiaomi Corporation
"""
This file shows how to use UVR for source separation.
Please first download a UVR model from
https://github.com/k2-fsa/sherpa-onnx/releases/tag/source-separation-models
The following is an example:
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/source-separation-models/UVR_MDXNET_9482.onnx
Please also download a test file
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/source-separation-models/qi-feng-le-zh.wav
The test wav file is 16-bit encoded with 2 channels. If you have other
formats, e.g., .mp4 or .mp3, please first use ffmpeg to convert it.
For instance
ffmpeg -i your.mp4 -vn -acodec pcm_s16le -ar 44100 -ac 2 out.wav
Then you can use out.wav as input for this example.
"""
import time
from pathlib import Path
import numpy as np
import sherpa_onnx
import soundfile as sf
def create_offline_source_separation():
# Please read the help message at the beginning of this file
# to download model files
model = "./UVR_MDXNET_9482.onnx"
if not Path(model).is_file():
raise ValueError(f"{model} does not exist.")
config = sherpa_onnx.OfflineSourceSeparationConfig(
model=sherpa_onnx.OfflineSourceSeparationModelConfig(
uvr=sherpa_onnx.OfflineSourceSeparationUvrModelConfig(
model=model,
),
num_threads=1,
debug=False,
provider="cpu",
)
)
if not config.validate():
raise ValueError("Please check your config.")
return sherpa_onnx.OfflineSourceSeparation(config)
def load_audio():
# Please read the help message at the beginning of this file to download
# the following wav_file
wav_file = "./qi-feng-le-zh.wav"
if not Path(wav_file).is_file():
raise ValueError(f"{wav_file} does not exist")
samples, sample_rate = sf.read(wav_file, dtype="float32", always_2d=True)
samples = np.transpose(samples)
# now samples is of shape (num_channels, num_samples)
assert (
samples.shape[1] > samples.shape[0]
), f"You should use (num_channels, num_samples). {samples.shape}"
assert (
samples.dtype == np.float32
), f"Expect np.float32 as dtype. Given: {samples.dtype}"
return samples, sample_rate
def main():
sp = create_offline_source_separation()
samples, sample_rate = load_audio()
samples = np.ascontiguousarray(samples)
print("Started. Please wait")
start = time.time()
output = sp.process(sample_rate=sample_rate, samples=samples)
end = time.time()
print("output.sample_rate", output.sample_rate)
assert len(output.stems) == 2, len(output.stems)
vocals = output.stems[0].data
non_vocals = output.stems[1].data
# vocals.shape (num_channels, num_samples)
vocals = np.transpose(vocals)
non_vocals = np.transpose(non_vocals)
# vocals.shape (num_samples,num_channels)
sf.write("./uvr-vocals.wav", vocals, samplerate=output.sample_rate)
sf.write("./uvr-non-vocals.wav", non_vocals, samplerate=output.sample_rate)
elapsed_seconds = end - start
audio_duration = samples.shape[1] / sample_rate
real_time_factor = elapsed_seconds / audio_duration
print("Saved to ./uvr-vocals.wav and ./uvr-non-vocals.wav")
print(f"Elapsed seconds: {elapsed_seconds:.3f}")
print(f"Audio duration in seconds: {audio_duration:.3f}")
print(f"RTF: {elapsed_seconds:.3f}/{audio_duration:.3f} = {real_time_factor:.3f}")
if __name__ == "__main__":
main()