gtf35
Committed by GitHub

Replace torchaudio with soundfile in python-api-examples (#765)

@@ -65,7 +65,7 @@ from typing import Dict, List, Tuple @@ -65,7 +65,7 @@ from typing import Dict, List, Tuple
65 65
66 import numpy as np 66 import numpy as np
67 import sherpa_onnx 67 import sherpa_onnx
68 -import torchaudio 68 +import soundfile as sf
69 69
70 try: 70 try:
71 import sounddevice as sd 71 import sounddevice as sd
@@ -357,8 +357,14 @@ def load_speaker_file(args) -> Dict[str, List[str]]: @@ -357,8 +357,14 @@ def load_speaker_file(args) -> Dict[str, List[str]]:
357 357
358 358
359 def load_audio(filename: str) -> Tuple[np.ndarray, int]: 359 def load_audio(filename: str) -> Tuple[np.ndarray, int]:
360 - samples, sample_rate = torchaudio.load(filename)  
361 - return samples[0].contiguous().numpy(), sample_rate 360 + data, sample_rate = sf.read(
  361 + filename,
  362 + always_2d=True,
  363 + dtype="float32",
  364 + )
  365 + data = data[:, 0] # use only the first channel
  366 + samples = np.ascontiguousarray(data)
  367 + return samples, sample_rate
362 368
363 369
364 def compute_speaker_embedding( 370 def compute_speaker_embedding(
@@ -60,7 +60,7 @@ from typing import Dict, List, Tuple @@ -60,7 +60,7 @@ from typing import Dict, List, Tuple
60 60
61 import numpy as np 61 import numpy as np
62 import sherpa_onnx 62 import sherpa_onnx
63 -import torchaudio 63 +import soundfile as sf
64 64
65 try: 65 try:
66 import sounddevice as sd 66 import sounddevice as sd
@@ -160,8 +160,14 @@ def load_speaker_file(args) -> Dict[str, List[str]]: @@ -160,8 +160,14 @@ def load_speaker_file(args) -> Dict[str, List[str]]:
160 160
161 161
162 def load_audio(filename: str) -> Tuple[np.ndarray, int]: 162 def load_audio(filename: str) -> Tuple[np.ndarray, int]:
163 - samples, sample_rate = torchaudio.load(filename)  
164 - return samples[0].contiguous().numpy(), sample_rate 163 + data, sample_rate = sf.read(
  164 + filename,
  165 + always_2d=True,
  166 + dtype="float32",
  167 + )
  168 + data = data[:, 0] # use only the first channel
  169 + samples = np.ascontiguousarray(data)
  170 + return samples, sample_rate
165 171
166 172
167 def compute_speaker_embedding( 173 def compute_speaker_embedding(
@@ -52,7 +52,7 @@ from typing import Dict, List, Tuple @@ -52,7 +52,7 @@ from typing import Dict, List, Tuple
52 52
53 import numpy as np 53 import numpy as np
54 import sherpa_onnx 54 import sherpa_onnx
55 -import torchaudio 55 +import soundfile as sf
56 56
57 try: 57 try:
58 import sounddevice as sd 58 import sounddevice as sd
@@ -145,8 +145,14 @@ def load_speaker_file(args) -> Dict[str, List[str]]: @@ -145,8 +145,14 @@ def load_speaker_file(args) -> Dict[str, List[str]]:
145 145
146 146
147 def load_audio(filename: str) -> Tuple[np.ndarray, int]: 147 def load_audio(filename: str) -> Tuple[np.ndarray, int]:
148 - samples, sample_rate = torchaudio.load(filename)  
149 - return samples[0].contiguous().numpy(), sample_rate 148 + data, sample_rate = sf.read(
  149 + filename,
  150 + always_2d=True,
  151 + dtype="float32",
  152 + )
  153 + data = data[:, 0] # use only the first channel
  154 + samples = np.ascontiguousarray(data)
  155 + return samples, sample_rate
150 156
151 157
152 def compute_speaker_embedding( 158 def compute_speaker_embedding(