Toggle navigation
Toggle navigation
此项目
正在载入...
Sign in
xuning
/
sherpaonnx
转到一个项目
Toggle navigation
项目
群组
代码片段
帮助
Toggle navigation pinning
Project
Activity
Repository
Pipelines
Graphs
Issues
0
Merge Requests
0
Wiki
Network
Create a new issue
Builds
Commits
Authored by
Fangjun Kuang
2024-08-15 14:54:43 +0800
Browse Files
Options
Browse Files
Download
Email Patches
Plain Diff
Committed by
GitHub
2024-08-15 14:54:43 +0800
Commit
ca729faebf8243ba1ead2ffecba0587a80380dd8
ca729fae
1 parent
62c4d4ab
Support reading multi-channel wave files with 8/16/32-bit encoded samples (#1258)
显示空白字符变更
内嵌
并排对比
正在显示
5 个修改的文件
包含
137 行增加
和
31 行删除
.github/scripts/test-offline-ctc.sh
.github/workflows/linux.yaml
sherpa-onnx/csrc/offline-tts-frontend.h
sherpa-onnx/csrc/wave-reader.cc
sherpa-onnx/jni/offline-recognizer.cc
.github/scripts/test-offline-ctc.sh
查看文件 @
ca729fa
...
...
@@ -38,14 +38,28 @@ done
# test wav reader for non-standard wav files
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/naudio.wav
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/junk-padding.wav
waves
=(
naudio.wav
junk-padding.wav
int8-1-channel-zh.wav
int8-2-channel-zh.wav
int8-4-channel-zh.wav
int16-1-channel-zh.wav
int16-2-channel-zh.wav
int32-1-channel-zh.wav
int32-2-channel-zh.wav
float32-1-channel-zh.wav
float32-2-channel-zh.wav
)
for
w
in
${
waves
[@]
}
;
do
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/
$w
time
$EXE
\
time
$EXE
\
--tokens
=
$repo
/tokens.txt
\
--sense-voice-model
=
$repo
/model.int8.onnx
\
./naudio.wav
\
./junk-padding.wav
$w
rm -v
$w
done
rm -rf
$repo
...
...
.github/workflows/linux.yaml
查看文件 @
ca729fa
...
...
@@ -143,35 +143,34 @@ jobs:
name
:
release-${{ matrix.build_type }}-with-shared-lib-${{ matrix.shared_lib }}-with-tts-${{ matrix.with_tts }}
path
:
install/*
-
name
:
Test o
nline punctuation
-
name
:
Test o
ffline CTC
shell
:
bash
run
:
|
du -h -d1 .
export PATH=$PWD/build/bin:$PATH
export EXE=sherpa-onnx-o
nline-punctuation
export EXE=sherpa-onnx-o
ffline
.github/scripts/test-o
nline-punctuation
.sh
.github/scripts/test-o
ffline-ctc
.sh
du -h -d1 .
-
name
:
Test o
ffline transducer
-
name
:
Test o
nline punctuation
shell
:
bash
run
:
|
du -h -d1 .
export PATH=$PWD/build/bin:$PATH
export EXE=sherpa-onnx-o
ffline
export EXE=sherpa-onnx-o
nline-punctuation
.github/scripts/test-o
ffline-transducer
.sh
.github/scripts/test-o
nline-punctuation
.sh
du -h -d1 .
-
name
:
Test offline CTC
-
name
:
Test offline transducer
shell
:
bash
run
:
|
du -h -d1 .
export PATH=$PWD/build/bin:$PATH
export EXE=sherpa-onnx-offline
.github/scripts/test-offline-
ctc
.sh
.github/scripts/test-offline-
transducer
.sh
du -h -d1 .
-
name
:
Test online transducer
...
...
sherpa-onnx/csrc/offline-tts-frontend.h
查看文件 @
ca729fa
...
...
@@ -6,6 +6,7 @@
#define SHERPA_ONNX_CSRC_OFFLINE_TTS_FRONTEND_H_
#include <cstdint>
#include <string>
#include <utility>
#include <vector>
#include "sherpa-onnx/csrc/macros.h"
...
...
sherpa-onnx/csrc/wave-reader.cc
查看文件 @
ca729fa
...
...
@@ -50,6 +50,16 @@ struct WaveHeader {
};
static_assert
(
sizeof
(
WaveHeader
)
==
44
);
/*
sox int16-1-channel-zh.wav -b 8 int8-1-channel-zh.wav
sox int16-1-channel-zh.wav -c 2 int16-2-channel-zh.wav
we use audacity to generate int32-1-channel-zh.wav and float32-1-channel-zh.wav
because sox uses WAVE_FORMAT_EXTENSIBLE, which is not easy to support
in sherpa-onnx.
*/
// Read a wave file of mono-channel.
// Return its samples normalized to the range [-1, 1).
std
::
vector
<
float
>
ReadWaveImpl
(
std
::
istream
&
is
,
int32_t
*
sampling_rate
,
...
...
@@ -114,9 +124,18 @@ std::vector<float> ReadWaveImpl(std::istream &is, int32_t *sampling_rate,
is
.
read
(
reinterpret_cast
<
char
*>
(
&
header
.
audio_format
),
sizeof
(
header
.
audio_format
));
if
(
header
.
audio_format
!=
1
)
{
// 1 for PCM
if
(
header
.
audio_format
!=
1
&&
header
.
audio_format
!=
3
)
{
// 1 for integer PCM
// 3 for floating point PCM
// see https://www.mmsp.ece.mcgill.ca/Documents/AudioFormats/WAVE/WAVE.html
// and https://github.com/microsoft/DirectXTK/wiki/Wave-Formats
SHERPA_ONNX_LOGE
(
"Expected audio_format 1. Given: %d
\n
"
,
header
.
audio_format
);
if
(
header
.
audio_format
==
static_cast
<
int16_t
>
(
0xfffe
))
{
SHERPA_ONNX_LOGE
(
"We don't support WAVE_FORMAT_EXTENSIBLE files."
);
}
*
is_ok
=
false
;
return
{};
}
...
...
@@ -125,10 +144,9 @@ std::vector<float> ReadWaveImpl(std::istream &is, int32_t *sampling_rate,
sizeof
(
header
.
num_channels
));
if
(
header
.
num_channels
!=
1
)
{
// we support only single channel for now
SHERPA_ONNX_LOGE
(
"Expected single channel. Given: %d
\n
"
,
SHERPA_ONNX_LOGE
(
"Warning: %d channels are found. We only use the first channel.
\n
"
,
header
.
num_channels
);
*
is_ok
=
false
;
return
{};
}
is
.
read
(
reinterpret_cast
<
char
*>
(
&
header
.
sample_rate
),
...
...
@@ -161,8 +179,9 @@ std::vector<float> ReadWaveImpl(std::istream &is, int32_t *sampling_rate,
return
{};
}
if
(
header
.
bits_per_sample
!=
16
)
{
// we support only 16 bits per sample
SHERPA_ONNX_LOGE
(
"Expected bits_per_sample 16. Given: %d
\n
"
,
if
(
header
.
bits_per_sample
!=
8
&&
header
.
bits_per_sample
!=
16
&&
header
.
bits_per_sample
!=
32
)
{
SHERPA_ONNX_LOGE
(
"Expected bits_per_sample 8, 16 or 32. Given: %d
\n
"
,
header
.
bits_per_sample
);
*
is_ok
=
false
;
return
{};
...
...
@@ -199,19 +218,93 @@ std::vector<float> ReadWaveImpl(std::istream &is, int32_t *sampling_rate,
*
sampling_rate
=
header
.
sample_rate
;
std
::
vector
<
float
>
ans
;
if
(
header
.
bits_per_sample
==
16
&&
header
.
audio_format
==
1
)
{
// header.subchunk2_size contains the number of bytes in the data.
// As we assume each sample contains two bytes, so it is divided by 2 here
std
::
vector
<
int16_t
>
samples
(
header
.
subchunk2_size
/
2
);
SHERPA_ONNX_LOGE
(
"%d samples, bytes: %d"
,
(
int
)
samples
.
size
(),
header
.
subchunk2_size
);
is
.
read
(
reinterpret_cast
<
char
*>
(
samples
.
data
()),
header
.
subchunk2_size
);
if
(
!
is
)
{
SHERPA_ONNX_LOGE
(
"Failed to read %d bytes"
,
header
.
subchunk2_size
);
*
is_ok
=
false
;
return
{};
}
ans
.
resize
(
samples
.
size
()
/
header
.
num_channels
);
// samples are interleaved
for
(
int32_t
i
=
0
;
i
!=
static_cast
<
int32_t
>
(
ans
.
size
());
++
i
)
{
ans
[
i
]
=
samples
[
i
*
header
.
num_channels
]
/
32768.
;
}
}
else
if
(
header
.
bits_per_sample
==
8
&&
header
.
audio_format
==
1
)
{
// number of samples == number of bytes for 8-bit encoded samples
//
// For 8-bit encoded samples, they are unsigned!
std
::
vector
<
uint8_t
>
samples
(
header
.
subchunk2_size
);
is
.
read
(
reinterpret_cast
<
char
*>
(
samples
.
data
()),
header
.
subchunk2_size
);
if
(
!
is
)
{
SHERPA_ONNX_LOGE
(
"Failed to read %d bytes"
,
header
.
subchunk2_size
);
*
is_ok
=
false
;
return
{};
}
ans
.
resize
(
samples
.
size
()
/
header
.
num_channels
);
for
(
int32_t
i
=
0
;
i
!=
static_cast
<
int32_t
>
(
ans
.
size
());
++
i
)
{
// Note(fangjun): We want to normalize each sample into the range [-1, 1]
// Since each original sample is in the range [0, 256], dividing
// them by 128 converts them to the range [0, 2];
// so after subtracting 1, we get the range [-1, 1]
//
ans
[
i
]
=
samples
[
i
*
header
.
num_channels
]
/
128.
-
1
;
}
}
else
if
(
header
.
bits_per_sample
==
32
&&
header
.
audio_format
==
1
)
{
// 32 here is for int32
//
// header.subchunk2_size contains the number of bytes in the data.
// As we assume each sample contains 4 bytes, so it is divided by 4 here
std
::
vector
<
int32_t
>
samples
(
header
.
subchunk2_size
/
4
);
is
.
read
(
reinterpret_cast
<
char
*>
(
samples
.
data
()),
header
.
subchunk2_size
);
if
(
!
is
)
{
SHERPA_ONNX_LOGE
(
"Failed to read %d bytes"
,
header
.
subchunk2_size
);
*
is_ok
=
false
;
return
{};
}
ans
.
resize
(
samples
.
size
()
/
header
.
num_channels
);
for
(
int32_t
i
=
0
;
i
!=
static_cast
<
int32_t
>
(
ans
.
size
());
++
i
)
{
ans
[
i
]
=
static_cast
<
float
>
(
samples
[
i
*
header
.
num_channels
])
/
(
1
<<
31
);
}
}
else
if
(
header
.
bits_per_sample
==
32
&&
header
.
audio_format
==
3
)
{
// 32 here is for float32
//
// header.subchunk2_size contains the number of bytes in the data.
// As we assume each sample contains 4 bytes, so it is divided by 4 here
std
::
vector
<
float
>
samples
(
header
.
subchunk2_size
/
4
);
is
.
read
(
reinterpret_cast
<
char
*>
(
samples
.
data
()),
header
.
subchunk2_size
);
if
(
!
is
)
{
SHERPA_ONNX_LOGE
(
"Failed to read %d bytes"
,
header
.
subchunk2_size
);
*
is_ok
=
false
;
return
{};
}
std
::
vector
<
float
>
ans
(
samples
.
size
()
);
ans
.
resize
(
samples
.
size
()
/
header
.
num_channels
);
for
(
int32_t
i
=
0
;
i
!=
static_cast
<
int32_t
>
(
ans
.
size
());
++
i
)
{
ans
[
i
]
=
samples
[
i
]
/
32768.
;
ans
[
i
]
=
samples
[
i
*
header
.
num_channels
];
}
}
else
{
SHERPA_ONNX_LOGE
(
"Unsupported %d bits per sample and audio format: %d. Supported values "
"are: 8, 16, 32."
,
header
.
bits_per_sample
,
header
.
audio_format
);
*
is_ok
=
false
;
return
{};
}
*
is_ok
=
true
;
...
...
sherpa-onnx/jni/offline-recognizer.cc
查看文件 @
ca729fa
...
...
@@ -264,13 +264,9 @@ Java_com_k2fsa_sherpa_onnx_OfflineRecognizer_newFromFile(JNIEnv *env,
return
(
jlong
)
model
;
}
SHERPA_ONNX_EXTERN_C
JNIEXPORT
void
JNICALL
Java_com_k2fsa_sherpa_onnx_OfflineRecognizer_setConfig
(
JNIEnv
*
env
,
jobject
/*obj*/
,
jlong
ptr
,
jobject
_config
)
{
JNIEXPORT
void
JNICALL
Java_com_k2fsa_sherpa_onnx_OfflineRecognizer_setConfig
(
JNIEnv
*
env
,
jobject
/*obj*/
,
jlong
ptr
,
jobject
_config
)
{
auto
config
=
sherpa_onnx
::
GetOfflineConfig
(
env
,
_config
);
SHERPA_ONNX_LOGE
(
"config:
\n
%s"
,
config
.
ToString
().
c_str
());
...
...
@@ -350,9 +346,12 @@ Java_com_k2fsa_sherpa_onnx_OfflineRecognizer_getResult(JNIEnv *env,
// [3]: lang, jstring
// [4]: emotion, jstring
// [5]: event, jstring
env
->
SetObjectArrayElement
(
obj_arr
,
3
,
env
->
NewStringUTF
(
result
.
lang
.
c_str
()));
env
->
SetObjectArrayElement
(
obj_arr
,
4
,
env
->
NewStringUTF
(
result
.
emotion
.
c_str
()));
env
->
SetObjectArrayElement
(
obj_arr
,
5
,
env
->
NewStringUTF
(
result
.
event
.
c_str
()));
env
->
SetObjectArrayElement
(
obj_arr
,
3
,
env
->
NewStringUTF
(
result
.
lang
.
c_str
()));
env
->
SetObjectArrayElement
(
obj_arr
,
4
,
env
->
NewStringUTF
(
result
.
emotion
.
c_str
()));
env
->
SetObjectArrayElement
(
obj_arr
,
5
,
env
->
NewStringUTF
(
result
.
event
.
c_str
()));
return
obj_arr
;
}
...
...
请
注册
或
登录
后发表评论