Committed by
GitHub
Add C# API for Moonshine models. (#1483)
* Also, return timestamps for non-streaming ASR.
正在显示
6 个修改的文件
包含
143 行增加
和
8 行删除
| @@ -9,6 +9,9 @@ rm -fv *.wav | @@ -9,6 +9,9 @@ rm -fv *.wav | ||
| 9 | rm -rfv sherpa-onnx-pyannote-* | 9 | rm -rfv sherpa-onnx-pyannote-* |
| 10 | 10 | ||
| 11 | cd ../offline-decode-files | 11 | cd ../offline-decode-files |
| 12 | +./run-moonshine.sh | ||
| 13 | +rm -rf sherpa-onnx-* | ||
| 14 | + | ||
| 12 | ./run-sense-voice-ctc.sh | 15 | ./run-sense-voice-ctc.sh |
| 13 | rm -rf sherpa-onnx-* | 16 | rm -rf sherpa-onnx-* |
| 14 | 17 |
| @@ -17,7 +17,7 @@ class OfflineDecodeFiles | @@ -17,7 +17,7 @@ class OfflineDecodeFiles | ||
| 17 | { | 17 | { |
| 18 | 18 | ||
| 19 | [Option("sample-rate", Required = false, Default = 16000, HelpText = "Sample rate of the data used to train the model")] | 19 | [Option("sample-rate", Required = false, Default = 16000, HelpText = "Sample rate of the data used to train the model")] |
| 20 | - public int SampleRate { get; set; } = 16000; | 20 | + public int SampleRate { get; set; } = 16000; |
| 21 | 21 | ||
| 22 | [Option("feat-dim", Required = false, Default = 80, HelpText = "Dimension of the features used to train the model")] | 22 | [Option("feat-dim", Required = false, Default = 80, HelpText = "Dimension of the features used to train the model")] |
| 23 | public int FeatureDim { get; set; } = 80; | 23 | public int FeatureDim { get; set; } = 80; |
| @@ -31,7 +31,7 @@ class OfflineDecodeFiles | @@ -31,7 +31,7 @@ class OfflineDecodeFiles | ||
| 31 | [Option(Required = false, Default = "", HelpText = "Path to transducer decoder.onnx. Used only for transducer models")] | 31 | [Option(Required = false, Default = "", HelpText = "Path to transducer decoder.onnx. Used only for transducer models")] |
| 32 | public string Decoder { get; set; } = ""; | 32 | public string Decoder { get; set; } = ""; |
| 33 | 33 | ||
| 34 | - [Option(Required = false, Default = "",HelpText = "Path to transducer joiner.onnx. Used only for transducer models")] | 34 | + [Option(Required = false, Default = "", HelpText = "Path to transducer joiner.onnx. Used only for transducer models")] |
| 35 | public string Joiner { get; set; } = ""; | 35 | public string Joiner { get; set; } = ""; |
| 36 | 36 | ||
| 37 | [Option("model-type", Required = false, Default = "", HelpText = "model type")] | 37 | [Option("model-type", Required = false, Default = "", HelpText = "model type")] |
| @@ -44,10 +44,22 @@ class OfflineDecodeFiles | @@ -44,10 +44,22 @@ class OfflineDecodeFiles | ||
| 44 | public string WhisperDecoder { get; set; } = ""; | 44 | public string WhisperDecoder { get; set; } = ""; |
| 45 | 45 | ||
| 46 | [Option("whisper-language", Required = false, Default = "", HelpText = "Language of the input file. Can be empty")] | 46 | [Option("whisper-language", Required = false, Default = "", HelpText = "Language of the input file. Can be empty")] |
| 47 | - public string WhisperLanguage{ get; set; } = ""; | 47 | + public string WhisperLanguage { get; set; } = ""; |
| 48 | 48 | ||
| 49 | [Option("whisper-task", Required = false, Default = "transcribe", HelpText = "transcribe or translate")] | 49 | [Option("whisper-task", Required = false, Default = "transcribe", HelpText = "transcribe or translate")] |
| 50 | - public string WhisperTask{ get; set; } = "transcribe"; | 50 | + public string WhisperTask { get; set; } = "transcribe"; |
| 51 | + | ||
| 52 | + [Option("moonshine-preprocessor", Required = false, Default = "", HelpText = "Path to preprocess.onnx. Used only for Moonshine models")] | ||
| 53 | + public string MoonshinePreprocessor { get; set; } = ""; | ||
| 54 | + | ||
| 55 | + [Option("moonshine-encoder", Required = false, Default = "", HelpText = "Path to encode.onnx. Used only for Moonshine models")] | ||
| 56 | + public string MoonshineEncoder { get; set; } = ""; | ||
| 57 | + | ||
| 58 | + [Option("moonshine-uncached-decoder", Required = false, Default = "", HelpText = "Path to uncached_decode.onnx. Used only for Moonshine models")] | ||
| 59 | + public string MoonshineUncachedDecoder { get; set; } = ""; | ||
| 60 | + | ||
| 61 | + [Option("moonshine-cached-decoder", Required = false, Default = "", HelpText = "Path to cached_decode.onnx. Used only for Moonshine models")] | ||
| 62 | + public string MoonshineCachedDecoder { get; set; } = ""; | ||
| 51 | 63 | ||
| 52 | [Option("tdnn-model", Required = false, Default = "", HelpText = "Path to tdnn yesno model")] | 64 | [Option("tdnn-model", Required = false, Default = "", HelpText = "Path to tdnn yesno model")] |
| 53 | public string TdnnModel { get; set; } = ""; | 65 | public string TdnnModel { get; set; } = ""; |
| @@ -90,7 +102,7 @@ It specifies number of active paths to keep during the search")] | @@ -90,7 +102,7 @@ It specifies number of active paths to keep during the search")] | ||
| 90 | public float HotwordsScore { get; set; } = 1.5F; | 102 | public float HotwordsScore { get; set; } = 1.5F; |
| 91 | 103 | ||
| 92 | [Option("files", Required = true, HelpText = "Audio files for decoding")] | 104 | [Option("files", Required = true, HelpText = "Audio files for decoding")] |
| 93 | - public IEnumerable<string> Files { get; set; } = new string[] {}; | 105 | + public IEnumerable<string> Files { get; set; } = new string[] { }; |
| 94 | } | 106 | } |
| 95 | 107 | ||
| 96 | static void Main(string[] args) | 108 | static void Main(string[] args) |
| @@ -236,6 +248,13 @@ to download pre-trained Tdnn models. | @@ -236,6 +248,13 @@ to download pre-trained Tdnn models. | ||
| 236 | config.ModelConfig.SenseVoice.Model = options.SenseVoiceModel; | 248 | config.ModelConfig.SenseVoice.Model = options.SenseVoiceModel; |
| 237 | config.ModelConfig.SenseVoice.UseInverseTextNormalization = options.SenseVoiceUseItn; | 249 | config.ModelConfig.SenseVoice.UseInverseTextNormalization = options.SenseVoiceUseItn; |
| 238 | } | 250 | } |
| 251 | + else if (!String.IsNullOrEmpty(options.MoonshinePreprocessor)) | ||
| 252 | + { | ||
| 253 | + config.ModelConfig.Moonshine.Preprocessor = options.MoonshinePreprocessor; | ||
| 254 | + config.ModelConfig.Moonshine.Encoder = options.MoonshineEncoder; | ||
| 255 | + config.ModelConfig.Moonshine.UncachedDecoder = options.MoonshineUncachedDecoder; | ||
| 256 | + config.ModelConfig.Moonshine.CachedDecoder = options.MoonshineCachedDecoder; | ||
| 257 | + } | ||
| 239 | else | 258 | else |
| 240 | { | 259 | { |
| 241 | Console.WriteLine("Please provide a model"); | 260 | Console.WriteLine("Please provide a model"); |
| @@ -273,10 +292,21 @@ to download pre-trained Tdnn models. | @@ -273,10 +292,21 @@ to download pre-trained Tdnn models. | ||
| 273 | // display results | 292 | // display results |
| 274 | for (int i = 0; i != files.Length; ++i) | 293 | for (int i = 0; i != files.Length; ++i) |
| 275 | { | 294 | { |
| 276 | - var text = streams[i].Result.Text; | 295 | + var r = streams[i].Result; |
| 277 | Console.WriteLine("--------------------"); | 296 | Console.WriteLine("--------------------"); |
| 278 | Console.WriteLine(files[i]); | 297 | Console.WriteLine(files[i]); |
| 279 | - Console.WriteLine(text); | 298 | + Console.WriteLine("Text: {0}", r.Text); |
| 299 | + Console.WriteLine("Tokens: [{0}]", string.Join(", ", r.Tokens)); | ||
| 300 | + if (r.Timestamps != null && r.Timestamps.Length > 0) { | ||
| 301 | + Console.Write("Timestamps: ["); | ||
| 302 | + var sep = ""; | ||
| 303 | + for (int k = 0; k != r.Timestamps.Length; ++k) | ||
| 304 | + { | ||
| 305 | + Console.Write("{0}{1}", sep, r.Timestamps[k].ToString("0.00")); | ||
| 306 | + sep = ", "; | ||
| 307 | + } | ||
| 308 | + Console.WriteLine("]"); | ||
| 309 | + } | ||
| 280 | } | 310 | } |
| 281 | Console.WriteLine("--------------------"); | 311 | Console.WriteLine("--------------------"); |
| 282 | } | 312 | } |
| 1 | +#!/usr/bin/env bash | ||
| 2 | + | ||
| 3 | +set -ex | ||
| 4 | + | ||
| 5 | +if [ ! -f ./sherpa-onnx-moonshine-tiny-en-int8/tokens.txt ]; then | ||
| 6 | + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-moonshine-tiny-en-int8.tar.bz2 | ||
| 7 | + tar xvf sherpa-onnx-moonshine-tiny-en-int8.tar.bz2 | ||
| 8 | + rm sherpa-onnx-moonshine-tiny-en-int8.tar.bz2 | ||
| 9 | +fi | ||
| 10 | + | ||
| 11 | +dotnet run \ | ||
| 12 | + --num-threads=2 \ | ||
| 13 | + --moonshine-preprocessor=./sherpa-onnx-moonshine-tiny-en-int8/preprocess.onnx \ | ||
| 14 | + --moonshine-encoder=./sherpa-onnx-moonshine-tiny-en-int8/encode.int8.onnx \ | ||
| 15 | + --moonshine-uncached-decoder=./sherpa-onnx-moonshine-tiny-en-int8/uncached_decode.int8.onnx \ | ||
| 16 | + --moonshine-cached-decoder=./sherpa-onnx-moonshine-tiny-en-int8/cached_decode.int8.onnx \ | ||
| 17 | + --tokens=./sherpa-onnx-moonshine-tiny-en-int8/tokens.txt \ | ||
| 18 | + --files ./sherpa-onnx-moonshine-tiny-en-int8/test_wavs/0.wav |
| @@ -24,6 +24,7 @@ namespace SherpaOnnx | @@ -24,6 +24,7 @@ namespace SherpaOnnx | ||
| 24 | BpeVocab = ""; | 24 | BpeVocab = ""; |
| 25 | TeleSpeechCtc = ""; | 25 | TeleSpeechCtc = ""; |
| 26 | SenseVoice = new OfflineSenseVoiceModelConfig(); | 26 | SenseVoice = new OfflineSenseVoiceModelConfig(); |
| 27 | + Moonshine = new OfflineMoonshineModelConfig(); | ||
| 27 | } | 28 | } |
| 28 | public OfflineTransducerModelConfig Transducer; | 29 | public OfflineTransducerModelConfig Transducer; |
| 29 | public OfflineParaformerModelConfig Paraformer; | 30 | public OfflineParaformerModelConfig Paraformer; |
| @@ -54,5 +55,6 @@ namespace SherpaOnnx | @@ -54,5 +55,6 @@ namespace SherpaOnnx | ||
| 54 | public string TeleSpeechCtc; | 55 | public string TeleSpeechCtc; |
| 55 | 56 | ||
| 56 | public OfflineSenseVoiceModelConfig SenseVoice; | 57 | public OfflineSenseVoiceModelConfig SenseVoice; |
| 58 | + public OfflineMoonshineModelConfig Moonshine; | ||
| 57 | } | 59 | } |
| 58 | } | 60 | } |
| 1 | +/// Copyright (c) 2024 Xiaomi Corporation (authors: Fangjun Kuang) | ||
| 2 | + | ||
| 3 | +using System.Runtime.InteropServices; | ||
| 4 | + | ||
| 5 | +namespace SherpaOnnx | ||
| 6 | +{ | ||
| 7 | + [StructLayout(LayoutKind.Sequential)] | ||
| 8 | + public struct OfflineMoonshineModelConfig | ||
| 9 | + { | ||
| 10 | + public OfflineMoonshineModelConfig() | ||
| 11 | + { | ||
| 12 | + Preprocessor = ""; | ||
| 13 | + Encoder = ""; | ||
| 14 | + UncachedDecoder = ""; | ||
| 15 | + CachedDecoder = ""; | ||
| 16 | + } | ||
| 17 | + [MarshalAs(UnmanagedType.LPStr)] | ||
| 18 | + public string Preprocessor; | ||
| 19 | + | ||
| 20 | + [MarshalAs(UnmanagedType.LPStr)] | ||
| 21 | + public string Encoder; | ||
| 22 | + | ||
| 23 | + [MarshalAs(UnmanagedType.LPStr)] | ||
| 24 | + public string UncachedDecoder; | ||
| 25 | + | ||
| 26 | + [MarshalAs(UnmanagedType.LPStr)] | ||
| 27 | + public string CachedDecoder; | ||
| 28 | + } | ||
| 29 | +} |
| @@ -31,17 +31,70 @@ namespace SherpaOnnx | @@ -31,17 +31,70 @@ namespace SherpaOnnx | ||
| 31 | byte[] stringBuffer = new byte[length]; | 31 | byte[] stringBuffer = new byte[length]; |
| 32 | Marshal.Copy(impl.Text, stringBuffer, 0, length); | 32 | Marshal.Copy(impl.Text, stringBuffer, 0, length); |
| 33 | _text = Encoding.UTF8.GetString(stringBuffer); | 33 | _text = Encoding.UTF8.GetString(stringBuffer); |
| 34 | + | ||
| 35 | + _tokens = new String[impl.Count]; | ||
| 36 | + | ||
| 37 | + unsafe | ||
| 38 | + { | ||
| 39 | + byte* buf = (byte*)impl.Tokens; | ||
| 40 | + for (int i = 0; i < impl.Count; i++) | ||
| 41 | + { | ||
| 42 | + length = 0; | ||
| 43 | + byte* start = buf; | ||
| 44 | + while (*buf != 0) | ||
| 45 | + { | ||
| 46 | + ++buf; | ||
| 47 | + length += 1; | ||
| 48 | + } | ||
| 49 | + ++buf; | ||
| 50 | + | ||
| 51 | + stringBuffer = new byte[length]; | ||
| 52 | + fixed (byte* pTarget = stringBuffer) | ||
| 53 | + { | ||
| 54 | + for (int k = 0; k < length; k++) | ||
| 55 | + { | ||
| 56 | + pTarget[k] = start[k]; | ||
| 57 | + } | ||
| 58 | + } | ||
| 59 | + | ||
| 60 | + _tokens[i] = Encoding.UTF8.GetString(stringBuffer); | ||
| 61 | + } | ||
| 62 | + } | ||
| 63 | + | ||
| 64 | + unsafe | ||
| 65 | + { | ||
| 66 | + if (impl.Timestamps != IntPtr.Zero) | ||
| 67 | + { | ||
| 68 | + float *t = (float*)impl.Timestamps; | ||
| 69 | + _timestamps = new float[impl.Count]; | ||
| 70 | + fixed (float* f = _timestamps) | ||
| 71 | + { | ||
| 72 | + for (int k = 0; k < impl.Count; k++) | ||
| 73 | + { | ||
| 74 | + f[k] = t[k]; | ||
| 75 | + } | ||
| 76 | + } | ||
| 77 | + } | ||
| 78 | + } | ||
| 79 | + | ||
| 34 | } | 80 | } |
| 35 | 81 | ||
| 36 | [StructLayout(LayoutKind.Sequential)] | 82 | [StructLayout(LayoutKind.Sequential)] |
| 37 | struct Impl | 83 | struct Impl |
| 38 | { | 84 | { |
| 39 | public IntPtr Text; | 85 | public IntPtr Text; |
| 86 | + public IntPtr Timestamps; | ||
| 87 | + public int Count; | ||
| 88 | + public IntPtr Tokens; | ||
| 40 | } | 89 | } |
| 41 | 90 | ||
| 42 | private String _text; | 91 | private String _text; |
| 43 | public String Text => _text; | 92 | public String Text => _text; |
| 44 | - } | ||
| 45 | 93 | ||
| 94 | + private String[] _tokens; | ||
| 95 | + public String[] Tokens => _tokens; | ||
| 46 | 96 | ||
| 97 | + private float[] _timestamps; | ||
| 98 | + public float[] Timestamps => _timestamps; | ||
| 99 | + } | ||
| 47 | } | 100 | } |
-
请 注册 或 登录 后发表评论