Fangjun Kuang
Committed by GitHub

Add C# API for Moonshine models. (#1483)

* Also, return timestamps for non-streaming ASR.
@@ -9,6 +9,9 @@ rm -fv *.wav @@ -9,6 +9,9 @@ rm -fv *.wav
9 rm -rfv sherpa-onnx-pyannote-* 9 rm -rfv sherpa-onnx-pyannote-*
10 10
11 cd ../offline-decode-files 11 cd ../offline-decode-files
  12 +./run-moonshine.sh
  13 +rm -rf sherpa-onnx-*
  14 +
12 ./run-sense-voice-ctc.sh 15 ./run-sense-voice-ctc.sh
13 rm -rf sherpa-onnx-* 16 rm -rf sherpa-onnx-*
14 17
@@ -17,7 +17,7 @@ class OfflineDecodeFiles @@ -17,7 +17,7 @@ class OfflineDecodeFiles
17 { 17 {
18 18
19 [Option("sample-rate", Required = false, Default = 16000, HelpText = "Sample rate of the data used to train the model")] 19 [Option("sample-rate", Required = false, Default = 16000, HelpText = "Sample rate of the data used to train the model")]
20 - public int SampleRate { get; set; } = 16000; 20 + public int SampleRate { get; set; } = 16000;
21 21
22 [Option("feat-dim", Required = false, Default = 80, HelpText = "Dimension of the features used to train the model")] 22 [Option("feat-dim", Required = false, Default = 80, HelpText = "Dimension of the features used to train the model")]
23 public int FeatureDim { get; set; } = 80; 23 public int FeatureDim { get; set; } = 80;
@@ -31,7 +31,7 @@ class OfflineDecodeFiles @@ -31,7 +31,7 @@ class OfflineDecodeFiles
31 [Option(Required = false, Default = "", HelpText = "Path to transducer decoder.onnx. Used only for transducer models")] 31 [Option(Required = false, Default = "", HelpText = "Path to transducer decoder.onnx. Used only for transducer models")]
32 public string Decoder { get; set; } = ""; 32 public string Decoder { get; set; } = "";
33 33
34 - [Option(Required = false, Default = "",HelpText = "Path to transducer joiner.onnx. Used only for transducer models")] 34 + [Option(Required = false, Default = "", HelpText = "Path to transducer joiner.onnx. Used only for transducer models")]
35 public string Joiner { get; set; } = ""; 35 public string Joiner { get; set; } = "";
36 36
37 [Option("model-type", Required = false, Default = "", HelpText = "model type")] 37 [Option("model-type", Required = false, Default = "", HelpText = "model type")]
@@ -44,10 +44,22 @@ class OfflineDecodeFiles @@ -44,10 +44,22 @@ class OfflineDecodeFiles
44 public string WhisperDecoder { get; set; } = ""; 44 public string WhisperDecoder { get; set; } = "";
45 45
46 [Option("whisper-language", Required = false, Default = "", HelpText = "Language of the input file. Can be empty")] 46 [Option("whisper-language", Required = false, Default = "", HelpText = "Language of the input file. Can be empty")]
47 - public string WhisperLanguage{ get; set; } = ""; 47 + public string WhisperLanguage { get; set; } = "";
48 48
49 [Option("whisper-task", Required = false, Default = "transcribe", HelpText = "transcribe or translate")] 49 [Option("whisper-task", Required = false, Default = "transcribe", HelpText = "transcribe or translate")]
50 - public string WhisperTask{ get; set; } = "transcribe"; 50 + public string WhisperTask { get; set; } = "transcribe";
  51 +
  52 + [Option("moonshine-preprocessor", Required = false, Default = "", HelpText = "Path to preprocess.onnx. Used only for Moonshine models")]
  53 + public string MoonshinePreprocessor { get; set; } = "";
  54 +
  55 + [Option("moonshine-encoder", Required = false, Default = "", HelpText = "Path to encode.onnx. Used only for Moonshine models")]
  56 + public string MoonshineEncoder { get; set; } = "";
  57 +
  58 + [Option("moonshine-uncached-decoder", Required = false, Default = "", HelpText = "Path to uncached_decode.onnx. Used only for Moonshine models")]
  59 + public string MoonshineUncachedDecoder { get; set; } = "";
  60 +
  61 + [Option("moonshine-cached-decoder", Required = false, Default = "", HelpText = "Path to cached_decode.onnx. Used only for Moonshine models")]
  62 + public string MoonshineCachedDecoder { get; set; } = "";
51 63
52 [Option("tdnn-model", Required = false, Default = "", HelpText = "Path to tdnn yesno model")] 64 [Option("tdnn-model", Required = false, Default = "", HelpText = "Path to tdnn yesno model")]
53 public string TdnnModel { get; set; } = ""; 65 public string TdnnModel { get; set; } = "";
@@ -90,7 +102,7 @@ It specifies number of active paths to keep during the search")] @@ -90,7 +102,7 @@ It specifies number of active paths to keep during the search")]
90 public float HotwordsScore { get; set; } = 1.5F; 102 public float HotwordsScore { get; set; } = 1.5F;
91 103
92 [Option("files", Required = true, HelpText = "Audio files for decoding")] 104 [Option("files", Required = true, HelpText = "Audio files for decoding")]
93 - public IEnumerable<string> Files { get; set; } = new string[] {}; 105 + public IEnumerable<string> Files { get; set; } = new string[] { };
94 } 106 }
95 107
96 static void Main(string[] args) 108 static void Main(string[] args)
@@ -236,6 +248,13 @@ to download pre-trained Tdnn models. @@ -236,6 +248,13 @@ to download pre-trained Tdnn models.
236 config.ModelConfig.SenseVoice.Model = options.SenseVoiceModel; 248 config.ModelConfig.SenseVoice.Model = options.SenseVoiceModel;
237 config.ModelConfig.SenseVoice.UseInverseTextNormalization = options.SenseVoiceUseItn; 249 config.ModelConfig.SenseVoice.UseInverseTextNormalization = options.SenseVoiceUseItn;
238 } 250 }
  251 + else if (!String.IsNullOrEmpty(options.MoonshinePreprocessor))
  252 + {
  253 + config.ModelConfig.Moonshine.Preprocessor = options.MoonshinePreprocessor;
  254 + config.ModelConfig.Moonshine.Encoder = options.MoonshineEncoder;
  255 + config.ModelConfig.Moonshine.UncachedDecoder = options.MoonshineUncachedDecoder;
  256 + config.ModelConfig.Moonshine.CachedDecoder = options.MoonshineCachedDecoder;
  257 + }
239 else 258 else
240 { 259 {
241 Console.WriteLine("Please provide a model"); 260 Console.WriteLine("Please provide a model");
@@ -273,10 +292,21 @@ to download pre-trained Tdnn models. @@ -273,10 +292,21 @@ to download pre-trained Tdnn models.
273 // display results 292 // display results
274 for (int i = 0; i != files.Length; ++i) 293 for (int i = 0; i != files.Length; ++i)
275 { 294 {
276 - var text = streams[i].Result.Text; 295 + var r = streams[i].Result;
277 Console.WriteLine("--------------------"); 296 Console.WriteLine("--------------------");
278 Console.WriteLine(files[i]); 297 Console.WriteLine(files[i]);
279 - Console.WriteLine(text); 298 + Console.WriteLine("Text: {0}", r.Text);
  299 + Console.WriteLine("Tokens: [{0}]", string.Join(", ", r.Tokens));
  300 + if (r.Timestamps != null && r.Timestamps.Length > 0) {
  301 + Console.Write("Timestamps: [");
  302 + var sep = "";
  303 + for (int k = 0; k != r.Timestamps.Length; ++k)
  304 + {
  305 + Console.Write("{0}{1}", sep, r.Timestamps[k].ToString("0.00"));
  306 + sep = ", ";
  307 + }
  308 + Console.WriteLine("]");
  309 + }
280 } 310 }
281 Console.WriteLine("--------------------"); 311 Console.WriteLine("--------------------");
282 } 312 }
  1 +#!/usr/bin/env bash
  2 +
  3 +set -ex
  4 +
  5 +if [ ! -f ./sherpa-onnx-moonshine-tiny-en-int8/tokens.txt ]; then
  6 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-moonshine-tiny-en-int8.tar.bz2
  7 + tar xvf sherpa-onnx-moonshine-tiny-en-int8.tar.bz2
  8 + rm sherpa-onnx-moonshine-tiny-en-int8.tar.bz2
  9 +fi
  10 +
  11 +dotnet run \
  12 + --num-threads=2 \
  13 + --moonshine-preprocessor=./sherpa-onnx-moonshine-tiny-en-int8/preprocess.onnx \
  14 + --moonshine-encoder=./sherpa-onnx-moonshine-tiny-en-int8/encode.int8.onnx \
  15 + --moonshine-uncached-decoder=./sherpa-onnx-moonshine-tiny-en-int8/uncached_decode.int8.onnx \
  16 + --moonshine-cached-decoder=./sherpa-onnx-moonshine-tiny-en-int8/cached_decode.int8.onnx \
  17 + --tokens=./sherpa-onnx-moonshine-tiny-en-int8/tokens.txt \
  18 + --files ./sherpa-onnx-moonshine-tiny-en-int8/test_wavs/0.wav
@@ -24,6 +24,7 @@ namespace SherpaOnnx @@ -24,6 +24,7 @@ namespace SherpaOnnx
24 BpeVocab = ""; 24 BpeVocab = "";
25 TeleSpeechCtc = ""; 25 TeleSpeechCtc = "";
26 SenseVoice = new OfflineSenseVoiceModelConfig(); 26 SenseVoice = new OfflineSenseVoiceModelConfig();
  27 + Moonshine = new OfflineMoonshineModelConfig();
27 } 28 }
28 public OfflineTransducerModelConfig Transducer; 29 public OfflineTransducerModelConfig Transducer;
29 public OfflineParaformerModelConfig Paraformer; 30 public OfflineParaformerModelConfig Paraformer;
@@ -54,5 +55,6 @@ namespace SherpaOnnx @@ -54,5 +55,6 @@ namespace SherpaOnnx
54 public string TeleSpeechCtc; 55 public string TeleSpeechCtc;
55 56
56 public OfflineSenseVoiceModelConfig SenseVoice; 57 public OfflineSenseVoiceModelConfig SenseVoice;
  58 + public OfflineMoonshineModelConfig Moonshine;
57 } 59 }
58 } 60 }
  1 +/// Copyright (c) 2024 Xiaomi Corporation (authors: Fangjun Kuang)
  2 +
  3 +using System.Runtime.InteropServices;
  4 +
  5 +namespace SherpaOnnx
  6 +{
  7 + [StructLayout(LayoutKind.Sequential)]
  8 + public struct OfflineMoonshineModelConfig
  9 + {
  10 + public OfflineMoonshineModelConfig()
  11 + {
  12 + Preprocessor = "";
  13 + Encoder = "";
  14 + UncachedDecoder = "";
  15 + CachedDecoder = "";
  16 + }
  17 + [MarshalAs(UnmanagedType.LPStr)]
  18 + public string Preprocessor;
  19 +
  20 + [MarshalAs(UnmanagedType.LPStr)]
  21 + public string Encoder;
  22 +
  23 + [MarshalAs(UnmanagedType.LPStr)]
  24 + public string UncachedDecoder;
  25 +
  26 + [MarshalAs(UnmanagedType.LPStr)]
  27 + public string CachedDecoder;
  28 + }
  29 +}
@@ -31,17 +31,70 @@ namespace SherpaOnnx @@ -31,17 +31,70 @@ namespace SherpaOnnx
31 byte[] stringBuffer = new byte[length]; 31 byte[] stringBuffer = new byte[length];
32 Marshal.Copy(impl.Text, stringBuffer, 0, length); 32 Marshal.Copy(impl.Text, stringBuffer, 0, length);
33 _text = Encoding.UTF8.GetString(stringBuffer); 33 _text = Encoding.UTF8.GetString(stringBuffer);
  34 +
  35 + _tokens = new String[impl.Count];
  36 +
  37 + unsafe
  38 + {
  39 + byte* buf = (byte*)impl.Tokens;
  40 + for (int i = 0; i < impl.Count; i++)
  41 + {
  42 + length = 0;
  43 + byte* start = buf;
  44 + while (*buf != 0)
  45 + {
  46 + ++buf;
  47 + length += 1;
  48 + }
  49 + ++buf;
  50 +
  51 + stringBuffer = new byte[length];
  52 + fixed (byte* pTarget = stringBuffer)
  53 + {
  54 + for (int k = 0; k < length; k++)
  55 + {
  56 + pTarget[k] = start[k];
  57 + }
  58 + }
  59 +
  60 + _tokens[i] = Encoding.UTF8.GetString(stringBuffer);
  61 + }
  62 + }
  63 +
  64 + unsafe
  65 + {
  66 + if (impl.Timestamps != IntPtr.Zero)
  67 + {
  68 + float *t = (float*)impl.Timestamps;
  69 + _timestamps = new float[impl.Count];
  70 + fixed (float* f = _timestamps)
  71 + {
  72 + for (int k = 0; k < impl.Count; k++)
  73 + {
  74 + f[k] = t[k];
  75 + }
  76 + }
  77 + }
  78 + }
  79 +
34 } 80 }
35 81
36 [StructLayout(LayoutKind.Sequential)] 82 [StructLayout(LayoutKind.Sequential)]
37 struct Impl 83 struct Impl
38 { 84 {
39 public IntPtr Text; 85 public IntPtr Text;
  86 + public IntPtr Timestamps;
  87 + public int Count;
  88 + public IntPtr Tokens;
40 } 89 }
41 90
42 private String _text; 91 private String _text;
43 public String Text => _text; 92 public String Text => _text;
44 - }  
45 93
  94 + private String[] _tokens;
  95 + public String[] Tokens => _tokens;
46 96
  97 + private float[] _timestamps;
  98 + public float[] Timestamps => _timestamps;
  99 + }
47 } 100 }