Michael Lamothe
Committed by GitHub

Upgraded to .NET 8 and made code style a little more internally consistent. (#1680)

正在显示 29 个修改的文件 包含 335 行增加385 行删除
<Project Sdk="Microsoft.NET.Sdk">
<PropertyGroup>
<TargetFramework>net6.0</TargetFramework>
<TargetFramework>net8.0</TargetFramework>
<AllowUnsafeBlocks>true</AllowUnsafeBlocks>
</PropertyGroup>
<ItemGroup>
... ...
... ... @@ -4,171 +4,166 @@ using System.IO;
using System.Runtime.InteropServices;
namespace SherpaOnnx
{
namespace SherpaOnnx;
[StructLayout(LayoutKind.Sequential)]
public struct WaveHeader
[StructLayout(LayoutKind.Sequential)]
public struct WaveHeader
{
public int ChunkID;
public int ChunkSize;
public int Format;
public int SubChunk1ID;
public int SubChunk1Size;
public short AudioFormat;
public short NumChannels;
public int SampleRate;
public int ByteRate;
public short BlockAlign;
public short BitsPerSample;
public int SubChunk2ID;
public int SubChunk2Size;
public bool Validate()
{
public Int32 ChunkID;
public Int32 ChunkSize;
public Int32 Format;
public Int32 SubChunk1ID;
public Int32 SubChunk1Size;
public Int16 AudioFormat;
public Int16 NumChannels;
public Int32 SampleRate;
public Int32 ByteRate;
public Int16 BlockAlign;
public Int16 BitsPerSample;
public Int32 SubChunk2ID;
public Int32 SubChunk2Size;
public bool Validate()
if (ChunkID != 0x46464952)
{
Console.WriteLine($"Invalid chunk ID: 0x{ChunkID:X}. Expect 0x46464952");
return false;
}
// E V A W
if (Format != 0x45564157)
{
Console.WriteLine($"Invalid format: 0x{Format:X}. Expect 0x45564157");
return false;
}
// t m f
if (SubChunk1ID != 0x20746d66)
{
Console.WriteLine($"Invalid SubChunk1ID: 0x{SubChunk1ID:X}. Expect 0x20746d66");
return false;
}
if (SubChunk1Size != 16)
{
Console.WriteLine($"Invalid SubChunk1Size: {SubChunk1Size}. Expect 16");
return false;
}
if (AudioFormat != 1)
{
Console.WriteLine($"Invalid AudioFormat: {AudioFormat}. Expect 1");
return false;
}
if (NumChannels != 1)
{
Console.WriteLine($"Invalid NumChannels: {NumChannels}. Expect 1");
return false;
}
if (ByteRate != (SampleRate * NumChannels * BitsPerSample / 8))
{
Console.WriteLine($"Invalid byte rate: {ByteRate}.");
return false;
}
if (BlockAlign != (NumChannels * BitsPerSample / 8))
{
if (ChunkID != 0x46464952)
{
Console.WriteLine($"Invalid chunk ID: 0x{ChunkID:X}. Expect 0x46464952");
return false;
}
// E V A W
if (Format != 0x45564157)
{
Console.WriteLine($"Invalid format: 0x{Format:X}. Expect 0x45564157");
return false;
}
// t m f
if (SubChunk1ID != 0x20746d66)
{
Console.WriteLine($"Invalid SubChunk1ID: 0x{SubChunk1ID:X}. Expect 0x20746d66");
return false;
}
if (SubChunk1Size != 16)
{
Console.WriteLine($"Invalid SubChunk1Size: {SubChunk1Size}. Expect 16");
return false;
}
if (AudioFormat != 1)
{
Console.WriteLine($"Invalid AudioFormat: {AudioFormat}. Expect 1");
return false;
}
if (NumChannels != 1)
{
Console.WriteLine($"Invalid NumChannels: {NumChannels}. Expect 1");
return false;
}
if (ByteRate != (SampleRate * NumChannels * BitsPerSample / 8))
{
Console.WriteLine($"Invalid byte rate: {ByteRate}.");
return false;
}
if (BlockAlign != (NumChannels * BitsPerSample / 8))
{
Console.WriteLine($"Invalid block align: {ByteRate}.");
return false;
}
if (BitsPerSample != 16)
{ // we support only 16 bits per sample
Console.WriteLine($"Invalid bits per sample: {BitsPerSample}. Expect 16");
return false;
}
return true;
Console.WriteLine($"Invalid block align: {ByteRate}.");
return false;
}
if (BitsPerSample != 16)
{ // we support only 16 bits per sample
Console.WriteLine($"Invalid bits per sample: {BitsPerSample}. Expect 16");
return false;
}
return true;
}
}
// It supports only 16-bit, single channel WAVE format.
// The sample rate can be any value.
public class WaveReader
// It supports only 16-bit, single channel WAVE format.
// The sample rate can be any value.
public class WaveReader
{
public WaveReader(string fileName)
{
public WaveReader(String fileName)
if (!File.Exists(fileName))
{
if (!File.Exists(fileName))
{
throw new ApplicationException($"{fileName} does not exist!");
}
using (var stream = File.Open(fileName, FileMode.Open))
{
using (var reader = new BinaryReader(stream))
{
_header = ReadHeader(reader);
if (!_header.Validate())
{
throw new ApplicationException($"Invalid wave file ${fileName}");
}
SkipMetaData(reader);
// now read samples
// _header.SubChunk2Size contains number of bytes in total.
// we assume each sample is of type int16
byte[] buffer = reader.ReadBytes(_header.SubChunk2Size);
short[] samples_int16 = new short[_header.SubChunk2Size / 2];
Buffer.BlockCopy(buffer, 0, samples_int16, 0, buffer.Length);
_samples = new float[samples_int16.Length];
for (var i = 0; i < samples_int16.Length; ++i)
{
_samples[i] = samples_int16[i] / 32768.0F;
}
}
}
throw new ApplicationException($"{fileName} does not exist!");
}
private static WaveHeader ReadHeader(BinaryReader reader)
{
byte[] bytes = reader.ReadBytes(Marshal.SizeOf(typeof(WaveHeader)));
using var stream = File.Open(fileName, FileMode.Open);
using var reader = new BinaryReader(stream);
GCHandle handle = GCHandle.Alloc(bytes, GCHandleType.Pinned);
WaveHeader header = (WaveHeader)Marshal.PtrToStructure(handle.AddrOfPinnedObject(), typeof(WaveHeader))!;
handle.Free();
_header = ReadHeader(reader);
return header;
if (!_header.Validate())
{
throw new ApplicationException($"Invalid wave file ${fileName}");
}
private void SkipMetaData(BinaryReader reader)
SkipMetaData(reader);
// now read samples
// _header.SubChunk2Size contains number of bytes in total.
// we assume each sample is of type int16
var buffer = reader.ReadBytes(_header.SubChunk2Size);
var samples_int16 = new short[_header.SubChunk2Size / 2];
Buffer.BlockCopy(buffer, 0, samples_int16, 0, buffer.Length);
_samples = new float[samples_int16.Length];
for (var i = 0; i < samples_int16.Length; ++i)
{
var bs = reader.BaseStream;
Int32 subChunk2ID = _header.SubChunk2ID;
Int32 subChunk2Size = _header.SubChunk2Size;
while (bs.Position != bs.Length && subChunk2ID != 0x61746164)
{
bs.Seek(subChunk2Size, SeekOrigin.Current);
subChunk2ID = reader.ReadInt32();
subChunk2Size = reader.ReadInt32();
}
_header.SubChunk2ID = subChunk2ID;
_header.SubChunk2Size = subChunk2Size;
_samples[i] = samples_int16[i] / 32768.0F;
}
}
private WaveHeader _header;
private static WaveHeader ReadHeader(BinaryReader reader)
{
var bytes = reader.ReadBytes(Marshal.SizeOf(typeof(WaveHeader)));
GCHandle handle = GCHandle.Alloc(bytes, GCHandleType.Pinned);
WaveHeader header = (WaveHeader)Marshal.PtrToStructure(handle.AddrOfPinnedObject(), typeof(WaveHeader))!;
handle.Free();
return header;
}
// Samples are normalized to the range [-1, 1]
private float[] _samples;
private void SkipMetaData(BinaryReader reader)
{
var bs = reader.BaseStream;
public int SampleRate => _header.SampleRate;
public float[] Samples => _samples;
var subChunk2ID = _header.SubChunk2ID;
var subChunk2Size = _header.SubChunk2Size;
public static void Test(String fileName)
while (bs.Position != bs.Length && subChunk2ID != 0x61746164)
{
WaveReader reader = new WaveReader(fileName);
Console.WriteLine($"samples length: {reader.Samples.Length}");
Console.WriteLine($"samples rate: {reader.SampleRate}");
bs.Seek(subChunk2Size, SeekOrigin.Current);
subChunk2ID = reader.ReadInt32();
subChunk2Size = reader.ReadInt32();
}
_header.SubChunk2ID = subChunk2ID;
_header.SubChunk2Size = subChunk2Size;
}
private WaveHeader _header;
// Samples are normalized to the range [-1, 1]
private float[] _samples;
public int SampleRate => _header.SampleRate;
public float[] Samples => _samples;
public static void Test(string fileName)
{
WaveReader reader = new WaveReader(fileName);
Console.WriteLine($"samples length: {reader.Samples.Length}");
Console.WriteLine($"samples rate: {reader.SampleRate}");
}
}
... ...
... ... @@ -13,8 +13,6 @@
// dotnet run
using SherpaOnnx;
using System.Collections.Generic;
using System;
class KeywordSpotterDemo
{
... ... @@ -38,11 +36,11 @@ class KeywordSpotterDemo
var filename = "./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/test_wavs/3.wav";
WaveReader waveReader = new WaveReader(filename);
var waveReader = new WaveReader(filename);
Console.WriteLine("----------Use pre-defined keywords----------");
OnlineStream s = kws.CreateStream();
var s = kws.CreateStream();
s.AcceptWaveform(waveReader.SampleRate, waveReader.Samples);
float[] tailPadding = new float[(int)(waveReader.SampleRate * 0.3)];
... ... @@ -53,7 +51,7 @@ class KeywordSpotterDemo
{
kws.Decode(s);
var result = kws.GetResult(s);
if (result.Keyword != "")
if (result.Keyword != string.Empty)
{
Console.WriteLine("Detected: {0}", result.Keyword);
}
... ... @@ -70,7 +68,7 @@ class KeywordSpotterDemo
{
kws.Decode(s);
var result = kws.GetResult(s);
if (result.Keyword != "")
if (result.Keyword != string.Empty)
{
Console.WriteLine("Detected: {0}", result.Keyword);
}
... ... @@ -89,7 +87,7 @@ class KeywordSpotterDemo
{
kws.Decode(s);
var result = kws.GetResult(s);
if (result.Keyword != "")
if (result.Keyword != string.Empty)
{
Console.WriteLine("Detected: {0}", result.Keyword);
}
... ...
... ... @@ -2,7 +2,7 @@
<PropertyGroup>
<OutputType>Exe</OutputType>
<TargetFramework>net6.0</TargetFramework>
<TargetFramework>net8.0</TargetFramework>
<RootNamespace>keyword_spotting_from_files</RootNamespace>
<ImplicitUsings>enable</ImplicitUsings>
<Nullable>enable</Nullable>
... ...
... ... @@ -12,12 +12,9 @@
//
// dotnet run
using PortAudioSharp;
using SherpaOnnx;
using System.Collections.Generic;
using System.Runtime.InteropServices;
using System;
using PortAudioSharp;
class KeywordSpotterDemo
{
... ... @@ -41,11 +38,11 @@ class KeywordSpotterDemo
var filename = "./sherpa-onnx-kws-zipformer-wenetspeech-3.3M-2024-01-01/test_wavs/3.wav";
WaveReader waveReader = new WaveReader(filename);
var waveReader = new WaveReader(filename);
Console.WriteLine("----------Use pre-defined keywords----------");
OnlineStream s = kws.CreateStream();
var s = kws.CreateStream();
Console.WriteLine(PortAudio.VersionInfo.versionText);
PortAudio.Initialize();
... ... @@ -54,7 +51,7 @@ class KeywordSpotterDemo
for (int i = 0; i != PortAudio.DeviceCount; ++i)
{
Console.WriteLine($" Device {i}");
DeviceInfo deviceInfo = PortAudio.GetDeviceInfo(i);
var deviceInfo = PortAudio.GetDeviceInfo(i);
Console.WriteLine($" Name: {deviceInfo.name}");
Console.WriteLine($" Max input channels: {deviceInfo.maxInputChannels}");
Console.WriteLine($" Default sample rate: {deviceInfo.defaultSampleRate}");
... ... @@ -66,12 +63,12 @@ class KeywordSpotterDemo
Environment.Exit(1);
}
DeviceInfo info = PortAudio.GetDeviceInfo(deviceIndex);
var info = PortAudio.GetDeviceInfo(deviceIndex);
Console.WriteLine();
Console.WriteLine($"Use default device {deviceIndex} ({info.name})");
StreamParameters param = new StreamParameters();
var param = new StreamParameters();
param.device = deviceIndex;
param.channelCount = 1;
param.sampleFormat = SampleFormat.Float32;
... ... @@ -79,21 +76,21 @@ class KeywordSpotterDemo
param.hostApiSpecificStreamInfo = IntPtr.Zero;
PortAudioSharp.Stream.Callback callback = (IntPtr input, IntPtr output,
UInt32 frameCount,
uint frameCount,
ref StreamCallbackTimeInfo timeInfo,
StreamCallbackFlags statusFlags,
IntPtr userData
) =>
{
float[] samples = new float[frameCount];
Marshal.Copy(input, samples, 0, (Int32)frameCount);
var samples = new float[frameCount];
Marshal.Copy(input, samples, 0, (int)frameCount);
s.AcceptWaveform(config.FeatConfig.SampleRate, samples);
return StreamCallbackResult.Continue;
};
PortAudioSharp.Stream stream = new PortAudioSharp.Stream(inParams: param, outParams: null, sampleRate: config.FeatConfig.SampleRate,
var stream = new PortAudioSharp.Stream(inParams: param, outParams: null, sampleRate: config.FeatConfig.SampleRate,
framesPerBuffer: 0,
streamFlags: StreamFlags.ClipOff,
callback: callback,
... ... @@ -113,15 +110,13 @@ class KeywordSpotterDemo
}
var result = kws.GetResult(s);
if (result.Keyword != "")
if (result.Keyword != string.Empty)
{
Console.WriteLine("Detected: {0}", result.Keyword);
}
Thread.Sleep(200); // ms
}
PortAudio.Terminate();
}
}
... ...
... ... @@ -2,7 +2,7 @@
<PropertyGroup>
<OutputType>Exe</OutputType>
<TargetFramework>net6.0</TargetFramework>
<TargetFramework>net8.0</TargetFramework>
<RootNamespace>keyword_spotting_from_microphone</RootNamespace>
<ImplicitUsings>enable</ImplicitUsings>
<Nullable>enable</Nullable>
... ...
... ... @@ -5,17 +5,14 @@
// Please refer to
// https://k2-fsa.github.io/sherpa/onnx/pretrained_models/index.html
// to download non-streaming models
using CommandLine.Text;
using CommandLine;
using CommandLine.Text;
using SherpaOnnx;
using System.Collections.Generic;
using System;
class OfflineDecodeFiles
{
class Options
{
[Option("sample-rate", Required = false, Default = 16000, HelpText = "Sample rate of the data used to train the model")]
public int SampleRate { get; set; } = 16000;
... ... @@ -23,58 +20,58 @@ class OfflineDecodeFiles
public int FeatureDim { get; set; } = 80;
[Option(Required = false, HelpText = "Path to tokens.txt")]
public string Tokens { get; set; } = "";
public string Tokens { get; set; } = string.Empty;
[Option(Required = false, Default = "", HelpText = "Path to transducer encoder.onnx. Used only for transducer models")]
public string Encoder { get; set; } = "";
public string Encoder { get; set; } = string.Empty;
[Option(Required = false, Default = "", HelpText = "Path to transducer decoder.onnx. Used only for transducer models")]
public string Decoder { get; set; } = "";
public string Decoder { get; set; } = string.Empty;
[Option(Required = false, Default = "", HelpText = "Path to transducer joiner.onnx. Used only for transducer models")]
public string Joiner { get; set; } = "";
public string Joiner { get; set; } = string.Empty;
[Option("model-type", Required = false, Default = "", HelpText = "model type")]
public string ModelType { get; set; } = "";
public string ModelType { get; set; } = string.Empty;
[Option("whisper-encoder", Required = false, Default = "", HelpText = "Path to whisper encoder.onnx. Used only for whisper models")]
public string WhisperEncoder { get; set; } = "";
public string WhisperEncoder { get; set; } = string.Empty;
[Option("whisper-decoder", Required = false, Default = "", HelpText = "Path to whisper decoder.onnx. Used only for whisper models")]
public string WhisperDecoder { get; set; } = "";
public string WhisperDecoder { get; set; } = string.Empty;
[Option("whisper-language", Required = false, Default = "", HelpText = "Language of the input file. Can be empty")]
public string WhisperLanguage { get; set; } = "";
public string WhisperLanguage { get; set; } = string.Empty;
[Option("whisper-task", Required = false, Default = "transcribe", HelpText = "transcribe or translate")]
public string WhisperTask { get; set; } = "transcribe";
[Option("moonshine-preprocessor", Required = false, Default = "", HelpText = "Path to preprocess.onnx. Used only for Moonshine models")]
public string MoonshinePreprocessor { get; set; } = "";
public string MoonshinePreprocessor { get; set; } = string.Empty;
[Option("moonshine-encoder", Required = false, Default = "", HelpText = "Path to encode.onnx. Used only for Moonshine models")]
public string MoonshineEncoder { get; set; } = "";
public string MoonshineEncoder { get; set; } = string.Empty;
[Option("moonshine-uncached-decoder", Required = false, Default = "", HelpText = "Path to uncached_decode.onnx. Used only for Moonshine models")]
public string MoonshineUncachedDecoder { get; set; } = "";
public string MoonshineUncachedDecoder { get; set; } = string.Empty;
[Option("moonshine-cached-decoder", Required = false, Default = "", HelpText = "Path to cached_decode.onnx. Used only for Moonshine models")]
public string MoonshineCachedDecoder { get; set; } = "";
public string MoonshineCachedDecoder { get; set; } = string.Empty;
[Option("tdnn-model", Required = false, Default = "", HelpText = "Path to tdnn yesno model")]
public string TdnnModel { get; set; } = "";
public string TdnnModel { get; set; } = string.Empty;
[Option(Required = false, HelpText = "Path to model.onnx. Used only for paraformer models")]
public string Paraformer { get; set; } = "";
public string Paraformer { get; set; } = string.Empty;
[Option("nemo-ctc", Required = false, HelpText = "Path to model.onnx. Used only for NeMo CTC models")]
public string NeMoCtc { get; set; } = "";
public string NeMoCtc { get; set; } = string.Empty;
[Option("telespeech-ctc", Required = false, HelpText = "Path to model.onnx. Used only for TeleSpeech CTC models")]
public string TeleSpeechCtc { get; set; } = "";
public string TeleSpeechCtc { get; set; } = string.Empty;
[Option("sense-voice-model", Required = false, HelpText = "Path to model.onnx. Used only for SenseVoice CTC models")]
public string SenseVoiceModel { get; set; } = "";
public string SenseVoiceModel { get; set; } = string.Empty;
[Option("sense-voice-use-itn", Required = false, HelpText = "1 to use inverse text normalization for sense voice.")]
public int SenseVoiceUseItn { get; set; } = 1;
... ... @@ -88,7 +85,7 @@ class OfflineDecodeFiles
[Option("rule-fsts", Required = false, Default = "",
HelpText = "If not empty, path to rule fst for inverse text normalization")]
public string RuleFsts { get; set; } = "";
public string RuleFsts { get; set; } = string.Empty;
[Option("max-active-paths", Required = false, Default = 4,
HelpText = @"Used only when --decoding--method is modified_beam_search.
... ... @@ -96,7 +93,7 @@ It specifies number of active paths to keep during the search")]
public int MaxActivePaths { get; set; } = 4;
[Option("hotwords-file", Required = false, Default = "", HelpText = "Path to hotwords.txt")]
public string HotwordsFile { get; set; } = "";
public string HotwordsFile { get; set; } = string.Empty;
[Option("hotwords-score", Required = false, Default = 1.5F, HelpText = "hotwords score")]
public float HotwordsScore { get; set; } = 1.5F;
... ... @@ -117,7 +114,7 @@ It specifies number of active paths to keep during the search")]
private static void DisplayHelp<T>(ParserResult<T> result, IEnumerable<Error> errs)
{
string usage = @"
var usage = @"
# Zipformer
dotnet run \
... ... @@ -213,42 +210,42 @@ to download pre-trained Tdnn models.
config.ModelConfig.Tokens = options.Tokens;
if (!String.IsNullOrEmpty(options.Encoder))
if (!string.IsNullOrEmpty(options.Encoder))
{
// this is a transducer model
config.ModelConfig.Transducer.Encoder = options.Encoder;
config.ModelConfig.Transducer.Decoder = options.Decoder;
config.ModelConfig.Transducer.Joiner = options.Joiner;
}
else if (!String.IsNullOrEmpty(options.Paraformer))
else if (!string.IsNullOrEmpty(options.Paraformer))
{
config.ModelConfig.Paraformer.Model = options.Paraformer;
}
else if (!String.IsNullOrEmpty(options.NeMoCtc))
else if (!string.IsNullOrEmpty(options.NeMoCtc))
{
config.ModelConfig.NeMoCtc.Model = options.NeMoCtc;
}
else if (!String.IsNullOrEmpty(options.TeleSpeechCtc))
else if (!string.IsNullOrEmpty(options.TeleSpeechCtc))
{
config.ModelConfig.TeleSpeechCtc = options.TeleSpeechCtc;
}
else if (!String.IsNullOrEmpty(options.WhisperEncoder))
else if (!string.IsNullOrEmpty(options.WhisperEncoder))
{
config.ModelConfig.Whisper.Encoder = options.WhisperEncoder;
config.ModelConfig.Whisper.Decoder = options.WhisperDecoder;
config.ModelConfig.Whisper.Language = options.WhisperLanguage;
config.ModelConfig.Whisper.Task = options.WhisperTask;
}
else if (!String.IsNullOrEmpty(options.TdnnModel))
else if (!string.IsNullOrEmpty(options.TdnnModel))
{
config.ModelConfig.Tdnn.Model = options.TdnnModel;
}
else if (!String.IsNullOrEmpty(options.SenseVoiceModel))
else if (!string.IsNullOrEmpty(options.SenseVoiceModel))
{
config.ModelConfig.SenseVoice.Model = options.SenseVoiceModel;
config.ModelConfig.SenseVoice.UseInverseTextNormalization = options.SenseVoiceUseItn;
}
else if (!String.IsNullOrEmpty(options.MoonshinePreprocessor))
else if (!string.IsNullOrEmpty(options.MoonshinePreprocessor))
{
config.ModelConfig.Moonshine.Preprocessor = options.MoonshinePreprocessor;
config.ModelConfig.Moonshine.Encoder = options.MoonshineEncoder;
... ... @@ -270,17 +267,17 @@ to download pre-trained Tdnn models.
config.ModelConfig.Debug = 0;
OfflineRecognizer recognizer = new OfflineRecognizer(config);
var recognizer = new OfflineRecognizer(config);
string[] files = options.Files.ToArray();
var files = options.Files.ToArray();
// We create a separate stream for each file
List<OfflineStream> streams = new List<OfflineStream>();
var streams = new List<OfflineStream>();
streams.EnsureCapacity(files.Length);
for (int i = 0; i != files.Length; ++i)
{
OfflineStream s = recognizer.CreateStream();
var s = recognizer.CreateStream();
WaveReader waveReader = new WaveReader(files[i]);
s.AcceptWaveform(waveReader.SampleRate, waveReader.Samples);
... ... @@ -299,7 +296,7 @@ to download pre-trained Tdnn models.
Console.WriteLine("Tokens: [{0}]", string.Join(", ", r.Tokens));
if (r.Timestamps != null && r.Timestamps.Length > 0) {
Console.Write("Timestamps: [");
var sep = "";
var sep = string.Empty;
for (int k = 0; k != r.Timestamps.Length; ++k)
{
Console.Write("{0}{1}", sep, r.Timestamps[k].ToString("0.00"));
... ...
... ... @@ -2,7 +2,7 @@
<PropertyGroup>
<OutputType>Exe</OutputType>
<TargetFramework>net6.0</TargetFramework>
<TargetFramework>net8.0</TargetFramework>
<RootNamespace>offline_decode_files</RootNamespace>
<ImplicitUsings>enable</ImplicitUsings>
<Nullable>enable</Nullable>
... ...
... ... @@ -12,8 +12,6 @@
// dotnet run
using SherpaOnnx;
using System.Collections.Generic;
using System;
class OfflinePunctuationDemo
{
... ... @@ -25,14 +23,14 @@ class OfflinePunctuationDemo
config.Model.NumThreads = 1;
var punct = new OfflinePunctuation(config);
string[] textList = new string[] {
var textList = new string[] {
"这是一个测试你好吗How are you我很好thank you are you ok谢谢你",
"我们都是木头人不会说话不会动",
"The African blogosphere is rapidly expanding bringing more voices online in the form of commentaries opinions analyses rants and poetry",
};
Console.WriteLine("---------");
foreach (string text in textList)
foreach (var text in textList)
{
string textWithPunct = punct.AddPunct(text);
Console.WriteLine("Input text: {0}", text);
... ...
... ... @@ -2,7 +2,7 @@
<PropertyGroup>
<OutputType>Exe</OutputType>
<TargetFramework>net6.0</TargetFramework>
<TargetFramework>net8.0</TargetFramework>
<RootNamespace>offline_punctuation</RootNamespace>
<ImplicitUsings>enable</ImplicitUsings>
<Nullable>enable</Nullable>
... ...
... ... @@ -34,7 +34,6 @@ Step 4. Run it
*/
using SherpaOnnx;
using System;
class OfflineSpeakerDiarizationDemo
{
... ... @@ -54,7 +53,7 @@ class OfflineSpeakerDiarizationDemo
var sd = new OfflineSpeakerDiarization(config);
var testWaveFile = "./0-four-speakers-zh.wav";
WaveReader waveReader = new WaveReader(testWaveFile);
var waveReader = new WaveReader(testWaveFile);
if (sd.SampleRate != waveReader.SampleRate)
{
Console.WriteLine($"Expected sample rate: {sd.SampleRate}. Given: {waveReader.SampleRate}");
... ... @@ -65,19 +64,19 @@ class OfflineSpeakerDiarizationDemo
// var segments = sd.Process(waveReader.Samples); // this one is also ok
var MyProgressCallback = (int numProcessedChunks, int numTotalChunks, IntPtr arg) =>
var progressCallback = (int numProcessedChunks, int numTotalChunks, IntPtr arg) =>
{
float progress = 100.0F * numProcessedChunks / numTotalChunks;
Console.WriteLine("Progress {0}%", String.Format("{0:0.00}", progress));
var progress = 100.0F * numProcessedChunks / numTotalChunks;
Console.WriteLine("Progress {0}%", string.Format("{0:0.00}", progress));
return 0;
};
var callback = new OfflineSpeakerDiarizationProgressCallback(MyProgressCallback);
var callback = new OfflineSpeakerDiarizationProgressCallback(progressCallback);
var segments = sd.ProcessWithCallback(waveReader.Samples, callback, IntPtr.Zero);
foreach (var s in segments)
{
Console.WriteLine("{0} -- {1} speaker_{2}", String.Format("{0:0.00}", s.Start), String.Format("{0:0.00}", s.End), s.Speaker);
Console.WriteLine("{0} -- {1} speaker_{2}", string.Format("{0:0.00}", s.Start), string.Format("{0:0.00}", s.End), s.Speaker);
}
}
}
... ...
... ... @@ -2,7 +2,7 @@
<PropertyGroup>
<OutputType>Exe</OutputType>
<TargetFramework>net6.0</TargetFramework>
<TargetFramework>net8.0</TargetFramework>
<RootNamespace>offline_speaker_diarization</RootNamespace>
<ImplicitUsings>enable</ImplicitUsings>
<Nullable>enable</Nullable>
... ...
... ... @@ -10,15 +10,12 @@
// Note that you need a speaker to run this file since it will play
// the generated audio as it is generating.
using CommandLine.Text;
using CommandLine;
using CommandLine.Text;
using PortAudioSharp;
using SherpaOnnx;
using System.Collections.Concurrent;
using System.Collections.Generic;
using System.Runtime.InteropServices;
using System.Threading;
using System;
class OfflineTtsPlayDemo
{
... ... @@ -26,13 +23,13 @@ class OfflineTtsPlayDemo
{
[Option("tts-rule-fsts", Required = false, Default = "", HelpText = "path to rule.fst")]
public string RuleFsts { get; set; }
public string? RuleFsts { get; set; }
[Option("vits-dict-dir", Required = false, Default = "", HelpText = "Path to the directory containing dict for jieba.")]
public string DictDir { get; set; }
public string? DictDir { get; set; }
[Option("vits-data-dir", Required = false, Default = "", HelpText = "Path to the directory containing dict for espeak-ng.")]
public string DataDir { get; set; }
public string? DataDir { get; set; }
[Option("vits-length-scale", Required = false, Default = 1, HelpText = "speech speed. Larger->Slower; Smaller->faster")]
public float LengthScale { get; set; }
... ... @@ -44,10 +41,10 @@ class OfflineTtsPlayDemo
public float NoiseScaleW { get; set; }
[Option("vits-lexicon", Required = false, Default = "", HelpText = "Path to lexicon.txt")]
public string Lexicon { get; set; }
public string? Lexicon { get; set; }
[Option("vits-tokens", Required = false, Default = "", HelpText = "Path to tokens.txt")]
public string Tokens { get; set; }
public string? Tokens { get; set; }
[Option("tts-max-num-sentences", Required = false, Default = 1, HelpText = "Maximum number of sentences that we process at a time.")]
public int MaxNumSentences { get; set; }
... ... @@ -56,16 +53,16 @@ class OfflineTtsPlayDemo
public int Debug { get; set; }
[Option("vits-model", Required = true, HelpText = "Path to VITS model")]
public string Model { get; set; }
public string? Model { get; set; }
[Option("sid", Required = false, Default = 0, HelpText = "Speaker ID")]
public int SpeakerId { get; set; }
[Option("text", Required = true, HelpText = "Text to synthesize")]
public string Text { get; set; }
public string? Text { get; set; }
[Option("output-filename", Required = true, Default = "./generated.wav", HelpText = "Path to save the generated audio")]
public string OutputFilename { get; set; }
public string? OutputFilename { get; set; }
}
static void Main(string[] args)
... ... @@ -124,10 +121,9 @@ to download more models.
Console.WriteLine(helpText);
}
private static void Run(Options options)
{
OfflineTtsConfig config = new OfflineTtsConfig();
var config = new OfflineTtsConfig();
config.Model.Vits.Model = options.Model;
config.Model.Vits.Lexicon = options.Lexicon;
config.Model.Vits.Tokens = options.Tokens;
... ... @@ -142,10 +138,9 @@ to download more models.
config.RuleFsts = options.RuleFsts;
config.MaxNumSentences = options.MaxNumSentences;
OfflineTts tts = new OfflineTts(config);
float speed = 1.0f / options.LengthScale;
int sid = options.SpeakerId;
var tts = new OfflineTts(config);
var speed = 1.0f / options.LengthScale;
var sid = options.SpeakerId;
Console.WriteLine(PortAudio.VersionInfo.versionText);
PortAudio.Initialize();
... ... @@ -166,11 +161,11 @@ to download more models.
Environment.Exit(1);
}
DeviceInfo info = PortAudio.GetDeviceInfo(deviceIndex);
var info = PortAudio.GetDeviceInfo(deviceIndex);
Console.WriteLine();
Console.WriteLine($"Use output default device {deviceIndex} ({info.name})");
StreamParameters param = new StreamParameters();
var param = new StreamParameters();
param.device = deviceIndex;
param.channelCount = 1;
param.sampleFormat = SampleFormat.Float32;
... ... @@ -178,7 +173,7 @@ to download more models.
param.hostApiSpecificStreamInfo = IntPtr.Zero;
// https://learn.microsoft.com/en-us/dotnet/standard/collections/thread-safe/blockingcollection-overview
BlockingCollection<float[]> dataItems = new BlockingCollection<float[]>();
var dataItems = new BlockingCollection<float[]>();
var MyCallback = (IntPtr samples, int n) =>
{
... ... @@ -193,9 +188,9 @@ to download more models.
return 1;
};
bool playFinished = false;
var playFinished = false;
float[] lastSampleArray = null;
float[]? lastSampleArray = null;
int lastIndex = 0; // not played
PortAudioSharp.Stream.Callback playCallback = (IntPtr input, IntPtr output,
... ... @@ -270,10 +265,10 @@ to download more models.
stream.Start();
OfflineTtsCallback callback = new OfflineTtsCallback(MyCallback);
var callback = new OfflineTtsCallback(MyCallback);
OfflineTtsGeneratedAudio audio = tts.GenerateWithCallback(options.Text, speed, sid, callback);
bool ok = audio.SaveToWaveFile(options.OutputFilename);
var audio = tts.GenerateWithCallback(options.Text, speed, sid, callback);
var ok = audio.SaveToWaveFile(options.OutputFilename);
if (ok)
{
... ...
... ... @@ -2,7 +2,7 @@
<PropertyGroup>
<OutputType>Exe</OutputType>
<TargetFramework>net6.0</TargetFramework>
<TargetFramework>net8.0</TargetFramework>
<RootNamespace>offline_tts_play</RootNamespace>
<ImplicitUsings>enable</ImplicitUsings>
<Nullable>enable</Nullable>
... ...
... ... @@ -6,28 +6,25 @@
// and
// https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models
// to download pre-trained models
using CommandLine.Text;
using CommandLine;
using CommandLine.Text;
using SherpaOnnx;
using System.Collections.Generic;
using System;
class OfflineTtsDemo
{
class Options
{
[Option("tts-rule-fsts", Required = false, Default = "", HelpText = "path to rule.fst")]
public string RuleFsts { get; set; } = "";
public string RuleFsts { get; set; } = string.Empty;
[Option("tts-rule-fars", Required = false, Default = "", HelpText = "path to rule.far")]
public string RuleFars { get; set; } = "";
public string RuleFars { get; set; } = string.Empty;
[Option("vits-dict-dir", Required = false, Default = "", HelpText = "Path to the directory containing dict for jieba.")]
public string DictDir { get; set; } = "";
public string DictDir { get; set; } = string.Empty;
[Option("vits-data-dir", Required = false, Default = "", HelpText = "Path to the directory containing dict for espeak-ng.")]
public string DataDir { get; set; } = "";
public string DataDir { get; set; } = string.Empty;
[Option("vits-length-scale", Required = false, Default = 1, HelpText = "speech speed. Larger->Slower; Smaller->faster")]
public float LengthScale { get; set; } = 1;
... ... @@ -39,10 +36,10 @@ class OfflineTtsDemo
public float NoiseScaleW { get; set; } = 0.8F;
[Option("vits-lexicon", Required = false, Default = "", HelpText = "Path to lexicon.txt")]
public string Lexicon { get; set; } = "";
public string Lexicon { get; set; } = string.Empty;
[Option("vits-tokens", Required = false, Default = "", HelpText = "Path to tokens.txt")]
public string Tokens { get; set; } = "";
public string Tokens { get; set; } = string.Empty;
[Option("tts-max-num-sentences", Required = false, Default = 1, HelpText = "Maximum number of sentences that we process at a time.")]
public int MaxNumSentences { get; set; } = 1;
... ... @@ -51,13 +48,13 @@ class OfflineTtsDemo
public int Debug { get; set; } = 0;
[Option("vits-model", Required = true, HelpText = "Path to VITS model")]
public string Model { get; set; } = "";
public string Model { get; set; } = string.Empty;
[Option("sid", Required = false, Default = 0, HelpText = "Speaker ID")]
public int SpeakerId { get; set; } = 0;
[Option("text", Required = true, HelpText = "Text to synthesize")]
public string Text { get; set; } = "";
public string Text { get; set; } = string.Empty;
[Option("output-filename", Required = true, Default = "./generated.wav", HelpText = "Path to save the generated audio")]
public string OutputFilename { get; set; } = "./generated.wav";
... ... @@ -65,7 +62,7 @@ class OfflineTtsDemo
static void Main(string[] args)
{
var parser = new CommandLine.Parser(with => with.HelpWriter = null);
var parser = new Parser(with => with.HelpWriter = null);
var parserResult = parser.ParseArguments<Options>(args);
parserResult
... ... @@ -75,7 +72,7 @@ class OfflineTtsDemo
private static void DisplayHelp<T>(ParserResult<T> result, IEnumerable<Error> errs)
{
string usage = @"
var usage = @"
# vits-aishell3
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-icefall-zh-aishell3.tar.bz2
... ... @@ -122,7 +119,7 @@ to download more models.
private static void Run(Options options)
{
OfflineTtsConfig config = new OfflineTtsConfig();
var config = new OfflineTtsConfig();
config.Model.Vits.Model = options.Model;
config.Model.Vits.Lexicon = options.Lexicon;
config.Model.Vits.Tokens = options.Tokens;
... ... @@ -138,11 +135,11 @@ to download more models.
config.RuleFars = options.RuleFars;
config.MaxNumSentences = options.MaxNumSentences;
OfflineTts tts = new OfflineTts(config);
float speed = 1.0f / options.LengthScale;
int sid = options.SpeakerId;
OfflineTtsGeneratedAudio audio = tts.Generate(options.Text, speed, sid);
bool ok = audio.SaveToWaveFile(options.OutputFilename);
var tts = new OfflineTts(config);
var speed = 1.0f / options.LengthScale;
var sid = options.SpeakerId;
var audio = tts.Generate(options.Text, speed, sid);
var ok = audio.SaveToWaveFile(options.OutputFilename);
if (ok)
{
... ...
... ... @@ -2,7 +2,7 @@
<PropertyGroup>
<OutputType>Exe</OutputType>
<TargetFramework>net6.0</TargetFramework>
<TargetFramework>net8.0</TargetFramework>
<RootNamespace>offline_tts</RootNamespace>
<ImplicitUsings>enable</ImplicitUsings>
<Nullable>enable</Nullable>
... ...
... ... @@ -6,40 +6,37 @@
// https://k2-fsa.github.io/sherpa/onnx/pretrained_models/online-transducer/zipformer-transducer-models.html
// to download streaming models
using CommandLine.Text;
using CommandLine;
using CommandLine.Text;
using SherpaOnnx;
using System.Collections.Generic;
using System.Linq;
using System;
class OnlineDecodeFiles
{
class Options
{
[Option(Required = true, HelpText = "Path to tokens.txt")]
public string Tokens { get; set; } = "";
public string Tokens { get; set; } = string.Empty;
[Option(Required = false, Default = "cpu", HelpText = "Provider, e.g., cpu, coreml")]
public string Provider { get; set; } = "";
public string Provider { get; set; } = string.Empty;
[Option(Required = false, HelpText = "Path to transducer encoder.onnx")]
public string Encoder { get; set; } = "";
public string Encoder { get; set; } = string.Empty;
[Option(Required = false, HelpText = "Path to transducer decoder.onnx")]
public string Decoder { get; set; } = "";
public string Decoder { get; set; } = string.Empty;
[Option(Required = false, HelpText = "Path to transducer joiner.onnx")]
public string Joiner { get; set; } = "";
public string Joiner { get; set; } = string.Empty;
[Option("paraformer-encoder", Required = false, HelpText = "Path to paraformer encoder.onnx")]
public string ParaformerEncoder { get; set; } = "";
public string ParaformerEncoder { get; set; } = string.Empty;
[Option("paraformer-decoder", Required = false, HelpText = "Path to paraformer decoder.onnx")]
public string ParaformerDecoder { get; set; } = "";
public string ParaformerDecoder { get; set; } = string.Empty;
[Option("zipformer2-ctc", Required = false, HelpText = "Path to zipformer2 CTC onnx model")]
public string Zipformer2Ctc { get; set; } = "";
public string Zipformer2Ctc { get; set; } = string.Empty;
[Option("num-threads", Required = false, Default = 1, HelpText = "Number of threads for computation")]
public int NumThreads { get; set; } = 1;
... ... @@ -80,15 +77,14 @@ larger than this value. Used only when --enable-endpoint is true.")]
public float Rule3MinUtteranceLength { get; set; } = 20.0F;
[Option("hotwords-file", Required = false, Default = "", HelpText = "Path to hotwords.txt")]
public string HotwordsFile { get; set; } = "";
public string HotwordsFile { get; set; } = string.Empty;
[Option("hotwords-score", Required = false, Default = 1.5F, HelpText = "hotwords score")]
public float HotwordsScore { get; set; } = 1.5F;
[Option("rule-fsts", Required = false, Default = "",
HelpText = "If not empty, path to rule fst for inverse text normalization")]
public string RuleFsts { get; set; } = "";
public string RuleFsts { get; set; } = string.Empty;
[Option("files", Required = true, HelpText = "Audio files for decoding")]
public IEnumerable<string> Files { get; set; } = new string[] {};
... ... @@ -162,7 +158,7 @@ to download pre-trained streaming models.
private static void Run(Options options)
{
OnlineRecognizerConfig config = new OnlineRecognizerConfig();
var config = new OnlineRecognizerConfig();
config.FeatConfig.SampleRate = options.SampleRate;
// All models from icefall using feature dim 80.
... ... @@ -194,22 +190,22 @@ to download pre-trained streaming models.
config.HotwordsScore = options.HotwordsScore;
config.RuleFsts = options.RuleFsts;
OnlineRecognizer recognizer = new OnlineRecognizer(config);
var recognizer = new OnlineRecognizer(config);
string[] files = options.Files.ToArray();
var files = options.Files.ToArray();
// We create a separate stream for each file
List<OnlineStream> streams = new List<OnlineStream>();
var streams = new List<OnlineStream>();
streams.EnsureCapacity(files.Length);
for (int i = 0; i != files.Length; ++i)
{
OnlineStream s = recognizer.CreateStream();
var s = recognizer.CreateStream();
WaveReader waveReader = new WaveReader(files[i]);
var waveReader = new WaveReader(files[i]);
s.AcceptWaveform(waveReader.SampleRate, waveReader.Samples);
float[] tailPadding = new float[(int)(waveReader.SampleRate * 0.3)];
var tailPadding = new float[(int)(waveReader.SampleRate * 0.3)];
s.AcceptWaveform(waveReader.SampleRate, tailPadding);
s.InputFinished();
... ... @@ -230,7 +226,7 @@ to download pre-trained streaming models.
// display results
for (int i = 0; i != files.Length; ++i)
{
OnlineRecognizerResult r = recognizer.GetResult(streams[i]);
var r = recognizer.GetResult(streams[i]);
var text = r.Text;
var tokens = r.Tokens;
Console.WriteLine("--------------------");
... ... @@ -238,7 +234,7 @@ to download pre-trained streaming models.
Console.WriteLine("text: {0}", text);
Console.WriteLine("tokens: [{0}]", string.Join(", ", tokens));
Console.Write("timestamps: [");
r.Timestamps.ToList().ForEach(i => Console.Write(String.Format("{0:0.00}", i) + ", "));
r.Timestamps.ToList().ForEach(i => Console.Write(string.Format("{0:0.00}", i) + ", "));
Console.WriteLine("]");
}
Console.WriteLine("--------------------");
... ...
... ... @@ -2,7 +2,7 @@
<PropertyGroup>
<OutputType>Exe</OutputType>
<TargetFramework>net6.0</TargetFramework>
<TargetFramework>net8.0</TargetFramework>
<RootNamespace>online_decode_files</RootNamespace>
<ImplicitUsings>enable</ImplicitUsings>
<Nullable>enable</Nullable>
... ...
... ... @@ -29,9 +29,7 @@ Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "keyword-spotting-from-files
EndProject
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "keyword-spotting-from-microphone", "keyword-spotting-from-microphone\keyword-spotting-from-microphone.csproj", "{AEE0ED2B-C86F-4952-863C-EAD3219CB4EC}"
EndProject
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "TTS", "TTS\TTS.csproj", "{DACE4A18-4FC8-4437-92BF-5A90BA81286C}"
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "offline-speaker-diarization", "offline-speaker-diarization\offline-speaker-diarization.csproj", "{D3A1FF28-A77D-429D-AEAC-2BA77CA682BC}"
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "offline-speaker-diarization", "offline-speaker-diarization\offline-speaker-diarization.csproj", "{D3A1FF28-A77D-429D-AEAC-2BA77CA682BC}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
... ... @@ -91,10 +89,6 @@ Global
{AEE0ED2B-C86F-4952-863C-EAD3219CB4EC}.Debug|Any CPU.Build.0 = Debug|Any CPU
{AEE0ED2B-C86F-4952-863C-EAD3219CB4EC}.Release|Any CPU.ActiveCfg = Release|Any CPU
{AEE0ED2B-C86F-4952-863C-EAD3219CB4EC}.Release|Any CPU.Build.0 = Release|Any CPU
{DACE4A18-4FC8-4437-92BF-5A90BA81286C}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{DACE4A18-4FC8-4437-92BF-5A90BA81286C}.Debug|Any CPU.Build.0 = Debug|Any CPU
{DACE4A18-4FC8-4437-92BF-5A90BA81286C}.Release|Any CPU.ActiveCfg = Release|Any CPU
{DACE4A18-4FC8-4437-92BF-5A90BA81286C}.Release|Any CPU.Build.0 = Release|Any CPU
{D3A1FF28-A77D-429D-AEAC-2BA77CA682BC}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{D3A1FF28-A77D-429D-AEAC-2BA77CA682BC}.Debug|Any CPU.Build.0 = Debug|Any CPU
{D3A1FF28-A77D-429D-AEAC-2BA77CA682BC}.Release|Any CPU.ActiveCfg = Release|Any CPU
... ...
... ... @@ -16,20 +16,18 @@
// dotnet run
using SherpaOnnx;
using System.Collections.Generic;
using System;
class SpeakerIdentificationDemo
{
public static float[] ComputeEmbedding(SpeakerEmbeddingExtractor extractor, String filename)
public static float[] ComputeEmbedding(SpeakerEmbeddingExtractor extractor, string filename)
{
WaveReader reader = new WaveReader(filename);
var reader = new WaveReader(filename);
OnlineStream stream = extractor.CreateStream();
var stream = extractor.CreateStream();
stream.AcceptWaveform(reader.SampleRate, reader.Samples);
stream.InputFinished();
float[] embedding = extractor.Compute(stream);
var embedding = extractor.Compute(stream);
return embedding;
}
... ... @@ -43,25 +41,25 @@ class SpeakerIdentificationDemo
var manager = new SpeakerEmbeddingManager(extractor.Dim);
string[] spk1Files =
var spk1Files =
new string[] {
"./sr-data/enroll/fangjun-sr-1.wav",
"./sr-data/enroll/fangjun-sr-2.wav",
"./sr-data/enroll/fangjun-sr-3.wav",
};
float[][] spk1Vec = new float[spk1Files.Length][];
var spk1Vec = new float[spk1Files.Length][];
for (int i = 0; i < spk1Files.Length; ++i)
{
spk1Vec[i] = ComputeEmbedding(extractor, spk1Files[i]);
}
string[] spk2Files =
var spk2Files =
new string[] {
"./sr-data/enroll/leijun-sr-1.wav", "./sr-data/enroll/leijun-sr-2.wav",
};
float[][] spk2Vec = new float[spk2Files.Length][];
var spk2Vec = new float[spk2Files.Length][];
for (int i = 0; i < spk2Files.Length; ++i)
{
... ... @@ -100,14 +98,14 @@ class SpeakerIdentificationDemo
Console.WriteLine("---All speakers---");
string[] allSpeakers = manager.GetAllSpeakers();
var allSpeakers = manager.GetAllSpeakers();
foreach (var s in allSpeakers)
{
Console.WriteLine(s);
}
Console.WriteLine("------------");
string[] testFiles =
var testFiles =
new string[] {
"./sr-data/test/fangjun-test-sr-1.wav",
"./sr-data/test/leijun-test-sr-1.wav",
... ... @@ -117,9 +115,9 @@ class SpeakerIdentificationDemo
float threshold = 0.6f;
foreach (var file in testFiles)
{
float[] embedding = ComputeEmbedding(extractor, file);
var embedding = ComputeEmbedding(extractor, file);
String name = manager.Search(embedding, threshold);
var name = manager.Search(embedding, threshold);
if (name == "")
{
name = "<Unknown>";
... ...
... ... @@ -2,7 +2,7 @@
<PropertyGroup>
<OutputType>Exe</OutputType>
<TargetFramework>net6.0</TargetFramework>
<TargetFramework>net8.0</TargetFramework>
<RootNamespace>speaker_identification</RootNamespace>
<ImplicitUsings>enable</ImplicitUsings>
<Nullable>enable</Nullable>
... ...
... ... @@ -6,47 +6,43 @@
// https://k2-fsa.github.io/sherpa/onnx/pretrained_models/online-transducer/zipformer-transducer-models.html
// to download streaming models
using CommandLine.Text;
using CommandLine;
using CommandLine.Text;
using PortAudioSharp;
using System.Threading;
using SherpaOnnx;
using System.Collections.Generic;
using System.Runtime.InteropServices;
using System;
class SpeechRecognitionFromMicrophone
{
class Options
{
[Option(Required = true, HelpText = "Path to tokens.txt")]
public string Tokens { get; set; }
public string? Tokens { get; set; }
[Option(Required = false, Default = "cpu", HelpText = "Provider, e.g., cpu, coreml")]
public string Provider { get; set; }
public string? Provider { get; set; }
[Option(Required = false, HelpText = "Path to transducer encoder.onnx")]
public string Encoder { get; set; }
public string? Encoder { get; set; }
[Option(Required = false, HelpText = "Path to transducer decoder.onnx")]
public string Decoder { get; set; }
public string? Decoder { get; set; }
[Option(Required = false, HelpText = "Path to transducer joiner.onnx")]
public string Joiner { get; set; }
public string? Joiner { get; set; }
[Option("paraformer-encoder", Required = false, HelpText = "Path to paraformer encoder.onnx")]
public string ParaformerEncoder { get; set; }
public string? ParaformerEncoder { get; set; }
[Option("paraformer-decoder", Required = false, HelpText = "Path to paraformer decoder.onnx")]
public string ParaformerDecoder { get; set; }
public string? ParaformerDecoder { get; set; }
[Option("num-threads", Required = false, Default = 1, HelpText = "Number of threads for computation")]
public int NumThreads { get; set; }
[Option("decoding-method", Required = false, Default = "greedy_search",
HelpText = "Valid decoding methods are: greedy_search, modified_beam_search")]
public string DecodingMethod { get; set; }
public string? DecodingMethod { get; set; }
[Option(Required = false, Default = false, HelpText = "True to show model info during loading")]
public bool Debug { get; set; }
... ... @@ -126,7 +122,7 @@ to download pre-trained streaming models.
private static void Run(Options options)
{
OnlineRecognizerConfig config = new OnlineRecognizerConfig();
var config = new OnlineRecognizerConfig();
config.FeatConfig.SampleRate = options.SampleRate;
// All models from icefall using feature dim 80.
... ... @@ -153,9 +149,9 @@ to download pre-trained streaming models.
config.Rule2MinTrailingSilence = options.Rule2MinTrailingSilence;
config.Rule3MinUtteranceLength = options.Rule3MinUtteranceLength;
OnlineRecognizer recognizer = new OnlineRecognizer(config);
var recognizer = new OnlineRecognizer(config);
OnlineStream s = recognizer.CreateStream();
var s = recognizer.CreateStream();
Console.WriteLine(PortAudio.VersionInfo.versionText);
PortAudio.Initialize();
... ... @@ -176,12 +172,12 @@ to download pre-trained streaming models.
Environment.Exit(1);
}
DeviceInfo info = PortAudio.GetDeviceInfo(deviceIndex);
var info = PortAudio.GetDeviceInfo(deviceIndex);
Console.WriteLine();
Console.WriteLine($"Use default device {deviceIndex} ({info.name})");
StreamParameters param = new StreamParameters();
var param = new StreamParameters();
param.device = deviceIndex;
param.channelCount = 1;
param.sampleFormat = SampleFormat.Float32;
... ... @@ -189,14 +185,14 @@ to download pre-trained streaming models.
param.hostApiSpecificStreamInfo = IntPtr.Zero;
PortAudioSharp.Stream.Callback callback = (IntPtr input, IntPtr output,
UInt32 frameCount,
uint frameCount,
ref StreamCallbackTimeInfo timeInfo,
StreamCallbackFlags statusFlags,
IntPtr userData
) =>
{
float[] samples = new float[frameCount];
Marshal.Copy(input, samples, 0, (Int32)frameCount);
var samples = new float[frameCount];
Marshal.Copy(input, samples, 0, (int)frameCount);
s.AcceptWaveform(options.SampleRate, samples);
... ... @@ -215,7 +211,7 @@ to download pre-trained streaming models.
stream.Start();
String lastText = "";
var lastText = string.Empty;
int segmentIndex = 0;
while (true)
... ... @@ -245,9 +241,5 @@ to download pre-trained streaming models.
Thread.Sleep(200); // ms
}
PortAudio.Terminate();
}
}
... ...
... ... @@ -2,7 +2,7 @@
<PropertyGroup>
<OutputType>Exe</OutputType>
<TargetFramework>net6.0</TargetFramework>
<TargetFramework>net8.0</TargetFramework>
<RootNamespace>speech_recognition_from_microphone</RootNamespace>
<ImplicitUsings>enable</ImplicitUsings>
<Nullable>enable</Nullable>
... ...
... ... @@ -15,12 +15,9 @@
// dotnet run
using SherpaOnnx;
using System.Collections.Generic;
using System;
class SpokenLanguageIdentificationDemo
{
static void Main(string[] args)
{
var config = new SpokenLanguageIdentificationConfig();
... ... @@ -30,7 +27,7 @@ class SpokenLanguageIdentificationDemo
var slid = new SpokenLanguageIdentification(config);
var filename = "./sherpa-onnx-whisper-tiny/test_wavs/0.wav";
WaveReader waveReader = new WaveReader(filename);
var waveReader = new WaveReader(filename);
var s = slid.CreateStream();
s.AcceptWaveform(waveReader.SampleRate, waveReader.Samples);
... ...
... ... @@ -2,7 +2,7 @@
<PropertyGroup>
<OutputType>Exe</OutputType>
<TargetFramework>net6.0</TargetFramework>
<TargetFramework>net8.0</TargetFramework>
<RootNamespace>spoken_language_identification</RootNamespace>
<ImplicitUsings>enable</ImplicitUsings>
<Nullable>enable</Nullable>
... ...
... ... @@ -13,12 +13,9 @@
// dotnet run
using SherpaOnnx;
using System.Collections.Generic;
using System;
class StreamingHlgDecodingDemo
{
static void Main(string[] args)
{
var config = new OnlineRecognizerConfig();
... ... @@ -32,15 +29,15 @@ class StreamingHlgDecodingDemo
config.ModelConfig.Debug = 0;
config.CtcFstDecoderConfig.Graph = "./sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/HLG.fst";
OnlineRecognizer recognizer = new OnlineRecognizer(config);
var recognizer = new OnlineRecognizer(config);
var filename = "./sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18/test_wavs/8k.wav";
WaveReader waveReader = new WaveReader(filename);
OnlineStream s = recognizer.CreateStream();
var waveReader = new WaveReader(filename);
var s = recognizer.CreateStream();
s.AcceptWaveform(waveReader.SampleRate, waveReader.Samples);
float[] tailPadding = new float[(int)(waveReader.SampleRate * 0.3)];
var tailPadding = new float[(int)(waveReader.SampleRate * 0.3)];
s.AcceptWaveform(waveReader.SampleRate, tailPadding);
s.InputFinished();
... ... @@ -49,7 +46,7 @@ class StreamingHlgDecodingDemo
recognizer.Decode(s);
}
OnlineRecognizerResult r = recognizer.GetResult(s);
var r = recognizer.GetResult(s);
var text = r.Text;
var tokens = r.Tokens;
Console.WriteLine("--------------------");
... ... @@ -57,10 +54,8 @@ class StreamingHlgDecodingDemo
Console.WriteLine("text: {0}", text);
Console.WriteLine("tokens: [{0}]", string.Join(", ", tokens));
Console.Write("timestamps: [");
r.Timestamps.ToList().ForEach(i => Console.Write(String.Format("{0:0.00}", i) + ", "));
r.Timestamps.ToList().ForEach(i => Console.Write(string.Format("{0:0.00}", i) + ", "));
Console.WriteLine("]");
Console.WriteLine("--------------------");
}
}
... ...
... ... @@ -2,7 +2,7 @@
<PropertyGroup>
<OutputType>Exe</OutputType>
<TargetFramework>net6.0</TargetFramework>
<TargetFramework>net8.0</TargetFramework>
<RootNamespace>streaming_hlg_decoding</RootNamespace>
<ImplicitUsings>enable</ImplicitUsings>
<Nullable>enable</Nullable>
... ...
... ... @@ -3,8 +3,6 @@
// This file shows how to use a silero_vad model with a non-streaming Paraformer
// for speech recognition.
using SherpaOnnx;
using System.Collections.Generic;
using System;
class VadNonStreamingAsrParaformer
{
... ... @@ -12,45 +10,49 @@ class VadNonStreamingAsrParaformer
{
// please download model files from
// https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models
OfflineRecognizerConfig config = new OfflineRecognizerConfig();
var config = new OfflineRecognizerConfig();
config.ModelConfig.Paraformer.Model = "./sherpa-onnx-paraformer-zh-2023-09-14/model.int8.onnx";
config.ModelConfig.Tokens = "./sherpa-onnx-paraformer-zh-2023-09-14/tokens.txt";
config.ModelConfig.Debug = 0;
OfflineRecognizer recognizer = new OfflineRecognizer(config);
var recognizer = new OfflineRecognizer(config);
VadModelConfig vadModelConfig = new VadModelConfig();
var vadModelConfig = new VadModelConfig();
vadModelConfig.SileroVad.Model = "./silero_vad.onnx";
vadModelConfig.Debug = 0;
VoiceActivityDetector vad = new VoiceActivityDetector(vadModelConfig, 60);
var vad = new VoiceActivityDetector(vadModelConfig, 60);
string testWaveFilename = "./lei-jun-test.wav";
WaveReader reader = new WaveReader(testWaveFilename);
var testWaveFilename = "./lei-jun-test.wav";
var reader = new WaveReader(testWaveFilename);
int numSamples = reader.Samples.Length;
int windowSize = vadModelConfig.SileroVad.WindowSize;
int sampleRate = vadModelConfig.SampleRate;
int numIter = numSamples / windowSize;
for (int i = 0; i != numIter; ++i) {
for (int i = 0; i != numIter; ++i)
{
int start = i * windowSize;
float[] samples = new float[windowSize];
var samples = new float[windowSize];
Array.Copy(reader.Samples, start, samples, 0, windowSize);
vad.AcceptWaveform(samples);
if (vad.IsSpeechDetected()) {
while (!vad.IsEmpty()) {
if (vad.IsSpeechDetected())
{
while (!vad.IsEmpty())
{
SpeechSegment segment = vad.Front();
float startTime = segment.Start / (float)sampleRate;
float duration = segment.Samples.Length / (float)sampleRate;
var startTime = segment.Start / (float)sampleRate;
var duration = segment.Samples.Length / (float)sampleRate;
OfflineStream stream = recognizer.CreateStream();
stream.AcceptWaveform(sampleRate, segment.Samples);
recognizer.Decode(stream);
String text = stream.Result.Text;
var text = stream.Result.Text;
if (!String.IsNullOrEmpty(text)) {
Console.WriteLine("{0}--{1}: {2}", String.Format("{0:0.00}", startTime),
String.Format("{0:0.00}", startTime+duration), text);
if (!string.IsNullOrEmpty(text))
{
Console.WriteLine("{0}--{1}: {2}", string.Format("{0:0.00}", startTime),
string.Format("{0:0.00}", startTime + duration), text);
}
vad.Pop();
... ... @@ -60,19 +62,21 @@ class VadNonStreamingAsrParaformer
vad.Flush();
while (!vad.IsEmpty()) {
SpeechSegment segment = vad.Front();
while (!vad.IsEmpty())
{
var segment = vad.Front();
float startTime = segment.Start / (float)sampleRate;
float duration = segment.Samples.Length / (float)sampleRate;
OfflineStream stream = recognizer.CreateStream();
var stream = recognizer.CreateStream();
stream.AcceptWaveform(sampleRate, segment.Samples);
recognizer.Decode(stream);
String text = stream.Result.Text;
var text = stream.Result.Text;
if (!String.IsNullOrEmpty(text)) {
Console.WriteLine("{0}--{1}: {2}", String.Format("{0:0.00}", startTime),
String.Format("{0:0.00}", startTime+duration), text);
if (!string.IsNullOrEmpty(text))
{
Console.WriteLine("{0}--{1}: {2}", string.Format("{0:0.00}", startTime),
string.Format("{0:0.00}", startTime + duration), text);
}
vad.Pop();
... ...
... ... @@ -2,7 +2,7 @@
<PropertyGroup>
<OutputType>Exe</OutputType>
<TargetFramework>net6.0</TargetFramework>
<TargetFramework>net8.0</TargetFramework>
<RootNamespace>vad_non_streaming_asr_paraformer</RootNamespace>
<ImplicitUsings>enable</ImplicitUsings>
<Nullable>enable</Nullable>
... ...