Fangjun Kuang
Committed by GitHub

Add C# API for spoken language identification (#697)

  1 +#!/usr/bin/env bash
  2 +
  3 +cd dotnet-examples/
  4 +
  5 +cd spoken-language-identification
  6 +./run.sh
  7 +
  8 +cd ../online-decode-files
  9 +./run-zipformer2-ctc.sh
  10 +./run-transducer.sh
  11 +./run-paraformer.sh
  12 +
  13 +cd ../offline-decode-files
  14 +./run-nemo-ctc.sh
  15 +./run-paraformer.sh
  16 +./run-zipformer.sh
  17 +./run-hotwords.sh
  18 +./run-whisper.sh
  19 +./run-tdnn-yesno.sh
  20 +
  21 +cd ../offline-tts
  22 +./run-aishell3.sh
  23 +./run-piper.sh
  24 +ls -lh
  25 +
  26 +cd ../..
  27 +
  28 +mkdir tts
  29 +
  30 +cp dotnet-examples/offline-tts/*.wav ./tts
@@ -40,33 +40,10 @@ jobs: @@ -40,33 +40,10 @@ jobs:
40 - name: Check dotnet 40 - name: Check dotnet
41 run: dotnet --info 41 run: dotnet --info
42 42
43 - - name: Decode a file 43 + - name: Run tests
44 shell: bash 44 shell: bash
45 run: | 45 run: |
46 - cd dotnet-examples/  
47 -  
48 - cd online-decode-files  
49 - ./run-transducer.sh  
50 - ./run-paraformer.sh  
51 -  
52 - cd ../offline-decode-files  
53 - ./run-nemo-ctc.sh  
54 - ./run-paraformer.sh  
55 - ./run-zipformer.sh  
56 - ./run-hotwords.sh  
57 - ./run-whisper.sh  
58 - ./run-tdnn-yesno.sh  
59 -  
60 - cd ../offline-tts  
61 - ./run-aishell3.sh  
62 - ./run-piper.sh  
63 - ls -lh  
64 -  
65 - cd ../..  
66 -  
67 - mkdir tts  
68 -  
69 - cp dotnet-examples/offline-tts/*.wav ./tts 46 + .github/scripts/test-dot-net.sh
70 47
71 - uses: actions/upload-artifact@v4 48 - uses: actions/upload-artifact@v4
72 with: 49 with:
@@ -177,39 +177,16 @@ jobs: @@ -177,39 +177,16 @@ jobs:
177 cp -v scripts/dotnet/examples/offline-decode-files.csproj dotnet-examples/offline-decode-files/ 177 cp -v scripts/dotnet/examples/offline-decode-files.csproj dotnet-examples/offline-decode-files/
178 cp -v scripts/dotnet/examples/online-decode-files.csproj dotnet-examples/online-decode-files/ 178 cp -v scripts/dotnet/examples/online-decode-files.csproj dotnet-examples/online-decode-files/
179 cp -v scripts/dotnet/examples/speech-recognition-from-microphone.csproj dotnet-examples/speech-recognition-from-microphone/ 179 cp -v scripts/dotnet/examples/speech-recognition-from-microphone.csproj dotnet-examples/speech-recognition-from-microphone/
  180 + cp -v scripts/dotnet/examples/spoken-language-identification.csproj dotnet-examples/spoken-language-identification/
180 181
181 ls -lh /tmp 182 ls -lh /tmp
182 183
183 - - name: Decode a file 184 + - name: Run tests
184 shell: bash 185 shell: bash
185 run: | 186 run: |
186 - cd dotnet-examples/ 187 + .github/scripts/test-dot-net.sh
187 188
188 - cd online-decode-files  
189 - ./run-zipformer2-ctc.sh  
190 - ./run-transducer.sh  
191 - ./run-paraformer.sh  
192 -  
193 - cd ../offline-decode-files  
194 - ./run-nemo-ctc.sh  
195 - ./run-paraformer.sh  
196 - ./run-zipformer.sh  
197 - ./run-hotwords.sh  
198 - ./run-whisper.sh  
199 - ./run-tdnn-yesno.sh  
200 -  
201 - cd ../offline-tts  
202 - ./run-aishell3.sh  
203 - ./run-piper.sh  
204 - ls -lh  
205 -  
206 - cd ../..  
207 -  
208 - mkdir tts  
209 -  
210 - cp dotnet-examples/offline-tts/*.wav ./tts  
211 -  
212 - - uses: actions/upload-artifact@v3 189 + - uses: actions/upload-artifact@v4
213 with: 190 with:
214 name: dot-net-tts-generated-test-files-${{ matrix.os }} 191 name: dot-net-tts-generated-test-files-${{ matrix.os }}
215 path: tts 192 path: tts
@@ -13,6 +13,8 @@ Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "offline-tts", "offline-tts\ @@ -13,6 +13,8 @@ Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "offline-tts", "offline-tts\
13 EndProject 13 EndProject
14 Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "offline-tts-play", "offline-tts-play\offline-tts-play.csproj", "{40781464-5948-462B-BA4B-98932711513F}" 14 Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "offline-tts-play", "offline-tts-play\offline-tts-play.csproj", "{40781464-5948-462B-BA4B-98932711513F}"
15 EndProject 15 EndProject
  16 +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "spoken-language-identification", "spoken-language-identification\spoken-language-identification.csproj", "{3D7CF3D6-AC45-4D50-9619-5687B1443E94}"
  17 +EndProject
16 Global 18 Global
17 GlobalSection(SolutionConfigurationPlatforms) = preSolution 19 GlobalSection(SolutionConfigurationPlatforms) = preSolution
18 Debug|Any CPU = Debug|Any CPU 20 Debug|Any CPU = Debug|Any CPU
@@ -42,5 +44,9 @@ Global @@ -42,5 +44,9 @@ Global
42 {40781464-5948-462B-BA4B-98932711513F}.Debug|Any CPU.Build.0 = Debug|Any CPU 44 {40781464-5948-462B-BA4B-98932711513F}.Debug|Any CPU.Build.0 = Debug|Any CPU
43 {40781464-5948-462B-BA4B-98932711513F}.Release|Any CPU.ActiveCfg = Release|Any CPU 45 {40781464-5948-462B-BA4B-98932711513F}.Release|Any CPU.ActiveCfg = Release|Any CPU
44 {40781464-5948-462B-BA4B-98932711513F}.Release|Any CPU.Build.0 = Release|Any CPU 46 {40781464-5948-462B-BA4B-98932711513F}.Release|Any CPU.Build.0 = Release|Any CPU
  47 + {3D7CF3D6-AC45-4D50-9619-5687B1443E94}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
  48 + {3D7CF3D6-AC45-4D50-9619-5687B1443E94}.Debug|Any CPU.Build.0 = Debug|Any CPU
  49 + {3D7CF3D6-AC45-4D50-9619-5687B1443E94}.Release|Any CPU.ActiveCfg = Release|Any CPU
  50 + {3D7CF3D6-AC45-4D50-9619-5687B1443E94}.Release|Any CPU.Build.0 = Release|Any CPU
45 EndGlobalSection 51 EndGlobalSection
46 EndGlobal 52 EndGlobal
  1 +// Copyright (c) 2024 Xiaomi Corporation
  2 +//
  3 +// This file shows how to do spoken language identification with whisper.
  4 +//
  5 +// 1. Download a whisper multilingual model. We use a tiny model below.
  6 +// Please refer to https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models
  7 +// to download more models.
  8 +//
  9 +// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.tar.bz2
  10 +// tar xvf sherpa-onnx-whisper-tiny.tar.bz2
  11 +// rm sherpa-onnx-whisper-tiny.tar.bz2
  12 +//
  13 +// 2. Now run it
  14 +//
  15 +// dotnet run
  16 +
  17 +using SherpaOnnx;
  18 +using System.Collections.Generic;
  19 +using System;
  20 +
  21 +class SpokenLanguageIdentificationDemo
  22 +{
  23 +
  24 + static void Main(string[] args)
  25 + {
  26 + var config = new SpokenLanguageIdentificationConfig();
  27 + config.Whisper.Encoder = "./sherpa-onnx-whisper-tiny/tiny-encoder.int8.onnx";
  28 + config.Whisper.Decoder = "./sherpa-onnx-whisper-tiny/tiny-decoder.int8.onnx";
  29 +
  30 + var slid = new SpokenLanguageIdentification(config);
  31 + var filename = "./sherpa-onnx-whisper-tiny/test_wavs/0.wav";
  32 +
  33 + WaveReader waveReader = new WaveReader(filename);
  34 +
  35 + var s = slid.CreateStream();
  36 + s.AcceptWaveform(waveReader.SampleRate, waveReader.Samples);
  37 + var result = slid.Compute(s);
  38 + Console.WriteLine($"Filename: {filename}");
  39 + Console.WriteLine($"Detected language: {result.Lang}");
  40 + }
  41 +}
  42 +
  1 +#!/usr/bin/env bash
  2 +
  3 +set -ex
  4 +
  5 +if [ ! -d ./sherpa-onnx-whisper-tiny ]; then
  6 + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.tar.bz2
  7 + tar xvf sherpa-onnx-whisper-tiny.tar.bz2
  8 + rm sherpa-onnx-whisper-tiny.tar.bz2
  9 +fi
  10 +
  11 +dotnet run
  12 +
  1 +<Project Sdk="Microsoft.NET.Sdk">
  2 +
  3 + <PropertyGroup>
  4 + <OutputType>Exe</OutputType>
  5 + <TargetFramework>net6.0</TargetFramework>
  6 + <RootNamespace>spoken_language_identification</RootNamespace>
  7 + <ImplicitUsings>enable</ImplicitUsings>
  8 + <Nullable>enable</Nullable>
  9 + </PropertyGroup>
  10 +
  11 + <ItemGroup>
  12 + <PackageReference Include="org.k2fsa.sherpa.onnx" Version="*" />
  13 + </ItemGroup>
  14 +
  15 +</Project>
  1 +<Project Sdk="Microsoft.NET.Sdk">
  2 +
  3 + <PropertyGroup>
  4 + <OutputType>Exe</OutputType>
  5 + <TargetFramework>net6.0</TargetFramework>
  6 + <RootNamespace>spoken_language_identification</RootNamespace>
  7 + <ImplicitUsings>enable</ImplicitUsings>
  8 + <Nullable>enable</Nullable>
  9 + </PropertyGroup>
  10 +
  11 + <PropertyGroup>
  12 + <RestoreSources>/tmp/packages;$(RestoreSources);https://api.nuget.org/v3/index.json</RestoreSources>
  13 + </PropertyGroup>
  14 +
  15 + <ItemGroup>
  16 + <PackageReference Include="org.k2fsa.sherpa.onnx" Version="*" />
  17 + </ItemGroup>
  18 +
  19 +</Project>
@@ -403,8 +403,8 @@ namespace SherpaOnnx @@ -403,8 +403,8 @@ namespace SherpaOnnx
403 while (*buffer != 0) 403 while (*buffer != 0)
404 { 404 {
405 ++buffer; 405 ++buffer;
  406 + length += 1;
406 } 407 }
407 - length = (int)(buffer - (byte*)impl.Text);  
408 } 408 }
409 409
410 byte[] stringBuffer = new byte[length]; 410 byte[] stringBuffer = new byte[length];
@@ -496,8 +496,6 @@ namespace SherpaOnnx @@ -496,8 +496,6 @@ namespace SherpaOnnx
496 return new OfflineStream(p); 496 return new OfflineStream(p);
497 } 497 }
498 498
499 - /// You have to ensure that IsReady(stream) returns true before  
500 - /// you call this method  
501 public void Decode(OfflineStream stream) 499 public void Decode(OfflineStream stream)
502 { 500 {
503 Decode(_handle.Handle, stream.Handle); 501 Decode(_handle.Handle, stream.Handle);
@@ -549,4 +547,137 @@ namespace SherpaOnnx @@ -549,4 +547,137 @@ namespace SherpaOnnx
549 private static extern void Decode(IntPtr handle, IntPtr[] streams, int n); 547 private static extern void Decode(IntPtr handle, IntPtr[] streams, int n);
550 } 548 }
551 549
  550 + [StructLayout(LayoutKind.Sequential)]
  551 + public struct SpokenLanguageIdentificationWhisperConfig
  552 + {
  553 + public SpokenLanguageIdentificationWhisperConfig()
  554 + {
  555 + Encoder = "";
  556 + Decoder = "";
  557 + TailPaddings = -1;
  558 + }
  559 +
  560 + [MarshalAs(UnmanagedType.LPStr)]
  561 + public string Encoder;
  562 +
  563 + [MarshalAs(UnmanagedType.LPStr)]
  564 + public string Decoder;
  565 +
  566 + public int TailPaddings;
  567 + }
  568 +
  569 + public struct SpokenLanguageIdentificationConfig
  570 + {
  571 + public SpokenLanguageIdentificationConfig()
  572 + {
  573 + Whisper = new SpokenLanguageIdentificationWhisperConfig();
  574 + NumThreads = 1;
  575 + Debug = 0;
  576 + Provider = "cpu";
  577 + }
  578 + public SpokenLanguageIdentificationWhisperConfig Whisper;
  579 +
  580 + public int NumThreads;
  581 + public int Debug;
  582 +
  583 + [MarshalAs(UnmanagedType.LPStr)]
  584 + public string Provider;
  585 + }
  586 +
  587 + public class SpokenLanguageIdentificationResult
  588 + {
  589 + public SpokenLanguageIdentificationResult(IntPtr handle)
  590 + {
  591 + Impl impl = (Impl)Marshal.PtrToStructure(handle, typeof(Impl));
  592 +
  593 + // PtrToStringUTF8() requires .net standard 2.1
  594 + // _text = Marshal.PtrToStringUTF8(impl.Text);
  595 +
  596 + int length = 0;
  597 +
  598 + unsafe
  599 + {
  600 + byte* buffer = (byte*)impl.Lang;
  601 + while (*buffer != 0)
  602 + {
  603 + ++buffer;
  604 + length += 1;
  605 + }
  606 + }
  607 +
  608 + byte[] stringBuffer = new byte[length];
  609 + Marshal.Copy(impl.Lang, stringBuffer, 0, length);
  610 + _lang = Encoding.UTF8.GetString(stringBuffer);
  611 + }
  612 +
  613 + [StructLayout(LayoutKind.Sequential)]
  614 + struct Impl
  615 + {
  616 + public IntPtr Lang;
  617 + }
  618 +
  619 + private String _lang;
  620 + public String Lang => _lang;
  621 + }
  622 +
  623 + public class SpokenLanguageIdentification : IDisposable
  624 + {
  625 + public SpokenLanguageIdentification(SpokenLanguageIdentificationConfig config)
  626 + {
  627 + IntPtr h = SherpaOnnxCreateSpokenLanguageIdentification(ref config);
  628 + _handle = new HandleRef(this, h);
  629 + }
  630 +
  631 + public OfflineStream CreateStream()
  632 + {
  633 + IntPtr p = SherpaOnnxSpokenLanguageIdentificationCreateOfflineStream(_handle.Handle);
  634 + return new OfflineStream(p);
  635 + }
  636 +
  637 + public SpokenLanguageIdentificationResult Compute(OfflineStream stream)
  638 + {
  639 + IntPtr h = SherpaOnnxSpokenLanguageIdentificationCompute(_handle.Handle, stream.Handle);
  640 + SpokenLanguageIdentificationResult result = new SpokenLanguageIdentificationResult(h);
  641 + SherpaOnnxDestroySpokenLanguageIdentificationResult(h);
  642 + return result;
  643 + }
  644 +
  645 + public void Dispose()
  646 + {
  647 + Cleanup();
  648 + // Prevent the object from being placed on the
  649 + // finalization queue
  650 + System.GC.SuppressFinalize(this);
  651 + }
  652 +
  653 + ~SpokenLanguageIdentification()
  654 + {
  655 + Cleanup();
  656 + }
  657 +
  658 + private void Cleanup()
  659 + {
  660 + SherpaOnnxDestroySpokenLanguageIdentification(_handle.Handle);
  661 +
  662 + // Don't permit the handle to be used again.
  663 + _handle = new HandleRef(this, IntPtr.Zero);
  664 + }
  665 +
  666 + private HandleRef _handle;
  667 +
  668 + [DllImport(Dll.Filename)]
  669 + private static extern IntPtr SherpaOnnxCreateSpokenLanguageIdentification(ref SpokenLanguageIdentificationConfig config);
  670 +
  671 + [DllImport(Dll.Filename)]
  672 + private static extern void SherpaOnnxDestroySpokenLanguageIdentification(IntPtr handle);
  673 +
  674 + [DllImport(Dll.Filename)]
  675 + private static extern IntPtr SherpaOnnxSpokenLanguageIdentificationCreateOfflineStream(IntPtr handle);
  676 +
  677 + [DllImport(Dll.Filename)]
  678 + private static extern IntPtr SherpaOnnxSpokenLanguageIdentificationCompute(IntPtr handle, IntPtr stream);
  679 +
  680 + [DllImport(Dll.Filename)]
  681 + private static extern void SherpaOnnxDestroySpokenLanguageIdentificationResult(IntPtr handle);
  682 + }
552 } 683 }