Recognition Context

Cubic allows users to send context information with a recognition request which may aid the speech recognition. For example, if you have a list of names that you want to make sure the Cubic model transcribes correctly, with the correct spelling, then you may provide the list in the form of a RecognitionContext object along with the RecognitionConfig before streaming data.

Cubic models allow different sets of “context tokens” each of which can be paired with a list of words or phrases. For example, a Cubic model may have a context token for airport names, and you can provide a list of airport names you want to be recognized correctly for this context token. Likewise, models may also be configured with tokens for “contact list names”, “menu items”, “medical jargon” etc.

To ensure that there is no added latency in processing the list of words or phrases during a recognition request, we have a API method called CompileContext() that allows the user to compile the list into a compact, efficient format for passing to the Recognize() or StreamingRecognize() methods.

Compiling Recognition Context

We have several examples in different langagues below showing you how to compile context data and send it during a recognition request.


package main

import (
    "context"
    "fmt"
    "log"
    "os"
    "strings"

    "github.com/cobaltspeech/sdk-cubic/grpc/go-cubic"
    "github.com/cobaltspeech/sdk-cubic/grpc/go-cubic/cubicpb"
)

func main() {

    // creating client without TLS. Remove WithInsecure() if using TLS
    serverAddr := "127.0.0.1:2727"
    client, err := cubic.NewClient(serverAddr, cubic.WithInsecure())
    if err != nil {
        log.Fatal(err)
    }
    defer client.Close()

    // Get list of available models
    modelResp, err := client.ListModels(context.Background())
    if err != nil {
        log.Fatal(err)
    }
    for _, m := range modelResp.Models {
        fmt.Printf("\nID = %v, Name = %v, Supports Context = %v\n",
            m.Id, m.Name, m.Attributes.ContextInfo.SupportsContext)
        // printing allowed context tokens
        if m.Attributes.ContextInfo.SupportsContext {
            fmt.Printf("Allowed Context Tokens = %v\n\n",
                strings.Join(m.Attributes.ContextInfo.AllowedContextTokens, ", "))
        }
    }

    // Assuming the first model supports context
    model := modelResp.Models[0]

    // Let's say this model has an allowed context token called "airport_names" and
    // we have a list of airport names that we want to make sure the recognizer gets
    // right. We compile the list of names using the CompileContext(), save the compiled
    // data and send it back with subsequent recognize requests to customize and improve the results.

    // a small example list
    phrases := []string{"NARITA", "KUALA LUMPUR INTERNATIONAL", "ISTANBUL ATATURK", "LAGUARDIA"}
    contextToken := model.Attributes.ContextInfo.AllowedContextTokens[0] // "airport_names"

    // sending request to server
    compiledResp, err := client.CompileContext(
        context.Background(), model.Id, contextToken, phrases, nil)
    if err != nil {
        log.Fatal(err)
    }

    // saving the compiled result for later use; note this compiled data is only
    // compatible with the model whose ID was provided in the CompileContext call
    compiledContexts := make([]*cubicpb.CompiledContext, 0)
    compiledContexts = append(compiledContexts, compiledResp.Context)

    // Now we can send a recognize request along with the compiled context. The
    // context data is provided through the recognition config as a list of compiled
    // contexts (i.e. we can provide more than one compiled context if the model
    // supports more than one context token).
    cfg := &cubicpb.RecognitionConfig{
        ModelId:       model.Id,
        AudioEncoding: cubicpb.RecognitionConfig_WAV,
        Context:       &cubicpb.RecognitionContext{Compiled: compiledContexts},
    }

    // The rest is the same as a usual streaming recognize request

    // open audio file stream
    f, err := os.Open("test.wav")
    if err != nil {
        log.Fatal(err)
    }
    defer f.Close()

    // define a callback function to handle results
    resultHandler := func(resp *cubicpb.RecognitionResponse) {
        for _, r := range resp.Results {
            if !r.IsPartial {
                fmt.Println(r.Alternatives[0].Transcript)
            }
        }
    }

    err = client.StreamingRecognize(context.Background(), cfg, f, resultHandler)
    if err != nil {
        log.Fatal(err)
    }
}

import cubic

# set insecure to False if server uses TLS
serverAddress = '127.0.0.1:2727'
client = cubic.Client(serverAddress, insecure=True)

# Get list of available models
modelResp = client.ListModels()
for m in modelResp.models:
    print("\nID = {}, Name = {}, Supports Context = {}".format(
        m.id, m.name, m.attributes.context_info.supports_context))
    if m.attributes.context_info.supports_context:
        # printing allowed context tokens
        print("Allowed context tokens =  {}\n".format(
            str(m.attributes.context_info.allowed_context_tokens)))

# Assuming the first model supports context
model = modelResp.models[0]

# Let's say this model has an allowed context token called "airport_names" and
# we have a list of airport names that we want to make sure the recognizer gets
# right. We compile the list of names using the CompileContext(), save the compiled
# data and send it back with subsequent recognize requests to customize and improve the results.

# a small example list
phrases = ["NARITA", "KUALA LUMPUR INTERNATIONAL", "ISTANBUL ATATURK", "LAGUARDIA"]
contextToken = model.attributes.context_info.allowed_context_tokens[0]  # "airport_names"

# sending request to server
compiledResp = client.CompileContext(model.id, contextToken, phrases)

# saving the compiled result for later use; note this compiled data is only
# compatible with the model whose ID was provided in the CompileContext call
compiledContexts = []
compiledContexts.append(compiledResp.context)

# Now we can send a recognize request along with the compiled context. The
# context data is provided through the recognition config as a list of compiled
# contexts (i.e. we can provide more than one compiled context if the model
# supports more than one context token).
cfg = cubic.RecognitionConfig(
    model_id=model.id,
    audio_encoding="WAV",
    context=cubic.RecognitionContext(compiled=compiledContexts),
)

# The rest is the same as a usual streaming recognize request

# open audio file stream
audio = open('test.wav', 'rb')

# send streaming request to cubic and print out results as they come in
for resp in client.StreamingRecognize(cfg, audio):
    for result in resp.results:
        if result.is_partial:
            print("\r{0}".format(result.alternatives[0].transcript), end="")
        else:
            print("\r{0}".format(result.alternatives[0].transcript), end="\n")

using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Threading.Tasks;
using Grpc.Core;

namespace CubicRecognitionContextExample {
    class Program {
        static async Task Main(string[] args) {

            // set creds = new Grpc.Core.SslCredentials(); if using TLS
            var serverAddress = "127.0.0.1:2727";
            var creds = Grpc.Core.ChannelCredentials.Insecure;

            // Initialize a gRPC connection
            var channel = new Grpc.Core.Channel(serverAddress, creds);
            var client = new CobaltSpeech.Cubic.Cubic.CubicClient(channel);

            // Get list of available models
            var listModelsRequest = new CobaltSpeech.Cubic.ListModelsRequest();
            var modelResp = client.ListModels(listModelsRequest);

            foreach (var m in modelResp.Models) {
                Console.WriteLine("\nID = {0}, Name = {1}, Supports Context = {2}",
                    m.Id, m.Name, m.Attributes.ContextInfo.SupportsContext);
                // printing allowed context tokens
                if (m.Attributes.ContextInfo.SupportsContext) {
                    Console.WriteLine("Allowed Context Tokens = {0}\n",
                        m.Attributes.ContextInfo.AllowedContextTokens);
                }
            }

            // Assuming the first model supports context
            var model = modelResp.Models[0];

            // Let's say this model has an allowed context token called "airport_names" and
            // we have a list of airport names that we want to make sure the recognizer gets
            // right. We compile the list of names using the CompileContext(), save the compiled
            // data and send it back with subsequent recognize requests to customize and improve the results.

            // a small example list
            string[] phrases = { "NARITA", "KUALA LUMPUR INTERNATIONAL", "ISTANBUL ATATURK", "LAGUARDIA" };
            string contextToken = model.Attributes.ContextInfo.AllowedContextTokens[0]; // "airport_names"

            // create compile context request
            var compileRequest = new CobaltSpeech.Cubic.CompileContextRequest {
                ModelId = model.Id,
                Token = contextToken,
            };
            // put phrases into an compileRequest.Phrases
            foreach (var phrase in phrases) {
                compileRequest.Phrases.Add(
                    new CobaltSpeech.Cubic.ContextPhrase {
                        Text = phrase,
                    });
            }

            // send request to server
            var compiledResp = client.CompileContext(compileRequest);

            // saving the compiled result for later use; note this compiled data is only
            // compatible with the model whose ID was provided in the CompileContext call
            var compiledContexts = new List<CobaltSpeech.Cubic.CompiledContext>();
            compiledContexts.Add(compiledResp.Context);

            // Now we can send a recognize request along with the compiled context. The
            // context data is provided through the recognition config as a list of compiled
            // contexts (i.e. we can provide more than one compiled context if the model
            // supports more than one context token).
            var cfg = new CobaltSpeech.Cubic.RecognitionConfig {
                ModelId = model.Id,
                AudioEncoding = CobaltSpeech.Cubic.RecognitionConfig.Types.Encoding.Wav,
                Context = new CobaltSpeech.Cubic.RecognitionContext(),
            };
            foreach (var ctx in compiledContexts) {
                cfg.Context.Compiled.Add(ctx);
            }

            // The rest is the same as a usual streaming recognize request

            string audioPath = "test.wav";

            // Setup the bi-directional gRPC stream.
            var call = client.StreamingRecognize();
            using(call) {
                // Setup recieve task
                var responseReaderTask = Task.Run(async() => {
                    // Wait for the next response
                    while (await call.ResponseStream.MoveNext()) {
                        var response = call.ResponseStream.Current;
                        foreach (var result in response.Results) {
                            Console.WriteLine(result.Alternatives[0].Transcript);
                        }
                    }
                });

                // Send config first, followed by the audio
                {
                    // Send the configs
                    var request = new CobaltSpeech.Cubic.StreamingRecognizeRequest();
                    request.Config = cfg;
                    await call.RequestStream.WriteAsync(request);

                    // Setup object for streaming audio
                    request.Config = null;
                    request.Audio = new CobaltSpeech.Cubic.RecognitionAudio { };

                    // Send the audio, in 8kb chunks
                    const int chunkSize = 8192;
                    using(FileStream file = File.OpenRead(audioPath)) {
                        int bytesRead;
                        var buffer = new byte[chunkSize];
                        while ((bytesRead = file.Read(buffer, 0, buffer.Length)) > 0) {
                            var bytes = Google.Protobuf.ByteString.CopyFrom(buffer.Take(bytesRead).ToArray());
                            request.Audio.Data = bytes;
                            await call.RequestStream.WriteAsync(request);
                        }

                        // Close the sending stream
                        await call.RequestStream.CompleteAsync();
                    }
                }

                // Wait for all of the responses to come back through the receiving stream
                await responseReaderTask;
            }
        }
    }
}

import Foundation
import Cubic

class CubicExample {

    // set useTLS to true if using TLS
    let client = Client(host: "127.0.0.1", port: 2727, useTLS: false)
    var config = Cobaltspeech_Cubic_RecognitionConfig()
    let fileName = "test.wav"
    let chunkSize = 8192
    
    public init() {
        let dispatchGroup = DispatchGroup()
        dispatchGroup.enter()
        
        var model: Cobaltspeech_Cubic_Model!

        client.listModels(success: { (models) in
            guard let models = models else { return }
            
            for model in models {
                print("\nID = \(model.id), Name = \(model.name), Supports Context = \(model.attributes.contextInfo.supportsContext)")
                // printing allowed context tokens
                if model.attributes.contextInfo.supportsContext {
                    print("Allowed Context Tokens = \(model.attributes.contextInfo.allowedContextTokens)\n")
                }
            }
            
            // Assuming the first model supports context
            if let firstModel = models.first {
                model = firstModel
            } else {
                return
            }
            
            dispatchGroup.leave()
        }) { (error) in
            print(error.localizedDescription)
            dispatchGroup.leave()
        }
        
        dispatchGroup.wait()
        
        // Let's say this model has an allowed context token called "airport_names" and
        // we have a list of airport names that we want to make sure the recognizer gets
        // right. We compile the list of names using the CompileContext(), save the compiled
        // data and send it back with subsequent recognize requests to customize and improve the results.
        
        // a small example list
        let phrases = ["NARITA", "KUALA LUMPUR INTERNATIONAL", "ISTANBUL ATATURK", "LAGUARDIA"]
        let contextToken = model.attributes.contextInfo.allowedContextTokens[0] // "airport_names"
        
        // create compile context request
        var compileRequest = Cobaltspeech_Cubic_CompileContextRequest()
        compileRequest.modelID = model.id
        compileRequest.token = contextToken
        
        // put phrases into an compileRequest.Phrases
        for phrase in phrases {
            var contextPhrase = Cobaltspeech_Cubic_ContextPhrase()
            contextPhrase.text = phrase
            compileRequest.phrases.append(contextPhrase)
        }

        // send request to server
        var compiledContexts: [Cobaltspeech_Cubic_CompiledContext] = []
        
        dispatchGroup.enter()
        
        self.client.compileContext(compileRequest).response.whenComplete { (result) in
            switch result {
            case .success(let response):
                // saving the compiled result for later use; note this compiled data is only
                // compatible with the model whose ID was provided in the CompileContext call
                compiledContexts.append(response.context)
                dispatchGroup.leave()
            case .failure(let error):
                print(error.localizedDescription)
                dispatchGroup.leave()
            }
        }
        
        dispatchGroup.wait()
        
        // Now we can send a recognize request along with the compiled context. The
        // context data is provided through the recognition config as a list of compiled
        // contexts (i.e. we can provide more than one compiled context if the model
        // supports more than one context token).
        self.config.modelID = model.id
        self.config.audioEncoding = .wav
        self.config.context = Cobaltspeech_Cubic_RecognitionContext()
        self.config.context.compiled.append(contentsOf: compiledContexts)

        // The rest is the same as a usual streaming recognize request
        let fileUrl = URL(fileURLWithPath: self.fileName)
        guard let audioData = try? Data(contentsOf: fileUrl) else { return }
        
        dispatchGroup.enter()
        
        self.client.streamingRecognize(audioData: audioData, chunkSize: self.chunkSize, config: self.config, success: { (response) in
            for result in response.results {
                if !result.isPartial, let alternative = result.alternatives.first {
                    print(alternative.transcript)
                }
            }
            
            dispatchGroup.leave()
        }) { (error) in
            print(error.localizedDescription)
            dispatchGroup.leave()
        }
    }
}