Source code

This application has been implemented using the Qt framework, as a GUI application, so that we also get a graphical interface for ease of debugging. This debugging UI was designed using the Qt Designer of the Qt Creator IDE as a single UI file.

We start by creating an instance of the GUI application:

#include "mainwindow.h" 
#include <QApplication> 
 
int main(int argc, char *argv[]) { 
    QApplication a(argc, argv); 
    MainWindow w; 
    w.show(); 
     
    return a.exec(); 
}

This creates an instance of the MainWindow class in which we have implemented the application, along with an instance of QApplication, which is a wrapper class used by the Qt framework.

Next, this is the MainWindow header:

#include <QMainWindow> 
 
#include <QAudioRecorder> 
#include <QAudioProbe> 
#include <QMediaPlayer> 
 
 
namespace Ui { 
    class MainWindow; 
} 
 
class MainWindow : public QMainWindow { 
    Q_OBJECT 
     
public: 
    explicit MainWindow(QWidget *parent = nullptr); 
    ~MainWindow(); 
     
public slots: 
    void playBluetooth(); 
    void stopBluetooth(); 
    void playOnlineStream(); 
    void stopOnlineStream(); 
    void playLocalFile(); 
    void stopLocalFile(); 
    void recordMessage(); 
    void playMessage(); 
     
    void errorString(QString err); 
     
    void quit(); 
     
private: 
    Ui::MainWindow *ui; 
     
    QMediaPlayer* player; 
    QAudioRecorder* audioRecorder; 
    QAudioProbe* audioProbe; 
     
    qint64 silence; // Microseconds of silence recorded so far. 
     
private slots: 
    void processBuffer(QAudioBuffer); 
};

Its implementation contains most of the core functionality, declaring the audio recorder and player instances, with just the voice command processing being handled in a separate class:

#include "mainwindow.h" 
#include "ui_mainwindow.h" 
 
#include "voiceinput.h" 
 
#include <QThread> 
#include <QMessageBox> 
 
#include <cmath> 
 
 
#define MSG_RECORD_MAX_SILENCE_US 5000000 
 
MainWindow::MainWindow(QWidget *parent) : QMainWindow(parent), 
    ui(new Ui::MainWindow) { 
    ui->setupUi(this); 
     
    // Set up menu connections. 
    connect(ui->actionQuit, SIGNAL(triggered()), this, SLOT(quit())); 
     
    // Set up UI connections. 
    connect(ui->playBluetoothButton, SIGNAL(pressed), this, SLOT(playBluetooth)); 
    connect(ui->stopBluetoothButton, SIGNAL(pressed), this, SLOT(stopBluetooth)); 
    connect(ui->playLocalAudioButton, SIGNAL(pressed), this, SLOT(playLocalFile)); 
    connect(ui->stopLocalAudioButton, SIGNAL(pressed), this, SLOT(stopLocalFile)); 
    connect(ui->playOnlineStreamButton, SIGNAL(pressed), this, SLOT(playOnlineStream)); 
    connect(ui->stopOnlineStreamButton, SIGNAL(pressed), this, SLOT(stopOnlineStream)); 
    connect(ui->recordMessageButton, SIGNAL(pressed), this, SLOT(recordMessage)); 
    connect(ui->playBackMessage, SIGNAL(pressed), this, SLOT(playMessage)); 
     
    // Defaults 
    silence = 0; 
         
    // Create the audio interface instances. 
    player = new QMediaPlayer(this); 
    audioRecorder = new QAudioRecorder(this); 
    audioProbe = new QAudioProbe(this); 
     
    // Configure the audio recorder. 
    QAudioEncoderSettings audioSettings; 
    audioSettings.setCodec("audio/amr"); 
    audioSettings.setQuality(QMultimedia::HighQuality);     
    audioRecorder->setEncodingSettings(audioSettings);     
    audioRecorder->setOutputLocation(QUrl::fromLocalFile("message/last_message.amr")); 
     
    // Configure audio probe. 
    connect(audioProbe, SIGNAL(audioBufferProbed(QAudioBuffer)), this, SLOT(processBuffer(QAudioBuffer))); 
    audioProbe->setSource(audioRecorder); 
     
    // Start the voice interface in its own thread and set up the connections. 
    QThread* thread = new QThread; 
    VoiceInput* vi = new VoiceInput(); 
    vi->moveToThread(thread); 
    connect(thread, SIGNAL(started()), vi, SLOT(run())); 
    connect(vi, SIGNAL(finished()), thread, SLOT(quit())); 
    connect(vi, SIGNAL(finished()), vi, SLOT(deleteLater())); 
    connect(thread, SIGNAL(finished()), thread, SLOT(deleteLater())); 
     
    connect(vi, SIGNAL(error(QString)), this, SLOT(errorString(QString))); 
    connect(vi, SIGNAL(playBluetooth), this, SLOT(playBluetooth)); 
    connect(vi, SIGNAL(stopBluetooth), this, SLOT(stopBluetooth)); 
    connect(vi, SIGNAL(playLocal), this, SLOT(playLocalFile)); 
    connect(vi, SIGNAL(stopLocal), this, SLOT(stopLocalFile)); 
    connect(vi, SIGNAL(playRemote), this, SLOT(playOnlineStream)); 
    connect(vi, SIGNAL(stopRemote), this, SLOT(stopOnlineStream)); 
    connect(vi, SIGNAL(recordMessage), this, SLOT(recordMessage)); 
    connect(vi, SIGNAL(playMessage), this, SLOT(playMessage)); 
     
    thread->start(); 
}

In the constructor, we set up all of the UI connections for the buttons in the GUI window that allow us to trigger the application's functionality without having to use the voice user interface. This is useful for testing purposes.

In addition, we create an instance of the audio recorder and media player, along with an audio probe that is linked with the audio recorder, so that we can look at the audio samples it's recording and act on them.

Finally, we create an instance of the voice input interface class and push it onto its own thread before starting it. We connect its signals to specific commands, and other events to their respective slots:

MainWindow::~MainWindow() { 
    delete ui; 
} 
 
 
void MainWindow::playBluetooth() { 
    // Use the link with the BlueZ Bluetooth stack in the Linux kernel to 
    // configure it to act as an A2DP sink for smartphones to connect to. 
} 
 
 
// --- STOP BLUETOOTH --- 
void MainWindow::stopBluetooth() { 
    // 
}

As mentioned in the section on Bluetooth technology, we have left the Bluetooth functionality unimplemented for the reasons explained in that section.

void MainWindow::playOnlineStream() { 
    // Connect to remote streaming service's API and start streaming. 
} 
 
 
void MainWindow::stopOnlineStream() { 
    // Stop streaming from remote service. 
}

The same is true for the online streaming functionality. See the section on online streaming earlier in this chapter for details on how to implement this functionality.

void MainWindow::playLocalFile() { 
    player->setMedia(QUrl::fromLocalFile("music/coolsong.mp3")); 
    player->setVolume(50); 
    player->play(); 
} 
 
 
void MainWindow::stopLocalFile() { 
    player->stop(); 
}

To play a local file, we expect to find an MP3 file present in the hardcoded path. This could, however, also play all of the music in a specific folder with just a few modifications by reading in the filenames and playing them back one by one.

void MainWindow::recordMessage() { 
    audioRecorder->record(); 
} 
 
 
void MainWindow::playMessage() { 
    player->setMedia(QUrl::fromLocalFile("message/last_message.arm")); 
    player->setVolume(50); 
    player->play(); 
}

In the constructor, we configured the recorder to record to a file in a sub-folder called message. This will be overwritten if a new recording is made, allowing one to leave a message that can be played back later. The optional display or another accessory could be used to indicate when a new recording has been made and hasn't been listened to yet:

void MainWindow::processBuffer(QAudioBuffer buffer) { 
    const quint16 *data = buffer.constData<quint16>(); 
     
    // Get RMS of buffer, if silence, add its duration to the counter. 
    int samples = buffer.sampleCount(); 
    double sumsquared = 0; 
    for (int i = 0; i < samples; i++) { 
        sumsquared += data[i] * data[i]; 
    } 
     
    double rms = sqrt((double(1) / samples)*(sumsquared)); 
     
    if (rms <= 100) { 
        silence += buffer.duration(); 
    } 
     
    if (silence >= MSG_RECORD_MAX_SILENCE_US) { 
        silence = 0; 
        audioRecorder->stop(); 
    } 
}

This method is called by our audio probe whenever the recorder is active. In this function, we calculate the root-mean square (RMS) value of the audio buffer to determine whether it's filled with silence. Here, silence is relative and might have to be adjusted depending on the recording environment.

After five seconds of silence have been detected, the recording of the message is stopped:

void MainWindow::errorString(QString err) { 
    QMessageBox::critical(this, tr("Error"), err); 
} 
 
 
void MainWindow::quit() { 
    exit(0); 
}

The remaining methods handle the reporting of error messages that may be emitted elsewhere in the application, as well as terminating the application.

The VoiceInput class header defines the functionality for the voice input interface:

#include <QObject> 
#include <QAudioInput> 
 
extern "C" { 
#include "pocketsphinx.h" 
} 
 
class VoiceInput : public QObject { 
    Q_OBJECT 
     
    QAudioInput* audioInput; 
    QIODevice* audioDevice; 
    bool state; 
     
public: 
    explicit VoiceInput(QObject *parent = nullptr); 
    bool checkState() { return state; } 
     
signals: 
    void playBluetooth(); 
    void stopBluetooth(); 
    void playLocal(); 
    void stopLocal(); 
    void playRemote(); 
    void stopRemote(); 
    void recordMessage(); 
    void playMessage(); 
     
    void error(QString err); 
     
public slots: 
    void run(); 
};

As PocketSphinx is a C library, we have to make sure that it is used with C-style linkage. Beyond this, we create the class members for the audio input and related IO device that the voice input will use.

Next, the class definition:

#include <QDebug> 
#include <QThread> 
 
#include "voiceinput.h" 
 
extern "C" { 
#include <sphinxbase/err.h> 
#include <sphinxbase/ad.h> 
} 
 
 
VoiceInput::VoiceInput(QObject *parent) : QObject(parent) { 
    // 
}

The constructor doesn't do anything special, as the next method does all of the initializing and setting up of the main loop:

void VoiceInput::run() { 
    const int32 buffsize = 2048; 
    int16 adbuf[buffsize]; 
    uint8 utt_started, in_speech; 
    uint32 k = 0; 
    char const* hyp; 
     
    static ps_decoder_t *ps; 
     
    state = true; 
     
    QAudioFormat format; 
    format.setSampleRate(16000); 
    format.setChannelCount(1); 
    format.setSampleSize(16); 
    format.setCodec("audio/pcm"); 
    format.setByteOrder(QAudioFormat::LittleEndian); 
    format.setSampleType(QAudioFormat::UnSignedInt); 
     
    // Check that the audio device supports this format. 
    QAudioDeviceInfo info = QAudioDeviceInfo::defaultInputDevice(); 
    if (!info.isFormatSupported(format)) { 
       qWarning() << "Default format not supported, aborting."; 
       state = false; 
       return; 
    } 
    
    audioInput = new QAudioInput(format, this); 
    audioInput->setBufferSize(buffsize * 2);    
    audioDevice = audioInput->start(); 
 
    if (ps_start_utt(ps) < 0) { 
        E_FATAL("Failed to start utterance
"); 
    } 
     
    utt_started = FALSE; 
    E_INFO("Ready....
");

The first part of this method sets up the audio interface, configuring it to record using the audio format settings PocketSphinx requires: mono, little-endian, 16-bit signed PCM audio at 16,000 Hertz. After checking that the audio input supports this format, we create a new audio input instance:

    const char* keyfile = "COMPUTER/3.16227766016838e-13/
"; 
    if (ps_set_kws(ps, "keyword_search", keyfile) != 0) { 
        return; 
    } 
     
    if (ps_set_search(ps, "keyword_search") != 0) { 
        return; 
    } 
     
    const char* gramfile = "grammar asr; 
             
            public <rule> = <action> [<preposition>] [<objects>] [<preposition>] [<objects>]; 
             
            <action> = STOP | PLAY | RECORD; 
             
            <objects> = BLUETOOTH | LOCAL | REMOTE | MESSAGE; 
             
            <preposition> = FROM | TO;"; 
    ps_set_jsgf_string(ps, "jsgf", gramfile);

Next, we set up the keyword-spotting and JSGF grammar file that will be used during the processing of the audio sample. With the first ps_set_search() function call, we start the keyword-spotting search. The following loop will keep processing samples until the utterance computer is detected:

    bool kws = true; 
    for (;;) { 
        if ((k = audioDevice->read((char*) &adbuf, 4096))) { 
            E_FATAL("Failed to read audio.
"); 
        } 
         
        ps_process_raw(ps, adbuf, k, FALSE, FALSE); 
        in_speech = ps_get_in_speech(ps); 
         
        if (in_speech && !utt_started) { 
            utt_started = TRUE; 
            E_INFO("Listening...
"); 
        }

Each cycle, we read in another buffer worth of audio samples, to then have PocketSphinx process these samples. It also does silence detection for us to determine whether someone has started speaking into the microphone. If someone is speaking but we haven't started interpreting it yet, we start a new utterance:

        if (!in_speech && utt_started) { 
            ps_end_utt(ps); 
            hyp = ps_get_hyp(ps, nullptr); 
            if (hyp != nullptr) { 
                // We have a hypothesis. 
                 
                if (kws && strstr(hyp, "computer") != nullptr) { 
                    if (ps_set_search(ps, "jsgf") != 0) { 
                        E_FATAL("ERROR: Cannot switch to jsgf mode.
"); 
                    } 
                     
                    kws = false; 
                    E_INFO("Switched to jsgf mode 
");                             
                    E_INFO("Mode: %s
", ps_get_search(ps)); 
                } 
                else if (!kws) { 
                    if (hyp != nullptr) { 
                        // Check each action. 
                        if (strncmp(hyp, "play bluetooth", 14) == 0) { 
                            emit playBluetooth(); 
                        } 
                        else if (strncmp(hyp, "stop bluetooth", 14) == 0) { 
                            emit stopBluetooth(); 
                        } 
                        else if (strncmp(hyp, "play local", 10) == 0) { 
                            emit playLocal(); 
                        } 
                        else if (strncmp(hyp, "stop local", 10) == 0) { 
                            emit stopLocal(); 
                        } 
                        else if (strncmp(hyp, "play remote", 11) == 0) { 
                            emit stopBluetooth(); 
                        } 
                        else if (strncmp(hyp, "stop remote", 11) == 0) { 
                            emit stopBluetooth(); 
                        } 
                        else if (strncmp(hyp, "record message", 14) == 0) { 
                            emit stopBluetooth(); 
                        } 
                        else if (strncmp(hyp, "play message", 12) == 0) { 
                            emit stopBluetooth(); 
                        } 
                    } 
                    else { 
                        if (ps_set_search(ps, "keyword_search") != 0){ 
                            E_FATAL("ERROR: Cannot switch to kws mode.
"); 
                        } 
                        
                        kws = true; 
                        E_INFO("Switched to kws mode.
"); 
                    } 
                }                 
            } 
 
            if (ps_start_utt(ps) < 0) { 
                E_FATAL("Failed to start utterance
"); 
            } 
             
            utt_started = FALSE; 
            E_INFO("Ready....
"); 
        } 
         
        QThread::msleep(100); 
    } 
     
}

The rest of the method checks whether we have a usable hypothesis we can analyze. Depending on whether we are in keyword or grammar mode, we check for the detection of the keyword in the former case and switch to grammar mode. If we're already in grammar mode, we try to narrow the utterance down to a specific command, at which point we will emit the relevant signal that will trigger the connected functionality.

A new utterance is started whenever PocketSphinx detects at least one second of silence. After executing a command, the system switches back to keyword-spotting mode.

Table of Contents for Source code

Create new playlist

Sign In

Sign Up

Table of Contents for
Source code