Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 71e0b480 authored by Bjorn Bringert's avatar Bjorn Bringert
Browse files

Improve TTS engine audio buffer API

This adds two methods:
SynthesisRequest.getMaxBufferSize()
SynthesisRequest.completeAudioAvailable()

Change-Id: I1186eed45997ee9a7e51212c8d6706dd324ca949
parent 50e657bb
Loading
Loading
Loading
Loading
+42 −5
Original line number Diff line number Diff line
@@ -19,6 +19,7 @@ import android.media.AudioFormat;
import android.util.Log;

import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.RandomAccessFile;
import java.nio.ByteBuffer;
@@ -32,6 +33,8 @@ class FileSynthesisRequest extends SynthesisRequest {
    private static final String TAG = "FileSynthesisRequest";
    private static final boolean DBG = false;

    private static final int MAX_AUDIO_BUFFER_SIZE = 8192;

    private static final int WAV_HEADER_LENGTH = 44;
    private static final short WAV_FORMAT_PCM = 0x0001;

@@ -80,6 +83,11 @@ class FileSynthesisRequest extends SynthesisRequest {
        }
    }

    @Override
    public int getMaxBufferSize() {
        return MAX_AUDIO_BUFFER_SIZE;
    }

    @Override
    public int start(int sampleRateInHz, int audioFormat, int channelCount) {
        if (DBG) {
@@ -152,8 +160,9 @@ class FileSynthesisRequest extends SynthesisRequest {
            try {
                // Write WAV header at start of file
                mFile.seek(0);
                int fileLen = (int) mFile.length();
                mFile.write(makeWavHeader(mSampleRateInHz, mAudioFormat, mChannelCount, fileLen));
                int dataLength = (int) (mFile.length() - WAV_HEADER_LENGTH);
                mFile.write(
                        makeWavHeader(mSampleRateInHz, mAudioFormat, mChannelCount, dataLength));
                closeFile();
                return TextToSpeech.SUCCESS;
            } catch (IOException ex) {
@@ -164,8 +173,37 @@ class FileSynthesisRequest extends SynthesisRequest {
        }
    }

    @Override
    public int completeAudioAvailable(int sampleRateInHz, int audioFormat, int channelCount,
            byte[] buffer, int offset, int length) {
        synchronized (mStateLock) {
            if (mStopped) {
                if (DBG) Log.d(TAG, "Request has been aborted.");
                return TextToSpeech.ERROR;
            }
        }
        FileOutputStream out = null;
        try {
            out = new FileOutputStream(mFileName);
            out.write(makeWavHeader(sampleRateInHz, audioFormat, channelCount, length));
            out.write(buffer, offset, length);
            return TextToSpeech.SUCCESS;
        } catch (IOException ex) {
            Log.e(TAG, "Failed to write to " + mFileName + ": " + ex);
            return TextToSpeech.ERROR;
        } finally {
            try {
                if (out != null) {
                    out.close();
                }
            } catch (IOException ex) {
                Log.e(TAG, "Failed to close " + mFileName + ": " + ex);
            }
        }
    }

    private byte[] makeWavHeader(int sampleRateInHz, int audioFormat, int channelCount,
            int fileLength) {
            int dataLength) {
        // TODO: is AudioFormat.ENCODING_DEFAULT always the same as ENCODING_PCM_16BIT?
        int sampleSizeInBytes = (audioFormat == AudioFormat.ENCODING_PCM_8BIT ? 1 : 2);
        int byteRate = sampleRateInHz * sampleSizeInBytes * channelCount;
@@ -177,7 +215,7 @@ class FileSynthesisRequest extends SynthesisRequest {
        header.order(ByteOrder.LITTLE_ENDIAN);

        header.put(new byte[]{ 'R', 'I', 'F', 'F' });
        header.putInt(fileLength - 8);  // RIFF chunk size
        header.putInt(dataLength + WAV_HEADER_LENGTH - 8);  // RIFF chunk size
        header.put(new byte[]{ 'W', 'A', 'V', 'E' });
        header.put(new byte[]{ 'f', 'm', 't', ' ' });
        header.putInt(16);  // size of fmt chunk
@@ -188,7 +226,6 @@ class FileSynthesisRequest extends SynthesisRequest {
        header.putShort(blockAlign);
        header.putShort(bitsPerSample);
        header.put(new byte[]{ 'd', 'a', 't', 'a' });
        int dataLength = fileLength - WAV_HEADER_LENGTH;
        header.putInt(dataLength);

        return headerBuf;
+86 −25
Original line number Diff line number Diff line
@@ -78,6 +78,13 @@ class PlaybackSynthesisRequest extends SynthesisRequest {
        }
    }

    @Override
    public int getMaxBufferSize() {
        // The AudioTrack buffer will be at least MIN_AUDIO_BUFFER_SIZE, so that should always be
        // a safe buffer size to pass in.
        return MIN_AUDIO_BUFFER_SIZE;
    }

    // TODO: add a thread that writes to the AudioTrack?
    @Override
    public int start(int sampleRateInHz, int audioFormat, int channelCount) {
@@ -86,20 +93,6 @@ class PlaybackSynthesisRequest extends SynthesisRequest {
                    + "," + channelCount + ")");
        }

        int channelConfig;
        if (channelCount == 1) {
            channelConfig = AudioFormat.CHANNEL_OUT_MONO;
        } else if (channelCount == 2){
            channelConfig = AudioFormat.CHANNEL_OUT_STEREO;
        } else {
            Log.e(TAG, "Unsupported number of channels: " + channelCount);
            return TextToSpeech.ERROR;
        }

        int minBufferSizeInBytes
                = AudioTrack.getMinBufferSize(sampleRateInHz, channelConfig, audioFormat);
        int bufferSizeInBytes = Math.max(MIN_AUDIO_BUFFER_SIZE, minBufferSizeInBytes);

        synchronized (mStateLock) {
            if (mStopped) {
                if (DBG) Log.d(TAG, "Request has been aborted.");
@@ -111,22 +104,19 @@ class PlaybackSynthesisRequest extends SynthesisRequest {
                return TextToSpeech.ERROR;
            }

            mAudioTrack = new AudioTrack(mStreamType, sampleRateInHz, channelConfig, audioFormat,
                    bufferSizeInBytes, AudioTrack.MODE_STREAM);
            if (mAudioTrack.getState() != AudioTrack.STATE_INITIALIZED) {
                cleanUp();
            mAudioTrack = createAudioTrack(sampleRateInHz, audioFormat, channelCount,
                    AudioTrack.MODE_STREAM);
            if (mAudioTrack == null) {
                return TextToSpeech.ERROR;
            }

            setupVolume();
        }

        return TextToSpeech.SUCCESS;
    }

    private void setupVolume() {
        float vol = clip(mVolume, 0.0f, 1.0f);
        float panning = clip(mPan, -1.0f, 1.0f);
    private void setupVolume(AudioTrack audioTrack, float volume, float pan) {
        float vol = clip(volume, 0.0f, 1.0f);
        float panning = clip(pan, -1.0f, 1.0f);
        float volLeft = vol;
        float volRight = vol;
        if (panning > 0.0f) {
@@ -135,7 +125,7 @@ class PlaybackSynthesisRequest extends SynthesisRequest {
            volRight *= (1.0f + panning);
        }
        if (DBG) Log.d(TAG, "volLeft=" + volLeft + ",volRight=" + volRight);
        if (mAudioTrack.setStereoVolume(volLeft, volRight) != AudioTrack.SUCCESS) {
        if (audioTrack.setStereoVolume(volLeft, volRight) != AudioTrack.SUCCESS) {
            Log.e(TAG, "Failed to set volume");
        }
    }
@@ -148,7 +138,10 @@ class PlaybackSynthesisRequest extends SynthesisRequest {
    public int audioAvailable(byte[] buffer, int offset, int length) {
        if (DBG) {
            Log.d(TAG, "audioAvailable(byte[" + buffer.length + "],"
                    + offset + "," + length + "), thread ID=" + android.os.Process.myTid());
                    + offset + "," + length + ")");
        }
        if (length > getMaxBufferSize()) {
            throw new IllegalArgumentException("buffer is too large (" + length + " bytes)");
        }
        synchronized (mStateLock) {
            if (mStopped) {
@@ -194,4 +187,72 @@ class PlaybackSynthesisRequest extends SynthesisRequest {
        }
        return TextToSpeech.SUCCESS;
    }

    @Override
    public int completeAudioAvailable(int sampleRateInHz, int audioFormat, int channelCount,
            byte[] buffer, int offset, int length) {
        if (DBG) {
            Log.d(TAG, "completeAudioAvailable(" + sampleRateInHz + "," + audioFormat
                    + "," + channelCount + "byte[" + buffer.length + "],"
                    + offset + "," + length + ")");
        }

        synchronized (mStateLock) {
            if (mStopped) {
                if (DBG) Log.d(TAG, "Request has been aborted.");
                return TextToSpeech.ERROR;
            }
            if (mAudioTrack != null) {
                Log.e(TAG, "start() called before completeAudioAvailable()");
                cleanUp();
                return TextToSpeech.ERROR;
            }

            mAudioTrack = createAudioTrack(sampleRateInHz, audioFormat, channelCount,
                    AudioTrack.MODE_STATIC);
            if (mAudioTrack == null) {
                return TextToSpeech.ERROR;
            }

            try {
                mAudioTrack.write(buffer, offset, length);
                mAudioTrack.play();
            } catch (IllegalStateException ex) {
                Log.e(TAG, "Playback error", ex);
                return TextToSpeech.ERROR;
            } finally {
                cleanUp();
            }
        }

        return TextToSpeech.SUCCESS;
    }

    private AudioTrack createAudioTrack(int sampleRateInHz, int audioFormat, int channelCount,
            int mode) {
        int channelConfig;
        if (channelCount == 1) {
            channelConfig = AudioFormat.CHANNEL_OUT_MONO;
        } else if (channelCount == 2){
            channelConfig = AudioFormat.CHANNEL_OUT_STEREO;
        } else {
            Log.e(TAG, "Unsupported number of channels: " + channelCount);
            return null;
        }

        int minBufferSizeInBytes
                = AudioTrack.getMinBufferSize(sampleRateInHz, channelConfig, audioFormat);
        int bufferSizeInBytes = Math.max(MIN_AUDIO_BUFFER_SIZE, minBufferSizeInBytes);
        AudioTrack audioTrack = new AudioTrack(mStreamType, sampleRateInHz, channelConfig,
                audioFormat, bufferSizeInBytes, mode);
        if (audioTrack == null) {
            return null;
        }
        if (audioTrack.getState() != AudioTrack.STATE_INITIALIZED) {
            audioTrack.release();
            return null;
        }
        setupVolume(audioTrack, mVolume, mPan);
        return audioTrack;
    }
}
 No newline at end of file
+32 −3
Original line number Diff line number Diff line
@@ -18,6 +18,13 @@ package android.speech.tts;
/**
 * A request for speech synthesis given to a TTS engine for processing.
 *
 * The engine can provide streaming audio by calling
 * {@link #start}, then {@link #audioAvailable} until all audio has been provided, then finally
 * {@link #done}.
 *
 * Alternatively, the engine can provide all the audio at once, by using
 * {@link #completeAudioAvailable}.
 *
 * @hide Pending approval
 */
public abstract class SynthesisRequest {
@@ -100,6 +107,12 @@ public abstract class SynthesisRequest {
        return mPitch;
    }

    /**
     * Gets the maximum number of bytes that the TTS engine can pass in a single call of
     * {@link #audioAvailable}. This does not apply to {@link #completeAudioAvailable}.
     */
    public abstract int getMaxBufferSize();

    /**
     * Aborts the speech request.
     *
@@ -117,7 +130,7 @@ public abstract class SynthesisRequest {
     * @param sampleRateInHz Sample rate in HZ of the generated audio.
     * @param audioFormat Audio format of the generated audio. Must be one of
     *         the ENCODING_ constants defined in {@link android.media.AudioFormat}.
     * @param channelCount The number of channels
     * @param channelCount The number of channels. Must be {@code 1} or {@code 2}.
     * @return {@link TextToSpeech#SUCCESS} or {@link TextToSpeech#ERROR}.
     */
    public abstract int start(int sampleRateInHz, int audioFormat, int channelCount);
@@ -131,8 +144,8 @@ public abstract class SynthesisRequest {
     * @param buffer The generated audio data. This method will not hold on to {@code buffer},
     *         so the caller is free to modify it after this method returns.
     * @param offset The offset into {@code buffer} where the audio data starts.
     * @param length The number of bytes of audio data in {@code buffer}.
     *         Must be less than or equal to {@code buffer.length - offset}.
     * @param length The number of bytes of audio data in {@code buffer}. This must be
     *         less than or equal to the return value of {@link #getMaxBufferSize}.
     * @return {@link TextToSpeech#SUCCESS} or {@link TextToSpeech#ERROR}.
     */
    public abstract int audioAvailable(byte[] buffer, int offset, int length);
@@ -148,4 +161,20 @@ public abstract class SynthesisRequest {
     */
    public abstract int done();

    /**
     * The service can call this method instead of using {@link #start}, {@link #audioAvailable}
     * and {@link #done} if all the audio data is available in a single buffer.
     *
     * @param sampleRateInHz Sample rate in HZ of the generated audio.
     * @param audioFormat Audio format of the generated audio. Must be one of
     *         the ENCODING_ constants defined in {@link android.media.AudioFormat}.
     * @param channelCount The number of channels. Must be {@code 1} or {@code 2}.
     * @param buffer The generated audio data. This method will not hold on to {@code buffer},
     *         so the caller is free to modify it after this method returns.
     * @param offset The offset into {@code buffer} where the audio data starts.
     * @param length The number of bytes of audio data in {@code buffer}.
     * @return {@link TextToSpeech#SUCCESS} or {@link TextToSpeech#ERROR}.
     */
    public abstract int completeAudioAvailable(int sampleRateInHz, int audioFormat,
            int channelCount, byte[] buffer, int offset, int length);
}
 No newline at end of file