AWS Transcript: file to text returns nonsense

Question

This is a follow-on question to AWS Transcribe S3 .wav file to text. I use a stream to read and send a .wav file contents to AWS.

Instead of getting back the correct transcript, I get nonsense like a bunch of "Yeah." statements. It looks like AWS isn't able to interpret the byte stream correctly, but I'm not sure what's wrong. I'm wondering if the file needs to be encoded somehow, ie, I can't send the raw .wav bytes straight from the file? Or perhaps I need to tell the service that this is .wav format?

What's wrong here? The input file is a valid .wav voice file that sounds intelligible when I listen to it.

Here is my java code:

package com.amazonaws.transcribe;

import org.reactivestreams.Publisher;
import org.reactivestreams.Subscriber;
import org.reactivestreams.Subscription;
import software.amazon.awssdk.core.SdkBytes;
import software.amazon.awssdk.regions.Region;
import software.amazon.awssdk.services.transcribestreaming.TranscribeStreamingAsyncClient;
import software.amazon.awssdk.services.transcribestreaming.model.*;

import javax.sound.sampled.*;
import java.io.*;
import java.net.URISyntaxException;
import java.nio.ByteBuffer;
import java.util.List;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.atomic.AtomicLong;


public class TranscribeFileFromStream {
    private static final Region REGION = Region.US_EAST_1;
    private static TranscribeStreamingAsyncClient client;

    public static void main(String args[]) throws URISyntaxException, ExecutionException, InterruptedException, LineUnavailableException {
        System.out.println(System.getProperty("java.version"));
        client = TranscribeStreamingAsyncClient.builder()
                .region(REGION)
                .build();
        try {
            CompletableFuture<Void> result = client.startStreamTranscription(getRequest(16000),
                    new AudioStreamPublisher(getStreamFromFile()),
                    getResponseHandler());
            result.get();
        } finally {
            if (client != null) {
                client.close();
            }
        }
    }

    private static InputStream getStreamFromFile() {
        try {
            File inputFile = new File("~/work/transcribe/src/main/resources/story/media/Story3.m4a.wav");
            InputStream audioStream = new FileInputStream(inputFile);
            return audioStream;
        } catch (FileNotFoundException e) {
            throw new RuntimeException(e);
        }
    }

    private static StartStreamTranscriptionRequest getRequest(Integer mediaSampleRateHertz) {
        return StartStreamTranscriptionRequest.builder()
                .languageCode(LanguageCode.EN_US)
                .mediaEncoding(MediaEncoding.PCM)
                .mediaSampleRateHertz(mediaSampleRateHertz)
                .build();
    }

    private static StartStreamTranscriptionResponseHandler getResponseHandler() {
        return StartStreamTranscriptionResponseHandler.builder()
                .onResponse(r -> {
                    System.out.println("Received Initial response");
                })
                .onError(e -> {
                    System.out.println(e.getMessage());
                    StringWriter sw = new StringWriter();
                    e.printStackTrace(new PrintWriter(sw));
                    System.out.println("Error Occurred: " + sw.toString());
                })
                .onComplete(() -> {
                    System.out.println("=== All records stream successfully ===");
                })
                .subscriber(event -> {
                    List<Result> results = ((TranscriptEvent) event).transcript().results();
                    if (results.size() > 0) {
                        if (!results.get(0).alternatives().get(0).transcript().isEmpty()) {
                            System.out.println(results.get(0).alternatives().get(0).transcript());
                        } else {
                            System.out.println("Empty result");
                        }
                    } else {
                        System.out.println("No results");
                    }
                })
                .build();
    }

    private static class AudioStreamPublisher implements Publisher<AudioStream> {
        private final InputStream inputStream;
        private static Subscription currentSubscription;


        private AudioStreamPublisher(InputStream inputStream) {
            this.inputStream = inputStream;
        }

        @Override
        public void subscribe(Subscriber<? super AudioStream> s) {

            if (this.currentSubscription == null) {
                this.currentSubscription = new SubscriptionImpl(s, inputStream);
            } else {
                this.currentSubscription.cancel();
                this.currentSubscription = new SubscriptionImpl(s, inputStream);
            }
            s.onSubscribe(currentSubscription);
        }
    }

    public static class SubscriptionImpl implements Subscription {
        private static final int CHUNK_SIZE_IN_BYTES = 1024 * 1;
        private final Subscriber<? super AudioStream> subscriber;
        private final InputStream inputStream;
        private ExecutorService executor = Executors.newFixedThreadPool(1);
        private AtomicLong demand = new AtomicLong(0);

        SubscriptionImpl(Subscriber<? super AudioStream> s, InputStream inputStream) {
            this.subscriber = s;
            this.inputStream = inputStream;
        }

        @Override
        public void request(long n) {
            if (n <= 0) {
                subscriber.onError(new IllegalArgumentException("Demand must be positive"));
            }

            demand.getAndAdd(n);

            executor.submit(() -> {
                try {
                    do {
                        ByteBuffer audioBuffer = getNextEvent();
                        if (audioBuffer.remaining() > 0) {
                            AudioEvent audioEvent = audioEventFromBuffer(audioBuffer);
                            subscriber.onNext(audioEvent);
                        } else {
                            subscriber.onComplete();
                            break;
                        }
                    } while (demand.decrementAndGet() > 0);
                } catch (Exception e) {
                    subscriber.onError(e);
                }
            });
        }

        @Override
        public void cancel() {
            executor.shutdown();
        }

        private ByteBuffer getNextEvent() {
            ByteBuffer audioBuffer = null;
            byte[] audioBytes = new byte[CHUNK_SIZE_IN_BYTES];

            int len = 0;
            try {
                len = inputStream.read(audioBytes);

                if (len <= 0) {
                    audioBuffer = ByteBuffer.allocate(0);
                } else {
                    audioBuffer = ByteBuffer.wrap(audioBytes, 0, len);
                }
            } catch (IOException e) {
                throw new UncheckedIOException(e);
            }

            return audioBuffer;
        }

        private AudioEvent audioEventFromBuffer(ByteBuffer bb) {
            return AudioEvent.builder()
                    .audioChunk(SdkBytes.fromByteBuffer(bb))
                    .build();
        }
    }
}

Here's my program output:

Received Initial response
No results
No results
Yeah.
No results
Yeah.
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
Yeah.
No results
No results
Oh,
No results
Oh,
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
Oh,
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results
No results

@smac2020 Thanks for looking. Question - how does the service know this is .wav format or does it care/detect automatically? The code doesn't specify the format, which is why I ask. — thebiggestlebowski, Jan 22 '21 at 14:49
According to the Javadocs - you can set the media encoding using https://sdk.amazonaws.com/java/api/latest/software/amazon/awssdk/services/transcribestreaming/model/StartStreamTranscriptionRequest.html#mediaEncoding-- — smac2020, Jan 22 '21 at 16:54
I used the .mediaEncoding(MediaEncoding.PCM). Does that work with a .wav file? It seems to me that the service isn't understanding the data sent to it, so I suspect it's something like incorrect encoding. — thebiggestlebowski, Jan 22 '21 at 17:48

smac2020 · Accepted Answer · 2021-01-22T22:27:01.990

1

The audio file had a sample rate of 44.1 kHz. It was converted to 16 kHz, and it worked:

https://drive.google.com/file/d/1mYVbNlYK3SpGT4NbFRYGn86177eTCqhd/view?usp=sharing

edited Jan 22 '21 at 22:27

answered Jan 22 '21 at 22:21

smac2020

9,637
4
24
38

1. How did you determine the sample rate? 2. How did you convert it to 16khz? – thebiggestlebowski Jan 22 '21 at 23:20
Use a tool like https://www.nch.com.au/switch/index.html – smac2020 Jan 23 '21 at 00:01
Thanks - when I converted to the sample rate it worked. Question - Is there any way the transcribe service could detect a mismatch between the submitted sample rate and the actual sample rate, such as was my case. It's interesting that the same file worked when loaded to S3, but didn't work when the file was sent as a stream. I suspect others will hit this problem and an error response would be very useful (if it's possible). – thebiggestlebowski Jan 23 '21 at 17:03
If the Transcribe service couldn't detect, could the SDK get that info from the file itself and send that as default values (and error or warn if the submitted value doesn't match the actual). The SDK could read media file metadata, just like the command "mediainfo". – thebiggestlebowski Jan 23 '21 at 17:04
As this is your code - you should do a PR against https://github.com/awsdocs/aws-doc-sdk-examples/tree/master/javav2/example_code/transcribe/src/main/java/com/amazonaws/transcribestreaming. Put this as a note in the class. That will be very helpful for ppl trying this use case in the future. – smac2020 Jan 23 '21 at 17:05
I'll update my question. I'll also submit a feature request so that the SDK and/or Transcribe service returns an error if metadata values don't match actual values. In fact, I can see no use to submitting wrong values, so shouldn't the SDK always just take them from the file metadata? I'm sure you're trying to create an API that's as easy as possible - this would make things much better. – thebiggestlebowski Jan 23 '21 at 17:07
Looks like it's as easy as incorporating this into the API ... and/or adding to the code examples: https://stackoverflow.com/questions/7275647/how-can-i-read-info-from-wave-file-using-javasound-java-java-sound. AudioFormat.java has all the relevant file metadata. – thebiggestlebowski Jan 23 '21 at 17:26
That is a great idea. You should update your code example to use this logic and then do a PR against the AWS Example Github( Link above). In the metadata (the metadata in the Java code examples), put your name. Its important that ppl that contribute code gets credit – smac2020 Jan 23 '21 at 17:28
I created a PR: https://github.com/awsdocs/aws-doc-sdk-examples/pull/1642 and added some comments for developers. I suggest this paradigm of verifying and/or automatically detecting correct values be followed anyplace in the AWS client SDI where files are uploaded that need metadata parameters. – thebiggestlebowski Feb 09 '21 at 18:46

thebiggestlebowski · Answer 2 · 2021-01-23T19:02:01.460

As smac2020 pointed out, the sample rate was wrong. Debugging incorrect metadata values passed to AWS is tricky because there's no errors from AWS. You just get back an incorrect transcription. So, the lesson here is, make sure you know what the right values are. Some of them can be automatically detected.

If you're on mac, the tool mediainfo is quite useful.

 brew install mediainfo

So is ffmpeg:

brew install ffmpeg

Here is an updated example where I automatically detect the sample rate using AudioFormat.java. Ideally, the AWS sdk would do this for you. If the media file is outside the parameters of what can be transcribed, then it would throw an exception. Note, I had to modify my original file to 16,000 sample rate using the tool: nch.com.au/switch/index.html. It would be great (hint, hint) if the SDK would also have the ability to modify sample rate, etc, so that files can be changed to fit within input parameters.

package com.amazonaws.transcribe;

import org.reactivestreams.Publisher;
import org.reactivestreams.Subscriber;
import org.reactivestreams.Subscription;
import software.amazon.awssdk.core.SdkBytes;
import software.amazon.awssdk.regions.Region;
import software.amazon.awssdk.services.transcribestreaming.TranscribeStreamingAsyncClient;
import software.amazon.awssdk.services.transcribestreaming.model.*;

import javax.sound.sampled.*;
import java.io.*;
import java.nio.ByteBuffer;
import java.util.List;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.atomic.AtomicLong;

import static javax.sound.sampled.AudioFormat.Encoding.*;


public class TranscribeFileFromStream {
    private static final Region REGION = Region.US_EAST_1;
    private static TranscribeStreamingAsyncClient client;

    public static void main(String args[]) throws Exception {
        System.setProperty("AWS_ACCESS_KEY_ID", "myId");
        System.setProperty("AWS_SECRET_ACCESS_KEY", "myKey");

        System.out.println(System.getProperty("java.version"));
       // BasicConfigurator.configure();
        client = TranscribeStreamingAsyncClient.builder()
                .region(REGION)
                .build();
        try {
            File inputFile = new File("/home/me/work/transcribe/src/main/resources/test-file.wav");
            CompletableFuture<Void> result = client.startStreamTranscription(
                    getRequest(inputFile),
                    new AudioStreamPublisher(getStreamFromFile(inputFile)),
                    getResponseHandler());
            result.get();
        } finally {
            if (client != null) {
                client.close();
            }
        }
    }

    private static InputStream getStreamFromFile(File inputFile) {
        try {
            return new FileInputStream(inputFile);
        } catch (FileNotFoundException e) {
            throw new RuntimeException(e);
        }
    }

    private static StartStreamTranscriptionRequest getRequest(File inputFile) throws IOException, UnsupportedAudioFileException {
        //TODO: I read the file twice in this example.  Can this be more performant?
        AudioInputStream audioInputStream = AudioSystem.getAudioInputStream(inputFile);
        AudioFormat audioFormat = audioInputStream.getFormat();
        return StartStreamTranscriptionRequest.builder()
                .languageCode(LanguageCode.EN_US)
                //.mediaEncoding(MediaEncoding.PCM)
                .mediaEncoding(getAwsMediaEncoding(audioFormat))
                .mediaSampleRateHertz(getAwsSampleRate(audioFormat))
                .build();
    }

    private static MediaEncoding getAwsMediaEncoding(AudioFormat audioFormat) {
        final String javaMediaEncoding = audioFormat.getEncoding().toString();

        if (PCM_SIGNED.toString().equals(javaMediaEncoding)) {
            return MediaEncoding.PCM;
        } else if (PCM_UNSIGNED.toString().equals(javaMediaEncoding)){
            return MediaEncoding.PCM;
        } /*else if (ALAW.toString().equals(javaMediaEncoding)){
            //WARNING: I have no idea how ALAW maps to AWS media encodings.
            return MediaEncoding.OGG_OPUS;
        } else if (ULAW.toString().equals(javaMediaEncoding)){
            //WARNING: I have no idea how ULAW maps to AWS encodings.  
            return MediaEncoding.FLAC;
        }*/

        throw new IllegalArgumentException("Not a recognized media encoding:" + javaMediaEncoding);
    }

    private static Integer getAwsSampleRate(AudioFormat audioFormat) {
        return Math.round(audioFormat.getSampleRate());
    }

    private static StartStreamTranscriptionResponseHandler getResponseHandler() {
        return StartStreamTranscriptionResponseHandler.builder()
                .onResponse(r -> {
                    System.out.println("Received Initial response");
                })
                .onError(e -> {
                    System.out.println(e.getMessage());
                    StringWriter sw = new StringWriter();
                    e.printStackTrace(new PrintWriter(sw));
                    System.out.println("Error Occurred: " + sw.toString());
                })
                .onComplete(() -> {
                    System.out.println("=== All records stream successfully ===");
                })
                .subscriber(event -> {
                    List<Result> results = ((TranscriptEvent) event).transcript().results();
                    if (results.size() > 0) {
                        if (!results.get(0).alternatives().get(0).transcript().isEmpty()) {
                            System.out.println(results.get(0).alternatives().get(0).transcript());
                        } else {
                            System.out.println("Empty result");
                        }
                    } else {
                        System.out.println("No results");
                    }
                })
                .build();
    }

    private static class AudioStreamPublisher implements Publisher<AudioStream> {
        private final InputStream inputStream;
        private static Subscription currentSubscription;


        private AudioStreamPublisher(InputStream inputStream) {
            this.inputStream = inputStream;
        }

        @Override
        public void subscribe(Subscriber<? super AudioStream> s) {

            if (this.currentSubscription == null) {
                this.currentSubscription = new SubscriptionImpl(s, inputStream);
            } else {
                this.currentSubscription.cancel();
                this.currentSubscription = new SubscriptionImpl(s, inputStream);
            }
            s.onSubscribe(currentSubscription);
        }
    }

    public static class SubscriptionImpl implements Subscription {
        private static final int CHUNK_SIZE_IN_BYTES = 1024 * 1;
        private final Subscriber<? super AudioStream> subscriber;
        private final InputStream inputStream;
        private ExecutorService executor = Executors.newFixedThreadPool(1);
        private AtomicLong demand = new AtomicLong(0);

        SubscriptionImpl(Subscriber<? super AudioStream> s, InputStream inputStream) {
            this.subscriber = s;
            this.inputStream = inputStream;
        }

        @Override
        public void request(long n) {
            if (n <= 0) {
                subscriber.onError(new IllegalArgumentException("Demand must be positive"));
            }

            demand.getAndAdd(n);

            executor.submit(() -> {
                try {
                    do {
                        ByteBuffer audioBuffer = getNextEvent();
                        if (audioBuffer.remaining() > 0) {
                            AudioEvent audioEvent = audioEventFromBuffer(audioBuffer);
                            subscriber.onNext(audioEvent);
                        } else {
                            subscriber.onComplete();
                            break;
                        }
                    } while (demand.decrementAndGet() > 0);
                } catch (Exception e) {
                    subscriber.onError(e);
                }
            });
        }

        @Override
        public void cancel() {
            executor.shutdown();
        }

        private ByteBuffer getNextEvent() {
            ByteBuffer audioBuffer = null;
            byte[] audioBytes = new byte[CHUNK_SIZE_IN_BYTES];

            int len = 0;
            try {
                len = inputStream.read(audioBytes);

                if (len <= 0) {
                    audioBuffer = ByteBuffer.allocate(0);
                } else {
                    audioBuffer = ByteBuffer.wrap(audioBytes, 0, len);
                }
            } catch (IOException e) {
                throw new UncheckedIOException(e);
            }

            return audioBuffer;
        }

        private AudioEvent audioEventFromBuffer(ByteBuffer bb) {
            return AudioEvent.builder()
                    .audioChunk(SdkBytes.fromByteBuffer(bb))
                    .build();
        }
    }
}

AWS Transcript: file to text returns nonsense

2 Answers2

Linked