I ended up solving my problem. I'm providing the issue and my solution to anyone needing to do this in the future.
Background info:
I'm using AngularJS when I make this ajax request, but the idea is the same for both that, jquery ajax, and regular xhr.
Code:
//creating a form object and assigning everything
//to it is so that XHR can automatically
//generate proper multipart formatting
var form = new FormData();
var data = {};
data['messageHeader'] = {};
var jsonData = JSON.stringify(data);
var jsonBlob = new Blob([jsonData],{type: "application/json"});
//assign json metadata blob and audio blob to the form
form.append("request", jsonData);
form.append("audio",response); //Response is the audio blob
//make the post request
//Notes:
//content-type set to undefined so angular can auto assign type
//transformRequest: angular.identity allows for angular to create multipart
//response: arraybuffer so untouched binary data can be received
$http({method:"POST",
url: endpoint + path,
headers: {
'Authorization': 'Bearer ' + $cookies.get('token'),
'Content-Type': undefined
},
transformRequest: angular.identity,
data: form,
responseType: "arraybuffer"
})
.success(function(data){
//data: ArrayBuffer of multipart response
//toss ArrayBuffer into Uint8Array
//lets you iterate over the bytes
var audioArray = new Uint8Array(data);
//toss a UTF-8 version of the response into
//a variable. Used to extract metadata
var holder = "";
for (var i = 0; i < audioArray.length; i++){
holder += String.fromCharCode(audioArray[i]);
}
//get the boundary from the string. Eg contents of first line
var boundary = holder.substr(0, holder.indexOf("\n"));
//break response into array at each boundary string
var temp = holder.split(boundary);
var parts = [];
//loop through array to remove empty parts
for (var i = 0; i < temp.length; i++){
if (temp[i] != ""){
parts.push(temp[i]);
}
}
//PARSE FIRST PART
//get index of first squiggly, indicator of start of JSON
var jsonStart = parts[0].indexOf('{');
//string to JSON on { index to end of part substring
var JSONResponse = JSON.parse(parts[0].substring(jsonStart));
//PARSE SECOND PART
var audioStart = holder.indexOf('mpeg') + 8;
//get an ArrayBuffer from UInt8Buffer from the audio
//start point to the end of the array
var audio = audioArray.buffer.slice(audioStart);
//hand off audio to AudioContext for automatic decoding
audio_context.decodeAudioData(audio, function(buffer) {
var audioBuffer = buffer;
//create a sound source
var source = audio_context.createBufferSource();
//attach audioBuffer to sound source
source.buffer = audioBuffer;
//wire source to speakers
source.connect(audio_context.destination);
//on audio completion, re-enable mic button
source.onended = function() {
console.log("ended");
$scope.$apply(function(){
$scope.playing = false;
});
}
//start playing audio
source.start(0);
}, function (){
//callback for when there is an error
console.log("error decoding audio");
});
})
Overview:
You need to accept the response as pure binary data (ArrayBuffer). Most libraries will give it to you as a string, which is cool for normal requests but bad for binary data.
You then step through the data to find the multipart boundaries.
You then split at the boundaries.
Get the index of the boundary you know is binary data
and then retrieve the original binary from the ArrayBuffer.
In my case I send that binary into the speakers, however if its an image you can build a blob, get a url from FileReader and then set that as a source of an image.