One possible approach is to wrap the eSpeak command line tool (Windows & Linux) http://espeak.sourceforge.net/. You can then wrap with Node.js.
const { exec } = require('child_process');
var outputFile = process.argv[2] || "output.wav";
var voice = process.argv[3] || "en-uk-north";
var text = process.argv[4] || "hello there buddy";
var command = `espeak.exe -v ${voice} -w ${outputFile} "${text}"`;
exec(command, (err, stdout, stderr) => {
if (err) {
console.log("Error occurred: ", err);
return;
}
});
This gives a fairly low quality output.
I've also played with the Bing Speech API and the output is very good, I've created a Node.js example. You would need to sign up for an API key but this is very easy (you to to https://azure.microsoft.com/en-us/try/cognitive-services/ and select "Speech").
const key = 'your api key here';
function synthesizeSpeech(apiKey)
{
const fs = require('fs');
const request = require('request');
const xmlbuilder = require('xmlbuilder');
const text = process.argv[2] || "The fault, dear Brutus, is not in our stars, But in ourselves, that we are underlings.";
const outputFile = process.argv[3] || "speech.wav";
var ssml_doc = xmlbuilder.create('speak')
.att('version', '1.0')
.att('xml:lang', 'en-au')
.ele('voice')
.att('xml:lang', 'en-au')
.att('xml:gender', 'Female')
.att('name', 'Microsoft Server Speech Text to Speech Voice (en-AU, HayleyRUS)')
.txt(text)
.end();
var post_speak_data = ssml_doc.toString();
console.log('Synthesizing speech: ', text);
request.post({
url: 'https://api.cognitive.microsoft.com/sts/v1.0/issueToken',
headers: {
'Ocp-Apim-Subscription-Key' : apiKey
}
}, function (err, resp, access_token) {
if (err || resp.statusCode != 200) {
console.log(err, resp.body);
} else {
try {
request.post({
url: 'https://speech.platform.bing.com/synthesize',
body: post_speak_data,
headers: {
'content-type' : 'application/ssml+xml',
'X-Microsoft-OutputFormat' : 'riff-16khz-16bit-mono-pcm',
'Authorization': 'Bearer ' + access_token,
'X-Search-AppId': '9FCF779F0EFB4E8E8D293EEC544221E9',
'X-Search-ClientID': '0A13B7717D0349E683C00A6AEA9E8B6D',
'User-Agent': 'Node.js-Demo'
},
encoding: null
}, function (err, resp, data) {
if (err || resp.statusCode != 200) {
console.log(err, resp.body);
} else {
try {
console.log('Saving output to file: ', outputFile);
fs.writeFileSync(outputFile, data);
} catch (e) {
console.log(e.message);
}
}
});
} catch (e) {
console.log(e.message);
}
}
});
}
synthesizeSpeech(key);
Also check out the MARY project here: http://mary.dfki.de/, this is an open source server that you can install, the voice output is very good, you could make calls to the server from node.js.
If you install the Mary Speech engine (quite easy):
"use strict";
const fs = require('fs');
const request = require('request');
const text = process.argv[2] || "The fault, dear Brutus, is not in our stars, But in ourselves, that we are underlings.";
const outputFile = process.argv[3] || "speech_mary_output.wav";
const options = {
url: `http://localhost:59125/process?INPUT_TEXT=${text}!&INPUT_TYPE=TEXT&OUTPUT_TYPE=AUDIO&AUDIO=WAVE_FILE&LOCALE=en_US&VOICE=cmu-slt-hsmm`,
encoding: null // Binary data.
}
console.log('Synthesizing speech (using Mary engine): ', text);
console.log('Calling: ', options.url);
request.get(options, function (err, resp, data) {
if (err || resp.statusCode != 200) {
console.log(err, resp.body);
} else {
try {
console.log(`Saving output to file: ${outputFile}, length: ${data.length} byte(s)`);
fs.writeFileSync(outputFile, data, { encoding: 'binary'});
} catch (e) {
console.log(e.message);
}
}
});
This will synthesize speech for you. No API key required!