I am trying to implement the following paper: https://arxiv.org/abs/1904.08779 in order to achieve better results in Speech to Text.
I am trying to implement it using the mozilla DeepSpeech repo.
It uses the tensorflow dataset model to load the data.
dataset = (tf.data.Dataset.from_generator(generate_values,
output_types=(tf.string, (tf.int64, tf.int32, tf.int64),tf.int64))
.map(entry_to_features, num_parallel_calls=tf.data.experimental.AUTOTUNE)
.cache(cache_path)
.map(augment_spec, num_parallel_calls=tf.data.experimental.AUTOTUNE)
.window(batch_size, drop_remainder=True).flat_map(batch_fn)
.prefetch(num_gpus))
The audio is converter to a spectrogram and mfcc are calculated, so when the data arrives at the augment_spec function it has a shape of (?, 26). ? is the result of a reshape of a variable audio length. I am trying to mask certain parts of the images, to do that I thought of multiplying to tensors, one being a mask of ones and zeros, using some code like this
def augment_spec(features, features_len, transcript):
# print("\n\n\n\n duration", duration.eval())
sample_rate = 8000
mask = np.ones_like(features)
temp = tf.Variable(tf.ones_like(features))
print(temp)
time_len = features_len.shape[0]
features_len = features_len
n_time_masks = np.random.randint(0, 4)
n_freq_masks = np.random.randint(0, 3)
for _ in range(n_time_masks):
time_delta = np.random.randint(int(sample_rate / 10), int(sample_rate / 2))
time_start = np.random.randint(0, time_len - time_delta)
print(time_start, time_delta)
mask[time_start:time_start + time_delta] = 0
for _ in range(n_freq_masks):
freq_delta = np.random.randint(1, 4)
freq_start = np.random.randint(0, features_len - freq_delta)
print(freq_start, freq_delta)
mask[:, freq_start:freq_start + freq_delta] = 0
mask = tf.convert_to_tensor(mask, dtype=tf.float32)
return tf.math.multiply(features, mask), features_len, transcript
The problem is that these instructions:
mask = np.ones_like(features)
time_len = features_len.shape[0]
do not work since the when the graph is being built the tensors has not defined shape, so I do not know how to implement this. Could you help me with this? Thanks a lot!!
UPDATE: Following @kempy answer my code now looks like this:
def augment_spec(features, features_len, transcript):
# print("\n\n\n\n duration", duration.eval())
sample_rate = 8000
mask = tf.Variable(tf.ones_like(features),validate_shape=False)
time_len = tf.shape(features)[0]
n_time_masks = np.random.randint(0, 4)
n_freq_masks = np.random.randint(0, 3)
# n_time_masks = tf.random.uniform(
# shape=(), minval=0, maxval=4, dtype=tf.int32)
# n_freq_masks = tf.random.uniform(
# shape=(), minval=0, maxval=3, dtype=tf.int32)
for _ in range(n_time_masks):
time_delta = tf.random.uniform(
shape=(), minval=int(sample_rate / 10), maxval=int(sample_rate / 2), dtype=tf.int32)
time_start = tf.random.uniform(
shape=(), minval=0, maxval=time_len-time_delta, dtype=tf.int32)
# indexes = list(range(time_start,time_start+time_delta))
indexes = tf.range(time_start, time_start+time_delta, delta=1, dtype=tf.int32, name='range')
tf.scatter_update(mask, indexes, 0)
mask = tf.transpose(mask,(1,0))
for _ in range(n_freq_masks):
# freq_delta = np.random.randint(1, 4)
# freq_start = np.random.randint(0, features_len - freq_delta)
freq_delta = tf.random.uniform(
shape=(), minval=1, maxval=4, dtype=tf.int32)
freq_start = tf.random.uniform(
shape=(), minval=0, maxval=(features_len - freq_delta), dtype=tf.int32)
# indexes = list(range(freq_start,freq_start+freq_delta))
indexes = tf.range(freq_start, freq_start+freq_delta, delta=1, dtype=tf.int32, name='range')
tf.scatter_update(mask, indexes, 0)
mask = tf.transpose(mask,(1,0))
mask = tf.convert_to_tensor(mask, dtype=tf.float32)
masked = tf.multiply(features, mask)
return masked, features_len, transcript
But now I am getting this error:
ValueError: Tensor("Variable:0", dtype=float32_ref) must be from the same graph as Tensor("tower_0/Mean:0", shape=(), dtype=float32, device=/device:GPU:0).
I do not know how to solve this, thank you for your help