Hi i'm trying to get a custom spectrogram layer going and I can't
class MelLayer(tf.keras.layers.Layer):
def __init__(
self,
frame_length=1024,
frame_step=256,
fft_length=None,
sampling_rate=MODEL_SR,
num_mel_channels=80,
freq_min=1,
freq_max=7600,
as_3D_tensor=True,
**kwargs,
):
super().__init__(**kwargs)
self.frame_length = frame_length
self.frame_step = frame_step
self.fft_length = fft_length
self.sampling_rate = sampling_rate
self.num_mel_channels = num_mel_channels
self.freq_min = freq_min
self.freq_max = freq_max
# Defining mel filter. This filter will be multiplied with the STFT output
self.mel_filterbank = tf.signal.linear_to_mel_weight_matrix(
num_mel_bins=self.num_mel_channels,
num_spectrogram_bins=self.frame_length // 2 1,
sample_rate=self.sampling_rate,
lower_edge_hertz=self.freq_min,
upper_edge_hertz=self.freq_max,
)
self.as_3D_tensor = as_3D_tensor
def call(self, audio, training=True):
stft = tf.signal.stft(
tf.squeeze(audio),
self.frame_length,
self.frame_step,
self.fft_length,
pad_end=True,
)
# Taking the magnitude of the STFT output
magnitude = tf.abs(stft)
# Multiplying the Mel-filterbank with the magnitude and scaling it using the db scale
mel = tf.matmul(tf.square(magnitude), self.mel_filterbank)
log_mel_spec = tfio.audio.dbscale(mel, top_db=80)
print(type(log_mel_spec))
return tf.expand_dims(log_mel_spec,axis=-1) if self.as_3D_tensor else tf.squeeze(log_mel_spec)
def get_config(self):
config = super(MelLayer, self).get_config()
config.update(
{
"frame_length": self.frame_length,
"frame_step": self.frame_step,
"fft_length": self.fft_length,
"sampling_rate": self.sampling_rate,
"num_mel_channels": self.num_mel_channels,
"freq_min": self.freq_min,
"freq_max": self.freq_max,
}
)
return config
class LogMelSpectrogram(tf.keras.layers.Layer):
"""Compute log-magnitude mel-scaled spectrograms."""
def __init__(self, sample_rate, fft_size, hop_size, n_mels,
f_min=0.0, f_max=None, **kwargs):
super(LogMelSpectrogram, self).__init__(**kwargs)
self.sample_rate = sample_rate
self.fft_size = fft_size
self.hop_size = hop_size
self.n_mels = n_mels
self.f_min = f_min
self.f_max = f_max if f_max else sample_rate / 2
self.mel_filterbank = tf.signal.linear_to_mel_weight_matrix(
num_mel_bins=self.n_mels,
num_spectrogram_bins=fft_size // 2 1,
sample_rate=self.sample_rate,
lower_edge_hertz=self.f_min,
upper_edge_hertz=self.f_max)
def build(self, input_shape):
self.non_trainable_weights.append(self.mel_filterbank)
super(LogMelSpectrogram, self).build(input_shape)
def call(self, waveforms):
"""Forward pass.
Parameters
----------
waveforms : tf.Tensor, shape = (None, n_samples)
A Batch of mono waveforms.
Returns
-------
log_mel_spectrograms : (tf.Tensor), shape = (None, time, freq, ch)
The corresponding batch of log-mel-spectrograms
"""
def _tf_log10(x):
numerator = tf.math.log(x)
denominator = tf.math.log(tf.constant(10, dtype=numerator.dtype))
return numerator / denominator
def power_to_db(magnitude, amin=1e-16, top_db=80.0):
"""
https://librosa.github.io/librosa/generated/librosa.core.power_to_db.html
"""
ref_value = tf.reduce_max(magnitude)
log_spec = 10.0 * _tf_log10(tf.maximum(amin, magnitude))
log_spec -= 10.0 * _tf_log10(tf.maximum(amin, ref_value))
log_spec = tf.maximum(log_spec, tf.reduce_max(log_spec) - top_db)
return log_spec
spectrograms = tf.signal.stft(waveforms,
frame_length=self.fft_size,
frame_step=self.hop_size,
pad_end=False)
magnitude_spectrograms = tf.abs(spectrograms)
mel_spectrograms = tf.matmul(tf.square(magnitude_spectrograms),
self.mel_filterbank)
log_mel_spectrograms = power_to_db(mel_spectrograms)
# add channel dimension
log_mel_spectrograms = tf.expand_dims(log_mel_spectrograms, 3)
return log_mel_spectrograms
def get_config(self):
config = {
'fft_size': self.fft_size,
'hop_size': self.hop_size,
'n_mels': self.n_mels,
'sample_rate': self.sample_rate,
'f_min': self.f_min,
'f_max': self.f_max,
}
config.update(super(LogMelSpectrogram, self).get_config())
return config
When I try to build the model
input = tf.keras.layers.Input(shape=(32000,1)) #audios will be 32000 samples, mono channel
mel_l = MelLayer(frame_length=512,
frame_step=512,
fft_length=None,
sampling_rate=16000,
num_mel_channels=80,
freq_min=1,
freq_max=2000,
name="spec")(input)
output = tf.keras.layers.Conv2D(16,3)(mel_l)
This throws the following error
/usr/local/lib/python3.7/dist-packages/keras/layers/convolutional.py in _get_input_channel(self, input_shape)
370 def _get_input_channel(self, input_shape):
371 channel_axis = self._get_channel_axis()
--> 372 if input_shape.dims[channel_axis].value is None:
373 raise ValueError('The channel dimension of the inputs should be defined. '
374 f'The input_shape received is {input_shape}, '
TypeError: 'NoneType' object is not subscriptable
I think it has to do with my custom class definition, as if it wasn't detecting the input shape, The source for the custom layer is here: https://keras.io/examples/audio/melgan_spectrogram_inversion/, I tried to copy the steps but it does not seem to work
CodePudding user response:
TensorFlow can't compute the output shape of your layer. As Conv2D
requires a specific shape (4 dimensions), it will fail if the output shape of the previous layer is not known (None
).
To fix that, you need to specify which axis you want to squeeze in you call
function.
Here, I specify that this is the last axis that need to be squeezed (the channel axis).
def call(self, audio, training=True):
stft = tf.signal.stft(
tf.squeeze(audio, axis=-1),
self.frame_length,
self.frame_step,
self.fft_length,
pad_end=True,
)
If you don't specify it, the output shape of the layer can't be computed, as it is going to be different if the batch size is 1 or if it is greater than 1.