Keras custom Layer: "input_shape" is not suscriptable-CodePudding

Hi i'm trying to get a custom spectrogram layer going and I can't

class MelLayer(tf.keras.layers.Layer):
    def __init__(
        self,
        frame_length=1024,
        frame_step=256,
        fft_length=None,
        sampling_rate=MODEL_SR,
        num_mel_channels=80,
        freq_min=1,
        freq_max=7600,
        as_3D_tensor=True,
        **kwargs,
    ):
        super().__init__(**kwargs)
        self.frame_length = frame_length
        self.frame_step = frame_step
        self.fft_length = fft_length
        self.sampling_rate = sampling_rate
        self.num_mel_channels = num_mel_channels
        self.freq_min = freq_min
        self.freq_max = freq_max
        # Defining mel filter. This filter will be multiplied with the STFT output
        self.mel_filterbank = tf.signal.linear_to_mel_weight_matrix(
            num_mel_bins=self.num_mel_channels,
            num_spectrogram_bins=self.frame_length // 2   1,
            sample_rate=self.sampling_rate,
            lower_edge_hertz=self.freq_min,
            upper_edge_hertz=self.freq_max,
        )
        self.as_3D_tensor = as_3D_tensor

    def call(self, audio, training=True):

        stft = tf.signal.stft(
            tf.squeeze(audio),
            self.frame_length,
            self.frame_step,
            self.fft_length,
            pad_end=True,
        )

        # Taking the magnitude of the STFT output
        magnitude = tf.abs(stft)

        # Multiplying the Mel-filterbank with the magnitude and scaling it using the db scale
        mel = tf.matmul(tf.square(magnitude), self.mel_filterbank)
        log_mel_spec = tfio.audio.dbscale(mel, top_db=80)

        print(type(log_mel_spec))
 
        return tf.expand_dims(log_mel_spec,axis=-1) if self.as_3D_tensor else tf.squeeze(log_mel_spec)


    def get_config(self):
        config = super(MelLayer, self).get_config()
        config.update(
            {
                "frame_length": self.frame_length,
                "frame_step": self.frame_step,
                "fft_length": self.fft_length,
                "sampling_rate": self.sampling_rate,
                "num_mel_channels": self.num_mel_channels,
                "freq_min": self.freq_min,
                "freq_max": self.freq_max,
            }
        )
        return config

class LogMelSpectrogram(tf.keras.layers.Layer):
    """Compute log-magnitude mel-scaled spectrograms."""

    def __init__(self, sample_rate, fft_size, hop_size, n_mels,
                 f_min=0.0, f_max=None, **kwargs):
        super(LogMelSpectrogram, self).__init__(**kwargs)
        self.sample_rate = sample_rate
        self.fft_size = fft_size
        self.hop_size = hop_size
        self.n_mels = n_mels
        self.f_min = f_min
        self.f_max = f_max if f_max else sample_rate / 2
        self.mel_filterbank = tf.signal.linear_to_mel_weight_matrix(
            num_mel_bins=self.n_mels,
            num_spectrogram_bins=fft_size // 2   1,
            sample_rate=self.sample_rate,
            lower_edge_hertz=self.f_min,
            upper_edge_hertz=self.f_max)

    def build(self, input_shape):
        self.non_trainable_weights.append(self.mel_filterbank)
        super(LogMelSpectrogram, self).build(input_shape)

    def call(self, waveforms):
        """Forward pass.
        Parameters
        ----------
        waveforms : tf.Tensor, shape = (None, n_samples)
            A Batch of mono waveforms.
        Returns
        -------
        log_mel_spectrograms : (tf.Tensor), shape = (None, time, freq, ch)
            The corresponding batch of log-mel-spectrograms
        """
        def _tf_log10(x):
            numerator = tf.math.log(x)
            denominator = tf.math.log(tf.constant(10, dtype=numerator.dtype))
            return numerator / denominator

        def power_to_db(magnitude, amin=1e-16, top_db=80.0):
            """
            https://librosa.github.io/librosa/generated/librosa.core.power_to_db.html
            """
            ref_value = tf.reduce_max(magnitude)
            log_spec = 10.0 * _tf_log10(tf.maximum(amin, magnitude))
            log_spec -= 10.0 * _tf_log10(tf.maximum(amin, ref_value))
            log_spec = tf.maximum(log_spec, tf.reduce_max(log_spec) - top_db)

            return log_spec

        spectrograms = tf.signal.stft(waveforms,
                                      frame_length=self.fft_size,
                                      frame_step=self.hop_size,
                                      pad_end=False)

        magnitude_spectrograms = tf.abs(spectrograms)

        mel_spectrograms = tf.matmul(tf.square(magnitude_spectrograms),
                                     self.mel_filterbank)

        log_mel_spectrograms = power_to_db(mel_spectrograms)

        # add channel dimension
        log_mel_spectrograms = tf.expand_dims(log_mel_spectrograms, 3)

        return log_mel_spectrograms

    def get_config(self):
        config = {
            'fft_size': self.fft_size,
            'hop_size': self.hop_size,
            'n_mels': self.n_mels,
            'sample_rate': self.sample_rate,
            'f_min': self.f_min,
            'f_max': self.f_max,
        }
        config.update(super(LogMelSpectrogram, self).get_config())

        return config

When I try to build the model


input = tf.keras.layers.Input(shape=(32000,1)) #audios will be 32000 samples, mono channel

mel_l = MelLayer(frame_length=512,
                 frame_step=512,
                 fft_length=None,
                 sampling_rate=16000,
                 num_mel_channels=80,
                 freq_min=1,
                 freq_max=2000,
                 name="spec")(input)

output = tf.keras.layers.Conv2D(16,3)(mel_l)

This throws the following error

/usr/local/lib/python3.7/dist-packages/keras/layers/convolutional.py in _get_input_channel(self, input_shape)
    370   def _get_input_channel(self, input_shape):
    371     channel_axis = self._get_channel_axis()
--> 372     if input_shape.dims[channel_axis].value is None:
    373       raise ValueError('The channel dimension of the inputs should be defined. '
    374                        f'The input_shape received is {input_shape}, '

TypeError: 'NoneType' object is not subscriptable

I think it has to do with my custom class definition, as if it wasn't detecting the input shape, The source for the custom layer is here: https://keras.io/examples/audio/melgan_spectrogram_inversion/, I tried to copy the steps but it does not seem to work

CodePudding user response：

TensorFlow can't compute the output shape of your layer. As Conv2D requires a specific shape (4 dimensions), it will fail if the output shape of the previous layer is not known (None).

To fix that, you need to specify which axis you want to squeeze in you call function.

Here, I specify that this is the last axis that need to be squeezed (the channel axis).

    def call(self, audio, training=True):

        stft = tf.signal.stft(
            tf.squeeze(audio, axis=-1),
            self.frame_length,
            self.frame_step,
            self.fft_length,
            pad_end=True,
        )

If you don't specify it, the output shape of the layer can't be computed, as it is going to be different if the batch size is 1 or if it is greater than 1.