CovidSpeechChallenge_2021/vggish/vggish_input.py

# Copyright 2017 The TensorFlow Authors All Rights Reserved.

#

# Licensed under the Apache License, Version 2.0 (the "License");

# you may not use this file except in compliance with the License.

# You may obtain a copy of the License at

#

#     http://www.apache.org/licenses/LICENSE-2.0

#

# Unless required by applicable law or agreed to in writing, software

# distributed under the License is distributed on an "AS IS" BASIS,

# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

# See the License for the specific language governing permissions and

# limitations under the License.

# ==============================================================================


"""Compute input examples for VGGish from audio waveform."""


import numpy as np

#import resampy


import mel_features

import vggish_params


try:

  import soundfile as sf


  def wav_read(wav_file):

    wav_data, sr = sf.read(wav_file, dtype='int16')

    return wav_data, sr


except ImportError:


  def wav_read(wav_file):

    raise NotImplementedError('WAV file reading requires soundfile package.')


def waveform_to_examples(data, sample_rate):

  """Converts audio waveform into an array of examples for VGGish.


  Args:

    data: np.array of either one dimension (mono) or two dimensions

      (multi-channel, with the outer dimension representing channels).

      Each sample is generally expected to lie in the range [-1.0, +1.0],

      although this is not required.

    sample_rate: Sample rate of data.


  Returns:

    3-D np.array of shape [num_examples, num_frames, num_bands] which represents

    a sequence of examples, each of which contains a patch of log mel

    spectrogram, covering num_frames frames of audio and num_bands mel frequency

    bands, where the frame length is vggish_params.STFT_HOP_LENGTH_SECONDS.

  """

  # Convert to mono.

  if len(data.shape) > 1:

    data = np.mean(data, axis=1)

  # Resample to the rate assumed by VGGish.

  #if sample_rate != vggish_params.SAMPLE_RATE:

    #data = resampy.resample(data, sample_rate, vggish_params.SAMPLE_RATE)


  # Compute log mel spectrogram features.

  log_mel = mel_features.log_mel_spectrogram(

      data,

      audio_sample_rate=vggish_params.SAMPLE_RATE,

      log_offset=vggish_params.LOG_OFFSET,

      window_length_secs=vggish_params.STFT_WINDOW_LENGTH_SECONDS,

      hop_length_secs=vggish_params.STFT_HOP_LENGTH_SECONDS,

      num_mel_bins=vggish_params.NUM_MEL_BINS,

      lower_edge_hertz=vggish_params.MEL_MIN_HZ,

      upper_edge_hertz=vggish_params.MEL_MAX_HZ)


  # Frame features into examples.

  features_sample_rate = 1.0 / vggish_params.STFT_HOP_LENGTH_SECONDS

  example_window_length = int(round(

      vggish_params.EXAMPLE_WINDOW_SECONDS * features_sample_rate))

  example_hop_length = int(round(

      vggish_params.EXAMPLE_HOP_SECONDS * features_sample_rate))

  log_mel_examples = mel_features.frame(

      log_mel,

      window_length=example_window_length,

      hop_length=example_hop_length)

  return log_mel_examples


def wavfile_to_examples(wav_file):

  """Convenience wrapper around waveform_to_examples() for a common WAV format.


  Args:

    wav_file: String path to a file, or a file-like object. The file

    is assumed to contain WAV audio data with signed 16-bit PCM samples.


  Returns:

    See waveform_to_examples.

  """

  wav_data, sr = wav_read(wav_file)

  assert wav_data.dtype == np.int16, 'Bad sample type: %r' % wav_data.dtype

  samples = wav_data / 32768.0  # Convert to [-1.0, +1.0]

  return waveform_to_examples(samples, sr)