# Copyright 2017 The TensorFlow Authors All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== """Compute input examples for VGGish from audio waveform.""" import numpy as np #import resampy import mel_features import vggish_params try: import soundfile as sf def wav_read(wav_file): wav_data, sr = sf.read(wav_file, dtype='int16') return wav_data, sr except ImportError: def wav_read(wav_file): raise NotImplementedError('WAV file reading requires soundfile package.') def waveform_to_examples(data, sample_rate): """Converts audio waveform into an array of examples for VGGish. Args: data: np.array of either one dimension (mono) or two dimensions (multi-channel, with the outer dimension representing channels). Each sample is generally expected to lie in the range [-1.0, +1.0], although this is not required. sample_rate: Sample rate of data. Returns: 3-D np.array of shape [num_examples, num_frames, num_bands] which represents a sequence of examples, each of which contains a patch of log mel spectrogram, covering num_frames frames of audio and num_bands mel frequency bands, where the frame length is vggish_params.STFT_HOP_LENGTH_SECONDS. """ # Convert to mono. if len(data.shape) > 1: data = np.mean(data, axis=1) # Resample to the rate assumed by VGGish. #if sample_rate != vggish_params.SAMPLE_RATE: #data = resampy.resample(data, sample_rate, vggish_params.SAMPLE_RATE) # Compute log mel spectrogram features. log_mel = mel_features.log_mel_spectrogram( data, audio_sample_rate=vggish_params.SAMPLE_RATE, log_offset=vggish_params.LOG_OFFSET, window_length_secs=vggish_params.STFT_WINDOW_LENGTH_SECONDS, hop_length_secs=vggish_params.STFT_HOP_LENGTH_SECONDS, num_mel_bins=vggish_params.NUM_MEL_BINS, lower_edge_hertz=vggish_params.MEL_MIN_HZ, upper_edge_hertz=vggish_params.MEL_MAX_HZ) # Frame features into examples. features_sample_rate = 1.0 / vggish_params.STFT_HOP_LENGTH_SECONDS example_window_length = int(round( vggish_params.EXAMPLE_WINDOW_SECONDS * features_sample_rate)) example_hop_length = int(round( vggish_params.EXAMPLE_HOP_SECONDS * features_sample_rate)) log_mel_examples = mel_features.frame( log_mel, window_length=example_window_length, hop_length=example_hop_length) return log_mel_examples def wavfile_to_examples(wav_file): """Convenience wrapper around waveform_to_examples() for a common WAV format. Args: wav_file: String path to a file, or a file-like object. The file is assumed to contain WAV audio data with signed 16-bit PCM samples. Returns: See waveform_to_examples. """ wav_data, sr = wav_read(wav_file) assert wav_data.dtype == np.int16, 'Bad sample type: %r' % wav_data.dtype samples = wav_data / 32768.0 # Convert to [-1.0, +1.0] return waveform_to_examples(samples, sr)