Skip to content

Commit

Permalink
Merge pull request jina-ai#487 from redram/feat-spectral-audio-encoders
Browse files Browse the repository at this point in the history
feat(encoders): add spectral audio encoders
  • Loading branch information
hanxiao authored Jun 4, 2020
2 parents a5714f8 2e7a404 commit 69196a0
Show file tree
Hide file tree
Showing 4 changed files with 117 additions and 0 deletions.
Empty file.
86 changes: 86 additions & 0 deletions jina/executors/encoders/audio/spectral.py
Original file line number Diff line number Diff line change
@@ -0,0 1,86 @@
import numpy as np

from .. import BaseAudioEncoder
from ...decorators import batching, as_ndarray


class MFCCTimbreEncoder(BaseAudioEncoder):
"""
:class:`MFCCTimbreEncoder` is based on Mel-Frequency Cepstral Coefficients (MFCCs) which represent timbral features.
:class:`MFCCTimbreEncoder` encodes an audio signal from a `Batch x Signal Length` ndarray into a
`Batch x Concatenated Features` ndarray.
"""

def __init__(self, input_sample_rate: int = 22050, n_mfcc: int = 20, n_fft_length: int = 2048,
hop_length: int = 512, *args, **kwargs):
"""
:class:`MFCCTimbreEncoder` extracts from an audio signal a `n_mfcc`-dimensional feature vector for each MFCC
frame.
:param input_sample_rate: input sampling rate in Hz (22050 by default)
:param n_mfcc: the number of coefficients (20 by default)
:param n_fft: length of the FFT window (2048 by default)
:param hop_length: the number of samples between successive MFCC frames (512 by default)
"""
super().__init__(*args, **kwargs)
self.input_sample_rate = input_sample_rate
self.n_mfcc = n_mfcc
self.n_fft_length = n_fft_length
self.hop_length = hop_length

@batching
@as_ndarray
def encode(self, data: np.ndarray, *args, **kwargs) -> np.ndarray:
"""
Segments the audio signal of each Chunk into short MFCC frames, extracts MFCCs for each frame and concatenates
Chunk frame MFCCs into a single Chunk embedding.
:param data: a `Batch x Signal Length` ndarray, where `Signal Length` is a number of samples
:return: a `Batch x Concatenated Features` ndarray, where `Concatinated Features` is a `n_mfcc`-dimensional
feature vector times the number of the MFCC frames
"""
from librosa.feature import mfcc
embeds = []
for chunk_data in data:
mfccs = mfcc(y=chunk_data, sr=self.input_sample_rate, n_mfcc=self.n_mfcc, n_fft=self.n_fft_length,
hop_length=self.hop_length)
embeds.append(mfccs.flatten())
return embeds


class ChromaPitchEncoder(BaseAudioEncoder):
"""
:class:`ChromaPitchEncoder` is based on chroma spectrograms (chromagrams) which represent melodic/harmonic features.
:class:`ChromaPitchEncoder` encodes an audio signal from a `Batch x Signal Length` ndarray into a
`Batch x Concatenated Features` ndarray.
"""

def __init__(self, input_sample_rate: int = 22050, hop_length: int = 512, *args, **kwargs):
"""
:class:`ChromaPitchEncoder` extracts from an audio signal a 12-dimensional feature vector, that refer to 12
octaves, for each chroma frame.
:param input_sample_rate: input sampling rate in Hz (22050 by default)
:param hop_length: the number of samples between successive chroma frames (512 by default)
"""
super().__init__(*args, **kwargs)
self.input_sample_rate = input_sample_rate
self.hop_length = hop_length

@batching
@as_ndarray
def encode(self, data: np.ndarray, *args, **kwargs) -> np.ndarray:
"""
Segments the audio signal of each Chunk into short chroma frames, extracts chromagrams for each frame and
concatenates Chunk frame chromagrams into a single Chunk embedding.
:param data: a `Batch x Signal Length` ndarray, where `Signal Length` is a number of samples
:return: a `Batch x Concatenated Features` ndarray, where `Concatinated Features` is a 12-dimensional feature
vector times the number of the chroma frames
"""
from librosa.feature import chroma_cqt
embeds = []
for chunk_data in data:
chromagrams = chroma_cqt(y=chunk_data, sr=self.input_sample_rate, n_chroma=12, hop_length=self.hop_length)
embeds.append(chromagrams.flatten())
return embeds
Empty file.
31 changes: 31 additions & 0 deletions tests/executors/encoders/audio/test_spectral.py
Original file line number Diff line number Diff line change
@@ -0,0 1,31 @@
import unittest

import numpy as np

from jina.executors.encoders.audio.spectral import MFCCTimbreEncoder, ChromaPitchEncoder
from tests import JinaTestCase


class MyTestCase(JinaTestCase):
def test_mfcc_encoder(self):
batch_size = 10
n_frames = 5
signal_length = 500 * n_frames
test_data = np.random.randn(batch_size, signal_length)
n_mfcc = 12
encoder = MFCCTimbreEncoder(n_mfcc=n_mfcc)
encoded_data = encoder.encode(test_data)
self.assertEqual(encoded_data.shape, (batch_size, n_mfcc * n_frames))

def test_chroma_encoder(self):
batch_size = 10
n_frames = 5
signal_length = 500 * n_frames
test_data = np.random.randn(batch_size, signal_length)
encoder = ChromaPitchEncoder()
encoded_data = encoder.encode(test_data)
self.assertEqual(encoded_data.shape, (batch_size, 12 * n_frames))


if __name__ == '__main__':
unittest.main()

0 comments on commit 69196a0

Please sign in to comment.