  • 频域特征-Fbank

    • 预加重

    def preemphasis(signal, coeff=0.95):
        return np.append(signal[1], signal[1:] - coeff * signal[:-1])

    • 分帧及加窗

    def frame_sig(sig, frame_len, frame_step, win_func):
        :param sig: 输入的语音信号
        :param frame_len: 帧长
        :param frame_step: 帧移
        :param win_func: 窗函数
        :return: array of frames, num_frame * frame_len
        slen = len(sig)
        if slen <= frame_len:
            num_frames = 1
            # np.ceil(), 向上取整
            num_frames = 1 + int(np.ceil((slen - frame_len) / frame_step))
        padlen = int( (num_frames - 1) * frame_step + frame_len)
        # 将信号补长,使得(slen - frame_len) /frame_step整除
        zeros = np.zeros((padlen - slen,))
        padSig = np.concatenate((sig, zeros))
        indices = np.tile(np.arange(0, frame_len), (num_frames, 1)) + np.tile(np.arange(0, num_frames*frame_step, frame_step), (frame_len, 1)).T
        indices = np.array(indices, dtype=np.int32)
        frames = padSig[indices]
        win = np.tile(win_func(frame_len), (num_frames, 1))
        return frames * win

    • FFT

    complex_spec = np.fft.rfft(frames, NFFT)

    • 幅值平方


    • Mel滤波器

    def filterbank(nfilt=40, nfft=512, samplerate=16000, lowfreq=20, highfreq=None):
        low_freq = lowfreq
        if highfreq is None:
            highfreq = samplerate // 2
        low_mel = hz2mel(low_freq)
        high_mel = hz2mel(highfreq)
        mel_points = np.linspace(low_mel, high_mel, nfilt + 2)
        binf = np.floor((nfft + 1) * mel2hz(mel_points) / samplerate)
        fbank = np.zeros([nfilt, int(nfft / 2 + 1)])
        for indexj in range(0, nfilt):
            left = binf[indexj]
            center = binf[indexj + 1]
            right = binf[indexj + 2]
            for indexi in range(int(left), int(center)):
                fbank[indexj, indexi] = (indexi - left) / ( center - left)
            for indexi in range(int(center), int(right)):
                fbank[indexj, indexi] = (right - indexi) / ( right -center)
        return fbank

    • 对数功率



    import numpy as np
    import soundfile as sf
    import python_speech_features as psf
    import librosa
    import librosa.display
    import matplotlib.pyplot as plt
    def frame_sig(sig, frame_len, frame_step, win_func):
        :param sig: 输入的语音信号
        :param frame_len: 帧长
        :param frame_step: 帧移
        :param win_func: 窗函数
        :return: array of frames, num_frame * frame_len
        slen = len(sig)
        if slen <= frame_len:
            num_frames = 1
            # np.ceil(), 向上取整
            num_frames = 1 + int(np.ceil((slen - frame_len) / frame_step))
        padlen = int( (num_frames - 1) * frame_step + frame_len)
        # 将信号补长,使得(slen - frame_len) /frame_step整除
        zeros = np.zeros((padlen - slen,))
        padSig = np.concatenate((sig, zeros))
        indices = np.tile(np.arange(0, frame_len), (num_frames, 1)) + np.tile(np.arange(0, num_frames*frame_step, frame_step), (frame_len, 1)).T
        indices = np.array(indices, dtype=np.int32)
        frames = padSig[indices]
        win = np.tile(win_func(frame_len), (num_frames, 1))
        return frames * win
    def preemphasis(signal, coeff=0.95):
        return np.append(signal[1], signal[1:] - coeff * signal[:-1])
    def pow_spec(frames, NFFT):
        complex_spec = np.fft.rfft(frames, NFFT)
        return 1 / NFFT * np.square(np.abs(complex_spec))
    def hz2mel(hz):
        return 2595 * np.log10(1 + hz / 700.)
    def mel2hz(mel):
        return 700 * (10 ** (mel / 2595.0) - 1)
    def filterbank(nfilt=40, nfft=512, samplerate=16000, lowfreq=20, highfreq=None):
        low_freq = lowfreq
        if highfreq is None:
            highfreq = samplerate // 2
        low_mel = hz2mel(low_freq)
        high_mel = hz2mel(highfreq)
        mel_points = np.linspace(low_mel, high_mel, nfilt + 2)
        binf = np.floor((nfft + 1) * mel2hz(mel_points) / samplerate)
        fbank = np.zeros([nfilt, int(nfft / 2 + 1)])
        for indexj in range(0, nfilt):
            left = binf[indexj]
            center = binf[indexj + 1]
            right = binf[indexj + 2]
            for indexi in range(int(left), int(center)):
                fbank[indexj, indexi] = (indexi - left) / ( center - left)
            for indexi in range(int(center), int(right)):
                fbank[indexj, indexi] = (right - indexi) / ( right -center)
        return fbank
    y, sr = sf.read('q1.wav')
    y = preemphasis(y, coeff=0.98)
    frames = frame_sig(y, frame_len=2048, frame_step= 512, win_func=np.hanning)
    features = pow_spec(frames, NFFT=2048)
    nfilt = 26
    nfft = 2048
    fb = filterbank(nfilt, nfft, sr, lowfreq=20, highfreq=sr // 2)
    feature = np.dot(features, fb.T)
    librosa.display.specshow(librosa.power_to_db(feature.T),sr=sr, x_axis='time', y_axis='linear')
    plt.colorbar(format='%+2.0f dB')

