pinebuds/tests/anc_usb/speech_process.c

/***************************************************************************
 *
 * Copyright 2015-2019 BES.
 * All rights reserved. All unpublished rights reserved.
 *
 * No part of this work may be used or reproduced in any form or by any
 * means, or stored in a database or retrieval system, without prior written
 * permission of BES.
 *
 * Use of this work is governed by a license granted by BES.
 * This work contains confidential and proprietary information of
 * BES. which is protected by copyright, trade secret,
 * trademark and other intellectual property rights.
 *
 ****************************************************************************/
#include "speech_process.h"
#include "cmsis.h"
#include "hal_sysfreq.h"
#include "hal_timer.h"
#include "hal_trace.h"

#if defined(SPEECH_TX_AEC2FLOAT) && !defined(KEEP_SAME_LATENCY)
#error "capture/playback size should be equal when enable AEC"
#endif

#include "bt_sco_chain.h"
#include "iir_resample.h"
#include "speech_memory.h"
#include "speech_utils.h"

#define MED_MEM_POOL_SIZE (1024 * 160)
static uint8_t g_medMemPool[MED_MEM_POOL_SIZE];

#define FFSE_SAMPLE_RATE (16000)

int capture_channel_num = 1;
int capture_sample_rate = 48000;
int capture_sample_bit = 16;
int playback_channel_num = 2;
int playback_sample_rate = 48000;
int playback_sample_bit = 24;

int send_channel_num = 2;
int recv_channel_num = 2;

// resample related
static bool resample_needed_flag = false;
static IirResampleState *upsample_st;
static IirResampleState *downsample_st[4];
int16_t *capture_buffer_deinter = NULL;
int16_t *process_buffer = NULL;
int16_t *process_buffer_inter = NULL;

static short *aec_echo_buf = NULL;
static IirResampleState *rx_downsample_st;

static volatile bool is_speech_init = false;

static void speech_deinterleave(int16_t *in, int16_t *out, int len,
                                int ch_num) {
  int len_per_channel = len / ch_num;

  for (int i = 0, j = 0; i < len; i += ch_num, j++) {
    int16_t *pout = &out[j];
    int16_t *pin = &in[i];
    for (int c = 0; c < ch_num; c++) {
      *pout = *pin;
      pout += len_per_channel;
      pin += 1;
    }
  }
}

static void speech_interleave(int16_t *in, int16_t *out, int len, int ch_num) {
  int len_per_channel = len / ch_num;

  for (int i = 0, j = 0; j < len; i++, j += ch_num) {
    int16_t *pout = &out[j];
    int16_t *pin = &in[i];
    for (int c = 0; c < ch_num; c++) {
      *pout = *pin;
      pout += 1;
      pin += len_per_channel;
    }
  }
}

static void speech_extend(int16_t *in, int16_t *out, int len, int ch_num) {
  int16_t *pout = out + len * ch_num - 1;
  for (int i = len - 1; i >= 0; i--) {
    for (int c = 0; c < ch_num; c++) {
      *pout-- = in[i];
    }
  }
}

// This function output remains the same sample rate as input,
// output channel number shoule be CHAN_NUM_SEND.
// TODO: add multi-channel support in iir resampler
void speech_process_capture_run(uint8_t *buf, uint32_t *len) {
  // TRACE(2,"[%s], pcm_len: %d", __FUNCTION__, *len / 2);

  if (is_speech_init == false)
    return;

  int16_t *pcm_buf = (int16_t *)buf;
  int pcm_len = *len / 2;
  int process_len = pcm_len * FFSE_SAMPLE_RATE / capture_sample_rate;

  if (resample_needed_flag == true) {
    if (capture_channel_num > 1)
      speech_deinterleave(pcm_buf, capture_buffer_deinter, pcm_len,
                          capture_channel_num);
    else
      speech_copy_int16(capture_buffer_deinter, pcm_buf, pcm_len);

    int in_len_per_channel = pcm_len / capture_channel_num;
    int out_len_per_channel = process_len / capture_channel_num;
    for (int i = 0; i < capture_channel_num; i++) {
      iir_resample_process(
          downsample_st[i], &capture_buffer_deinter[i * in_len_per_channel],
          &process_buffer[i * out_len_per_channel], in_len_per_channel);
    }

    if (capture_channel_num > 1)
      speech_interleave(process_buffer, process_buffer_inter, process_len,
                        capture_channel_num);
    else
      speech_copy_int16(process_buffer_inter, process_buffer, process_len);

    speech_tx_process(process_buffer_inter, aec_echo_buf, &process_len);

    iir_resample_process(upsample_st, process_buffer_inter, pcm_buf,
                         process_len);

    if (send_channel_num > 1)
      speech_extend(pcm_buf, pcm_buf, in_len_per_channel, send_channel_num);
  } else {
    speech_tx_process(pcm_buf, aec_echo_buf, &process_len);

    if (send_channel_num > 1)
      speech_extend(pcm_buf, pcm_buf, process_len, send_channel_num);
  }

  pcm_len = pcm_len / capture_channel_num * send_channel_num;
  *len = pcm_len * sizeof(int16_t);
}

void speech_process_playback_run(uint8_t *buf, uint32_t *len) {
  // TRACE(2,"[%s] pcm_len: %d", __FUNCTION__, *len / 2);

  if (is_speech_init == false)
    return;

#if defined(SPEECH_TX_AEC2FLOAT)
  int16_t *pcm_buf = (int16_t *)buf;
  int pcm_len = *len / 2;

  if (resample_needed_flag == true) {
    // Convert to 16bit if necessary
    if (playback_sample_bit == 24) {
      int32_t *pcm32 = (int32_t *)buf;
      for (int i = 0; i < pcm_len / 2; i++) {
        pcm_buf[i] = (pcm32[i] >> 8);
      }
      pcm_len >>= 1;
    }

    // Convert to mono if necessary, choose left channel
    if (playback_channel_num == 2) {
      for (int i = 0, j = 0; i < pcm_len; i += 2, j++)
        pcm_buf[j] = pcm_buf[i];
      pcm_len >>= 1;
    }

    iir_resample_process(rx_downsample_st, pcm_buf, pcm_buf, pcm_len);
  }
  speech_copy_int16(aec_echo_buf, pcm_buf,
                    pcm_len * FFSE_SAMPLE_RATE / capture_sample_rate);
#endif
}

void speech_process_init(int tx_sample_rate, int tx_channel_num,
                         int tx_sample_bit, int rx_sample_rate,
                         int rx_channel_num, int rx_sample_bit, int tx_frame_ms,
                         int rx_frame_ms, int tx_send_channel_num,
                         int rx_recv_channel_num) {
  ASSERT(tx_sample_rate == 16000 || tx_sample_rate == 48000,
         "[%s] sample rate(%d) not supported", __FUNCTION__, tx_sample_rate);
  ASSERT(tx_frame_ms == 16, "[%s] just support 16ms frame", __func__);

  capture_sample_rate = tx_sample_rate;
  capture_channel_num = tx_channel_num;
  capture_sample_bit = tx_sample_bit;
  playback_sample_rate = rx_sample_rate;
  playback_channel_num = rx_channel_num;
  playback_sample_bit = rx_sample_bit;

  send_channel_num = tx_send_channel_num;
  recv_channel_num = rx_recv_channel_num;

  resample_needed_flag = (capture_sample_rate != FFSE_SAMPLE_RATE);

  TRACE(5,
        "[%s] sample_rate: %d, frame_ms: %d, channel_num: %d, "
        "resample_needed_flag: %d",
        __FUNCTION__, tx_sample_rate, tx_frame_ms, tx_channel_num,
        resample_needed_flag);

  speech_init(FFSE_SAMPLE_RATE, FFSE_SAMPLE_RATE, tx_frame_ms, tx_frame_ms,
              tx_frame_ms, &g_medMemPool[0], MED_MEM_POOL_SIZE);

  if (resample_needed_flag == true) {
    capture_buffer_deinter =
        speech_calloc(SPEECH_FRAME_MS_TO_LEN(capture_sample_rate, tx_frame_ms) *
                          capture_channel_num,
                      sizeof(int16_t));

    // Resample state must be created after speech init, as it uses speech heap
    process_buffer =
        speech_calloc(SPEECH_FRAME_MS_TO_LEN(FFSE_SAMPLE_RATE, tx_frame_ms) *
                          capture_channel_num,
                      sizeof(int16_t));
    process_buffer_inter =
        speech_calloc(SPEECH_FRAME_MS_TO_LEN(FFSE_SAMPLE_RATE, tx_frame_ms) *
                          capture_channel_num,
                      sizeof(int16_t));

    upsample_st = iir_resample_init(
        SPEECH_FRAME_MS_TO_LEN(FFSE_SAMPLE_RATE, tx_frame_ms),
        iir_resample_choose_mode(FFSE_SAMPLE_RATE, capture_sample_rate));

    // as iir resample can only deal with mono signal, we should init
    // channel_num states
    for (int i = 0; i < capture_channel_num; i++) {
      downsample_st[i] = iir_resample_init(
          SPEECH_FRAME_MS_TO_LEN(capture_sample_rate, tx_frame_ms),
          iir_resample_choose_mode(capture_sample_rate, FFSE_SAMPLE_RATE));
    }

    //
    aec_echo_buf = speech_calloc(
        SPEECH_FRAME_MS_TO_LEN(FFSE_SAMPLE_RATE, rx_frame_ms), sizeof(int16_t));
    rx_downsample_st = iir_resample_init(
        SPEECH_FRAME_MS_TO_LEN(playback_sample_rate, rx_frame_ms),
        iir_resample_choose_mode(playback_sample_rate, FFSE_SAMPLE_RATE));
  }

  is_speech_init = true;
}

void speech_process_deinit(void) {
  if (is_speech_init == false)
    return;

  if (resample_needed_flag == true) {
    speech_free(capture_buffer_deinter);

    speech_free(process_buffer);
    speech_free(process_buffer_inter);

    iir_resample_destroy(upsample_st);

    for (int i = 0; i < capture_channel_num; i++) {
      iir_resample_destroy(downsample_st[i]);
    }

    speech_free(aec_echo_buf);
    iir_resample_destroy(rx_downsample_st);

    resample_needed_flag = false;
  }

  speech_deinit();

  is_speech_init = false;
}

enum HAL_CMU_FREQ_T speech_process_need_freq(void) { return HAL_CMU_FREQ_208M; }