Skip to content

Commit

Permalink
Optimize floating point IDCT.
Browse files Browse the repository at this point in the history
Replace brute force floating point IDCT with ‘obfuscated’ version,
making floating point interpolation speed on par with fixed point.

References #20.
  • Loading branch information
foo86 committed Mar 28, 2015
1 parent 4bb6719 commit 7475a12
Show file tree
Hide file tree
Showing 11 changed files with 309 additions and 61 deletions.
1 change: 1 addition & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ libdcadec/dca_context.c \
libdcadec/dmix_tables.c \
libdcadec/exss_parser.c \
libdcadec/idct_fixed.c \
libdcadec/idct_float.c \
libdcadec/interpolator.c \
libdcadec/interpolator_fixed.c \
libdcadec/interpolator_float.c \
Expand Down
7 changes: 4 additions & 3 deletions libdcadec/core_decoder.c
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
#include "common.h"
#include "bitstream.h"
#include "interpolator.h"
#include "idct.h"
#include "fixed_math.h"
#include "core_decoder.h"
#include "exss_parser.h"
Expand Down Expand Up @@ -918,15 +919,15 @@ int core_filter(struct core_decoder *core, int flags)

core->filter_flags = flags;

if (!core->subband_dsp_data)
if (!(core->subband_dsp_data = interpolator_init(core)))
if (!core->subband_dsp_idct)
if (!(core->subband_dsp_idct = idct_init(core)))
return -DCADEC_ENOMEM;

// Filter primary channels
for (int ch = 0; ch < core->nchannels; ch++) {
// Allocate subband DSP
if (!core->subband_dsp[ch])
if (!(core->subband_dsp[ch] = interpolator_create(core->subband_dsp_data, flags)))
if (!(core->subband_dsp[ch] = interpolator_create(core->subband_dsp_idct, flags)))
return -DCADEC_ENOMEM;

// Map this primary channel to speaker
Expand Down
2 changes: 1 addition & 1 deletion libdcadec/core_decoder.h
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,7 @@ struct core_decoder {
int *subband_buffer;
int *subband_samples[MAX_CHANNELS][MAX_SUBBANDS];
struct interpolator *subband_dsp[MAX_CHANNELS];
struct interpolator_data *subband_dsp_data;
struct idct_context *subband_dsp_idct;

int *lfe_samples;

Expand Down
2 changes: 1 addition & 1 deletion libdcadec/fir_float.h
Original file line number Diff line number Diff line change
Expand Up @@ -804,7 +804,7 @@ static const double lfe_fir_128[512] = {
2.164336300E-04, 1.887860900E-04, 1.635869100E-04, 5.316857100E-04
};

#define SCALE(x) ((x) * M_SQRT2 * 512)
#define SCALE(x) ((x) * M_SQRT2 * 256)

// Annex D.9 - 1024 tap FIR for X96 synthesis QMF
static const double band_fir_x96[1024] = {
Expand Down
30 changes: 26 additions & 4 deletions libdcadec/idct_fixed.h → libdcadec/idct.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,32 @@
* Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/

#ifndef IDCT_FIXED_H
#define IDCT_FIXED_H
#ifndef IDCT_H
#define IDCT_H

void inverse_dct32_fixed(int *input, int *output);
void inverse_dct64_fixed(int *input, int *output);
struct core_decoder;

struct idct_context {
double dct_a[8][8];
double dct_b[8][7];

double mod_a[16];
double mod_b[ 8];
double mod_c[32];

double mod64_a[32];
double mod64_b[16];
double mod64_c[64];
};

struct idct_context *idct_init(struct core_decoder *parent);

void idct_perform32_float(const struct idct_context *idct,
double *input, double *output);
void idct_perform64_float(const struct idct_context *idct,
double *input, double *output);

void idct_perform32_fixed(int *input, int *output);
void idct_perform64_fixed(int *input, int *output);

#endif
6 changes: 3 additions & 3 deletions libdcadec/idct_fixed.c
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@

#include "common.h"
#include "fixed_math.h"
#include "idct_fixed.h"
#include "idct.h"

static inline void sum_a(const int *input, int *output, int len)
{
Expand Down Expand Up @@ -162,7 +162,7 @@ static inline void clp_v(int *input, int len)
input[i] = clip23(input[i]);
}

void inverse_dct32_fixed(int *input, int *output)
void idct_perform32_fixed(int *input, int *output)
{
int mag = 0;
for (int i = 0; i < 32; i++)
Expand Down Expand Up @@ -273,7 +273,7 @@ static inline void mod64_c(const int *input, int *output)
output[i] = mul23(cos_mod[i], input[k] - input[32 + k]);
}

void inverse_dct64_fixed(int *input, int *output)
void idct_perform64_fixed(int *input, int *output)
{
int mag = 0;
for (int i = 0; i < 32; i++)
Expand Down
262 changes: 262 additions & 0 deletions libdcadec/idct_float.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,262 @@
/*
* This file is part of libdcadec.
*
* This library is free software; you can redistribute it and/or modify it
* under the terms of the GNU Lesser General Public License as published by the
* Free Software Foundation; either version 2.1 of the License, or (at your
* option) any later version.
*
* This library is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
* for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this library; if not, write to the Free Software Foundation,
* Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/

#include "common.h"
#include "idct.h"

struct idct_context *idct_init(struct core_decoder *parent)
{
int i, j, k;

struct idct_context *idct = ta_new(parent, struct idct_context);
if (!idct)
return NULL;

for (i = 0; i < 8; i++) {
for (j = 0, k = 7; j < 8; j++, k--) {
if (i & 1)
idct->dct_a[i][j] = -sin((2 * i + 1) * (2 * k + 1) * M_PI / 32);
else
idct->dct_a[i][j] = sin((2 * i + 1) * (2 * k + 1) * M_PI / 32);
}
}

for (i = 0; i < 8; i++)
for (j = 0; j < 7; j++)
idct->dct_b[i][j] = cos((2 * i + 1) * (1 + j) * M_PI / 16);

for (i = 0; i < 8; i++)
idct->mod_a[i] = 0.5 / cos((2 * i + 1) * M_PI / 64);

for (i = 8, k = 7; i < 16; i++, k--)
idct->mod_a[i] = -0.5 / sin((2 * k + 1) * M_PI / 64);

for (i = 0; i < 4; i++)
idct->mod_b[i] = 0.5 / cos((2 * i + 1) * M_PI / 32);

for (i = 4, k = 3; i < 8; i++, k--)
idct->mod_b[i] = 0.5 / sin((2 * k + 1) * M_PI / 32);

for (i = 0; i < 16; i++)
idct->mod_c[i] = 0.125 / cos((2 * i + 1) * M_PI / 128);

for (i = 16, k = 15; i < 32; i++, k--)
idct->mod_c[i] = -0.125 / sin((2 * k + 1) * M_PI / 128);

for (i = 0; i < 16; i++)
idct->mod64_a[i] = 0.5 / cos((2 * i + 1) * M_PI / 128);

for (i = 16, k = 15; i < 32; i++, k--)
idct->mod64_a[i] = -0.5 / sin((2 * k + 1) * M_PI / 128);

for (i = 0; i < 8; i++)
idct->mod64_b[i] = 0.5 / cos((2 * i + 1) * M_PI / 64);

for (i = 8, k = 7; i < 16; i++, k--)
idct->mod64_b[i] = 0.5 / sin((2 * k + 1) * M_PI / 64);

for (i = 0; i < 32; i++)
idct->mod64_c[i] = 0.125 / cos((2 * i + 1) * M_PI / 256);

for (i = 32, k = 31; i < 64; i++, k--)
idct->mod64_c[i] = -0.125 / sin((2 * k + 1) * M_PI / 256);

return idct;
}

static inline void sum_a(const double *input, double *output, int len)
{
for (int i = 0; i < len; i++)
output[i] = input[2 * i] + input[2 * i + 1];
}

static inline void sum_b(const double *input, double *output, int len)
{
for (int i = 0; i < len; i++) {
if (i > 0)
output[i] = input[2 * i] + input[2 * i - 1];
else
output[i] = input[2 * i];
}
}

static inline void sum_c(const double *input, double *output, int len)
{
for (int i = 0; i < len; i++)
output[i] = input[2 * i];
}

static inline void sum_d(const double *input, double *output, int len)
{
for (int i = 0; i < len; i++) {
if (i > 0)
output[i] = input[2 * i - 1] + input[2 * i + 1];
else
output[i] = input[2 * i + 1];
}
}

static inline void dct_a(const struct idct_context *idct,
const double *input, double *output)
{
for (int i = 0; i < 8; i++) {
double res = 0.0;
for (int j = 0; j < 8; j++)
res += idct->dct_a[i][j] * input[j];
output[i] = res;
}
}

static inline void dct_b(const struct idct_context *idct,
const double *input, double *output)
{
for (int i = 0; i < 8; i++) {
double res = input[0];
for (int j = 0; j < 7; j++)
res += idct->dct_b[i][j] * input[1 + j];
output[i] = res;
}
}

static inline void mod_a(const struct idct_context *idct,
const double *input, double *output)
{
for (int i = 0; i < 8; i++)
output[i] = idct->mod_a[i] * (input[i] + input[8 + i]);

for (int i = 8, k = 7; i < 16; i++, k--)
output[i] = idct->mod_a[i] * (input[k] - input[8 + k]);
}

static inline void mod_b(const struct idct_context *idct,
double *input, double *output)
{
for (int i = 0; i < 8; i++)
input[8 + i] = idct->mod_b[i] * input[8 + i];

for (int i = 0; i < 8; i++)
output[i] = input[i] + input[8 + i];

for (int i = 8, k = 7; i < 16; i++, k--)
output[i] = input[k] - input[8 + k];
}

static inline void mod_c(const struct idct_context *idct,
const double *input, double *output)
{
for (int i = 0; i < 16; i++)
output[i] = idct->mod_c[i] * (input[i] + input[16 + i]);

for (int i = 16, k = 15; i < 32; i++, k--)
output[i] = idct->mod_c[i] * (input[k] - input[16 + k]);
}

void idct_perform32_float(const struct idct_context *idct,
double *input, double *output)
{
sum_a(input, output + 0, 16);
sum_b(input, output + 16, 16);

sum_a(output + 0, input + 0, 8);
sum_b(output + 0, input + 8, 8);
sum_c(output + 16, input + 16, 8);
sum_d(output + 16, input + 24, 8);

dct_a(idct, input + 0, output + 0);
dct_b(idct, input + 8, output + 8);
dct_b(idct, input + 16, output + 16);
dct_b(idct, input + 24, output + 24);

mod_a(idct, output + 0, input + 0);
mod_b(idct, output + 16, input + 16);

mod_c(idct, input, output);
}

static inline void mod64_a(const struct idct_context *idct,
const double *input, double *output)
{
for (int i = 0; i < 16; i++)
output[i] = idct->mod64_a[i] * (input[i] + input[16 + i]);

for (int i = 16, k = 15; i < 32; i++, k--)
output[i] = idct->mod64_a[i] * (input[k] - input[16 + k]);
}

static inline void mod64_b(const struct idct_context *idct,
double *input, double *output)
{
for (int i = 0; i < 16; i++)
input[16 + i] = idct->mod64_b[i] * input[16 + i];

for (int i = 0; i < 16; i++)
output[i] = input[i] + input[16 + i];

for (int i = 16, k = 15; i < 32; i++, k--)
output[i] = input[k] - input[16 + k];
}

static inline void mod64_c(const struct idct_context *idct,
const double *input, double *output)
{
for (int i = 0; i < 32; i++)
output[i] = idct->mod64_c[i] * (input[i] + input[32 + i]);

for (int i = 32, k = 31; i < 64; i++, k--)
output[i] = idct->mod64_c[i] * (input[k] - input[32 + k]);
}

void idct_perform64_float(const struct idct_context *idct,
double *input, double *output)
{
sum_a(input, output + 0, 32);
sum_b(input, output + 32, 32);

sum_a(output + 0, input + 0, 16);
sum_b(output + 0, input + 16, 16);
sum_c(output + 32, input + 32, 16);
sum_d(output + 32, input + 48, 16);

sum_a(input + 0, output + 0, 8);
sum_b(input + 0, output + 8, 8);
sum_c(input + 16, output + 16, 8);
sum_d(input + 16, output + 24, 8);
sum_c(input + 32, output + 32, 8);
sum_d(input + 32, output + 40, 8);
sum_c(input + 48, output + 48, 8);
sum_d(input + 48, output + 56, 8);

dct_a(idct, output + 0, input + 0);
dct_b(idct, output + 8, input + 8);
dct_b(idct, output + 16, input + 16);
dct_b(idct, output + 24, input + 24);
dct_b(idct, output + 32, input + 32);
dct_b(idct, output + 40, input + 40);
dct_b(idct, output + 48, input + 48);
dct_b(idct, output + 56, input + 56);

mod_a(idct, input + 0, output + 0);
mod_b(idct, input + 16, output + 16);
mod_b(idct, input + 32, output + 32);
mod_b(idct, input + 48, output + 48);

mod64_a(idct, output + 0, input + 0);
mod64_b(idct, output + 32, input + 32);

mod64_c(idct, input, output);
}
Loading

0 comments on commit 7475a12

Please sign in to comment.