Merge remote-tracking branch 'github/simd'

This commit is contained in:
Jörg Thalheim 2015-03-10 15:23:22 +01:00
commit 608a750738
3 changed files with 135 additions and 93 deletions

View File

@ -126,6 +126,7 @@ int fix_fft_org(fixed fr[], fixed fi[], int m, int inverse)
wr >>= 1; wr >>= 1;
wi >>= 1; wi >>= 1;
} }
for(i=m; i<n; i+=istep) for(i=m; i<n; i+=istep)
{ {
@ -148,6 +149,7 @@ int fix_fft_org(fixed fr[], fixed fi[], int m, int inverse)
fi[i] = qi + ti; fi[i] = qi + ti;
} }
} }
--k; --k;
l = istep; l = istep;
} }

66
fft.c
View File

@ -37,6 +37,14 @@
#include <stdio.h> #include <stdio.h>
#define FFT_COMBINED_STORE(_fr, _fi, _i, _simd_r, _simd_i) \
asm ("{" "\n" \
" fft_simd_store %1, %0, %2" "\n" \
" nop" "\n" \
" fft_simd_store %3, %0, %4" "\n" \
"}" \
:: "r" (_i), "r" (_fr), "r" (_simd_r), "r" (_fi), "r" (_simd_i));
/* /*
* fix_fft() - perform fast Fourier transform. * fix_fft() - perform fast Fourier transform.
* *
@ -61,7 +69,7 @@ int fix_fft(fixed fr[], fixed fi[], int m, int inverse)
mr = 0; mr = 0;
nn = n - 1; nn = n - 1;
scale = 0; scale = 0;
int mm = m; int mm = m;
/* decimation in time - re-order data */ /* decimation in time - re-order data */
@ -70,7 +78,7 @@ int fix_fft(fixed fr[], fixed fi[], int m, int inverse)
if(m >= nn) break; if(m >= nn) break;
if(mr <= m) continue; if(mr <= m) continue;
tr = fr[m]; tr = fr[m];
fr[m] = fr[mr]; fr[m] = fr[mr];
fr[mr] = tr; fr[mr] = tr;
@ -96,7 +104,7 @@ int fix_fft(fixed fr[], fixed fi[], int m, int inverse)
++scale; ++scale;
break; break;
} }
} }
} }
else else
{ {
@ -110,28 +118,42 @@ int fix_fft(fixed fr[], fixed fi[], int m, int inverse)
/* it may not be obvious, but the shift will be performed /* it may not be obvious, but the shift will be performed
on each data point exactly once, during this pass. */ on each data point exactly once, during this pass. */
istep = l << 1; //step width of current butterfly istep = l << 1; //step width of current butterfly
FFT_REG reg; FFT_REG reg;
FFT_REG_SIMD simd_r, simd_i;
fixed *reg_s = ((fixed*) &reg); fixed *reg_s = ((fixed*) &reg);
for(m=0; m<n; m+=istep) if(l == 1)
{ {
for(i=m; i<m+l; ++i) for(i=0; i<n; i+=8)
{ {
j = i + l; simd_r = FFT_simd_load(fr, i);
simd_i = FFT_simd_load(fi, i);
reg_s[3] = fr[i]; FFT_SIMD_FIRST(simd_r, simd_i, (xtbool) shift);
reg_s[2] = fr[j]; FFT_COMBINED_STORE(fr, fi, i, simd_r, simd_i);
reg_s[1] = fi[i]; }
reg_s[0] = fi[j]; }
else
FFT_CALC(reg, i << k, (xtbool) shift, inverse); {
for(m=0; m<n; m+=istep)
fr[i] = reg_s[3]; {
fr[j] = reg_s[2]; for(i=m; i<m+l; ++i)
fi[i] = reg_s[1]; {
fi[j] = reg_s[0]; j = i + l;
}
reg_s[3] = fr[i];
reg_s[2] = fr[j];
reg_s[1] = fi[i];
reg_s[0] = fi[j];
FFT_CALC(reg, i << k, (xtbool) shift, inverse);
fr[i] = reg_s[3];
fr[j] = reg_s[2];
fi[i] = reg_s[1];
fi[j] = reg_s[0];
}
}
} }
--k; --k;
l = istep; l = istep;

View File

@ -132,10 +132,66 @@ table SIN_WAVE 16 1024 {
regfile FFT_REG 64 2 fftv regfile FFT_REG 64 2 fftv
regfile FFT_REG_SIMD 128 2 fftsv
function [31:0] FFT_VAR_SHIFT([31:0] data, [3:0] sh)
{
assign FFT_VAR_SHIFT = TIEmux(sh,
data[31:0],
{data[30:0], 1'b0},
{data[29:0], 2'b0},
{data[28:0], 3'b0},
{data[27:0], 4'b0},
{data[26:0], 5'b0},
{data[25:0], 6'b0},
{data[24:0], 7'b0},
{data[23:0], 8'b0},
{data[22:0], 9'b0},
{data[21:0], 10'b0},
{data[20:0], 11'b0},
{data[19:0], 12'b0},
{data[18:0], 13'b0},
{data[17:0], 14'b0},
{data[16:0], 15'b0}
);
}
operation FFT_SIMD_LOAD {in AR *base, in AR offset, out FFT_REG_SIMD data} {out VAddr, in MemDataIn128}
{
assign VAddr = TIEadd(base, {offset[30:0], 1'b0}, 1'b0);
wire [15:0] o1 = MemDataIn128[15:0];
wire [15:0] o2 = MemDataIn128[31:16];
wire [15:0] o3 = MemDataIn128[47:32];
wire [15:0] o4 = MemDataIn128[63:48];
wire [15:0] o5 = MemDataIn128[79:64];
wire [15:0] o6 = MemDataIn128[95:80];
wire [15:0] o7 = MemDataIn128[111:96];
wire [15:0] o8 = MemDataIn128[127:112];
assign data = {o1, o2, o3, o4, o5, o6, o7, o8 };
}
operation FFT_SIMD_STORE {in AR *base, in AR offset, in FFT_REG_SIMD data} {out VAddr, out MemDataOut128}
{
assign VAddr = TIEadd(base, {offset[30:0], 1'b0}, 1'b0);
wire [15:0] o1 = data[15:0];
wire [15:0] o2 = data[31:16];
wire [15:0] o3 = data[47:32];
wire [15:0] o4 = data[63:48];
wire [15:0] o5 = data[79:64];
wire [15:0] o6 = data[95:80];
wire [15:0] o7 = data[111:96];
wire [15:0] o8 = data[127:112];
assign MemDataOut128 = {o1, o2, o3, o4, o5, o6, o7, o8 };
}
operation FFT_SHIFT_CHECK {in AR *base, in AR offset, out AR needs_shift} {out VAddr, in MemDataIn128} operation FFT_SHIFT_CHECK {in AR *base, in AR offset, out AR needs_shift} {out VAddr, in MemDataIn128}
{ {
assign VAddr = TIEadd(base, offset[31:1], 1'b0); assign VAddr = TIEadd(base, offset[31:1], 1'b0);
wire [15:0] o1 = MemDataIn128[15:0]; wire [15:0] o1 = MemDataIn128[15:0];
wire [15:0] o2 = MemDataIn128[31:16]; wire [15:0] o2 = MemDataIn128[31:16];
wire [15:0] o3 = MemDataIn128[47:32]; wire [15:0] o3 = MemDataIn128[47:32];
@ -178,7 +234,7 @@ operation FFT_BIT_REVERSE {inout AR m, out AR mr, in AR mm} {}
TIEmux(mm[3:0], 1'b0, 1'b0, x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7], x[8], x[9], x[10],x[11],x[12],x[13]), TIEmux(mm[3:0], 1'b0, 1'b0, x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7], x[8], x[9], x[10],x[11],x[12],x[13]),
TIEmux(mm[3:0], 1'b0, x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7], x[8], x[9], x[10],x[11],x[12],x[13],x[14]) TIEmux(mm[3:0], 1'b0, x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7], x[8], x[9], x[10],x[11],x[12],x[13],x[14])
}; };
assign m = x; assign m = x;
} }
function [31:0] FFT_TWIDDLE ([31:0] j, [0:0] shift, [0:0] inverse) function [31:0] FFT_TWIDDLE ([31:0] j, [0:0] shift, [0:0] inverse)
@ -189,7 +245,7 @@ function [31:0] FFT_TWIDDLE ([31:0] j, [0:0] shift, [0:0] inverse)
wire [15:0] wr1 = SIN_WAVE[idx]; wire [15:0] wr1 = SIN_WAVE[idx];
wire [15:0] wi1 = TIEadd(~sin, 16'b0, 1'b1); wire [15:0] wi1 = TIEadd(~sin, 16'b0, 1'b1);
wire [15:0] wi2 = TIEmux(inverse, wi1, sin); wire [15:0] wi2 = TIEmux(inverse, wi1, sin);
assign FFT_TWIDDLE = { assign FFT_TWIDDLE = {
TIEmux(shift, wr1, {wr1[15], wr1[15:1]}), TIEmux(shift, wr1, {wr1[15], wr1[15:1]}),
TIEmux(shift, wi2, {wi2[15], wi2[15:1]}) TIEmux(shift, wi2, {wi2[15], wi2[15:1]})
@ -197,105 +253,67 @@ function [31:0] FFT_TWIDDLE ([31:0] j, [0:0] shift, [0:0] inverse)
} }
function [63:0] FFT_BUTTERFLY ([63:0] data, [15:0] wr, [15:0] wi, [0:0] shift) { function [63:0] FFT_BUTTERFLY ([63:0] data, [15:0] wr, [15:0] wi, [0:0] shift) {
// operands real parts // operands real parts
wire [15:0] r1 = data[63:48]; wire [15:0] r1 = data[63:48];
wire [15:0] r2 = data[47:32]; wire [15:0] r2 = data[47:32];
// operands imaginary parts // operands imaginary parts
wire [15:0] i1 = data[31:16]; wire [15:0] i1 = data[31:16];
wire [15:0] i2 = data[15:0]; wire [15:0] i2 = data[15:0];
// odd real part // odd real part
wire [31:0] oddr1 = TIEmul(wr, r2, 1'b1); wire [31:0] oddr1 = TIEmul(wr, r2, 1'b1);
wire [15:0] oddr1s = oddr1[30:15]; wire [15:0] oddr1s = oddr1[30:15];
wire [31:0] oddr2 = TIEmul(wi, i2, 1'b1); wire [31:0] oddr2 = TIEmul(wi, i2, 1'b1);
wire [15:0] oddr2s = oddr2[30:15]; wire [15:0] oddr2s = oddr2[30:15];
wire [15:0] oddr = TIEadd(oddr1s, ~oddr2s, 1'b1); wire [15:0] oddr = TIEadd(oddr1s, ~oddr2s, 1'b1);
// odd imaginary part // odd imaginary part
wire [31:0] oddi1 = TIEmul(wr, i2, 1'b1); wire [31:0] oddi1 = TIEmul(wr, i2, 1'b1);
wire [15:0] oddi1s = oddi1[30:15]; wire [15:0] oddi1s = oddi1[30:15];
wire [31:0] oddi2 = TIEmul(wi, r2, 1'b1); wire [31:0] oddi2 = TIEmul(wi, r2, 1'b1);
wire [15:0] oddi2s = oddi2[30:15]; wire [15:0] oddi2s = oddi2[30:15];
wire [15:0] oddi = TIEadd(oddi1s, oddi2s, 1'b0); wire [15:0] oddi = TIEadd(oddi1s, oddi2s, 1'b0);
// even parts // even parts
wire [15:0] evenr = TIEmux(shift[0], r1, {r1[15], r1[15:1]}); wire [15:0] evenr = TIEmux(shift[0], r1, {r1[15], r1[15:1]});
wire [15:0] eveni = TIEmux(shift[0], i1, {i1[15], i1[15:1]}); wire [15:0] eveni = TIEmux(shift[0], i1, {i1[15], i1[15:1]});
// final result // final result
wire [15:0] resr1 = TIEadd(evenr, oddr, 1'b0); wire [15:0] resr1 = TIEadd(evenr, oddr, 1'b0);
wire [15:0] resr2 = TIEadd(evenr, ~oddr, 1'b1); wire [15:0] resr2 = TIEadd(evenr, ~oddr, 1'b1);
wire [15:0] resi1 = TIEadd(eveni, oddi, 1'b0); wire [15:0] resi1 = TIEadd(eveni, oddi, 1'b0);
wire [15:0] resi2 = TIEadd(eveni, ~oddi, 1'b1); wire [15:0] resi2 = TIEadd(eveni, ~oddi, 1'b1);
assign FFT_BUTTERFLY = { resr1, resr2, resi1, resi2 }; assign FFT_BUTTERFLY = { resr1, resr2, resi1, resi2 };
} }
operation FFT_CALC {inout FFT_REG data, in AR i, in BR shift, in BR inverse} {} operation FFT_CALC {inout FFT_REG data, in AR i, in BR shift, in BR inverse} {}
{ {
wire [31:0] twiddle = FFT_TWIDDLE(i, shift, inverse); wire [31:0] twiddle = FFT_TWIDDLE(i, shift, inverse);
wire [15:0] wr = twiddle[31:16]; wire [15:0] wr = twiddle[31:16];
wire [15:0] wi = twiddle[15:0]; wire [15:0] wi = twiddle[15:0];
assign data = FFT_BUTTERFLY(data, wr, wi, shift); assign data = FFT_BUTTERFLY(data, wr, wi, shift);
} }
//////////////////////////////////////////////////////////////////////////// // 4 butterflies at once
// operation FFT_SIMD_FIRST {inout FFT_REG_SIMD fr, inout FFT_REG_SIMD fi, in BR shift} {}
// Generated by XPRES v4.0.4 {
// Sat Mar 07 17:29:10 2015
// wire [15:0] wr = TIEmux(shift, 16'h7fff, 16'h3fff);
// Register Files wire [15:0] wi = 16'b0;
// AR (a): 16 entries, 32 bits per entry, ports 3r / 2w
// vec (v): 16 entries, 160 bits per entry, ports 3r / 1w wire [63:0] res1 = FFT_butterfly({fr[127:96], fi[127:96]}, wr, wi, shift);
// valign (u): 4 entries, 128 bits per entry, ports 2r / 1w wire [63:0] res2 = FFT_butterfly({fr[95:64], fi[95:64]}, wr, wi, shift);
// sel (s): 8 entries, 32 bits per entry, ports 2r / 1w wire [63:0] res3 = FFT_butterfly({fr[63:32], fi[63:32]}, wr, wi, shift);
// FFT_reg (fftv): 2 entries, 64 bits per entry, ports 1r / 1w wire [63:0] res4 = FFT_butterfly({fr[31:0], fi[31:0]}, wr, wi, shift);
//
// FLIX Formats assign fr = { res1[63:32], res2[63:32], res3[63:32], res4[63:32] };
// x24: size 24 bits, 1 slot assign fi = { res1[31:0], res2[31:0], res3[31:0], res4[31:0] };
// slot Inst: size 24 bits }
// opcodes { }
// x64: size 64 bits, 3 slots
// slot vsLDST: size 24 bits
// opcodes { }
// slot vsMAC: size 18 bits
// opcodes { }
// slot vsALU: size 18 bits
// opcodes { }
// flix64_0: size 64 bits, 2 slots
// slot flix64_0_slot0: size 24 bits
// opcodes { L16SI NOP ld.FFT_reg mv.FFT_reg st.FFT_reg }
// slot flix64_0_slot1: size 10 bits
// opcodes { MOV.N NOP }
//
// This TIE requires the following configuration settings:
//
// Required Endian: Little
// Required Instruction Width: 64 bits
// Minimum Data-Memory Width: 128 bits
// Required Load/Store Units: 1
// Requires Byte Enables: Yes
// Requires Booleans: No
// Pipeline Length: 5 stages
//
// This TIE was generated on a processor configuration with the
// following ISA instruction options enabled:
//
// CLAMPS
// MUL16
// NSA/NSAU
// MIN/MAX and MINU/MAXU
// Sign Extend to 32 Bits
// Enable Density Instructions
// Enable Boolean Registers
// Zero Overhead Loop Instructions
// Vectra LX DSP Coprocessor Instruction Family
//
////////////////////////////////////////////////////////////////////////////
//-------------------------------------------------------------------------- //--------------------------------------------------------------------------
@ -314,10 +332,9 @@ immediate_range LD.FFT_REG_immed2 -32 24 8
// flix64_0, format width 64 bits, 2 slots // flix64_0, format width 64 bits, 2 slots
format flix64_0 64 { flix64_0_slot0, flix64_0_slot1, flix64_0_slot2 } format flix64_0 64 { flix64_0_slot0, flix64_0_slot1, flix64_0_slot2 }
//Full slots: slot_opcodes flix64_0_slot0 { MOVI, J, ADDX2, L16SI, S16I, FFT_BIT_REVERSE, S32I, L32I, FFT_SHIFT_CHECK, OR, NOP, ADD, FFT_CALC, FFT_SIMD_LOAD, FFT_SIMD_STORE }
slot_opcodes flix64_0_slot0 { MOVI, J, ADDX2, L16SI, S16I, FFT_BIT_REVERSE, S32I, L32I, FFT_SHIFT_CHECK, OR, NOP, ADD, FFT_CALC }
slot_opcodes flix64_0_slot1 { SSL, SLL, MOVI, ADDX2, NOP, ADDI.N, ANDBC, ADD, MOV.N, J } slot_opcodes flix64_0_slot1 { SSL, SLL, MOVI, ADDX2, NOP, ADDI.N, ANDBC, ADD, MOV.N, J }
slot_opcodes flix64_0_slot2 { S32I, ADDI.N, L32I, L16SI, ADDX2, NOP, FFT_SHIFT_CHECK, J, MOVI, SSL, MOV.N, S16I } slot_opcodes flix64_0_slot2 { S32I, ADDI.N, L32I, L16SI, ADDX2, NOP, FFT_SHIFT_CHECK, J, MOVI, SSL, MOV.N, S16I, FFT_SIMD_LOAD, FFT_SIMD_STORE }
//-------------------------------------------------------------------------- //--------------------------------------------------------------------------
@ -328,3 +345,4 @@ slot_opcodes flix64_0_slot2 { S32I, ADDI.N, L32I, L16SI, ADDX2, NOP, FFT_SHIFT_C
//-------------------------------------------------------------------------- //--------------------------------------------------------------------------
ctype FFT_REG 64 64 FFT_REG default ctype FFT_REG 64 64 FFT_REG default
ctype FFT_REG_SIMD 128 128 FFT_REG_SIMD default