Merge remote-tracking branch 'github/simd'
This commit is contained in:
commit
608a750738
@ -126,6 +126,7 @@ int fix_fft_org(fixed fr[], fixed fi[], int m, int inverse)
|
|||||||
wr >>= 1;
|
wr >>= 1;
|
||||||
wi >>= 1;
|
wi >>= 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
for(i=m; i<n; i+=istep)
|
for(i=m; i<n; i+=istep)
|
||||||
{
|
{
|
||||||
|
|
||||||
@ -148,6 +149,7 @@ int fix_fft_org(fixed fr[], fixed fi[], int m, int inverse)
|
|||||||
fi[i] = qi + ti;
|
fi[i] = qi + ti;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
--k;
|
--k;
|
||||||
l = istep;
|
l = istep;
|
||||||
}
|
}
|
||||||
|
50
fft.c
50
fft.c
@ -37,6 +37,14 @@
|
|||||||
|
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
|
|
||||||
|
#define FFT_COMBINED_STORE(_fr, _fi, _i, _simd_r, _simd_i) \
|
||||||
|
asm ("{" "\n" \
|
||||||
|
" fft_simd_store %1, %0, %2" "\n" \
|
||||||
|
" nop" "\n" \
|
||||||
|
" fft_simd_store %3, %0, %4" "\n" \
|
||||||
|
"}" \
|
||||||
|
:: "r" (_i), "r" (_fr), "r" (_simd_r), "r" (_fi), "r" (_simd_i));
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* fix_fft() - perform fast Fourier transform.
|
* fix_fft() - perform fast Fourier transform.
|
||||||
*
|
*
|
||||||
@ -112,26 +120,40 @@ int fix_fft(fixed fr[], fixed fi[], int m, int inverse)
|
|||||||
istep = l << 1; //step width of current butterfly
|
istep = l << 1; //step width of current butterfly
|
||||||
|
|
||||||
FFT_REG reg;
|
FFT_REG reg;
|
||||||
|
FFT_REG_SIMD simd_r, simd_i;
|
||||||
fixed *reg_s = ((fixed*) ®);
|
fixed *reg_s = ((fixed*) ®);
|
||||||
|
|
||||||
for(m=0; m<n; m+=istep)
|
if(l == 1)
|
||||||
{
|
{
|
||||||
for(i=m; i<m+l; ++i)
|
for(i=0; i<n; i+=8)
|
||||||
{
|
{
|
||||||
j = i + l;
|
simd_r = FFT_simd_load(fr, i);
|
||||||
|
simd_i = FFT_simd_load(fi, i);
|
||||||
|
FFT_SIMD_FIRST(simd_r, simd_i, (xtbool) shift);
|
||||||
|
FFT_COMBINED_STORE(fr, fi, i, simd_r, simd_i);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
for(m=0; m<n; m+=istep)
|
||||||
|
{
|
||||||
|
for(i=m; i<m+l; ++i)
|
||||||
|
{
|
||||||
|
j = i + l;
|
||||||
|
|
||||||
reg_s[3] = fr[i];
|
reg_s[3] = fr[i];
|
||||||
reg_s[2] = fr[j];
|
reg_s[2] = fr[j];
|
||||||
reg_s[1] = fi[i];
|
reg_s[1] = fi[i];
|
||||||
reg_s[0] = fi[j];
|
reg_s[0] = fi[j];
|
||||||
|
|
||||||
FFT_CALC(reg, i << k, (xtbool) shift, inverse);
|
FFT_CALC(reg, i << k, (xtbool) shift, inverse);
|
||||||
|
|
||||||
fr[i] = reg_s[3];
|
fr[i] = reg_s[3];
|
||||||
fr[j] = reg_s[2];
|
fr[j] = reg_s[2];
|
||||||
fi[i] = reg_s[1];
|
fi[i] = reg_s[1];
|
||||||
fi[j] = reg_s[0];
|
fi[j] = reg_s[0];
|
||||||
}
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
--k;
|
--k;
|
||||||
l = istep;
|
l = istep;
|
||||||
|
130
fft_inst.tie
130
fft_inst.tie
@ -132,6 +132,62 @@ table SIN_WAVE 16 1024 {
|
|||||||
|
|
||||||
regfile FFT_REG 64 2 fftv
|
regfile FFT_REG 64 2 fftv
|
||||||
|
|
||||||
|
regfile FFT_REG_SIMD 128 2 fftsv
|
||||||
|
|
||||||
|
function [31:0] FFT_VAR_SHIFT([31:0] data, [3:0] sh)
|
||||||
|
{
|
||||||
|
assign FFT_VAR_SHIFT = TIEmux(sh,
|
||||||
|
data[31:0],
|
||||||
|
{data[30:0], 1'b0},
|
||||||
|
{data[29:0], 2'b0},
|
||||||
|
{data[28:0], 3'b0},
|
||||||
|
{data[27:0], 4'b0},
|
||||||
|
{data[26:0], 5'b0},
|
||||||
|
{data[25:0], 6'b0},
|
||||||
|
{data[24:0], 7'b0},
|
||||||
|
{data[23:0], 8'b0},
|
||||||
|
{data[22:0], 9'b0},
|
||||||
|
{data[21:0], 10'b0},
|
||||||
|
{data[20:0], 11'b0},
|
||||||
|
{data[19:0], 12'b0},
|
||||||
|
{data[18:0], 13'b0},
|
||||||
|
{data[17:0], 14'b0},
|
||||||
|
{data[16:0], 15'b0}
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
operation FFT_SIMD_LOAD {in AR *base, in AR offset, out FFT_REG_SIMD data} {out VAddr, in MemDataIn128}
|
||||||
|
{
|
||||||
|
assign VAddr = TIEadd(base, {offset[30:0], 1'b0}, 1'b0);
|
||||||
|
|
||||||
|
wire [15:0] o1 = MemDataIn128[15:0];
|
||||||
|
wire [15:0] o2 = MemDataIn128[31:16];
|
||||||
|
wire [15:0] o3 = MemDataIn128[47:32];
|
||||||
|
wire [15:0] o4 = MemDataIn128[63:48];
|
||||||
|
wire [15:0] o5 = MemDataIn128[79:64];
|
||||||
|
wire [15:0] o6 = MemDataIn128[95:80];
|
||||||
|
wire [15:0] o7 = MemDataIn128[111:96];
|
||||||
|
wire [15:0] o8 = MemDataIn128[127:112];
|
||||||
|
|
||||||
|
assign data = {o1, o2, o3, o4, o5, o6, o7, o8 };
|
||||||
|
}
|
||||||
|
|
||||||
|
operation FFT_SIMD_STORE {in AR *base, in AR offset, in FFT_REG_SIMD data} {out VAddr, out MemDataOut128}
|
||||||
|
{
|
||||||
|
assign VAddr = TIEadd(base, {offset[30:0], 1'b0}, 1'b0);
|
||||||
|
|
||||||
|
wire [15:0] o1 = data[15:0];
|
||||||
|
wire [15:0] o2 = data[31:16];
|
||||||
|
wire [15:0] o3 = data[47:32];
|
||||||
|
wire [15:0] o4 = data[63:48];
|
||||||
|
wire [15:0] o5 = data[79:64];
|
||||||
|
wire [15:0] o6 = data[95:80];
|
||||||
|
wire [15:0] o7 = data[111:96];
|
||||||
|
wire [15:0] o8 = data[127:112];
|
||||||
|
|
||||||
|
assign MemDataOut128 = {o1, o2, o3, o4, o5, o6, o7, o8 };
|
||||||
|
}
|
||||||
|
|
||||||
operation FFT_SHIFT_CHECK {in AR *base, in AR offset, out AR needs_shift} {out VAddr, in MemDataIn128}
|
operation FFT_SHIFT_CHECK {in AR *base, in AR offset, out AR needs_shift} {out VAddr, in MemDataIn128}
|
||||||
{
|
{
|
||||||
assign VAddr = TIEadd(base, offset[31:1], 1'b0);
|
assign VAddr = TIEadd(base, offset[31:1], 1'b0);
|
||||||
@ -243,59 +299,21 @@ operation FFT_CALC {inout FFT_REG data, in AR i, in BR shift, in BR inverse} {}
|
|||||||
assign data = FFT_BUTTERFLY(data, wr, wi, shift);
|
assign data = FFT_BUTTERFLY(data, wr, wi, shift);
|
||||||
}
|
}
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////
|
// 4 butterflies at once
|
||||||
//
|
operation FFT_SIMD_FIRST {inout FFT_REG_SIMD fr, inout FFT_REG_SIMD fi, in BR shift} {}
|
||||||
// Generated by XPRES v4.0.4
|
{
|
||||||
// Sat Mar 07 17:29:10 2015
|
|
||||||
//
|
wire [15:0] wr = TIEmux(shift, 16'h7fff, 16'h3fff);
|
||||||
// Register Files
|
wire [15:0] wi = 16'b0;
|
||||||
// AR (a): 16 entries, 32 bits per entry, ports 3r / 2w
|
|
||||||
// vec (v): 16 entries, 160 bits per entry, ports 3r / 1w
|
wire [63:0] res1 = FFT_butterfly({fr[127:96], fi[127:96]}, wr, wi, shift);
|
||||||
// valign (u): 4 entries, 128 bits per entry, ports 2r / 1w
|
wire [63:0] res2 = FFT_butterfly({fr[95:64], fi[95:64]}, wr, wi, shift);
|
||||||
// sel (s): 8 entries, 32 bits per entry, ports 2r / 1w
|
wire [63:0] res3 = FFT_butterfly({fr[63:32], fi[63:32]}, wr, wi, shift);
|
||||||
// FFT_reg (fftv): 2 entries, 64 bits per entry, ports 1r / 1w
|
wire [63:0] res4 = FFT_butterfly({fr[31:0], fi[31:0]}, wr, wi, shift);
|
||||||
//
|
|
||||||
// FLIX Formats
|
assign fr = { res1[63:32], res2[63:32], res3[63:32], res4[63:32] };
|
||||||
// x24: size 24 bits, 1 slot
|
assign fi = { res1[31:0], res2[31:0], res3[31:0], res4[31:0] };
|
||||||
// slot Inst: size 24 bits
|
}
|
||||||
// opcodes { }
|
|
||||||
// x64: size 64 bits, 3 slots
|
|
||||||
// slot vsLDST: size 24 bits
|
|
||||||
// opcodes { }
|
|
||||||
// slot vsMAC: size 18 bits
|
|
||||||
// opcodes { }
|
|
||||||
// slot vsALU: size 18 bits
|
|
||||||
// opcodes { }
|
|
||||||
// flix64_0: size 64 bits, 2 slots
|
|
||||||
// slot flix64_0_slot0: size 24 bits
|
|
||||||
// opcodes { L16SI NOP ld.FFT_reg mv.FFT_reg st.FFT_reg }
|
|
||||||
// slot flix64_0_slot1: size 10 bits
|
|
||||||
// opcodes { MOV.N NOP }
|
|
||||||
//
|
|
||||||
// This TIE requires the following configuration settings:
|
|
||||||
//
|
|
||||||
// Required Endian: Little
|
|
||||||
// Required Instruction Width: 64 bits
|
|
||||||
// Minimum Data-Memory Width: 128 bits
|
|
||||||
// Required Load/Store Units: 1
|
|
||||||
// Requires Byte Enables: Yes
|
|
||||||
// Requires Booleans: No
|
|
||||||
// Pipeline Length: 5 stages
|
|
||||||
//
|
|
||||||
// This TIE was generated on a processor configuration with the
|
|
||||||
// following ISA instruction options enabled:
|
|
||||||
//
|
|
||||||
// CLAMPS
|
|
||||||
// MUL16
|
|
||||||
// NSA/NSAU
|
|
||||||
// MIN/MAX and MINU/MAXU
|
|
||||||
// Sign Extend to 32 Bits
|
|
||||||
// Enable Density Instructions
|
|
||||||
// Enable Boolean Registers
|
|
||||||
// Zero Overhead Loop Instructions
|
|
||||||
// Vectra LX DSP Coprocessor Instruction Family
|
|
||||||
//
|
|
||||||
////////////////////////////////////////////////////////////////////////////
|
|
||||||
|
|
||||||
|
|
||||||
//--------------------------------------------------------------------------
|
//--------------------------------------------------------------------------
|
||||||
@ -314,10 +332,9 @@ immediate_range LD.FFT_REG_immed2 -32 24 8
|
|||||||
// flix64_0, format width 64 bits, 2 slots
|
// flix64_0, format width 64 bits, 2 slots
|
||||||
format flix64_0 64 { flix64_0_slot0, flix64_0_slot1, flix64_0_slot2 }
|
format flix64_0 64 { flix64_0_slot0, flix64_0_slot1, flix64_0_slot2 }
|
||||||
|
|
||||||
//Full slots:
|
slot_opcodes flix64_0_slot0 { MOVI, J, ADDX2, L16SI, S16I, FFT_BIT_REVERSE, S32I, L32I, FFT_SHIFT_CHECK, OR, NOP, ADD, FFT_CALC, FFT_SIMD_LOAD, FFT_SIMD_STORE }
|
||||||
slot_opcodes flix64_0_slot0 { MOVI, J, ADDX2, L16SI, S16I, FFT_BIT_REVERSE, S32I, L32I, FFT_SHIFT_CHECK, OR, NOP, ADD, FFT_CALC }
|
|
||||||
slot_opcodes flix64_0_slot1 { SSL, SLL, MOVI, ADDX2, NOP, ADDI.N, ANDBC, ADD, MOV.N, J }
|
slot_opcodes flix64_0_slot1 { SSL, SLL, MOVI, ADDX2, NOP, ADDI.N, ANDBC, ADD, MOV.N, J }
|
||||||
slot_opcodes flix64_0_slot2 { S32I, ADDI.N, L32I, L16SI, ADDX2, NOP, FFT_SHIFT_CHECK, J, MOVI, SSL, MOV.N, S16I }
|
slot_opcodes flix64_0_slot2 { S32I, ADDI.N, L32I, L16SI, ADDX2, NOP, FFT_SHIFT_CHECK, J, MOVI, SSL, MOV.N, S16I, FFT_SIMD_LOAD, FFT_SIMD_STORE }
|
||||||
|
|
||||||
//--------------------------------------------------------------------------
|
//--------------------------------------------------------------------------
|
||||||
|
|
||||||
@ -328,3 +345,4 @@ slot_opcodes flix64_0_slot2 { S32I, ADDI.N, L32I, L16SI, ADDX2, NOP, FFT_SHIFT_C
|
|||||||
//--------------------------------------------------------------------------
|
//--------------------------------------------------------------------------
|
||||||
|
|
||||||
ctype FFT_REG 64 64 FFT_REG default
|
ctype FFT_REG 64 64 FFT_REG default
|
||||||
|
ctype FFT_REG_SIMD 128 128 FFT_REG_SIMD default
|
||||||
|
Loading…
Reference in New Issue
Block a user