fft_3dn2.c
Go to the documentation of this file.
1 /*
2  * FFT/MDCT transform with Extended 3DNow! optimizations
3  * Copyright (c) 2006-2008 Zuxy MENG Jie, Loren Merritt
4  *
5  * This file is part of Libav.
6  *
7  * Libav is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU Lesser General Public
9  * License as published by the Free Software Foundation; either
10  * version 2.1 of the License, or (at your option) any later version.
11  *
12  * Libav is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15  * Lesser General Public License for more details.
16  *
17  * You should have received a copy of the GNU Lesser General Public
18  * License along with Libav; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20  */
21 
22 #include "libavutil/x86_cpu.h"
23 #include "libavcodec/dsputil.h"
24 #include "fft.h"
25 
26 DECLARE_ALIGNED(8, static const unsigned int, m1m1)[2] = { 1U<<31, 1U<<31 };
27 
28 #ifdef EMULATE_3DNOWEXT
29 #define PSWAPD(s,d)\
30  "movq "#s","#d"\n"\
31  "psrlq $32,"#d"\n"\
32  "punpckldq "#s","#d"\n"
33 #define ff_fft_calc_3dn2 ff_fft_calc_3dn
34 #define ff_fft_dispatch_3dn2 ff_fft_dispatch_3dn
35 #define ff_fft_dispatch_interleave_3dn2 ff_fft_dispatch_interleave_3dn
36 #define ff_imdct_calc_3dn2 ff_imdct_calc_3dn
37 #define ff_imdct_half_3dn2 ff_imdct_half_3dn
38 #else
39 #define PSWAPD(s,d) "pswapd "#s","#d"\n"
40 #endif
41 
42 void ff_fft_dispatch_3dn2(FFTComplex *z, int nbits);
44 
46 {
47  int n = 1<<s->nbits;
48  int i;
50  __asm__ volatile("femms");
51  if(n <= 8)
52  for(i=0; i<n; i+=2)
53  FFSWAP(FFTSample, z[i].im, z[i+1].re);
54 }
55 
56 void ff_imdct_half_3dn2(FFTContext *s, FFTSample *output, const FFTSample *input)
57 {
58  x86_reg j, k;
59  long n = s->mdct_size;
60  long n2 = n >> 1;
61  long n4 = n >> 2;
62  long n8 = n >> 3;
63  const uint16_t *revtab = s->revtab;
64  const FFTSample *tcos = s->tcos;
65  const FFTSample *tsin = s->tsin;
66  const FFTSample *in1, *in2;
67  FFTComplex *z = (FFTComplex *)output;
68 
69  /* pre rotation */
70  in1 = input;
71  in2 = input + n2 - 1;
72 #ifdef EMULATE_3DNOWEXT
73  __asm__ volatile("movd %0, %%mm7" ::"r"(1U<<31));
74 #endif
75  for(k = 0; k < n4; k++) {
76  // FIXME a single block is faster, but gcc 2.95 and 3.4.x on 32bit can't compile it
77  __asm__ volatile(
78  "movd %0, %%mm0 \n"
79  "movd %2, %%mm1 \n"
80  "punpckldq %1, %%mm0 \n"
81  "punpckldq %3, %%mm1 \n"
82  "movq %%mm0, %%mm2 \n"
83  PSWAPD( %%mm1, %%mm3 )
84  "pfmul %%mm1, %%mm0 \n"
85  "pfmul %%mm3, %%mm2 \n"
86 #ifdef EMULATE_3DNOWEXT
87  "movq %%mm0, %%mm1 \n"
88  "punpckhdq %%mm2, %%mm0 \n"
89  "punpckldq %%mm2, %%mm1 \n"
90  "pxor %%mm7, %%mm0 \n"
91  "pfadd %%mm1, %%mm0 \n"
92 #else
93  "pfpnacc %%mm2, %%mm0 \n"
94 #endif
95  ::"m"(in2[-2*k]), "m"(in1[2*k]),
96  "m"(tcos[k]), "m"(tsin[k])
97  );
98  __asm__ volatile(
99  "movq %%mm0, %0 \n\t"
100  :"=m"(z[revtab[k]])
101  );
102  }
103 
105 
106 #define CMUL(j,mm0,mm1)\
107  "movq (%2,"#j",2), %%mm6 \n"\
108  "movq 8(%2,"#j",2), "#mm0"\n"\
109  "movq %%mm6, "#mm1"\n"\
110  "movq "#mm0",%%mm7 \n"\
111  "pfmul (%3,"#j"), %%mm6 \n"\
112  "pfmul (%4,"#j"), "#mm0"\n"\
113  "pfmul (%4,"#j"), "#mm1"\n"\
114  "pfmul (%3,"#j"), %%mm7 \n"\
115  "pfsub %%mm6, "#mm0"\n"\
116  "pfadd %%mm7, "#mm1"\n"
117 
118  /* post rotation */
119  j = -n2;
120  k = n2-8;
121  __asm__ volatile(
122  "1: \n"
123  CMUL(%0, %%mm0, %%mm1)
124  CMUL(%1, %%mm2, %%mm3)
125  "movd %%mm0, (%2,%0,2) \n"
126  "movd %%mm1,12(%2,%1,2) \n"
127  "movd %%mm2, (%2,%1,2) \n"
128  "movd %%mm3,12(%2,%0,2) \n"
129  "psrlq $32, %%mm0 \n"
130  "psrlq $32, %%mm1 \n"
131  "psrlq $32, %%mm2 \n"
132  "psrlq $32, %%mm3 \n"
133  "movd %%mm0, 8(%2,%0,2) \n"
134  "movd %%mm1, 4(%2,%1,2) \n"
135  "movd %%mm2, 8(%2,%1,2) \n"
136  "movd %%mm3, 4(%2,%0,2) \n"
137  "sub $8, %1 \n"
138  "add $8, %0 \n"
139  "jl 1b \n"
140  :"+r"(j), "+r"(k)
141  :"r"(z+n8), "r"(tcos+n8), "r"(tsin+n8)
142  :"memory"
143  );
144  __asm__ volatile("femms");
145 }
146 
147 void ff_imdct_calc_3dn2(FFTContext *s, FFTSample *output, const FFTSample *input)
148 {
149  x86_reg j, k;
150  long n = s->mdct_size;
151  long n4 = n >> 2;
152 
153  ff_imdct_half_3dn2(s, output+n4, input);
154 
155  j = -n;
156  k = n-8;
157  __asm__ volatile(
158  "movq %4, %%mm7 \n"
159  "1: \n"
160  PSWAPD((%2,%1), %%mm0)
161  PSWAPD((%3,%0), %%mm1)
162  "pxor %%mm7, %%mm0 \n"
163  "movq %%mm1, (%3,%1) \n"
164  "movq %%mm0, (%2,%0) \n"
165  "sub $8, %1 \n"
166  "add $8, %0 \n"
167  "jl 1b \n"
168  :"+r"(j), "+r"(k)
169  :"r"(output+n4), "r"(output+n4*3),
170  "m"(*m1m1)
171  );
172  __asm__ volatile("femms");
173 }
174