

Moderator: ZSNES Mods
Code: Select all
unsigned const diff_offset = (0x440 << 21) | (0x207 << 11) | 0x407;
unsigned const diff_mask = (0x380 << 21) | (0x1F0 << 11) | 0x3F0;
// you'd use a table in place of this, and do rescaling
// of y and v before you had reduced to 1/256 precision
unsigned to_yuv( unsigned rgb )
{
int r = rgb >> 16 & 0xF8;
int g = rgb >> 8 & 0xFC;
int b = rgb >> 0 & 0xF8;
int y = (r + g + b) >> 2;
int u = ((r - b) >> 2) + 128;
int v = ((g * 2 - r - b) >> 3) + 128;
// these are the changes
int y = y * 0x3F / 0x30;
int v = v * 7 / 6;
return (y << 21) + (u << 11) + v;
}
// non-zero if pixels differ enough
unsigned diff( unsigned x, unsigned y )
{
x = to_yuv( x );
y = to_yuv( y );
return (x - y + diff_offset) & diff_mask;
}
// calculation of pattern index inside blitter
// add the offset now instead of 8 times below
unsigned middle = to_yuv( w [5] ) + diff_offset;
int pattern;
// negation of result sets highest bit when pixels differ, which is then shifted to proper position
pattern = -((middle - to_yuv( w [1] )) & diff_mask) >> (31 - 0);
pattern |= -((middle - to_yuv( w [2] )) & diff_mask) >> (30 - 1) & (1 << 1);
pattern |= -((middle - to_yuv( w [3] )) & diff_mask) >> (29 - 2) & (1 << 2);
pattern |= -((middle - to_yuv( w [4] )) & diff_mask) >> (28 - 3) & (1 << 3);
pattern |= -((middle - to_yuv( w [6] )) & diff_mask) >> (27 - 4) & (1 << 4);
pattern |= -((middle - to_yuv( w [7] )) & diff_mask) >> (26 - 5) & (1 << 5);
pattern |= -((middle - to_yuv( w [8] )) & diff_mask) >> (25 - 6) & (1 << 6);
pattern |= -((middle - to_yuv( w [9] )) & diff_mask) >> (24 - 7) & (1 << 7);
switch ( pattern )
...
Code: Select all
int pattern;
uint32 yx = rgbtoyuv[w[5]] + diff_offset;
pattern = ydiff(yx, w[1]);
pattern |= ydiff(yx, w[2]) << 1;
pattern |= ydiff(yx, w[3]) << 2;
pattern |= ydiff(yx, w[4]) << 3;
pattern |= ydiff(yx, w[6]) << 4;
pattern |= ydiff(yx, w[7]) << 5;
pattern |= ydiff(yx, w[8]) << 6;
pattern |= ydiff(yx, w[9]) << 7;
Code: Select all
double kr = 0.2126, kb = 0.0722, kg = (1.0 - kr - kb);
y = double(r) * kr + double(g) * kg + double(b) * kb;
cb = 128.0 + (double(b) - y) / (2.0 - 2.0 * kb);
cr = 128.0 + (double(r) - y) / (2.0 - 2.0 * kr);
Code: Select all
X Adjust Y Raw Mask Result
----------------------------------------------
0x40 + 0x210 - 0x30 = 0x220 & 0x1E0 = non-zero
0x40 + 0x210 - 0x31 = 0x21F & 0x1E0 = zero
0x40 + 0x210 - 0x50 = 0x200 & 0x1E0 = zero
0x40 + 0x210 - 0x51 = 0x1FF & 0x1E0 = non-zero
Code: Select all
static inline unsigned blend1(unsigned c1, unsigned c2) {
const unsigned lowbits=(((c1<<1)&0x0842)+(c1&0x0C63)+(c2&0x0C63))&0x0C63;
return ((c1*3+c2) - lowbits) >> 2;
}
static inline unsigned blend2(unsigned c1, unsigned c2, unsigned c3) {
c1<<=1;
const unsigned lowbits=((c1&0x0842)+(c2&0x0C63)+(c3&0x0C63))&0x0C63;
return ((c1+c2+c3) - lowbits) >> 2;
}
static inline unsigned blend5(unsigned c1, unsigned c2) {
return ( c1+c2 - ((c1^c2)&0x421) ) >> 1;
}
static inline unsigned blend6(unsigned c1, unsigned c2, unsigned c3) {
c2<<=1;
const unsigned lowbits=( ((c1<<2)&0x1084)+(c1&0x1CE7)+(c2&0x18C6)+(c3&0x1CE7) ) & 0x1CE7;
return ((c1*5+c2+c3) - lowbits) >> 3;
}
static inline unsigned blend7(unsigned c1, unsigned c2, unsigned c3) {
const unsigned lowbits=(((((c1<<1)&0x0842)+(c1&0x0C63))<<1)+(c2&0x1CE7)+(c3&0x1CE7))&0x1CE7;
return ((c1*6+c2+c3) - lowbits) >> 3;
}
static inline unsigned blend9(unsigned c1, unsigned c2, unsigned c3) {
c1<<=1;
const unsigned rb=(c1&0xF83E)+((c2&0x7C1F)+(c3&0x7C1F))*3;
const unsigned g=(c1&0x07C0)+((c2&0x03E0)+(c3&0x03E0))*3;
return ((rb&0x3E0F8)|(g&0x01F00))>>3;
}
static inline unsigned blend10(unsigned c1, unsigned c2, unsigned c3) {
const unsigned rb=(c1&0x7C1F)*14+(c2&0x7C1F)+(c3&0x7C1F);
const unsigned g=(c1&0x03E0)*14+(c2&0x03E0)+(c3&0x03E0);
return ((rb&0x7C1F0)|(g&0x03E00))>>4;
}
Code: Select all
static inline unsigned blend1(unsigned c1, unsigned c2) {
const unsigned tmp=c<<1;
const unsigned lowbits=((tmp&0x0842)+(c1&0x0C63)+(c2&0x0C63))&0x0C63;
return ((tmp+c1+c2) - lowbits) >> 2;
}
Code: Select all
A-B-C-D
|X|X|X|
E-F-G-H
|X|X|X|
I-J-K-L
|X|X|X|
M-N-O-P
123
4.5
678
Code: Select all
pattern = (pattern & 0x10) >> 1;
pattern = -((middle - to_yuv( w [1] )) & diff_mask) >> (31 - 0);
pattern |= -((middle - to_yuv( w [2] )) & diff_mask) >> (30 - 1) & (1 << 1);
pattern |= -((middle - to_yuv( w [3] )) & diff_mask) >> (29 - 2) & (1 << 2);
//pattern |= -((middle - to_yuv( w [4] )) & diff_mask) >> (28 - 3) & (1 << 3);
pattern |= -((middle - to_yuv( w [6] )) & diff_mask) >> (27 - 4) & (1 << 4);
pattern |= -((middle - to_yuv( w [7] )) & diff_mask) >> (26 - 5) & (1 << 5);
pattern |= -((middle - to_yuv( w [8] )) & diff_mask) >> (25 - 6) & (1 << 6);
pattern |= -((middle - to_yuv( w [9] )) & diff_mask) >> (24 - 7) & (1 << 7);
pattern = (pattern & 0x10) >> 1;
//omit -width as well?
pattern |= (ptable[-width-1]) >> 7); //no need to mask
pattern |= (ptable[-width] & 0x40) >> 5);
//pattern = -((middle - to_yuv( w [1] )) & diff_mask) >> (31 - 0);
//pattern |= -((middle - to_yuv( w [2] )) & diff_mask) >> (30 - 1) & (1 << 1);
pattern |= -((middle - to_yuv( w [3] )) & diff_mask) >> (29 - 2) & (1 << 2);
//pattern |= -((middle - to_yuv( w [4] )) & diff_mask) >> (28 - 3) & (1 << 3);
pattern |= -((middle - to_yuv( w [6] )) & diff_mask) >> (27 - 4) & (1 << 4);
pattern |= -((middle - to_yuv( w [7] )) & diff_mask) >> (26 - 5) & (1 << 5);
pattern |= -((middle - to_yuv( w [8] )) & diff_mask) >> (25 - 6) & (1 << 6);
pattern |= -((middle - to_yuv( w [9] )) & diff_mask) >> (24 - 7) & (1 << 7);
Code: Select all
<jmr> bsnes has the most accurate wiki page but it takes forever to load (or something)
Code: Select all
avg = (x & y) + (((x ^ y) >> 1) & 0x7F7F7F7F);
Code: Select all
ls = (x & 0x7F7F7F7F) + (y & 0x7F7F7F7F);
hs = (x ^ y) & 0x80808080;
hc = (x & y) & 0x80808080;
s = ls ^ hs;
c = (ls & hs) | hc;
mask = ((c >> 7) + 0x7F7F7F7F) ^ 0x7F7F7F7F;
dest = s | mask;
Nifty. X & Y yields the carries from each bit and X ^ Y yields the individual sums of each bit (without carries).Code: Select all
avg = (x & y) + (((x ^ y) >> 1) & 0x7F7F7F7F);
The mask can be generated without the large constants, which might generate less code on some machines and allow more parallelism (the two shifts could be done simultaneously). On machines without three-operand instructions (a = b OP c) it might be worse though.This code is exactly the same as the MMX op "paddusb". It can also be adjusted to work with field sizes other than 8-bits.
Code: Select all
mask = (c << 1) - (c >> 7);
I don't think so.blargg wrote:I was browsing the hq2x author's website and found the filter quite impressive for the example images. I looked at the optimized assembly versions and have a feeling that my optimized C version will outperform it at this point.
Here is a faster one that can be used when adding 24bit values (note: upper 8bits=0):DMV27 wrote:This code is exactly the same as the MMX op "paddusb". It can also be adjusted to work with field sizes other than 8-bits.
Code: Select all
ls = (x & 0x7F7F7F7F) + (y & 0x7F7F7F7F); hs = (x ^ y) & 0x80808080; hc = (x & y) & 0x80808080; s = ls ^ hs; c = (ls & hs) | hc; mask = ((c >> 7) + 0x7F7F7F7F) ^ 0x7F7F7F7F; dest = s | mask;
Code: Select all
dest=(x&0xfefeff)+(y&0xfefeff);
tmp=dest&0x1010100;
tmp=tmp-(tmp>>8);
dest|=tmp;
Code: Select all
%macro TestDiff 2
xor ecx,ecx
mov edx,[%1]
cmp edx,[%2]
je %%fin
mov ecx,_RGBtoYUV
movd mm1,[ecx+edx*4]
movq mm5,mm1
mov edx,[%2]
movd mm2,[ecx+edx*4]
psubusb mm1,mm2
psubusb mm2,mm5
por mm1,mm2
psubusb mm1,[threshold]
movd ecx,mm1
%%fin:
%endmacro
Code: Select all
Still, I wonder if the C code "(x_yuv - y_yuv + offset) & mask" beats this, since it's also pretty damn simple.
1) His MMX code is GPL, which I cannot use, and his C code is LGPL, which I hopefully can.byuu, any reason you aren't using MaxST's MMX-optimized version for your x86 builds?