/************************************************************************** ** ** svd3 ** ** Quick singular value decomposition as described by: ** A. McAdams, A. Selle, R. Tamstorf, J. Teran and E. Sifakis, ** "Computing the Singular Value Decomposition of 3x3 matrices ** with minimal branching and elementary floating point operations", ** University of Wisconsin - Madison technical report TR1690, May 2011 ** ** OPTIMIZED CPU VERSION ** Implementation by: Eric Jang ** ** 13 Apr 2014 ** **************************************************************************/ #ifndef SVD3_H #define SVD3_H #define _gamma 5.828427124 // FOUR_GAMMA_SQUARED = sqrt(8)+3; #define _cstar 0.923879532 // cos(pi/8) #define _sstar 0.3826834323 // sin(p/8) #define EPSILON 1e-6 #include /* This is a novel and fast routine for the reciprocal square root of an IEEE float (single precision). http://www.lomont.org/Math/Papers/2003/InvSqrt.pdf http://playstation2-linux.com/download/p2lsd/fastrsqrt.pdf http://www.beyond3d.com/content/articles/8/ */ inline float rsqrt(float x) { // int ihalf = *(int *)&x - 0x00800000; // Alternative to next line, // float xhalf = *(float *)&ihalf; // for sufficiently large nos. float xhalf = 0.5f*x; int i = *(int *)&x; // View x as an int. // i = 0x5f3759df - (i >> 1); // Initial guess (traditional). i = 0x5f375a82 - (i >> 1); // Initial guess (slightly better). x = *(float *)&i; // View i as float. x = x*(1.5f - xhalf*x*x); // Newton step. // x = x*(1.5008908 - xhalf*x*x); // Newton step for a balanced error. return x; } /* This is rsqrt with an additional step of the Newton iteration, for increased accuracy. The constant 0x5f37599e makes the relative error range from 0 to -0.00000463. You can't balance the error by adjusting the constant. */ inline float rsqrt1(float x) { float xhalf = 0.5f*x; int i = *(int *)&x; // View x as an int. i = 0x5f37599e - (i >> 1); // Initial guess. x = *(float *)&i; // View i as float. x = x*(1.5f - xhalf*x*x); // Newton step. x = x*(1.5f - xhalf*x*x); // Newton step again. return x; } inline float accurateSqrt(float x) { return x * rsqrt1(x); } inline void condSwap(bool c, float &X, float &Y) { // used in step 2 float Z = X; X = c ? Y : X; Y = c ? Z : Y; } inline void condNegSwap(bool c, float &X, float &Y) { // used in step 2 and 3 float Z = -X; X = c ? Y : X; Y = c ? Z : Y; } // matrix multiplication M = A * B inline void multAB(float a11, float a12, float a13, float a21, float a22, float a23, float a31, float a32, float a33, // float b11, float b12, float b13, float b21, float b22, float b23, float b31, float b32, float b33, // float &m11, float &m12, float &m13, float &m21, float &m22, float &m23, float &m31, float &m32, float &m33) { m11=a11*b11 + a12*b21 + a13*b31; m12=a11*b12 + a12*b22 + a13*b32; m13=a11*b13 + a12*b23 + a13*b33; m21=a21*b11 + a22*b21 + a23*b31; m22=a21*b12 + a22*b22 + a23*b32; m23=a21*b13 + a22*b23 + a23*b33; m31=a31*b11 + a32*b21 + a33*b31; m32=a31*b12 + a32*b22 + a33*b32; m33=a31*b13 + a32*b23 + a33*b33; } // matrix multiplication M = Transpose[A] * B inline void multAtB(float a11, float a12, float a13, float a21, float a22, float a23, float a31, float a32, float a33, // float b11, float b12, float b13, float b21, float b22, float b23, float b31, float b32, float b33, // float &m11, float &m12, float &m13, float &m21, float &m22, float &m23, float &m31, float &m32, float &m33) { m11=a11*b11 + a21*b21 + a31*b31; m12=a11*b12 + a21*b22 + a31*b32; m13=a11*b13 + a21*b23 + a31*b33; m21=a12*b11 + a22*b21 + a32*b31; m22=a12*b12 + a22*b22 + a32*b32; m23=a12*b13 + a22*b23 + a32*b33; m31=a13*b11 + a23*b21 + a33*b31; m32=a13*b12 + a23*b22 + a33*b32; m33=a13*b13 + a23*b23 + a33*b33; } inline void quatToMat3(const float * qV, float &m11, float &m12, float &m13, float &m21, float &m22, float &m23, float &m31, float &m32, float &m33 ) { float w = qV[3]; float x = qV[0]; float y = qV[1]; float z = qV[2]; float qxx = x*x; float qyy = y*y; float qzz = z*z; float qxz = x*z; float qxy = x*y; float qyz = y*z; float qwx = w*x; float qwy = w*y; float qwz = w*z; m11=1 - 2*(qyy + qzz); m12=2*(qxy - qwz); m13=2*(qxz + qwy); m21=2*(qxy + qwz); m22=1 - 2*(qxx + qzz); m23=2*(qyz - qwx); m31=2*(qxz - qwy); m32=2*(qyz + qwx); m33=1 - 2*(qxx + qyy); } inline void approximateGivensQuaternion(float a11, float a12, float a22, float &ch, float &sh) { /* * Given givens angle computed by approximateGivensAngles, * compute the corresponding rotation quaternion. */ ch = 2*(a11-a22); sh = a12; bool b = _gamma*sh*sh < ch*ch; // fast rsqrt function suffices // rsqrt2 (https://code.google.com/p/lppython/source/browse/algorithm/HDcode/newCode/rsqrt.c?r=26) // is even faster but results in too much error float w = rsqrt(ch*ch+sh*sh); ch=b?w*ch:(float)_cstar; sh=b?w*sh:(float)_sstar; } inline void jacobiConjugation( const int x, const int y, const int z, float &s11, float &s21, float &s22, float &s31, float &s32, float &s33, float * qV) { float ch,sh; approximateGivensQuaternion(s11,s21,s22,ch,sh); float scale = ch*ch+sh*sh; float a = (ch*ch-sh*sh)/scale; float b = (2*sh*ch)/scale; // make temp copy of S float _s11 = s11; float _s21 = s21; float _s22 = s22; float _s31 = s31; float _s32 = s32; float _s33 = s33; // perform conjugation S = Q'*S*Q // Q already implicitly solved from a, b s11 =a*(a*_s11 + b*_s21) + b*(a*_s21 + b*_s22); s21 =a*(-b*_s11 + a*_s21) + b*(-b*_s21 + a*_s22); s22=-b*(-b*_s11 + a*_s21) + a*(-b*_s21 + a*_s22); s31 =a*_s31 + b*_s32; s32=-b*_s31 + a*_s32; s33=_s33; // update cumulative rotation qV float tmp[3]; tmp[0]=qV[0]*sh; tmp[1]=qV[1]*sh; tmp[2]=qV[2]*sh; sh *= qV[3]; qV[0] *= ch; qV[1] *= ch; qV[2] *= ch; qV[3] *= ch; // (x,y,z) corresponds to ((0,1,2),(1,2,0),(2,0,1)) // for (p,q) = ((0,1),(1,2),(0,2)) qV[z] += sh; qV[3] -= tmp[z]; // w qV[x] += tmp[y]; qV[y] -= tmp[x]; // re-arrange matrix for next iteration _s11 = s22; _s21 = s32; _s22 = s33; _s31 = s21; _s32 = s31; _s33 = s11; s11 = _s11; s21 = _s21; s22 = _s22; s31 = _s31; s32 = _s32; s33 = _s33; } inline float dist2(float x, float y, float z) { return x*x+y*y+z*z; } // finds transformation that diagonalizes a symmetric matrix inline void jacobiEigenanlysis( // symmetric matrix float &s11, float &s21, float &s22, float &s31, float &s32, float &s33, // quaternion representation of V float * qV) { qV[3]=1; qV[0]=0;qV[1]=0;qV[2]=0; // follow same indexing convention as GLM for (int i=0;i<4;i++) { // we wish to eliminate the maximum off-diagonal element // on every iteration, but cycling over all 3 possible rotations // in fixed order (p,q) = (1,2) , (2,3), (1,3) still retains // asymptotic convergence jacobiConjugation(0,1,2,s11,s21,s22,s31,s32,s33,qV); // p,q = 0,1 jacobiConjugation(1,2,0,s11,s21,s22,s31,s32,s33,qV); // p,q = 1,2 jacobiConjugation(2,0,1,s11,s21,s22,s31,s32,s33,qV); // p,q = 0,2 } } inline void sortSingularValues(// matrix that we want to decompose float &b11, float &b12, float &b13, float &b21, float &b22, float &b23, float &b31, float &b32, float &b33, // sort V simultaneously float &v11, float &v12, float &v13, float &v21, float &v22, float &v23, float &v31, float &v32, float &v33) { float rho1 = dist2(b11,b21,b31); float rho2 = dist2(b12,b22,b32); float rho3 = dist2(b13,b23,b33); bool c; c = rho1 < rho2; condNegSwap(c,b11,b12); condNegSwap(c,v11,v12); condNegSwap(c,b21,b22); condNegSwap(c,v21,v22); condNegSwap(c,b31,b32); condNegSwap(c,v31,v32); condSwap(c,rho1,rho2); c = rho1 < rho3; condNegSwap(c,b11,b13); condNegSwap(c,v11,v13); condNegSwap(c,b21,b23); condNegSwap(c,v21,v23); condNegSwap(c,b31,b33); condNegSwap(c,v31,v33); condSwap(c,rho1,rho3); c = rho2 < rho3; condNegSwap(c,b12,b13); condNegSwap(c,v12,v13); condNegSwap(c,b22,b23); condNegSwap(c,v22,v23); condNegSwap(c,b32,b33); condNegSwap(c,v32,v33); } void QRGivensQuaternion(float a1, float a2, float &ch, float &sh) { // a1 = pivot point on diagonal // a2 = lower triangular entry we want to annihilate float epsilon = (float)EPSILON; float rho = accurateSqrt(a1*a1 + a2*a2); sh = rho > epsilon ? a2 : 0; ch = fabsf(a1) + fmaxf(rho,epsilon); bool b = a1 < 0; condSwap(b,sh,ch); float w = rsqrt(ch*ch+sh*sh); ch *= w; sh *= w; } inline void QRDecomposition(// matrix that we want to decompose float b11, float b12, float b13, float b21, float b22, float b23, float b31, float b32, float b33, // output Q float &q11, float &q12, float &q13, float &q21, float &q22, float &q23, float &q31, float &q32, float &q33, // output R float &r11, float &r12, float &r13, float &r21, float &r22, float &r23, float &r31, float &r32, float &r33) { float ch1,sh1,ch2,sh2,ch3,sh3; float a,b; // first givens rotation (ch,0,0,sh) QRGivensQuaternion(b11,b21,ch1,sh1); a=1-2*sh1*sh1; b=2*ch1*sh1; // apply B = Q' * B r11=a*b11+b*b21; r12=a*b12+b*b22; r13=a*b13+b*b23; r21=-b*b11+a*b21; r22=-b*b12+a*b22; r23=-b*b13+a*b23; r31=b31; r32=b32; r33=b33; // second givens rotation (ch,0,-sh,0) QRGivensQuaternion(r11,r31,ch2,sh2); a=1-2*sh2*sh2; b=2*ch2*sh2; // apply B = Q' * B; b11=a*r11+b*r31; b12=a*r12+b*r32; b13=a*r13+b*r33; b21=r21; b22=r22; b23=r23; b31=-b*r11+a*r31; b32=-b*r12+a*r32; b33=-b*r13+a*r33; // third givens rotation (ch,sh,0,0) QRGivensQuaternion(b22,b32,ch3,sh3); a=1-2*sh3*sh3; b=2*ch3*sh3; // R is now set to desired value r11=b11; r12=b12; r13=b13; r21=a*b21+b*b31; r22=a*b22+b*b32; r23=a*b23+b*b33; r31=-b*b21+a*b31; r32=-b*b22+a*b32; r33=-b*b23+a*b33; // construct the cumulative rotation Q=Q1 * Q2 * Q3 // the number of floating point operations for three quaternion multiplications // is more or less comparable to the explicit form of the joined matrix. // certainly more memory-efficient! float sh12=sh1*sh1; float sh22=sh2*sh2; float sh32=sh3*sh3; q11=(-1+2*sh12)*(-1+2*sh22); q12=4*ch2*ch3*(-1+2*sh12)*sh2*sh3+2*ch1*sh1*(-1+2*sh32); q13=4*ch1*ch3*sh1*sh3-2*ch2*(-1+2*sh12)*sh2*(-1+2*sh32); q21=2*ch1*sh1*(1-2*sh22); q22=-8*ch1*ch2*ch3*sh1*sh2*sh3+(-1+2*sh12)*(-1+2*sh32); q23=-2*ch3*sh3+4*sh1*(ch3*sh1*sh3+ch1*ch2*sh2*(-1+2*sh32)); q31=2*ch2*sh2; q32=2*ch3*(1-2*sh22)*sh3; q33=(-1+2*sh22)*(-1+2*sh32); } void svd(// input A float a11, float a12, float a13, float a21, float a22, float a23, float a31, float a32, float a33, // output U float &u11, float &u12, float &u13, float &u21, float &u22, float &u23, float &u31, float &u32, float &u33, // output S float &s11, float &s12, float &s13, float &s21, float &s22, float &s23, float &s31, float &s32, float &s33, // output V float &v11, float &v12, float &v13, float &v21, float &v22, float &v23, float &v31, float &v32, float &v33) { // normal equations matrix float ATA11, ATA12, ATA13; float ATA21, ATA22, ATA23; float ATA31, ATA32, ATA33; multAtB(a11,a12,a13,a21,a22,a23,a31,a32,a33, a11,a12,a13,a21,a22,a23,a31,a32,a33, ATA11,ATA12,ATA13,ATA21,ATA22,ATA23,ATA31,ATA32,ATA33); // symmetric eigenalysis float qV[4]; jacobiEigenanlysis( ATA11,ATA21,ATA22, ATA31,ATA32,ATA33,qV); quatToMat3(qV,v11,v12,v13,v21,v22,v23,v31,v32,v33); float b11, b12, b13; float b21, b22, b23; float b31, b32, b33; multAB(a11,a12,a13,a21,a22,a23,a31,a32,a33, v11,v12,v13,v21,v22,v23,v31,v32,v33, b11, b12, b13, b21, b22, b23, b31, b32, b33); // sort singular values and find V sortSingularValues(b11, b12, b13, b21, b22, b23, b31, b32, b33, v11,v12,v13,v21,v22,v23,v31,v32,v33); // QR decomposition QRDecomposition(b11, b12, b13, b21, b22, b23, b31, b32, b33, u11, u12, u13, u21, u22, u23, u31, u32, u33, s11, s12, s13, s21, s22, s23, s31, s32, s33 ); } /// polar decomposition can be reconstructed trivially from SVD result // A = UP void pd(float a11, float a12, float a13, float a21, float a22, float a23, float a31, float a32, float a33, // output U float &u11, float &u12, float &u13, float &u21, float &u22, float &u23, float &u31, float &u32, float &u33, // output P float &p11, float &p12, float &p13, float &p21, float &p22, float &p23, float &p31, float &p32, float &p33) { float w11, w12, w13, w21, w22, w23, w31, w32, w33; float s11, s12, s13, s21, s22, s23, s31, s32, s33; float v11, v12, v13, v21, v22, v23, v31, v32, v33; svd(a11, a12, a13, a21, a22, a23, a31, a32, a33, w11, w12, w13, w21, w22, w23, w31, w32, w33, s11, s12, s13, s21, s22, s23, s31, s32, s33, v11, v12, v13, v21, v22, v23, v31, v32, v33); // P = VSV' float t11, t12, t13, t21, t22, t23, t31, t32, t33; multAB(v11, v12, v13, v21, v22, v23, v31, v32, v33, s11, s12, s13, s21, s22, s23, s31, s32, s33, t11, t12, t13, t21, t22, t23, t31, t32, t33); multAB(t11, t12, t13, t21, t22, t23, t31, t32, t33, v11, v21, v31, v12, v22, v32, v13, v23, v33, p11, p12, p13, p21, p22, p23, p31, p32, p33); // U = WV' multAB(w11, w12, w13, w21, w22, w23, w31, w32, w33, v11, v21, v31, v12, v22, v32, v13, v23, v33, u11, u12, u13, u21, u22, u23, u31, u32, u33); } #endif