diff --git a/.clang-format b/.clang-format index fe3619d35..898668930 100644 --- a/.clang-format +++ b/.clang-format @@ -25,6 +25,7 @@ WhitespaceSensitiveMacros: DECOMP_FORCEACTIVE, DECOMP_FORCELITERAL, OS_ASSERT, + OS_DEBUG_ASSERT, OS_ERROR, EGG_ASSERT, EGG_ASSERT_MSG, diff --git a/config/RSPE01_01/splits.txt b/config/RSPE01_01/splits.txt index cf8095dcc..f3961755e 100644 --- a/config/RSPE01_01/splits.txt +++ b/config/RSPE01_01/splits.txt @@ -2028,10 +2028,16 @@ revolution/MTX/mtx.c: revolution/MTX/mtxvec.c: .text start:0x800E8260 end:0x800E82B4 +revolution/MTX/mtxstack.c: + .text start:0x800E82B4 end:0x800E82B4 + revolution/MTX/mtx44.c: .text start:0x800E82B4 end:0x800E84D4 .sdata2 start:0x804C11E0 end:0x804C11F8 +revolution/MTX/mtx44vec.c: + .text start:0x800E84D4 end:0x800E84D4 + revolution/MTX/vec.c: .text start:0x800E84D4 end:0x800E86F8 .sdata2 start:0x804C11F8 end:0x804C1208 @@ -2041,6 +2047,9 @@ revolution/MTX/quat.c: .rodata start:0x8037C4B0 end:0x8037C4C0 .sdata2 start:0x804C1208 end:0x804C1220 +revolution/MTX/psmtx.c: + .text start:0x800E8B10 end:0x800E8B10 + revolution/NAND/nand.c: .text start:0x800E8B10 end:0x800E9C00 .sdata start:0x804BD668 end:0x804BD670 diff --git a/config/RSPE01_01/symbols.txt b/config/RSPE01_01/symbols.txt index 6a6f969a1..2b6d39627 100644 --- a/config/RSPE01_01/symbols.txt +++ b/config/RSPE01_01/symbols.txt @@ -4401,7 +4401,7 @@ PSMTXInverse = .text:0x800E790C; // type:function size:0xF8 scope:global PSMTXInvXpose = .text:0x800E7A04; // type:function size:0xC8 scope:global PSMTXRotRad = .text:0x800E7ACC; // type:function size:0x7C scope:global PSMTXRotTrig = .text:0x800E7B48; // type:function size:0xB0 scope:global -__PSMTXRotAxisRadInternal = .text:0x800E7BF8; // type:function size:0xB0 scope:global +__PSMTXRotAxisRadInternal = .text:0x800E7BF8; // type:function size:0xB0 scope:local PSMTXRotAxisRad = .text:0x800E7CA8; // type:function size:0x7C scope:global PSMTXTrans = .text:0x800E7D24; // type:function size:0x34 scope:global PSMTXTransApply = .text:0x800E7D58; // type:function size:0x4C scope:global @@ -23284,7 +23284,7 @@ lbl_804C11C8 = .sdata2:0x804C11C8; // type:object size:0x4 align:4 data:float lbl_804C11CC = .sdata2:0x804C11CC; // type:object size:0x4 align:4 data:float lbl_804C11D0 = .sdata2:0x804C11D0; // type:object size:0x4 align:4 data:float lbl_804C11D4 = .sdata2:0x804C11D4; // type:object size:0x4 align:4 data:float -lbl_804C11D8 = .sdata2:0x804C11D8; // type:object size:0x8 align:4 data:float +lbl_804C11D8 = .sdata2:0x804C11D8; // type:object size:0x4 align:4 data:float @176 = .sdata2:0x804C11E0; // type:object size:0x4 scope:local align:4 data:float @177 = .sdata2:0x804C11E4; // type:object size:0x4 scope:local align:4 data:float @178 = .sdata2:0x804C11E8; // type:object size:0x4 scope:local align:4 data:float diff --git a/configure.py b/configure.py index 25ef57e83..a724b130d 100755 --- a/configure.py +++ b/configure.py @@ -944,11 +944,14 @@ def MatchingFor(*versions): Object(Matching, "revolution/MEM/mem_frameHeap.c"), Object(Matching, "revolution/MEM/mem_allocator.c"), Object(Matching, "revolution/MEM/mem_list.c"), - Object(NonMatching, "revolution/MTX/mtx.c"), + Object(Matching, "revolution/MTX/mtx.c"), Object(Matching, "revolution/MTX/mtxvec.c"), + Object(Matching, "revolution/MTX/mtxstack.c"), Object(Matching, "revolution/MTX/mtx44.c"), + Object(Matching, "revolution/MTX/mtx44vec.c"), Object(Matching, "revolution/MTX/vec.c"), Object(Matching, "revolution/MTX/quat.c"), + Object(Matching, "revolution/MTX/psmtx.c"), Object(Matching, "revolution/NAND/nand.c"), Object(Matching, "revolution/NAND/NANDOpenClose.c"), Object(Matching, "revolution/NAND/NANDCore.c"), diff --git a/include/revolution/MTX.h b/include/revolution/MTX.h index 23dc1585a..ba9cd1e2e 100644 --- a/include/revolution/MTX.h +++ b/include/revolution/MTX.h @@ -4,12 +4,9 @@ extern "C" { #endif +#include #include -#include -#include -#include -#include -#include +#include #ifdef __cplusplus } diff --git a/include/revolution/MTX/GeoTypes.h b/include/revolution/MTX/GeoTypes.h new file mode 100644 index 000000000..c99a7f279 --- /dev/null +++ b/include/revolution/MTX/GeoTypes.h @@ -0,0 +1,61 @@ +#ifndef RVL_SDK_MTX_GEO_TYPES_H +#define RVL_SDK_MTX_GEO_TYPES_H +#include +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct { + f32 x, y, z; +} Vec; +typedef Vec* VecPtr; +typedef const Vec* CVecPtr; +typedef Vec Point3d; +typedef Vec* Point3dPtr; +typedef const Vec* CPoint3dPtr; + +typedef struct { + s16 x, y, z; +} S16Vec; +typedef S16Vec* S16VecPtr; +typedef const S16Vec* CS16VecPtr; + +typedef struct { + f32 x, y; +} Vec2; +typedef Vec2* Vec2Ptr; +typedef const Vec2* CVec2Ptr; + +typedef struct { + f32 x, y, z, w; +} Quaternion; +typedef Quaternion* QuaternionPtr; +typedef const Quaternion* CQuaternionPtr; +typedef Quaternion Qtrn; +typedef Quaternion* QtrnPtr; +typedef const Quaternion* CQtrnPtr; + +typedef f32 Mtx[3][4]; +typedef f32 (*MtxPtr)[4]; +typedef const f32 (*CMtxPtr)[4]; + +typedef f32 ROMtx[4][3]; +typedef f32 (*ROMtxPtr)[3]; +typedef const f32 (*CROMtxPtr)[3]; + +typedef f32 Mtx44[4][4]; +typedef f32 (*Mtx44Ptr)[4]; +typedef const f32 (*CMtx44Ptr)[4]; + +typedef struct { + u32 numMtx; // at 0x0 + MtxPtr stackBase; // at 0x4 + MtxPtr stackPtr; // at 0x8 +} MtxStack; +typedef MtxStack* MtxStackPtr; +typedef const MtxStack* CMtxStackPtr; + +#ifdef __cplusplus +} +#endif +#endif diff --git a/include/revolution/MTX/internal/mtx44extAssert.h b/include/revolution/MTX/internal/mtx44extAssert.h new file mode 100644 index 000000000..f2c17ea5f --- /dev/null +++ b/include/revolution/MTX/internal/mtx44extAssert.h @@ -0,0 +1,17 @@ +#ifndef RVL_SDK_MTX44_EXT_ASSERT_H +#define RVL_SDK_MTX44_EXT_ASSERT_H + +#include + +/****************************************************************************** + * + * Mtx44 + * + ******************************************************************************/ + +#define Mtx_30 0x30 +#define Mtx_31 0x34 +#define Mtx_32 0x38 +#define Mtx_33 0x3C + +#endif diff --git a/include/revolution/MTX/internal/mtxAssert.h b/include/revolution/MTX/internal/mtxAssert.h new file mode 100644 index 000000000..670239fa9 --- /dev/null +++ b/include/revolution/MTX/internal/mtxAssert.h @@ -0,0 +1,65 @@ +#ifndef RVL_SDK_MTX_ASSERT_H +#define RVL_SDK_MTX_ASSERT_H + +/****************************************************************************** + * + * Mtx + * + ******************************************************************************/ + +#define Mtx_00 0x0 +#define Mtx_01 0x4 +#define Mtx_02 0x8 +#define Mtx_03 0xC +#define Mtx_10 0x10 +#define Mtx_11 0x14 +#define Mtx_12 0x18 +#define Mtx_13 0x1C +#define Mtx_20 0x20 +#define Mtx_21 0x24 +#define Mtx_22 0x28 +#define Mtx_23 0x2C + +/****************************************************************************** + * + * ROMtx + * + ******************************************************************************/ + +#define ROMtx_00 0x0 +#define ROMtx_01 0x4 +#define ROMtx_02 0x8 +#define ROMtx_10 0xC +#define ROMtx_11 0x10 +#define ROMtx_12 0x14 +#define ROMtx_20 0x18 +#define ROMtx_21 0x1C +#define ROMtx_22 0x20 +#define ROMtx_30 0x24 +#define ROMtx_31 0x28 +#define ROMtx_32 0x2C + +/****************************************************************************** + * + * General use + * + ******************************************************************************/ + +#define DEG2RAD 0.017453292f + +/****************************************************************************** + * + * Probably not defined here + * + ******************************************************************************/ + +#define qr0 0 +#define qr1 1 +#define qr2 2 +#define qr3 3 +#define qr4 4 +#define qr5 5 +#define qr6 6 +#define qr7 7 + +#endif diff --git a/include/revolution/MTX/mtx.h b/include/revolution/MTX/mtx.h index dc040fb47..40ea30def 100644 --- a/include/revolution/MTX/mtx.h +++ b/include/revolution/MTX/mtx.h @@ -2,32 +2,218 @@ #define RVL_SDK_MTX_MTX_H #include -#include +#include #ifdef __cplusplus extern "C" { #endif -// TODO(kiwi) Create macros for PS/C_ functions -void PSMTXIdentity(Mtx); -void PSMTXCopy(const Mtx, Mtx); -void PSMTXConcat(const Mtx, const Mtx, Mtx); -void PSMTXConcatArray(const Mtx, const Mtx, Mtx, u32); -void PSMTXTranspose(const Mtx, Mtx); -u32 PSMTXInverse(const Mtx, Mtx); -u32 PSMTXInvXpose(const Mtx, Mtx); -void PSMTXRotRad(Mtx, char, f32); -void PSMTXRotTrig(Mtx, f32, f32, char); -void PSMTXRotAxisRad(Mtx, const Vec*, f32); -void PSMTXTrans(Mtx, f32, f32, f32); -void PSMTXTransApply(const Mtx, Mtx, f32, f32, f32); -void PSMTXScale(Mtx, f32, f32, f32); -void PSMTXScaleApply(const Mtx, Mtx, f32, f32, f32); -void PSMTXQuat(Mtx, const Quaternion*); - -void C_MTXLookAt(Mtx, const Vec*, const Vec*, const Vec*); -void C_MTXLightFrustum(Mtx, f32, f32, f32, f32, f32, f32, f32, f32, f32); -void C_MTXLightPerspective(Mtx, f32, f32, f32, f32, f32, f32); -void C_MTXLightOrtho(Mtx, f32, f32, f32, f32, f32, f32, f32, f32); +/****************************************************************************** + * + * mtx.c + * + ******************************************************************************/ + +void C_MTXIdentity(MtxPtr m); +void PSMTXIdentity(MtxPtr m); +void C_MTXCopy(CMtxPtr src, MtxPtr dst); +void PSMTXCopy(CMtxPtr src, MtxPtr dst); +void C_MTXConcat(CMtxPtr a, CMtxPtr b, MtxPtr ab); +void PSMTXConcat(CMtxPtr mA, CMtxPtr mB, MtxPtr mAB); +void C_MTXConcatArray(CMtxPtr a, CMtxPtr srcBase, MtxPtr dstBase, u32 count); +void PSMTXConcatArray(CMtxPtr a, CMtxPtr srcBase, MtxPtr dstBase, u32 count); +void C_MTXTranspose(CMtxPtr src, MtxPtr xPose); +void PSMTXTranspose(CMtxPtr src, MtxPtr xPose); +u32 C_MTXInverse(CMtxPtr src, MtxPtr inv); // DWARF reveals it's not BOOL +u32 PSMTXInverse(CMtxPtr src, MtxPtr inv); +u32 C_MTXInvXpose(CMtxPtr src, MtxPtr invX); // DWARF reveals it's not BOOL +u32 PSMTXInvXpose(CMtxPtr src, MtxPtr invX); +void C_MTXRotRad(MtxPtr m, char axis, f32 rad); +void PSMTXRotRad(MtxPtr m, char axis, f32 rad); +void C_MTXRotTrig(MtxPtr m, char axis, f32 sinA, f32 cosA); +void PSMTXRotTrig(MtxPtr m, char axis, f32 sinA, f32 cosA); +void C_MTXRotAxisRad(MtxPtr m, CVecPtr axis, f32 rad); +void PSMTXRotAxisRad(MtxPtr m, CVecPtr axis, f32 rad); +void C_MTXTrans(MtxPtr m, f32 xT, f32 yT, f32 zT); +void PSMTXTrans(MtxPtr m, f32 xT, f32 yT, f32 zT); +void C_MTXTransApply(CMtxPtr src, MtxPtr dst, f32 xT, f32 yT, f32 zT); +void PSMTXTransApply(CMtxPtr src, MtxPtr dst, f32 xT, f32 yT, f32 zT); +void C_MTXScale(MtxPtr m, f32 xS, f32 yS, f32 zS); +void PSMTXScale(MtxPtr m, f32 xS, f32 yS, f32 zS); +void C_MTXScaleApply(CMtxPtr src, MtxPtr dst, f32 xS, f32 yS, f32 zS); +void PSMTXScaleApply(CMtxPtr src, MtxPtr dst, f32 xS, f32 yS, f32 zS); +void C_MTXQuat(MtxPtr m, CQuaternionPtr q); +void PSMTXQuat(MtxPtr m, CQuaternionPtr q); +void C_MTXReflect(MtxPtr m, CVecPtr point, CVecPtr normal); +void PSMTXReflect(MtxPtr m, CVecPtr point, CVecPtr normal); +void C_MTXLookAt(MtxPtr m, CVecPtr camPos, CVecPtr camUp, CPoint3dPtr target); +void C_MTXLightFrustum(MtxPtr m, f32 t, f32 b, f32 l, f32 r, f32 n, f32 scaleS, + f32 scaleT, f32 transS, f32 transT); +void C_MTXLightPerspective(MtxPtr m, f32 fovY, f32 aspect, f32 scaleS, + f32 scaleT, f32 transS, f32 transT); +void C_MTXLightOrtho(MtxPtr m, f32 t, f32 b, f32 l, f32 r, f32 scaleS, + f32 scaleT, f32 transS, f32 transT); + +#define MTXIdentity PSMTXIdentity +#define MTXCopy PSMTXCopy +#define MTXConcat PSMTXConcat +#define MTXConcatArray PSMTXConcatArray +#define MTXTranspose PSMTXTranspose +#define MTXInverse PSMTXInverse +#define MTXInvXpose PSMTXInvXpose +#define MTXRotRad PSMTXRotRad +#define MTXRotTrig PSMTXRotTrig +#define MTXRotAxisRad PSMTXRotAxisRad +#define MTXTrans PSMTXTrans +#define MTXTransApply PSMTXTransApply +#define MTXScale PSMTXScale +#define MTXScaleApply PSMTXScaleApply +#define MTXQuat PSMTXQuat +#define MTXReflect PSMTXReflect +#define MTXLookAt C_MTXLookAt +#define MTXLightFrustum C_MTXLightFrustum +#define MTXLightPerspective C_MTXLightPerspective +#define MTXLightOrtho C_MTXLightOrtho + +/****************************************************************************** + * + * mtxvec.c + * + ******************************************************************************/ + +void C_MTXMultVec(CMtxPtr m, CVecPtr src, VecPtr dst); +void PSMTXMultVec(CMtxPtr m, CVecPtr src, VecPtr dst); +void C_MTXMultVecArray(CMtxPtr m, CVecPtr srcBase, VecPtr dstBase, u32 count); +void PSMTXMultVecArray(CMtxPtr m, CVecPtr srcBase, VecPtr dstBase, u32 count); +void C_MTXMultVecSR(CMtxPtr m, CVecPtr src, VecPtr dst); +void PSMTXMultVecSR(CMtxPtr m, CVecPtr src, VecPtr dst); +void C_MTXMultVecArraySR(CMtxPtr m, CVecPtr srcBase, VecPtr dstBase, u32 count); +void PSMTXMultVecArraySR(CMtxPtr m, CVecPtr srcBase, VecPtr dstBase, u32 count); + +#define MTXMultVec PSMTXMultVec +#define MTXMultVecArray PSMTXMultVecArray +#define MTXMultVecSR PSMTXMultVecSR +#define MTXMultVecArraySR PSMTXMultVecArraySR + +/****************************************************************************** + * + * mtxstack.c + * + ******************************************************************************/ + +void MTXInitStack(MtxStackPtr sPtr, u32 numMtx); +MtxPtr MTXPush(MtxStackPtr sPtr, CMtxPtr m); +MtxPtr MTXPushFwd(MtxStackPtr sPtr, CMtxPtr m); +MtxPtr MTXPushInv(MtxStackPtr sPtr, CMtxPtr m); +MtxPtr MTXPushInvXpose(MtxStackPtr sPtr, CMtxPtr m); +MtxPtr MTXPop(MtxStackPtr sPtr); +MtxPtr MTXGetStackPtr(MtxStackPtr sPtr); + +/****************************************************************************** + * + * vec.c + * + ******************************************************************************/ + +void C_VECAdd(CVecPtr a, CVecPtr b, VecPtr ab); +void PSVECAdd(CVecPtr vec1, CVecPtr vec2, VecPtr dst); +void C_VECSubtract(CVecPtr a, CVecPtr b, VecPtr a_b); +void PSVECSubtract(CVecPtr vec1, CVecPtr vec2, VecPtr dst); +void C_VECScale(CVecPtr src, VecPtr dst, f32 scale); +void PSVECScale(CVecPtr src, VecPtr dst, f32 mult); +void C_VECNormalize(CVecPtr src, VecPtr unit); +void PSVECNormalize(CVecPtr src, VecPtr unit); +f32 C_VECSquareMag(CVecPtr v); +f32 PSVECSquareMag(CVecPtr vec1); +f32 C_VECMag(CVecPtr v); +f32 PSVECMag(CVecPtr v); +f32 C_VECDotProduct(CVecPtr a, CVecPtr b); +f32 PSVECDotProduct(CVecPtr vec1, CVecPtr vec2); +void C_VECCrossProduct(CVecPtr a, CVecPtr b, VecPtr axb); +void PSVECCrossProduct(CVecPtr vec1, CVecPtr vec2, VecPtr dst); +void C_VECHalfAngle(CVecPtr a, CVecPtr b, VecPtr half); +void C_VECReflect(CVecPtr src, CVecPtr normal, VecPtr dst); +f32 C_VECSquareDistance(CVecPtr a, CVecPtr b); +f32 PSVECSquareDistance(CVecPtr a, CVecPtr b); +f32 C_VECDistance(CVecPtr a, CVecPtr b); +f32 PSVECDistance(CVecPtr a, CVecPtr b); + +#define VECAdd PSVECAdd +#define VECSubtract PSVECSubtract +#define VECScale PSVECScale +#define VECNormalize PSVECNormalize +#define VECSquareMag PSVECSquareMag +#define VECMag PSVECMag +#define VECDotProduct PSVECDotProduct +#define VECCrossProduct PSVECCrossProduct +#define VECHalfAngle C_VECHalfAngle +#define VECReflect C_VECReflect +#define VECSquareDistance PSVECSquareDistance +#define VECDistance PSVECDistance + +/****************************************************************************** + * + * quat.c + * + ******************************************************************************/ + +void C_QUATAdd(CQuaternionPtr p, CQuaternionPtr q, QuaternionPtr r); +void PSQUATAdd(CQuaternionPtr p, CQuaternionPtr q, QuaternionPtr r); +void C_QUATSubtract(CQuaternionPtr p, CQuaternionPtr q, QuaternionPtr r); +void PSQUATSubtract(CQuaternionPtr p, CQuaternionPtr q, QuaternionPtr r); +void C_QUATMultiply(CQuaternionPtr p, CQuaternionPtr q, QuaternionPtr pq); +void PSQUATMultiply(CQuaternionPtr p, CQuaternionPtr q, QuaternionPtr pq); +void C_QUATScale(CQuaternionPtr q, QuaternionPtr r, f32 scale); +void PSQUATScale(CQuaternionPtr q, QuaternionPtr r, f32 mult); +f32 C_QUATDotProduct(CQuaternionPtr p, CQuaternionPtr q); +f32 PSQUATDotProduct(CQuaternionPtr p, CQuaternionPtr q); +void C_QUATNormalize(CQuaternionPtr src, QuaternionPtr unit); +void PSQUATNormalize(CQuaternionPtr src, QuaternionPtr unit); +void C_QUATInverse(CQuaternionPtr src, QuaternionPtr inv); +void PSQUATInverse(CQuaternionPtr src, QuaternionPtr inv); +void C_QUATDivide(CQuaternionPtr p, CQuaternionPtr q, QuaternionPtr r); +void PSQUATDivide(CQuaternionPtr p, CQuaternionPtr q, QuaternionPtr r); +void C_QUATExp(CQuaternionPtr q, QuaternionPtr r); +void C_QUATLogN(CQuaternionPtr q, QuaternionPtr r); +void C_QUATMakeClosest(CQuaternionPtr q, CQuaternionPtr qto, QuaternionPtr r); +void C_QUATRotAxisRad(QuaternionPtr r, CVecPtr axis, f32 rad); +void C_QUATMtx(QuaternionPtr r, CMtxPtr m); +void C_QUATLerp(CQuaternionPtr p, CQuaternionPtr q, QuaternionPtr r, f32 t); +void C_QUATSlerp(CQuaternionPtr p, CQuaternionPtr q, QuaternionPtr r, f32 t); +void C_QUATSquad(CQuaternionPtr p, CQuaternionPtr a, CQuaternionPtr b, + CQuaternionPtr q, QuaternionPtr r, f32 t); +void C_QUATCompA(CQuaternionPtr qprev, CQuaternionPtr q, CQuaternionPtr qnext, + QuaternionPtr a); + +#define QUATAdd PSQUATAdd +#define QUATSubtract PSQUATSubtract +#define QUATMultiply PSQUATMultiply +#define QUATScale PSQUATScale +#define QUATDotProduct PSQUATDotProduct +#define QUATNormalize PSQUATNormalize +#define QUATInverse PSQUATInverse +#define QUATDivide PSQUATDivide +#define QUATExp C_QUATExp +#define QUATLogN C_QUATLogN +#define QUATMakeClosest C_QUATMakeClosest +#define QUATRotAxisRad C_QUATRotAxisRad +#define QUATMtx C_QUATMtx +#define QUATLerp C_QUATLerp +#define QUATSlerp C_QUATSlerp +#define QUATSquad C_QUATSquad +#define QUATCompA C_QUATCompA + +/****************************************************************************** + * + * psmtx.c + * + ******************************************************************************/ + +void PSMTXReorder(CMtxPtr src, ROMtxPtr dst); +void PSMTXROMultVecArray(CROMtxPtr m, CVecPtr srcBase, VecPtr dstBase, + u32 count); + +#define MTXReorder PSMTXReorder +#define MTXROMultVecArray PSMTXROMultVecArray #ifdef __cplusplus } diff --git a/include/revolution/MTX/mtx44.h b/include/revolution/MTX/mtx44.h deleted file mode 100644 index e51067d04..000000000 --- a/include/revolution/MTX/mtx44.h +++ /dev/null @@ -1,17 +0,0 @@ -#ifndef RVL_SDK_MTX_MTX44_H -#define RVL_SDK_MTX_MTX44_H -#include - -#include -#ifdef __cplusplus -extern "C" { -#endif - -void C_MTXFrustum(Mtx44 mtx, f32 t, f32 b, f32 l, f32 r, f32 n, f32 f); -void C_MTXPerspective(Mtx44 mtx, f32 fovy, f32 aspect, f32 n, f32 f); -void C_MTXOrtho(Mtx44 mtx, f32 t, f32 b, f32 l, f32 r, f32 n, f32 f); - -#ifdef __cplusplus -} -#endif -#endif diff --git a/include/revolution/MTX/mtx44ext.h b/include/revolution/MTX/mtx44ext.h new file mode 100644 index 000000000..77da45d73 --- /dev/null +++ b/include/revolution/MTX/mtx44ext.h @@ -0,0 +1,86 @@ +#ifndef RVL_SDK_MTX_MTX44_EXT_H +#define RVL_SDK_MTX_MTX44_EXT_H +#include + +#include +#ifdef __cplusplus +extern "C" { +#endif + +/****************************************************************************** + * + * mtx44.c + * + ******************************************************************************/ + +void C_MTXFrustum(Mtx44Ptr m, f32 t, f32 b, f32 l, f32 r, f32 n, f32 f); +void C_MTXPerspective(Mtx44Ptr m, f32 fovY, f32 aspect, f32 n, f32 f); +void C_MTXOrtho(Mtx44Ptr m, f32 t, f32 b, f32 l, f32 r, f32 n, f32 f); +void C_MTX44Identity(Mtx44Ptr m); +void PSMTX44Identity(Mtx44Ptr m); +void C_MTX44Copy(CMtx44Ptr src, Mtx44Ptr dst); +void PSMTX44Copy(CMtx44Ptr src, Mtx44Ptr dst); +void C_MTX44Concat(CMtx44Ptr a, CMtx44Ptr b, Mtx44Ptr ab); +void PSMTX44Concat(CMtx44Ptr a, CMtx44Ptr b, Mtx44Ptr ab); +void C_MTX44Transpose(CMtx44Ptr src, Mtx44Ptr xPose); +void PSMTX44Transpose(CMtx44Ptr src, Mtx44Ptr xPose); +u32 C_MTX44Inverse(CMtx44Ptr src, Mtx44Ptr inv); +void C_MTX44Trans(Mtx44Ptr m, f32 xT, f32 yT, f32 zT); +void PSMTX44Trans(Mtx44Ptr m, f32 xT, f32 yT, f32 zT); +void C_MTX44TransApply(CMtx44Ptr src, Mtx44Ptr dst, f32 xT, f32 yT, f32 zT); +void PSMTX44TransApply(CMtx44Ptr src, Mtx44Ptr dst, f32 xT, f32 yT, f32 zT); +void C_MTX44Scale(Mtx44Ptr m, f32 xS, f32 yS, f32 zS); +void PSMTX44Scale(Mtx44Ptr m, f32 xS, f32 yS, f32 zS); +void C_MTX44ScaleApply(CMtx44Ptr src, Mtx44Ptr dst, f32 xS, f32 yS, f32 zS); +void PSMTX44ScaleApply(CMtx44Ptr src, Mtx44Ptr dst, f32 xS, f32 yS, f32 zS); +void C_MTX44RotRad(Mtx44Ptr m, char axis, f32 rad); +void PSMTX44RotRad(Mtx44Ptr m, char axis, f32 rad); +void C_MTX44RotTrig(Mtx44Ptr m, char axis, f32 sinA, f32 cosA); +void PSMTX44RotTrig(Mtx44Ptr m, char axis, f32 sinA, f32 cosA); +void C_MTX44RotAxisRad(MtxPtr m, CVecPtr axis, f32 rad); +void PSMTX44RotAxisRad(MtxPtr m, CVecPtr axis, f32 rad); + +#define MTXFrustum C_MTXFrustum +#define MTXPerspective C_MTXPerspective +#define MTXOrtho C_MTXOrtho +#define MTX44Identity PSMTX44Identity +#define MTX44Copy PSMTX44Copy +#define MTX44Concat PSMTX44Concat +#define MTX44Transpose PSMTX44Transpose +#define MTX44Inverse C_MTX44Inverse +#define MTX44Trans PSMTX44Trans +#define MTX44TransApply PSMTX44TransApply +#define MTX44Scale PSMTX44Scale +#define MTX44ScaleApply PSMTX44ScaleApply +#define MTX44RotRad PSMTX44RotRad +#define MTX44RotTrig PSMTX44RotTrig +#define MTX44RotAxisRad PSMTX44RotAxisRad + +/****************************************************************************** + * + * mtx44vec.c + * + ******************************************************************************/ + +void C_MTX44MultVec(CMtx44Ptr m, CVecPtr src, VecPtr dst); +void PSMTX44MultVec(CMtx44Ptr m, CVecPtr src, VecPtr dst); +void C_MTX44MultVecArray(CMtx44Ptr m, CVecPtr srcBase, VecPtr dstBase, + u32 count); +void PSMTX44MultVecArray(CMtx44Ptr m, CVecPtr srcBase, VecPtr dstBase, + u32 count); +void C_MTX44MultVecSR(CMtx44Ptr m, CVecPtr src, VecPtr dst); +void PSMTX44MultVecSR(CMtx44Ptr m, CVecPtr src, VecPtr dst); +void C_MTX44MultVecArraySR(CMtx44Ptr m, CVecPtr srcBase, VecPtr dstBase, + u32 count); +void PSMTX44MultVecArraySR(CMtx44Ptr m, CVecPtr srcBase, VecPtr dstBase, + u32 count); + +#define MTX44MultVec PSMTX44MultVec +#define MTX44MultVecArray PSMTX44MultVecArray +#define MTX44MultVecSR PSMTX44MultVecSR +#define MTX44MultVecArraySR PSMTX44MultVecArraySR + +#ifdef __cplusplus +} +#endif +#endif diff --git a/include/revolution/MTX/mtxtypes.h b/include/revolution/MTX/mtxtypes.h deleted file mode 100644 index 6188ca44b..000000000 --- a/include/revolution/MTX/mtxtypes.h +++ /dev/null @@ -1,26 +0,0 @@ -#ifndef RVL_SDK_MTX_TYPES_H -#define RVL_SDK_MTX_TYPES_H -#include -#ifdef __cplusplus -extern "C" { -#endif - -typedef f32 Mtx[3][4]; -typedef f32 Mtx44[4][4]; - -typedef struct Vec { - f32 x, y, z; -} Vec; - -typedef struct Vec2 { - f32 x, y; -} Vec2; - -typedef struct Quaternion { - f32 x, y, z, w; -} Quaternion; - -#ifdef __cplusplus -} -#endif -#endif diff --git a/include/revolution/MTX/mtxvec.h b/include/revolution/MTX/mtxvec.h deleted file mode 100644 index 8bd063454..000000000 --- a/include/revolution/MTX/mtxvec.h +++ /dev/null @@ -1,15 +0,0 @@ -#ifndef RVL_SDK_MTX_MTXVEC_H -#define RVL_SDK_MTX_MTXVEC_H -#include - -#include -#ifdef __cplusplus -extern "C" { -#endif - -void PSMTXMultVec(const Mtx mtx, const Vec* vec, Vec* out); - -#ifdef __cplusplus -} -#endif -#endif diff --git a/include/revolution/MTX/quat.h b/include/revolution/MTX/quat.h deleted file mode 100644 index 732e8b4eb..000000000 --- a/include/revolution/MTX/quat.h +++ /dev/null @@ -1,19 +0,0 @@ -#ifndef RVL_SDK_MTX_QUAT_H -#define RVL_SDK_MTX_QUAT_H -#include - -#include -#ifdef __cplusplus -extern "C" { -#endif - -void PSQUATMultiply(const Quaternion* a, const Quaternion* b, Quaternion* prod); -void PSQUATNormalize(const Quaternion* in, Quaternion* out); -void C_QUATMtx(Quaternion* quat, const Mtx mtx); -void C_QUATSlerp(const Quaternion* a, const Quaternion* b, Quaternion* out, - f32 t); - -#ifdef __cplusplus -} -#endif -#endif diff --git a/include/revolution/MTX/vec.h b/include/revolution/MTX/vec.h deleted file mode 100644 index 38642c77f..000000000 --- a/include/revolution/MTX/vec.h +++ /dev/null @@ -1,22 +0,0 @@ -#ifndef RVL_SDK_MTX_VEC_H -#define RVL_SDK_MTX_VEC_H -#include - -#include -#ifdef __cplusplus -extern "C" { -#endif - -void PSVECAdd(const Vec* a, const Vec* b, Vec* sum); -void PSVECScale(const Vec* in, Vec* out, f32 scale); -void PSVECNormalize(const Vec* in, Vec* out); -f32 PSVECMag(const Vec* v); -f32 PSVECDotProduct(const Vec* a, const Vec* b); -void PSVECCrossProduct(const Vec* a, const Vec* b, Vec* prod); -f32 PSVECSquareDistance(const Vec* a, const Vec* b); -void C_VECHalfAngle(const Vec* a, const Vec* b, Vec* half); - -#ifdef __cplusplus -} -#endif -#endif diff --git a/include/revolution/OS/OSError.h b/include/revolution/OS/OSError.h index e566a7b5f..3d4dba356 100644 --- a/include/revolution/OS/OSError.h +++ b/include/revolution/OS/OSError.h @@ -14,6 +14,14 @@ typedef struct OSContext OSContext; if (!(exp)) \ OSPanic(__FILE__, __LINE__, __VA_ARGS__) +#ifndef NDEBUG +#define OS_DEBUG_ASSERT(exp, ...) \ + if (!(exp)) \ + OSPanic(__FILE__, __LINE__, __VA_ARGS__) +#else +#define OS_DEBUG_ASSERT(...) +#endif + typedef enum { OS_ERR_SYSTEM_RESET, OS_ERR_MACHINE_CHECK, diff --git a/src/revolution/MTX/mtx.c b/src/revolution/MTX/mtx.c new file mode 100644 index 000000000..d20829bbb --- /dev/null +++ b/src/revolution/MTX/mtx.c @@ -0,0 +1,1299 @@ +#include +#include +#include + +#include + +static f32 Unit01[] = {0.0f, 1.0f}; + +void C_MTXIdentity(MtxPtr m) { + OS_DEBUG_ASSERT(m, "MtxIdentity(): NULL Mtx 'm' "); + + m[0][0] = 1.0f; + m[0][1] = 0.0f; + m[0][2] = 0.0f; + m[0][3] = 0.0f; + m[1][0] = 0.0f; + m[1][1] = 1.0f; + m[1][2] = 0.0f; + m[1][3] = 0.0f; + m[2][0] = 0.0f; + m[2][1] = 0.0f; + m[2][2] = 1.0f; + m[2][3] = 0.0f; +} + +void PSMTXIdentity(register MtxPtr m) { + register f32 c_zero = 0.0f; + register f32 c_one = 1.0f; + register f32 c_01; + register f32 c_10; + + ASM ( + psq_st c_zero, Mtx_02(m), 0, qr0; + ps_merge01 c_01, c_zero, c_one; + psq_st c_zero, Mtx_12(m), 0, qr0; + ps_merge10 c_10, c_one, c_zero; + psq_st c_zero, Mtx_20(m), 0, qr0; + + psq_st c_01, Mtx_10(m), 0, qr0; + psq_st c_10, Mtx_00(m), 0, qr0; + psq_st c_10, Mtx_22(m), 0, qr0; + ) +} + +void C_MTXCopy(CMtxPtr src, MtxPtr dst) { + OS_DEBUG_ASSERT(src, "MTXCopy(): NULL MtxPtr 'src' "); + OS_DEBUG_ASSERT(dst, "MTXCopy(): NULL MtxPtr 'dst' "); + + if (src == dst) { + return; + } + + dst[0][0] = src[0][0]; + dst[0][1] = src[0][1]; + dst[0][2] = src[0][2]; + dst[0][3] = src[0][3]; + dst[1][0] = src[1][0]; + dst[1][1] = src[1][1]; + dst[1][2] = src[1][2]; + dst[1][3] = src[1][3]; + dst[2][0] = src[2][0]; + dst[2][1] = src[2][1]; + dst[2][2] = src[2][2]; + dst[2][3] = src[2][3]; +} + +asm void PSMTXCopy(register CMtxPtr src, register MtxPtr dst) { + nofralloc; + + psq_l f0, Mtx_00(src), 0, qr0; + psq_st f0, Mtx_00(dst), 0, qr0; + psq_l f1, Mtx_02(src), 0, qr0; + psq_st f1, Mtx_02(dst), 0, qr0; + psq_l f2, Mtx_10(src), 0, qr0; + psq_st f2, Mtx_10(dst), 0, qr0; + psq_l f3, Mtx_12(src), 0, qr0; + psq_st f3, Mtx_12(dst), 0, qr0; + psq_l f4, Mtx_20(src), 0, qr0; + psq_st f4, Mtx_20(dst), 0, qr0; + psq_l f5, Mtx_22(src), 0, qr0; + psq_st f5, Mtx_22(dst), 0, qr0; + + blr; +} + +void C_MTXConcat(CMtxPtr a, CMtxPtr b, MtxPtr ab) { + Mtx mTmp; + MtxPtr m; + + OS_DEBUG_ASSERT(a, "MTXConcat(): NULL MtxPtr 'a' "); + OS_DEBUG_ASSERT(b, "MTXConcat(): NULL MtxPtr 'b' "); + OS_DEBUG_ASSERT(ab, "MTXConcat(): NULL MtxPtr 'ab' "); + + if (ab == a || ab == b) { + m = mTmp; + } else { + m = ab; + } + + m[0][0] = a[0][0] * b[0][0] + a[0][1] * b[1][0] + a[0][2] * b[2][0]; + m[0][1] = a[0][0] * b[0][1] + a[0][1] * b[1][1] + a[0][2] * b[2][1]; + m[0][2] = a[0][0] * b[0][2] + a[0][1] * b[1][2] + a[0][2] * b[2][2]; + m[0][3] = + a[0][0] * b[0][3] + a[0][1] * b[1][3] + a[0][2] * b[2][3] + a[0][3]; + m[1][0] = a[1][0] * b[0][0] + a[1][1] * b[1][0] + a[1][2] * b[2][0]; + m[1][1] = a[1][0] * b[0][1] + a[1][1] * b[1][1] + a[1][2] * b[2][1]; + m[1][2] = a[1][0] * b[0][2] + a[1][1] * b[1][2] + a[1][2] * b[2][2]; + m[1][3] = + a[1][0] * b[0][3] + a[1][1] * b[1][3] + a[1][2] * b[2][3] + a[1][3]; + m[2][0] = a[2][0] * b[0][0] + a[2][1] * b[1][0] + a[2][2] * b[2][0]; + m[2][1] = a[2][0] * b[0][1] + a[2][1] * b[1][1] + a[2][2] * b[2][1]; + m[2][2] = a[2][0] * b[0][2] + a[2][1] * b[1][2] + a[2][2] * b[2][2]; + m[2][3] = + a[2][0] * b[0][3] + a[2][1] * b[1][3] + a[2][2] * b[2][3] + a[2][3]; + + if (m == mTmp) { + C_MTXCopy(mTmp, ab); + } +} + +asm void PSMTXConcat(register CMtxPtr mA, register CMtxPtr mB, + register MtxPtr mAB) { + nofralloc; + + stwu sp, -0x40(sp); + psq_l f0, Mtx_00(mA), 0, qr0; + stfd f14, 0x8(sp); + psq_l f6, Mtx_00(mB), 0, qr0; + + // It wants to format "Unit01@ha" with whitespace + // clang-format off + lis r6, Unit01@ha; + // clang-format on + + psq_l f7, Mtx_02(mB), 0, qr0; + stfd f15, 0x10(sp); + + // It wants to do the same with "Unit01@l" + // clang-format off + addi r6, r6, Unit01@l; + // clang-format on + + stfd f31, 0x28(sp); + psq_l f8, Mtx_10(mB), 0, qr0; + ps_muls0 f12, f6, f0; + psq_l f2, Mtx_10(mA), 0, qr0; + ps_muls0 f13, f7, f0; + psq_l f31, 0x0(r6), 0, qr0; + ps_muls0 f14, f6, f2; + psq_l f9, Mtx_12(mB), 0, qr0; + ps_muls0 f15, f7, f2; + psq_l f1, Mtx_02(mA), 0, qr0; + ps_madds1 f12, f8, f0, f12; + psq_l f3, Mtx_12(mA), 0, qr0; + ps_madds1 f14, f8, f2, f14; + psq_l f10, Mtx_20(mB), 0, qr0; + ps_madds1 f13, f9, f0, f13; + psq_l f11, Mtx_22(mB), 0, qr0; + ps_madds1 f15, f9, f2, f15; + psq_l f4, Mtx_20(mA), 0, qr0; + psq_l f5, Mtx_22(mA), 0, qr0; + ps_madds0 f12, f10, f1, f12; + ps_madds0 f13, f11, f1, f13; + ps_madds0 f14, f10, f3, f14; + ps_madds0 f15, f11, f3, f15; + psq_st f12, Mtx_00(mAB), 0, qr0; + ps_muls0 f2, f6, f4; + ps_madds1 f13, f31, f1, f13; + ps_muls0 f0, f7, f4; + psq_st f14, Mtx_10(mAB), 0, qr0; + ps_madds1 f15, f31, f3, f15; + psq_st f13, Mtx_02(mAB), 0, qr0; + ps_madds1 f2, f8, f4, f2; + ps_madds1 f0, f9, f4, f0; + ps_madds0 f2, f10, f5, f2; + lfd f14, 0x8(sp); + psq_st f15, Mtx_12(mAB), 0, qr0; + ps_madds0 f0, f11, f5, f0; + psq_st f2, Mtx_20(mAB), 0, qr0; + ps_madds1 f0, f31, f5, f0; + lfd f15, 0x10(sp); + psq_st f0, Mtx_22(mAB), 0, qr0; + lfd f31, 0x28(sp); + addi sp, sp, 0x40; + + blr; +} + +void C_MTXConcatArray(CMtxPtr a, CMtxPtr srcBase, MtxPtr dstBase, u32 count) { + u32 i; + + OS_DEBUG_ASSERT(a, "MTXConcatArray(): NULL MtxPtr 'a' "); + OS_DEBUG_ASSERT(srcBase, "MTXConcatArray(): NULL MtxPtr 'srcBase' "); + OS_DEBUG_ASSERT(dstBase, "MTXConcatArray(): NULL MtxPtr 'dstBase' "); + OS_DEBUG_ASSERT(count > 1, "MTXConcatArray(): count must be greater than 1."); + + for (i = 0; i < count; ++i) { + C_MTXConcat(a, srcBase, dstBase); + srcBase += 3; + dstBase += 3; + } +} + +// NOTE (vabold): This appears to be required, but I don't know why +#ifndef NDEBUG +#pragma optimization_level 1 +#endif + +void PSMTXConcatArray(register CMtxPtr a, register CMtxPtr srcBase, + register MtxPtr dstBase, register u32 count) { + register f32 a00; + register f32 a02; + register f32 a10; + register f32 a12; + register f32 a20; + register f32 a22; + register f32 s00; + register f32 s02; + register f32 s10; + register f32 s12; + register f32 s20; + register f32 s22; + register f32 d00; + register f32 d02; + register f32 d10; + register f32 d12; + register f32 d20; + register f32 d22; + register f32 unit; + register f32* pUnit = Unit01; + + ASM ( + psq_l a00, Mtx_00(a), 0, qr0; + psq_l a02, Mtx_02(a), 0, qr0; + psq_l a10, Mtx_10(a), 0, qr0; + psq_l a12, Mtx_12(a), 0, qr0; + subi count, count, 0x1; + psq_l a20, Mtx_20(a), 0, qr0; + psq_l a22, Mtx_22(a), 0, qr0; + mtctr count; + psq_l unit, 0x0(pUnit), 0, qr0; + psq_l s00, Mtx_00(srcBase), 0, qr0; + psq_l s10, Mtx_10(srcBase), 0, qr0; + ps_muls0 d00, s00, a00; + ps_muls0 d10, s00, a10; + ps_muls0 d20, s00, a20; + psq_l s20, Mtx_20(srcBase), 0, qr0; + ps_madds1 d00, s10, a00, d00; + ps_madds1 d10, s10, a10, d10; + ps_madds1 d20, s10, a20, d20; + psq_l s02, Mtx_02(srcBase), 0, qr0; + ps_madds0 d00, s20, a02, d00; + ps_madds0 d10, s20, a12, d10; + ps_madds0 d20, s20, a22, d20; + psq_l s12, Mtx_12(srcBase), 0, qr0; + psq_st d00, Mtx_00(dstBase), 0, qr0; + ps_muls0 d02, s02, a00; + ps_muls0 d12, s02, a10; + ps_muls0 d22, s02, a20; + psq_l s22, Mtx_22(srcBase), 0, qr0; + psq_st d10, Mtx_10(dstBase), 0, qr0; + ps_madds1 d02, s12, a00, d02; + ps_madds1 d12, s12, a10, d12; + ps_madds1 d22, s12, a20, d22; + loop: + addi srcBase, srcBase, sizeof(Mtx); + ps_madds0 d02, s22, a02, d02; + ps_madds0 d12, s22, a12, d12; + ps_madds0 d22, s22, a22, d22; + psq_l s00, Mtx_00(srcBase), 0, qr0; + psq_st d20, Mtx_20(dstBase), 0, qr0; + ps_madd d02, unit, a02, d02; + ps_madd d12, unit, a12, d12; + ps_madd d22, unit, a22, d22; + psq_l s10, Mtx_10(srcBase), 0, qr0; + psq_st d02, Mtx_02(dstBase), 0, qr0; + ps_muls0 d00, s00, a00; + ps_muls0 d10, s00, a10; + ps_muls0 d20, s00, a20; + psq_l s20, Mtx_20(srcBase), 0, qr0; + psq_st d12, Mtx_12(dstBase), 0, qr0; + ps_madds1 d00, s10, a00, d00; + ps_madds1 d10, s10, a10, d10; + ps_madds1 d20, s10, a20, d20; + psq_l s02, Mtx_02(srcBase), 0, qr0; + psq_st d22, Mtx_22(dstBase), 0, qr0; + addi dstBase, dstBase, sizeof(Mtx); + ps_madds0 d00, s20, a02, d00; + ps_madds0 d10, s20, a12, d10; + ps_madds0 d20, s20, a22, d20; + psq_l s12, Mtx_12(srcBase), 0, qr0; + psq_st d00, Mtx_00(dstBase), 0, qr0; + ps_muls0 d02, s02, a00; + ps_muls0 d12, s02, a10; + ps_muls0 d22, s02, a20; + psq_l s22, Mtx_22(srcBase), 0, qr0; + psq_st d10, Mtx_10(dstBase), 0, qr0; + ps_madds1 d02, s12, a00, d02; + ps_madds1 d12, s12, a10, d12; + ps_madds1 d22, s12, a20, d22; + bdnz loop; + psq_st d20, Mtx_20(dstBase), 0, qr0; + ps_madds0 d02, s22, a02, d02; + ps_madds0 d12, s22, a12, d12; + ps_madds0 d22, s22, a22, d22; + ps_madd d02, unit, a02, d02; + ps_madd d12, unit, a12, d12; + ps_madd d22, unit, a22, d22; + psq_st d02, Mtx_02(dstBase), 0, qr0; + psq_st d12, Mtx_12(dstBase), 0, qr0; + psq_st d22, Mtx_22(dstBase), 0, qr0; + ) +} + +#ifndef NDEBUG +#pragma optimization_level 0 +#endif + +void C_MTXTranspose(CMtxPtr src, MtxPtr xPose) { + Mtx mTmp; + MtxPtr m; + + OS_DEBUG_ASSERT(src, "MTXTranspose(): NULL MtxPtr 'src' "); + OS_DEBUG_ASSERT(xPose, "MTXTranspose(): NULL MtxPtr 'xPose' "); + + if (src == xPose) { + m = mTmp; + } else { + m = xPose; + } + + m[0][0] = src[0][0]; + m[0][1] = src[1][0]; + m[0][2] = src[2][0]; + m[0][3] = 0.0f; + m[1][0] = src[0][1]; + m[1][1] = src[1][1]; + m[1][2] = src[2][1]; + m[1][3] = 0.0f; + m[2][0] = src[0][2]; + m[2][1] = src[1][2]; + m[2][2] = src[2][2]; + m[2][3] = 0.0f; + + if (m == mTmp) { + C_MTXCopy(mTmp, xPose); + } +} + +void PSMTXTranspose(register CMtxPtr src, register MtxPtr xPose) { + register f32 c_zero = 0.0f; + register f32 row0a; + register f32 row1a; + register f32 row0b; + register f32 row1b; + register f32 trns0; + register f32 trns1; + register f32 trns2; + + ASM ( + psq_l row0a, Mtx_00(src), 0, qr0; + stfs c_zero, Mtx_23(xPose); + psq_l row1a, Mtx_10(src), 0, qr0; + ps_merge00 trns0, row0a, row1a; + psq_l row0b, Mtx_02(src), 1, qr0; + ps_merge11 trns1, row0a, row1a; + psq_l row1b, Mtx_12(src), 1, qr0; + psq_st trns0, Mtx_00(xPose), 0, qr0; + psq_l row0a, Mtx_20(src), 0, qr0; + ps_merge00 trns2, row0b, row1b; + psq_st trns1, Mtx_10(xPose), 0, qr0; + ps_merge00 trns0, row0a, c_zero; + psq_st trns2, Mtx_20(xPose), 0, qr0; + ps_merge10 trns1, row0a, c_zero; + psq_st trns0, Mtx_02(xPose), 0, qr0; + lfs row0b, Mtx_22(src); + psq_st trns1, Mtx_12(xPose), 0, qr0; + stfs row0b, Mtx_22(xPose); + ) +} + +u32 C_MTXInverse(CMtxPtr src, MtxPtr inv) { + Mtx mTmp; + MtxPtr m; + f32 det; + + if (src == inv) { + m = mTmp; + } else { + m = inv; + } + + det = + src[0][0] * src[1][1] * src[2][2] + src[0][1] * src[1][2] * src[2][0] + + src[0][2] * src[1][0] * src[2][1] - src[2][0] * src[1][1] * src[0][2] - + src[1][0] * src[0][1] * src[2][2] - src[0][0] * src[2][1] * src[1][2]; + + if (det == 0.0f) { + return FALSE; + } + + det = 1.0f / det; + + m[0][0] = (src[1][1] * src[2][2] - src[2][1] * src[1][2]) * det; + m[0][1] = -(src[0][1] * src[2][2] - src[2][1] * src[0][2]) * det; + m[0][2] = (src[0][1] * src[1][2] - src[1][1] * src[0][2]) * det; + m[1][0] = -(src[1][0] * src[2][2] - src[2][0] * src[1][2]) * det; + m[1][1] = (src[0][0] * src[2][2] - src[2][0] * src[0][2]) * det; + m[1][2] = -(src[0][0] * src[1][2] - src[1][0] * src[0][2]) * det; + m[2][0] = (src[1][0] * src[2][1] - src[2][0] * src[1][1]) * det; + m[2][1] = -(src[0][0] * src[2][1] - src[2][0] * src[0][1]) * det; + m[2][2] = (src[0][0] * src[1][1] - src[1][0] * src[0][1]) * det; + + m[0][3] = -m[0][0] * src[0][3] - m[0][1] * src[1][3] - m[0][2] * src[2][3]; + m[1][3] = -m[1][0] * src[0][3] - m[1][1] * src[1][3] - m[1][2] * src[2][3]; + m[2][3] = -m[2][0] * src[0][3] - m[2][1] * src[1][3] - m[2][2] * src[2][3]; + + if (m == mTmp) { + C_MTXCopy(mTmp, inv); + } + + return TRUE; +} + +asm u32 PSMTXInverse(register CMtxPtr src, register MtxPtr inv) { + nofralloc; + + psq_l f0, Mtx_00(src), 1, qr0; + psq_l f1, Mtx_01(src), 0, qr0; + psq_l f2, Mtx_10(src), 1, qr0; + ps_merge10 f6, f1, f0; + psq_l f3, Mtx_11(src), 0, qr0; + psq_l f4, Mtx_20(src), 1, qr0; + ps_merge10 f7, f3, f2; + psq_l f5, Mtx_21(src), 0, qr0; + ps_mul f11, f3, f6; + ps_mul f13, f5, f7; + ps_merge10 f8, f5, f4; + ps_msub f11, f1, f7, f11; + ps_mul f12, f1, f8; + ps_msub f13, f3, f8, f13; + ps_mul f10, f3, f4; + ps_msub f12, f5, f6, f12; + ps_mul f9, f0, f5; + ps_mul f8, f1, f2; + ps_sub f6, f6, f6; + ps_msub f10, f2, f5, f10; + ps_mul f7, f0, f13; + ps_msub f9, f1, f4, f9; + ps_madd f7, f2, f12, f7; + ps_msub f8, f0, f3, f8; + ps_madd f7, f4, f11, f7; + ps_cmpo0 cr0, f7, f6; + bne nonsingular; + + li r3, 0x0; + blr; + +nonsingular: + fres f0, f7; + ps_add f6, f0, f0; + ps_mul f5, f0, f0; + ps_nmsub f0, f7, f5, f6; + lfs f1, Mtx_03(src); + ps_muls0 f13, f13, f0; + lfs f2, Mtx_13(src); + ps_muls0 f12, f12, f0; + lfs f3, Mtx_23(src); + ps_muls0 f11, f11, f0; + ps_merge00 f5, f13, f12; + ps_muls0 f10, f10, f0; + ps_merge11 f4, f13, f12; + ps_muls0 f9, f9, f0; + psq_st f5, Mtx_00(inv), 0, qr0; + ps_mul f6, f13, f1; + psq_st f4, Mtx_10(inv), 0, qr0; + ps_muls0 f8, f8, f0; + ps_madd f6, f12, f2, f6; + psq_st f10, Mtx_20(inv), 1, qr0; + ps_nmadd f6, f11, f3, f6; + psq_st f9, Mtx_21(inv), 1, qr0; + ps_mul f7, f10, f1; + ps_merge00 f5, f11, f6; + psq_st f8, Mtx_22(inv), 1, qr0; + ps_merge11 f4, f11, f6; + psq_st f5, Mtx_02(inv), 0, qr0; + ps_madd f7, f9, f2, f7; + psq_st f4, Mtx_12(inv), 0, qr0; + ps_nmadd f7, f8, f3, f7; + li r3, 0x1; + psq_st f7, Mtx_23(inv), 1, qr0; + blr; +} + +u32 C_MTXInvXpose(CMtxPtr src, MtxPtr invX) { + Mtx mTmp; + MtxPtr m; + f32 det; + + OS_DEBUG_ASSERT(src, "MTXInverse(): NULL MtxPtr 'src' "); + OS_DEBUG_ASSERT(inv, "MTXInverse(): NULL MtxPtr 'inv' "); + + if (src == invX) { + m = mTmp; + } else { + m = invX; + } + + det = + src[0][0] * src[1][1] * src[2][2] + src[0][1] * src[1][2] * src[2][0] + + src[0][2] * src[1][0] * src[2][1] - src[2][0] * src[1][1] * src[0][2] - + src[1][0] * src[0][1] * src[2][2] - src[0][0] * src[2][1] * src[1][2]; + + if (det == 0.0f) { + return FALSE; + } + + det = 1.0f / det; + + m[0][0] = (src[1][1] * src[2][2] - src[2][1] * src[1][2]) * det; + m[0][1] = -(src[1][0] * src[2][2] - src[2][0] * src[1][2]) * det; + m[0][2] = (src[1][0] * src[2][1] - src[2][0] * src[1][1]) * det; + m[1][0] = -(src[0][1] * src[2][2] - src[2][1] * src[0][2]) * det; + m[1][1] = (src[0][0] * src[2][2] - src[2][0] * src[0][2]) * det; + m[1][2] = -(src[0][0] * src[2][1] - src[2][0] * src[0][1]) * det; + m[2][0] = (src[0][1] * src[1][2] - src[1][1] * src[0][2]) * det; + m[2][1] = -(src[0][0] * src[1][2] - src[1][0] * src[0][2]) * det; + m[2][2] = (src[0][0] * src[1][1] - src[1][0] * src[0][1]) * det; + + m[0][3] = 0.0f; + m[1][3] = 0.0f; + m[2][3] = 0.0f; + + if (m == mTmp) { + C_MTXCopy(mTmp, invX); + } + + return TRUE; +} + +asm u32 PSMTXInvXpose(register CMtxPtr src, register MtxPtr invX) { + nofralloc; + + psq_l f0, Mtx_00(src), 1, qr0; + psq_l f1, Mtx_01(src), 0, qr0; + psq_l f2, Mtx_10(src), 1, qr0; + ps_merge10 f6, f1, f0; + psq_l f3, Mtx_11(src), 0, qr0; + psq_l f4, Mtx_20(src), 1, qr0; + ps_merge10 f7, f3, f2; + psq_l f5, Mtx_21(src), 0, qr0; + ps_mul f11, f3, f6; + ps_merge10 f8, f5, f4; + ps_mul f13, f5, f7; + ps_msub f11, f1, f7, f11; + ps_mul f12, f1, f8; + ps_msub f13, f3, f8, f13; + ps_msub f12, f5, f6, f12; + ps_mul f10, f3, f4; + ps_mul f9, f0, f5; + ps_mul f8, f1, f2; + ps_msub f10, f2, f5, f10; + ps_msub f9, f1, f4, f9; + ps_msub f8, f0, f3, f8; + ps_mul f7, f0, f13; + ps_sub f1, f1, f1; + ps_madd f7, f2, f12, f7; + ps_madd f7, f4, f11, f7; + ps_cmpo0 cr0, f7, f1; + bne nonsingular; + + li r3, 0x0; + blr; + +nonsingular: + fres f0, f7; + psq_st f1, Mtx_03(invX), 1, qr0; + ps_add f6, f0, f0; + ps_mul f5, f0, f0; + psq_st f1, Mtx_13(invX), 1, qr0; + ps_nmsub f0, f7, f5, f6; + psq_st f1, Mtx_23(invX), 1, qr0; + ps_muls0 f13, f13, f0; + ps_muls0 f12, f12, f0; + ps_muls0 f11, f11, f0; + psq_st f13, Mtx_00(invX), 0, qr0; + psq_st f12, Mtx_10(invX), 0, qr0; + ps_muls0 f10, f10, f0; + ps_muls0 f9, f9, f0; + psq_st f11, Mtx_20(invX), 0, qr0; + psq_st f10, Mtx_02(invX), 1, qr0; + ps_muls0 f8, f8, f0; + li r3, 0x1; + psq_st f9, Mtx_12(invX), 1, qr0; + psq_st f8, Mtx_22(invX), 1, qr0; + blr; +} + +void C_MTXRotRad(MtxPtr m, char axis, f32 rad) { + f32 sinA, cosA; + + OS_DEBUG_ASSERT(m, "MTXRotRad(): NULL MtxPtr 'm' "); + + sinA = sinf(rad); + cosA = cosf(rad); + C_MTXRotTrig(m, axis, sinA, cosA); +} + +void PSMTXRotRad(MtxPtr m, char axis, f32 rad) { + f32 sinA, cosA; + + sinA = sinf(rad); + cosA = cosf(rad); + PSMTXRotTrig(m, axis, sinA, cosA); +} + +void C_MTXRotTrig(MtxPtr m, char axis, f32 sinA, f32 cosA) { + OS_DEBUG_ASSERT(m, "MTXRotTrig(): NULL MtxPtr 'm' "); + + switch (axis) { + case 'x': + case 'X': + m[0][0] = 1.0f; + m[0][1] = 0.0f; + m[0][2] = 0.0f; + m[0][3] = 0.0f; + m[1][0] = 0.0f; + m[1][1] = cosA; + m[1][2] = -sinA; + m[1][3] = 0.0f; + m[2][0] = 0.0f; + m[2][1] = sinA; + m[2][2] = cosA; + m[2][3] = 0.0f; + break; + case 'y': + case 'Y': + m[0][0] = cosA; + m[0][1] = 0.0f; + m[0][2] = sinA; + m[0][3] = 0.0f; + m[1][0] = 0.0f; + m[1][1] = 1.0f; + m[1][2] = 0.0f; + m[1][3] = 0.0f; + m[2][0] = -sinA; + m[2][1] = 0.0f; + m[2][2] = cosA; + m[2][3] = 0.0f; + break; + case 'z': + case 'Z': + m[0][0] = cosA; + m[0][1] = -sinA; + m[0][2] = 0.0f; + m[0][3] = 0.0f; + m[1][0] = sinA; + m[1][1] = cosA; + m[1][2] = 0.0f; + m[1][3] = 0.0f; + m[2][0] = 0.0f; + m[2][1] = 0.0f; + m[2][2] = 1.0f; + m[2][3] = 0.0f; + break; + default: + OS_DEBUG_ASSERT(FALSE, "MTXRotTrig(): invalid 'axis' value "); + break; + } +} + +void PSMTXRotTrig(register MtxPtr m, register char axis, register f32 sinA, + register f32 cosA) { + register f32 fc0; + register f32 fc1; + register f32 nsinA; + register f32 fw0; + register f32 fw1; + register f32 fw2; + register f32 fw3; + + ASM ( + frsp sinA, sinA; + frsp cosA, cosA; + ) + + fc0 = 0.0f; + fc1 = 1.0f; + + ASM ( + ori axis, axis, 0x20; + ps_neg nsinA, sinA; + cmplwi axis, 'x'; + beq axis_x; + cmplwi axis, 'y'; + beq axis_y; + cmplwi axis, 'z'; + beq axis_z; + b epilogue; + axis_x: + psq_st fc1, Mtx_00(m), 1, qr0; + psq_st fc0, Mtx_01(m), 0, qr0; + ps_merge00 fw0, sinA, cosA; + psq_st fc0, Mtx_03(m), 0, qr0; + ps_merge00 fw1, cosA, nsinA; + psq_st fc0, Mtx_13(m), 0, qr0; + psq_st fc0, Mtx_23(m), 1, qr0; + psq_st fw0, Mtx_21(m), 0, qr0; + psq_st fw1, Mtx_11(m), 0, qr0; + b epilogue; + axis_y: + ps_merge00 fw0, cosA, fc0; + ps_merge00 fw1, fc0, fc1; + psq_st fc0, Mtx_12(m), 0, qr0; + psq_st fw0, Mtx_00(m), 0, qr0; + ps_merge00 fw2, nsinA, fc0; + ps_merge00 fw3, sinA, fc0; + psq_st fw0, Mtx_22(m), 0, qr0; + psq_st fw1, Mtx_10(m), 0, qr0; + psq_st fw3, Mtx_02(m), 0, qr0; + psq_st fw2, Mtx_20(m), 0, qr0; + b epilogue; + axis_z: + psq_st fc0, Mtx_02(m), 0, qr0; + ps_merge00 fw0, sinA, cosA; + ps_merge00 fw2, cosA, nsinA; + psq_st fc0, Mtx_12(m), 0, qr0; + psq_st fc0, Mtx_20(m), 0, qr0; + ps_merge00 fw1, fc1, fc0; + psq_st fw0, Mtx_10(m), 0, qr0; + psq_st fw2, Mtx_00(m), 0, qr0; + psq_st fw1, Mtx_22(m), 0, qr0; + epilogue: + ) +} + +void C_MTXRotAxisRad(MtxPtr m, CVecPtr axis, f32 rad) { + Vec vN; + f32 s; + f32 c; + f32 t; + f32 x; + f32 y; + f32 z; + // The DWARF build we have is outdated, these names are added + f32 sqx; + f32 sqy; + f32 sqz; + + OS_DEBUG_ASSERT(m, "MTXRotAxisRad(): NULL MtxPtr 'm' "); + OS_DEBUG_ASSERT(axis, "MTXRotAxisRad(): NULL VecPtr 'axis' "); + + s = sinf(rad); + c = cosf(rad); + t = 1.0f - c; + + C_VECNormalize(axis, &vN); + + x = vN.x; + y = vN.y; + z = vN.z; + sqx = x * x; + sqy = y * y; + sqz = z * z; + + m[0][0] = t * sqx + c; + m[0][1] = t * x * y - s * z; + m[0][2] = t * x * z + s * y; + m[0][3] = 0.0f; + m[1][0] = t * x * y + s * z; + m[1][1] = t * sqy + c; + m[1][2] = t * y * z - s * x; + m[1][3] = 0.0f; + m[2][0] = t * x * z - s * y; + m[2][1] = t * y * z + s * x; + m[2][2] = t * sqz + c; + m[2][3] = 0.0f; +} + +// Subroutine for PSMTXRotAxisRad +// Paired singles don't have good ways of doing trig so we rely on a C caller +// DWARF merges PSMTXRotAxisRad with this so it's not perfect +static void __PSMTXRotAxisRadInternal(register MtxPtr m, register CVecPtr axis, + register f32 sT, register f32 cT) { + register f32 tT; + register f32 fc0; + register f32 tmp0; + register f32 tmp1; + register f32 tmp2; + register f32 tmp3; + register f32 tmp4; + register f32 tmp5; + register f32 tmp6; + register f32 tmp7; + register f32 tmp8; + register f32 tmp9; + + tmp9 = 0.5f; + tmp8 = 3.0f; + + ASM ( + frsp cT, cT; + psq_l tmp0, Vec.x(axis), 0, qr0; + frsp sT, sT; + lfs tmp1, Vec.z(axis); + ps_mul tmp2, tmp0, tmp0; + fadds tmp7, tmp9, tmp9; + ps_madd tmp3, tmp1, tmp1, tmp2; + fsubs fc0, tmp9, tmp9; + ps_sum0 tmp4, tmp3, tmp1, tmp2; + fsubs tT, tmp7, cT; + frsqrte tmp5, tmp4; + fmuls tmp2, tmp5, tmp5; + fmuls tmp3, tmp5, tmp9; + fnmsubs tmp2, tmp2, tmp4, tmp8; + fmuls tmp5, tmp2, tmp3; + ps_merge00 cT, cT, cT; + ps_muls0 tmp0, tmp0, tmp5; + ps_muls0 tmp1, tmp1, tmp5; + ps_muls0 tmp4, tmp0, tT; + ps_muls0 tmp9, tmp0, sT; + ps_muls0 tmp5, tmp1, tT; + ps_muls1 tmp3, tmp4, tmp0; + ps_muls0 tmp2, tmp4, tmp0; + ps_muls0 tmp4, tmp4, tmp1; + fnmsubs tmp6, tmp1, sT, tmp3; + fmadds tmp7, tmp1, sT, tmp3; + ps_neg tmp0, tmp9; + ps_sum0 tmp8, tmp4, fc0, tmp9; + ps_sum0 tmp2, tmp2, tmp6, cT; + ps_sum1 tmp3, cT, tmp7, tmp3; + ps_sum0 tmp6, tmp0, fc0, tmp4; + psq_st tmp8, Mtx_02(m), 0, qr0; + ps_sum0 tmp0, tmp4, tmp4, tmp0; + psq_st tmp2, Mtx_00(m), 0, qr0; + ps_muls0 tmp5, tmp5, tmp1; + psq_st tmp3, Mtx_10(m), 0, qr0; + ps_sum1 tmp4, tmp9, tmp0, tmp4; + psq_st tmp6, Mtx_12(m), 0, qr0; + ps_sum0 tmp5, tmp5, fc0, cT; + psq_st tmp4, Mtx_20(m), 0, qr0; + psq_st tmp5, Mtx_22(m), 0, qr0; + ) +} + +void PSMTXRotAxisRad(MtxPtr m, CVecPtr axis, f32 rad) { + f32 s; + f32 c; + + s = sinf(rad); + c = cosf(rad); + __PSMTXRotAxisRadInternal(m, axis, s, c); +} + +void C_MTXTrans(MtxPtr m, f32 xT, f32 yT, f32 zT) { + OS_DEBUG_ASSERT(m, "MTXTrans(): NULL MtxPtr 'm' "); + + m[0][0] = 1.0f; + m[0][1] = 0.0f; + m[0][2] = 0.0f; + m[0][3] = xT; + m[1][0] = 0.0f; + m[1][1] = 1.0f; + m[1][2] = 0.0f; + m[1][3] = yT; + m[2][0] = 0.0f; + m[2][1] = 0.0f; + m[2][2] = 1.0f; + m[2][3] = zT; +} + +void PSMTXTrans(register MtxPtr m, register f32 xT, register f32 yT, + register f32 zT) { + register f32 c0 = 0.0f; + register f32 c1 = 1.0f; + + ASM ( + stfs xT, Mtx_03(m); + stfs yT, Mtx_13(m); + psq_st c0, Mtx_01(m), 0, qr0; + psq_st c0, Mtx_20(m), 0, qr0; + stfs c0, Mtx_10(m); + stfs c1, Mtx_11(m); + stfs c0, Mtx_12(m); + stfs c1, Mtx_22(m); + stfs zT, Mtx_23(m); + stfs c1, Mtx_00(m); + ) +} + +void C_MTXTransApply(CMtxPtr src, MtxPtr dst, f32 xT, f32 yT, f32 zT) { + OS_DEBUG_ASSERT(src, "MTXTransApply(): NULL MtxPtr 'src' "); + OS_DEBUG_ASSERT(dst, "MTXTransApply(): NULL MtxPtr 'src' "); + + if (src != dst) { + dst[0][0] = src[0][0]; + dst[0][1] = src[0][1]; + dst[0][2] = src[0][2]; + dst[1][0] = src[1][0]; + dst[1][1] = src[1][1]; + dst[1][2] = src[1][2]; + dst[2][0] = src[2][0]; + dst[2][1] = src[2][1]; + dst[2][2] = src[2][2]; + } + + dst[0][3] = src[0][3] + xT; + dst[1][3] = src[1][3] + yT; + dst[2][3] = src[2][3] + zT; +} + +asm void PSMTXTransApply(register CMtxPtr src, register MtxPtr dst, + register f32 xT, register f32 yT, register f32 zT) { + nofralloc; + + psq_l f4, Mtx_00(src), 0, qr0; + frsp xT, xT; + psq_l f5, Mtx_02(src), 0, qr0; + frsp yT, yT; + psq_l f7, Mtx_12(src), 0, qr0; + frsp zT, zT; + psq_l f8, Mtx_22(src), 0, qr0; + psq_st f4, Mtx_00(dst), 0, qr0; + ps_sum1 f5, xT, f5, f5; + psq_l f6, Mtx_10(src), 0, qr0; + psq_st f5, Mtx_02(dst), 0, qr0; + ps_sum1 f7, yT, f7, f7; + psq_l f9, Mtx_20(src), 0, qr0; + psq_st f6, Mtx_10(dst), 0, qr0; + ps_sum1 f8, zT, f8, f8; + psq_st f7, Mtx_12(dst), 0, qr0; + psq_st f9, Mtx_20(dst), 0, qr0; + psq_st f8, Mtx_22(dst), 0, qr0; + + blr; +} + +void C_MTXScale(MtxPtr m, f32 xS, f32 yS, f32 zS) { + OS_DEBUG_ASSERT(m, "MTXScale(): NULL MtxPtr 'm' "); + + m[0][0] = xS; + m[0][1] = 0.0f; + m[0][2] = 0.0f; + m[0][3] = 0.0f; + m[1][0] = 0.0f; + m[1][1] = yS; + m[1][2] = 0.0f; + m[1][3] = 0.0f; + m[2][0] = 0.0f; + m[2][1] = 0.0f; + m[2][2] = zS; + m[2][3] = 0.0f; +} + +void PSMTXScale(register MtxPtr m, register f32 xS, register f32 yS, + register f32 zS) { + register f32 c0 = 0.0f; + + ASM ( + stfs xS, Mtx_00(m); + psq_st c0, Mtx_01(m), 0, qr0; + psq_st c0, Mtx_03(m), 0, qr0; + stfs yS, Mtx_11(m); + psq_st c0, Mtx_12(m), 0, qr0; + psq_st c0, Mtx_20(m), 0, qr0; + stfs zS, Mtx_22(m); + stfs c0, Mtx_23(m); + ) +} + +void C_MTXScaleApply(CMtxPtr src, MtxPtr dst, f32 xS, f32 yS, f32 zS) { + OS_DEBUG_ASSERT(src, "MTXScaleApply(): NULL MtxPtr 'src' "); + OS_DEBUG_ASSERT(dst, "MTXScaleApply(): NULL MtxPtr 'dst' "); + + dst[0][0] = src[0][0] * xS; + dst[0][1] = src[0][1] * xS; + dst[0][2] = src[0][2] * xS; + dst[0][3] = src[0][3] * xS; + dst[1][0] = src[1][0] * yS; + dst[1][1] = src[1][1] * yS; + dst[1][2] = src[1][2] * yS; + dst[1][3] = src[1][3] * yS; + dst[2][0] = src[2][0] * zS; + dst[2][1] = src[2][1] * zS; + dst[2][2] = src[2][2] * zS; + dst[2][3] = src[2][3] * zS; +} + +asm void PSMTXScaleApply(register CMtxPtr src, register MtxPtr dst, + register f32 xS, register f32 yS, register f32 zS) { + nofralloc; + + frsp xS, xS; + psq_l f4, Mtx_00(src), 0, qr0; + frsp yS, yS; + psq_l f5, Mtx_02(src), 0, qr0; + frsp zS, zS; + ps_muls0 f4, f4, xS; + psq_l f6, Mtx_10(src), 0, qr0; + ps_muls0 f5, f5, xS; + psq_l f7, Mtx_12(src), 0, qr0; + ps_muls0 f6, f6, yS; + psq_l f8, Mtx_20(src), 0, qr0; + psq_st f4, Mtx_00(dst), 0, qr0; + ps_muls0 f7, f7, yS; + psq_l f2, Mtx_22(src), 0, qr0; + psq_st f5, Mtx_02(dst), 0, qr0; + ps_muls0 f8, f8, zS; + psq_st f6, Mtx_10(dst), 0, qr0; + ps_muls0 f2, f2, zS; + psq_st f7, Mtx_12(dst), 0, qr0; + psq_st f8, Mtx_20(dst), 0, qr0; + psq_st f2, Mtx_22(dst), 0, qr0; + + blr; +} + +void C_MTXQuat(MtxPtr m, CQuaternionPtr q) { + f32 s; + f32 xs; + f32 ys; + f32 zs; + f32 wx; + f32 wy; + f32 wz; + f32 xx; + f32 xy; + f32 xz; + f32 yy; + f32 yz; + f32 zz; + + OS_DEBUG_ASSERT(m, "MTXQuat(): NULL MtxPtr 'm' "); + OS_DEBUG_ASSERT(q, "MTXQuat(): NULL QuaternionPtr 'q' "); + OS_DEBUG_ASSERT(q->x || q->y || q->z || q->w, "MTXQuat(): zero-value quaternion "); + + s = 2.0f / (q->x * q->x + q->y * q->y + q->z * q->z + q->w * q->w); + xs = q->x * s; + ys = q->y * s; + zs = q->z * s; + wx = q->w * xs; + wy = q->w * ys; + wz = q->w * zs; + xx = q->x * xs; + xy = q->x * ys; + xz = q->x * zs; + yy = q->y * ys; + yz = q->y * zs; + zz = q->z * zs; + + m[0][0] = 1.0f - (yy + zz); + m[0][1] = xy - wz; + m[0][2] = xz + wy; + m[0][3] = 0.0f; + m[1][0] = xy + wz; + m[1][1] = 1.0f - (xx + zz); + m[1][2] = yz - wx; + m[1][3] = 0.0f; + m[2][0] = xz - wy; + m[2][1] = yz + wx; + m[2][2] = 1.0f - (xx + yy); + m[2][3] = 0.0f; +} + +void PSMTXQuat(register MtxPtr m, register CQuaternionPtr q) { + register f32 c_zero; + register f32 c_one = 1.0f; + register f32 c_two; + register f32 scale; + register f32 tmp0; + register f32 tmp1; + register f32 tmp2; + register f32 tmp3; + register f32 tmp4; + register f32 tmp5; + register f32 tmp6; + register f32 tmp7; + register f32 tmp8; + register f32 tmp9; + + ASM ( + psq_l tmp0, Quaternion.x(q), 0, qr0; + psq_l tmp1, Quaternion.z(q), 0, qr0; + fsubs c_zero, c_one, c_one; + fadds c_two, c_one, c_one; + ps_mul tmp2, tmp0, tmp0; + ps_merge10 tmp5, tmp0, tmp0; + ps_madd tmp4, tmp1, tmp1, tmp2; + ps_mul tmp3, tmp1, tmp1; + ps_sum0 scale, tmp4, tmp4, tmp4; + ps_muls1 tmp7, tmp5, tmp1; + fres tmp9, scale; + ps_sum1 tmp4, tmp3, tmp4, tmp2; + ps_nmsub scale, scale, tmp9, c_two; + ps_muls1 tmp6, tmp1, tmp1; + ps_mul scale, tmp9, scale; + ps_sum0 tmp2, tmp2, tmp2, tmp2; + fmuls scale, scale, c_two; + ps_madd tmp8, tmp0, tmp5, tmp6; + ps_msub tmp6, tmp0, tmp5, tmp6; + psq_st c_zero, Mtx_03(m), 1, qr0; + ps_nmsub tmp2, tmp2, scale, c_one; + ps_nmsub tmp4, tmp4, scale, c_one; + psq_st c_zero, Mtx_23(m), 1, qr0; + ps_mul tmp8, tmp8, scale; + ps_mul tmp6, tmp6, scale; + psq_st tmp2, Mtx_22(m), 1, qr0; + ps_madds0 tmp5, tmp0, tmp1, tmp7; + ps_merge00 tmp1, tmp8, tmp4; + ps_nmsub tmp7, tmp7, c_two, tmp5; + ps_merge10 tmp0, tmp4, tmp6; + psq_st tmp1, Mtx_10(m), 0, qr0; + ps_mul tmp5, tmp5, scale; + ps_mul tmp7, tmp7, scale; + psq_st tmp0, Mtx_00(m), 0, qr0; + psq_st tmp5, Mtx_02(m), 1, qr0; + ps_merge10 tmp3, tmp7, c_zero; + ps_merge01 tmp9, tmp7, tmp5; + psq_st tmp3, Mtx_12(m), 0, qr0; + psq_st tmp9, Mtx_20(m), 0, qr0; + ) +} + +void C_MTXReflect(MtxPtr m, CVecPtr p, CVecPtr n) { + f32 vxy = -2.0f * n->x * n->y; + f32 vxz = -2.0f * n->x * n->z; + f32 vyz = -2.0f * n->y * n->z; + f32 pdotn = 2.0f * C_VECDotProduct(p, n); + + m[0][0] = 1.0f - 2.0f * n->x * n->x; + m[0][1] = vxy; + m[0][2] = vxz; + m[0][3] = pdotn * n->x; + m[1][0] = vxy; + m[1][1] = 1.0f - 2.0f * n->y * n->y; + m[1][2] = vyz; + m[1][3] = pdotn * n->y; + m[2][0] = vxz; + m[2][1] = vyz; + m[2][2] = 1.0f - 2.0f * n->z * n->z; + m[2][3] = pdotn * n->z; +} + +void PSMTXReflect(register MtxPtr m, register CVecPtr point, + register CVecPtr normal) { + register f32 c_one = 1.0f; + register f32 vn_xy; + register f32 vn_z1; + register f32 n2vn_xy; + register f32 n2vn_z1; + register f32 pdotn; + register f32 tmp0; + register f32 tmp1; + register f32 tmp2; + register f32 tmp3; + register f32 tmp4; + register f32 tmp5; + register f32 tmp6; + register f32 tmp7; + + ASM ( + psq_l vn_z1, Vec.z(normal), 1, qr0; + psq_l vn_xy, Vec.x(normal), 0, qr0; + psq_l pdotn, Vec.x(point), 0, qr0; + ps_nmadd n2vn_z1, vn_z1, c_one, vn_z1; + psq_l tmp1, Vec.z(point), 1, qr0; + ps_nmadd n2vn_xy, vn_xy, c_one, vn_xy; + ps_muls0 tmp4, vn_xy, n2vn_z1; + ps_mul pdotn, n2vn_xy, pdotn; + ps_muls0 tmp2, vn_xy, n2vn_xy; + ps_sum0 pdotn, pdotn, pdotn, pdotn; + ps_muls1 n2vn_xy, vn_xy, n2vn_xy; + psq_st tmp4, Mtx_20(m), 0, qr0; + ps_sum0 tmp2, tmp2, tmp2, c_one; + ps_nmadd tmp0, n2vn_z1, tmp1, pdotn; + ps_sum1 tmp3, c_one, n2vn_xy, n2vn_xy; + psq_st tmp2, Mtx_00(m), 0, qr0; + ps_muls0 vn_xy, vn_xy, tmp0; + ps_merge00 tmp6, n2vn_z1, tmp0; + psq_st tmp3, Mtx_10(m), 0, qr0; + ps_merge00 tmp7, tmp4, vn_xy; + ps_muls0 tmp6, tmp6, vn_z1; + ps_merge11 tmp5, tmp4, vn_xy; + psq_st tmp7, Mtx_02(m), 0, qr0; + ps_sum0 tmp6, tmp6, tmp6, c_one; + psq_st tmp5, Mtx_12(m), 0, qr0; + psq_st tmp6, Mtx_22(m), 0, qr0; + ) +} + +void C_MTXLookAt(MtxPtr m, CVecPtr camPos, CVecPtr camUp, CPoint3dPtr target) { + Vec vLook; + Vec vRight; + Vec vUp; + + OS_DEBUG_ASSERT(m, "MTXLookAt(): NULL MtxPtr 'm' "); + OS_DEBUG_ASSERT(camPos, "MTXLookAt(): NULL VecPtr 'camPos' "); + OS_DEBUG_ASSERT(camUp, "MTXLookAt(): NULL VecPtr 'camUp' "); + OS_DEBUG_ASSERT(target, "MTXLookAt(): NULL Point3dPtr 'target' "); + + vLook.x = camPos->x - target->x; + vLook.y = camPos->y - target->y; + vLook.z = camPos->z - target->z; + VECNormalize(&vLook, &vLook); + VECCrossProduct(camUp, &vLook, &vRight); + VECNormalize(&vRight, &vRight); + VECCrossProduct(&vLook, &vRight, &vUp); + + m[0][0] = vRight.x; + m[0][1] = vRight.y; + m[0][2] = vRight.z; + m[0][3] = + -(camPos->x * vRight.x + camPos->y * vRight.y + camPos->z * vRight.z); + m[1][0] = vUp.x; + m[1][1] = vUp.y; + m[1][2] = vUp.z; + m[1][3] = -(camPos->x * vUp.x + camPos->y * vUp.y + camPos->z * vUp.z); + m[2][0] = vLook.x; + m[2][1] = vLook.y; + m[2][2] = vLook.z; + m[2][3] = + -(camPos->x * vLook.x + camPos->y * vLook.y + camPos->z * vLook.z); +} + +void C_MTXLightFrustum(MtxPtr m, f32 t, f32 b, f32 l, f32 r, f32 n, f32 scaleS, + f32 scaleT, f32 transS, f32 transT) { + f32 tmp; + + OS_DEBUG_ASSERT(m, "MTXLightFrustum(): NULL MtxPtr 'm' "); + OS_DEBUG_ASSERT(t != b, "MTXLightFrustum(): 't' and 'b' clipping planes are equal "); + OS_DEBUG_ASSERT(l != r, "MTXLightFrustum(): 'l' and 'r' clipping planes are equal "); + + tmp = 1.0f / (r - l); + m[0][0] = 2.0f * n * tmp * scaleS; + m[0][1] = 0.0f; + m[0][2] = (r + l) * tmp * scaleS - transS; + m[0][3] = 0.0f; + tmp = 1.0f / (t - b); + m[1][0] = 0.0f; + m[1][1] = 2.0f * n * tmp * scaleT; + m[1][2] = (t + b) * tmp * scaleT - transT; + m[1][3] = 0.0f; + m[2][0] = 0.0f; + m[2][1] = 0.0f; + m[2][2] = -1.0f; + m[2][3] = 0.0f; +} + +void C_MTXLightPerspective(MtxPtr m, f32 fovY, f32 aspect, f32 scaleS, + f32 scaleT, f32 transS, f32 transT) { + f32 angle; + f32 cot; + + OS_DEBUG_ASSERT(m, "MTXLightPerspective(): NULL MtxPtr 'm' "); + OS_DEBUG_ASSERT(fovY > 0.0 && fovY < 180.0, "MTXLightPerspective(): 'fovY' out of range "); + OS_DEBUG_ASSERT(aspect != 0.0f, "MTXLightPerspective(): 'aspect' is 0 "); + + // Matching with C_MTXPerspective, debug compliance + angle = fovY * 0.5f; + angle = angle * DEG2RAD; + cot = 1.0f / tanf(angle); + + m[0][0] = cot / aspect * scaleS; + m[0][1] = 0.0f; + m[0][2] = -transS; + m[0][3] = 0.0f; + m[1][0] = 0.0f; + m[1][1] = cot * scaleT; + m[1][2] = -transT; + m[1][3] = 0.0f; + m[2][0] = 0.0f; + m[2][1] = 0.0f; + m[2][2] = -1.0f; + m[2][3] = 0.0f; +} + +void C_MTXLightOrtho(MtxPtr m, f32 t, f32 b, f32 l, f32 r, f32 scaleS, + f32 scaleT, f32 transS, f32 transT) { + f32 tmp; + + OS_DEBUG_ASSERT(m, "MTXLightOrtho(): NULL MtxPtr 'm' "); + OS_DEBUG_ASSERT(t != b, "MTXLightOrtho(): 't' and 'b' clipping planes are equal "); + OS_DEBUG_ASSERT(l != r, "MTXLightOrtho(): 'l' and 'r' clipping planes are equal "); + + tmp = 1.0f / (r - l); + m[0][0] = 2.0f * tmp * scaleS; + m[0][1] = 0.0f; + m[0][2] = 0.0f; + m[0][3] = -(r + l) * tmp * scaleS + transS; + tmp = 1.0f / (t - b); + m[1][0] = 0.0f; + m[1][1] = 2.0f * tmp * scaleT; + m[1][2] = 0.0f; + m[1][3] = -(t + b) * tmp * scaleT + transT; + m[2][0] = 0.0f; + m[2][1] = 0.0f; + m[2][2] = 0.0f; + m[2][3] = 1.0f; +} diff --git a/src/revolution/MTX/mtx44.c b/src/revolution/MTX/mtx44.c index c345a7168..91a45baec 100644 --- a/src/revolution/MTX/mtx44.c +++ b/src/revolution/MTX/mtx44.c @@ -1,87 +1,917 @@ -#include #include +#include +#include + +#include + +static f32 mtxUnit[] = {0.0f, 1.0f, 0.5f, 3.0f}; + +void C_MTXFrustum(Mtx44Ptr m, f32 t, f32 b, f32 l, f32 r, f32 n, f32 f) { + f32 tmp; + + OS_DEBUG_ASSERT(m, "MTXFrustum(): NULL Mtx44Ptr 'm' "); + OS_DEBUG_ASSERT(t != b, "MTXFrustum(): 't' and 'b' clipping planes are equal "); + OS_DEBUG_ASSERT(l != r, "MTXFrustum(): 'l' and 'r' clipping planes are equal "); + OS_DEBUG_ASSERT(n != f, "MTXFrustum(): 'n' and 'f' clipping planes are equal "); + + tmp = 1.0f / (r - l); + m[0][0] = 2.0f * n * tmp; + m[0][1] = 0.0f; + m[0][2] = (r + l) * tmp; + m[0][3] = 0.0f; + tmp = 1.0f / (t - b); + m[1][0] = 0.0f; + m[1][1] = 2.0f * n * tmp; + m[1][2] = (t + b) * tmp; + m[1][3] = 0.0f; + m[2][0] = 0.0f; + m[2][1] = 0.0f; + tmp = 1.0f / (f - n); + m[2][2] = -n * tmp; + m[2][3] = -(f * n) * tmp; + m[3][0] = 0.0f; + m[3][1] = 0.0f; + m[3][2] = -1.0f; + m[3][3] = 0.0f; +} + +void C_MTXPerspective(Mtx44Ptr m, f32 fovY, f32 aspect, f32 n, f32 f) { + f32 angle; + f32 cot; + f32 tmp; + + OS_DEBUG_ASSERT(m, "MTXPerspective(): NULL Mtx44Ptr 'm' "); + OS_DEBUG_ASSERT(fovY > 0.0 && fovY < 180.0, "MTXPerspective(): 'fovY' out of range "); + OS_DEBUG_ASSERT(aspect != 0.0f, "MTXPerspective(): 'aspect' is 0 "); + + // Float ordering, debug compliance + angle = fovY * 0.5f; + angle = angle * DEG2RAD; + cot = 1.0f / tanf(angle); + + m[0][0] = cot / aspect; + m[0][1] = 0.0f; + m[0][2] = 0.0f; + m[0][3] = 0.0f; + m[1][0] = 0.0f; + m[1][1] = cot; + m[1][2] = 0.0f; + m[1][3] = 0.0f; + m[2][0] = 0.0f; + m[2][1] = 0.0f; + tmp = 1.0f / (f - n); + m[2][2] = -n * tmp; + m[2][3] = -(f * n) * tmp; + m[3][0] = 0.0f; + m[3][1] = 0.0f; + m[3][2] = -1.0f; + m[3][3] = 0.0f; +} + +void C_MTXOrtho(Mtx44Ptr m, f32 t, f32 b, f32 l, f32 r, f32 n, f32 f) { + f32 tmp; + + OS_DEBUG_ASSERT(m, "MTXOrtho(): NULL Mtx44Ptr 'm' "); + OS_DEBUG_ASSERT(t != b, "MTXOrtho(): 't' and 'b' clipping planes are equal "); + OS_DEBUG_ASSERT(l != r, "MTXOrtho(): 'l' and 'r' clipping planes are equal "); + OS_DEBUG_ASSERT(n != f, "MTXOrtho(): 'n' and 'f' clipping planes are equal "); + + tmp = 1.0f / (r - l); + m[0][0] = 2.0f * tmp; + m[0][1] = 0.0f; + m[0][2] = 0.0f; + m[0][3] = -(r + l) * tmp; + tmp = 1.0f / (t - b); + m[1][0] = 0.0f; + m[1][1] = 2.0f * tmp; + m[1][2] = 0.0f; + m[1][3] = -(t + b) * tmp; + m[2][0] = 0.0f; + m[2][1] = 0.0f; + tmp = 1.0f / (f - n); + m[2][2] = -1.0f * tmp; + m[2][3] = -f * tmp; + m[3][0] = 0.0f; + m[3][1] = 0.0f; + m[3][2] = 0.0f; + m[3][3] = 1.0f; +} + +void C_MTX44Identity(Mtx44Ptr m) { + OS_DEBUG_ASSERT(m, "MTX44Identity(): NULL Mtx44 'm' "); + + m[0][0] = 1.0f; + m[0][1] = 0.0f; + m[0][2] = 0.0f; + m[0][3] = 0.0f; + m[1][0] = 0.0f; + m[1][1] = 1.0f; + m[1][2] = 0.0f; + m[1][3] = 0.0f; + m[2][0] = 0.0f; + m[2][1] = 0.0f; + m[2][2] = 1.0f; + m[2][3] = 0.0f; + m[3][0] = 0.0f; + m[3][1] = 0.0f; + m[3][2] = 0.0f; + m[3][3] = 1.0f; +} + +void PSMTX44Identity(register Mtx44Ptr m) { + register f32 c1 = 1.0f; + register f32 c0 = 0.0f; + + ASM ( + stfs c1, Mtx_00(m); + psq_st c0, Mtx_01(m), 0, qr0; + psq_st c0, Mtx_03(m), 0, qr0; + stfs c1, Mtx_11(m); + psq_st c0, Mtx_12(m), 0, qr0; + psq_st c0, Mtx_20(m), 0, qr0; + stfs c1, Mtx_22(m); + psq_st c0, Mtx_23(m), 0, qr0; + psq_st c0, Mtx_31(m), 0, qr0; + stfs c1, Mtx_33(m); + ) +} + +void C_MTX44Copy(CMtx44Ptr src, Mtx44Ptr dst) { + OS_DEBUG_ASSERT(src, "MTX44Copy(): NULL Mtx44Ptr 'src' "); + OS_DEBUG_ASSERT(dst, "MTX44Copy(): NULL Mtx44Ptr 'dst' "); + + if (src == dst) { + return; + } + + dst[0][0] = src[0][0]; + dst[0][1] = src[0][1]; + dst[0][2] = src[0][2]; + dst[0][3] = src[0][3]; + dst[1][0] = src[1][0]; + dst[1][1] = src[1][1]; + dst[1][2] = src[1][2]; + dst[1][3] = src[1][3]; + dst[2][0] = src[2][0]; + dst[2][1] = src[2][1]; + dst[2][2] = src[2][2]; + dst[2][3] = src[2][3]; + dst[3][0] = src[3][0]; + dst[3][1] = src[3][1]; + dst[3][2] = src[3][2]; + dst[3][3] = src[3][3]; +} + +asm void PSMTX44Copy(register CMtx44Ptr src, register Mtx44Ptr dst) { + nofralloc; + + psq_l f1, Mtx_00(src), 0, qr0; + psq_st f1, Mtx_00(dst), 0, qr0; + psq_l f1, Mtx_02(src), 0, qr0; + psq_st f1, Mtx_02(dst), 0, qr0; + psq_l f1, Mtx_10(src), 0, qr0; + psq_st f1, Mtx_10(dst), 0, qr0; + psq_l f1, Mtx_12(src), 0, qr0; + psq_st f1, Mtx_12(dst), 0, qr0; + psq_l f1, Mtx_20(src), 0, qr0; + psq_st f1, Mtx_20(dst), 0, qr0; + psq_l f1, Mtx_22(src), 0, qr0; + psq_st f1, Mtx_22(dst), 0, qr0; + psq_l f1, Mtx_30(src), 0, qr0; + psq_st f1, Mtx_30(dst), 0, qr0; + psq_l f1, Mtx_32(src), 0, qr0; + psq_st f1, Mtx_32(dst), 0, qr0; + + blr; +} + +void C_MTX44Concat(CMtx44Ptr a, CMtx44Ptr b, Mtx44Ptr ab) { + Mtx44 mTmp; + Mtx44Ptr m; + + OS_DEBUG_ASSERT(a, "MTX44Concat(): NULL Mtx44Ptr 'a' "); + OS_DEBUG_ASSERT(b, "MTX44Concat(): NULL Mtx44Ptr 'b' "); + OS_DEBUG_ASSERT(ab, "MTX44Concat(): NULL Mtx44Ptr 'ab' "); + + if (ab == a || ab == b) { + m = mTmp; + } else { + m = ab; + } + + m[0][0] = a[0][0] * b[0][0] + a[0][1] * b[1][0] + a[0][2] * b[2][0] + + a[0][3] * b[3][0]; + m[0][1] = a[0][0] * b[0][1] + a[0][1] * b[1][1] + a[0][2] * b[2][1] + + a[0][3] * b[3][1]; + m[0][2] = a[0][0] * b[0][2] + a[0][1] * b[1][2] + a[0][2] * b[2][2] + + a[0][3] * b[3][2]; + m[0][3] = a[0][0] * b[0][3] + a[0][1] * b[1][3] + a[0][2] * b[2][3] + + a[0][3] * b[3][3]; + m[1][0] = a[1][0] * b[0][0] + a[1][1] * b[1][0] + a[1][2] * b[2][0] + + a[1][3] * b[3][0]; + m[1][1] = a[1][0] * b[0][1] + a[1][1] * b[1][1] + a[1][2] * b[2][1] + + a[1][3] * b[3][1]; + m[1][2] = a[1][0] * b[0][2] + a[1][1] * b[1][2] + a[1][2] * b[2][2] + + a[1][3] * b[3][2]; + m[1][3] = a[1][0] * b[0][3] + a[1][1] * b[1][3] + a[1][2] * b[2][3] + + a[1][3] * b[3][3]; + m[2][0] = a[2][0] * b[0][0] + a[2][1] * b[1][0] + a[2][2] * b[2][0] + + a[2][3] * b[3][0]; + m[2][1] = a[2][0] * b[0][1] + a[2][1] * b[1][1] + a[2][2] * b[2][1] + + a[2][3] * b[3][1]; + m[2][2] = a[2][0] * b[0][2] + a[2][1] * b[1][2] + a[2][2] * b[2][2] + + a[2][3] * b[3][2]; + m[2][3] = a[2][0] * b[0][3] + a[2][1] * b[1][3] + a[2][2] * b[2][3] + + a[2][3] * b[3][3]; + m[3][0] = a[3][0] * b[0][0] + a[3][1] * b[1][0] + a[3][2] * b[2][0] + + a[3][3] * b[3][0]; + m[3][1] = a[3][0] * b[0][1] + a[3][1] * b[1][1] + a[3][2] * b[2][1] + + a[3][3] * b[3][1]; + m[3][2] = a[3][0] * b[0][2] + a[3][1] * b[1][2] + a[3][2] * b[2][2] + + a[3][3] * b[3][2]; + m[3][3] = a[3][0] * b[0][3] + a[3][1] * b[1][3] + a[3][2] * b[2][3] + + a[3][3] * b[3][3]; + + if (m == mTmp) { + C_MTX44Copy(mTmp, ab); + } +} + +asm void PSMTX44Concat(register CMtx44Ptr mA, register CMtx44Ptr mB, + register Mtx44Ptr mAB) { + nofralloc; + + psq_l f0, Mtx_00(mA), 0, qr0; + psq_l f2, Mtx_00(mB), 0, qr0; + ps_muls0 f6, f2, f0; + psq_l f3, Mtx_10(mB), 0, qr0; + psq_l f4, Mtx_20(mB), 0, qr0; + ps_madds1 f6, f3, f0, f6; + psq_l f1, Mtx_02(mA), 0, qr0; + psq_l f5, Mtx_30(mB), 0, qr0; + ps_madds0 f6, f4, f1, f6; + psq_l f0, Mtx_10(mA), 0, qr0; + ps_madds1 f6, f5, f1, f6; + psq_l f1, Mtx_12(mA), 0, qr0; + ps_muls0 f8, f2, f0; + ps_madds1 f8, f3, f0, f8; + psq_l f0, Mtx_20(mA), 0, qr0; + ps_madds0 f8, f4, f1, f8; + ps_madds1 f8, f5, f1, f8; + psq_l f1, Mtx_22(mA), 0, qr0; + ps_muls0 f10, f2, f0; + ps_madds1 f10, f3, f0, f10; + psq_l f0, Mtx_30(mA), 0, qr0; + ps_madds0 f10, f4, f1, f10; + ps_madds1 f10, f5, f1, f10; + psq_l f1, Mtx_32(mA), 0, qr0; + ps_muls0 f12, f2, f0; + psq_l f2, Mtx_02(mB), 0, qr0; + ps_madds1 f12, f3, f0, f12; + psq_l f0, Mtx_00(mA), 0, qr0; + ps_madds0 f12, f4, f1, f12; + psq_l f3, Mtx_12(mB), 0, qr0; + ps_madds1 f12, f5, f1, f12; + psq_l f1, Mtx_02(mA), 0, qr0; + ps_muls0 f7, f2, f0; + psq_l f4, Mtx_22(mB), 0, qr0; + ps_madds1 f7, f3, f0, f7; + psq_l f5, Mtx_32(mB), 0, qr0; + ps_madds0 f7, f4, f1, f7; + psq_l f0, Mtx_10(mA), 0, qr0; + ps_madds1 f7, f5, f1, f7; + psq_l f1, Mtx_12(mA), 0, qr0; + ps_muls0 f9, f2, f0; + psq_st f6, Mtx_00(mAB), 0, qr0; + ps_madds1 f9, f3, f0, f9; + psq_l f0, Mtx_20(mA), 0, qr0; + ps_madds0 f9, f4, f1, f9; + psq_st f8, Mtx_10(mAB), 0, qr0; + ps_madds1 f9, f5, f1, f9; + psq_l f1, Mtx_22(mA), 0, qr0; + ps_muls0 f11, f2, f0; + psq_st f10, Mtx_20(mAB), 0, qr0; + ps_madds1 f11, f3, f0, f11; + psq_l f0, Mtx_30(mA), 0, qr0; + ps_madds0 f11, f4, f1, f11; + psq_st f12, Mtx_30(mAB), 0, qr0; + ps_madds1 f11, f5, f1, f11; + psq_l f1, Mtx_32(mA), 0, qr0; + ps_muls0 f13, f2, f0; + psq_st f7, Mtx_02(mAB), 0, qr0; + ps_madds1 f13, f3, f0, f13; + psq_st f9, Mtx_12(mAB), 0, qr0; + ps_madds0 f13, f4, f1, f13; + psq_st f11, Mtx_22(mAB), 0, qr0; + ps_madds1 f13, f5, f1, f13; + psq_st f13, Mtx_32(mAB), 0, qr0; + + blr; +} + +void C_MTX44Transpose(CMtx44Ptr src, Mtx44Ptr xPose) { + Mtx44 mTmp; + Mtx44Ptr m; + + OS_DEBUG_ASSERT(src, "MTX44Transpose(): NULL Mtx44Ptr 'src' "); + OS_DEBUG_ASSERT(xPose, "MTX44Transpose(): NULL Mtx44Ptr 'xPose' "); + + if (src == xPose) { + m = mTmp; + } else { + m = xPose; + } + + m[0][0] = src[0][0]; + m[0][1] = src[1][0]; + m[0][2] = src[2][0]; + m[0][3] = src[3][0]; + m[1][0] = src[0][1]; + m[1][1] = src[1][1]; + m[1][2] = src[2][1]; + m[1][3] = src[3][1]; + m[2][0] = src[0][2]; + m[2][1] = src[1][2]; + m[2][2] = src[2][2]; + m[2][3] = src[3][2]; + m[3][0] = src[0][3]; + m[3][1] = src[1][3]; + m[3][2] = src[2][3]; + m[3][3] = src[3][3]; + + if (m == mTmp) { + MTX44Copy(mTmp, xPose); + } +} + +asm void PSMTX44Transpose(register CMtx44Ptr src, register Mtx44Ptr xPose) { + nofralloc; + + psq_l f0, Mtx_00(src), 0, qr0; + psq_l f1, Mtx_10(src), 0, qr0; + ps_merge00 f4, f0, f1; + psq_l f2, Mtx_02(src), 0, qr0; + psq_st f4, Mtx_00(xPose), 0, qr0; + ps_merge11 f5, f0, f1; + psq_l f3, Mtx_12(src), 0, qr0; + psq_st f5, Mtx_10(xPose), 0, qr0; + ps_merge00 f4, f2, f3; + psq_l f0, Mtx_20(src), 0, qr0; + psq_st f4, Mtx_20(xPose), 0, qr0; + ps_merge11 f5, f2, f3; + psq_l f1, Mtx_30(src), 0, qr0; + psq_st f5, Mtx_30(xPose), 0, qr0; + ps_merge00 f4, f0, f1; + psq_l f2, Mtx_22(src), 0, qr0; + psq_st f4, Mtx_02(xPose), 0, qr0; + ps_merge11 f5, f0, f1; + psq_l f3, Mtx_32(src), 0, qr0; + psq_st f5, Mtx_12(xPose), 0, qr0; + ps_merge00 f4, f2, f3; + psq_st f4, Mtx_22(xPose), 0, qr0; + ps_merge11 f5, f2, f3; + psq_st f5, Mtx_32(xPose), 0, qr0; + + blr; +} + +u32 C_MTX44Inverse(CMtx44Ptr src, Mtx44Ptr inv) { + Mtx44 gjm; + s32 i; + s32 j; + s32 k; + f32 w; + f32 max; + s32 swp; + f32 ftmp; + + OS_DEBUG_ASSERT(src, "MTX44Inverse(): NULL Mtx44Ptr 'src' "); + OS_DEBUG_ASSERT(inv, "MTX44Inverse(): NULL Mtx44Ptr 'inv' "); + + MTX44Copy(src, gjm); + MTX44Identity(inv); + + for (i = 0; i < 4; ++i) { + f32 max = 0.0f; + s32 swp = i; + + for (j = i; j < 4; ++j) { + ftmp = fabsf(gjm[j][i]); + if (ftmp > max) { + max = ftmp; + swp = j; + } + } + + if (max == 0.0f) { + // Missing pivot, matrix is singular + return FALSE; + } + + if (swp != i) { + for (j = 0; j < 4; ++j) { + // DWARF has two "tmp" variables + // Placing one here and another in the bottom loop doesn't work + { + f32 tmp = gjm[i][j]; + gjm[i][j] = gjm[swp][j]; + gjm[swp][j] = tmp; + } + { + f32 tmp = inv[i][j]; + inv[i][j] = inv[swp][j]; + inv[swp][j] = tmp; + } + } + } + + w = 1.0f / gjm[i][i]; + + for (k = 0; k < 4; ++k) { + gjm[i][k] *= w; + inv[i][k] *= w; + } + + for (j = 0; j < 4; ++j) { + if (j != i) { + w = gjm[j][i]; + for (k = 0; k < 4; ++k) { + gjm[j][k] -= gjm[i][k] * w; + inv[j][k] -= inv[i][k] * w; + } + } + } + } + + return TRUE; +} + +void C_MTX44Trans(Mtx44Ptr m, f32 xT, f32 yT, f32 zT) { + OS_DEBUG_ASSERT(m, "MTX44Trans(): NULL Mtx44Ptr 'm' "); + + m[0][0] = 1.0f; + m[0][1] = 0.0f; + m[0][2] = 0.0f; + m[0][3] = xT; + m[1][0] = 0.0f; + m[1][1] = 1.0f; + m[1][2] = 0.0f; + m[1][3] = yT; + m[2][0] = 0.0f; + m[2][1] = 0.0f; + m[2][2] = 1.0f; + m[2][3] = zT; + m[3][0] = 0.0f; + m[3][1] = 0.0f; + m[3][2] = 0.0f; + m[3][3] = 1.0f; +} + +void PSMTX44Trans(register Mtx44Ptr m, register f32 xT, register f32 yT, + register f32 zT) { + register f32 c_zero = 0.0f; + register f32 c_one = 1.0f; + register f32 c_01; + + ASM ( + stfs xT, Mtx_03(m); + stfs yT, Mtx_13(m); + ps_merge00 c_01, c_zero, c_one; + stfs zT, Mtx_23(m); + psq_st c_one, Mtx_00(m), 1, qr0; + psq_st c_zero, Mtx_01(m), 0, qr0; + psq_st c_01, Mtx_10(m), 0, qr0; + psq_st c_zero, Mtx_12(m), 1, qr0; + psq_st c_zero, Mtx_20(m), 0, qr0; + psq_st c_one, Mtx_22(m), 1, qr0; + psq_st c_zero, Mtx_30(m), 0, qr0; + psq_st c_01, Mtx_32(m), 0, qr0; + ) +} + +void C_MTX44TransApply(CMtx44Ptr src, Mtx44Ptr dst, f32 xT, f32 yT, f32 zT) { + OS_DEBUG_ASSERT(src, "MTX44TransApply(): NULL Mtx44Ptr 'src' "); + OS_DEBUG_ASSERT(dst, "MTX44TransApply(): NULL Mtx44Ptr 'src' "); + + if (src != dst) { + dst[0][0] = src[0][0]; + dst[0][1] = src[0][1]; + dst[0][2] = src[0][2]; + dst[1][0] = src[1][0]; + dst[1][1] = src[1][1]; + dst[1][2] = src[1][2]; + dst[2][0] = src[2][0]; + dst[2][1] = src[2][1]; + dst[2][2] = src[2][2]; + dst[3][0] = src[3][0]; + dst[3][1] = src[3][1]; + dst[3][2] = src[3][2]; + dst[3][3] = src[3][3]; + } + + dst[0][3] = src[0][3] + xT; + dst[1][3] = src[1][3] + yT; + dst[2][3] = src[2][3] + zT; +} + +asm void PSMTX44TransApply(register CMtx44Ptr src, register Mtx44Ptr dst, + register f32 xT, register f32 yT, register f32 zT) { + nofralloc; + + psq_l f4, Mtx_00(src), 0, qr0; + frsp xT, xT; + psq_l f5, Mtx_02(src), 0, qr0; + frsp yT, yT; + psq_l f6, Mtx_10(src), 0, qr0; + frsp zT, zT; + psq_l f7, Mtx_12(src), 0, qr0; + psq_st f4, Mtx_00(dst), 0, qr0; + ps_sum1 f5, f1, f5, f5; + psq_l f4, Mtx_22(src), 0, qr0; + psq_st f6, Mtx_10(dst), 0, qr0; + ps_sum1 f7, f2, f7, f7; + psq_l f8, Mtx_20(src), 0, qr0; + psq_st f5, Mtx_02(dst), 0, qr0; + ps_sum1 f4, f3, f4, f4; + psq_st f7, Mtx_12(dst), 0, qr0; + psq_st f8, Mtx_20(dst), 0, qr0; + psq_l f5, Mtx_30(src), 0, qr0; + psq_l f6, Mtx_32(src), 0, qr0; + psq_st f4, Mtx_22(dst), 0, qr0; + psq_st f5, Mtx_30(dst), 0, qr0; + psq_st f6, Mtx_32(dst), 0, qr0; + + blr; +} + +void C_MTX44Scale(Mtx44Ptr m, f32 xS, f32 yS, f32 zS) { + OS_DEBUG_ASSERT(m, "MTX44Scale(): NULL Mtx44Ptr 'm' "); + + m[0][0] = xS; + m[0][1] = 0.0f; + m[0][2] = 0.0f; + m[0][3] = 0.0f; + m[1][0] = 0.0f; + m[1][1] = yS; + m[1][2] = 0.0f; + m[1][3] = 0.0f; + m[2][0] = 0.0f; + m[2][1] = 0.0f; + m[2][2] = zS; + m[2][3] = 0.0f; + m[3][0] = 0.0f; + m[3][1] = 0.0f; + m[3][2] = 0.0f; + m[3][3] = 1.0f; +} + +void PSMTX44Scale(register Mtx44Ptr m, register f32 xS, register f32 yS, + register f32 zS) { + register f32 c_zero = 0.0f; + register f32 c_one = 1.0f; + + ASM ( + stfs xS, Mtx_00(m); + psq_st c_zero, Mtx_01(m), 0, qr0; + psq_st c_zero, Mtx_03(m), 0, qr0; + stfs yS, Mtx_11(m); + psq_st c_zero, Mtx_12(m), 0, qr0; + psq_st c_zero, Mtx_20(m), 0, qr0; + stfs zS, Mtx_22(m); + psq_st c_zero, Mtx_23(m), 0, qr0; + psq_st c_zero, Mtx_31(m), 0, qr0; + stfs c_one, Mtx_33(m); + ) +} + +void C_MTX44ScaleApply(CMtx44Ptr src, Mtx44Ptr dst, f32 xS, f32 yS, f32 zS) { + OS_DEBUG_ASSERT(src, "MTX44ScaleApply(): NULL Mtx44Ptr 'src' "); + OS_DEBUG_ASSERT(dst, "MTX44ScaleApply(): NULL Mtx44Ptr 'dst' "); + + dst[0][0] = src[0][0] * xS; + dst[0][1] = src[0][1] * xS; + dst[0][2] = src[0][2] * xS; + dst[0][3] = src[0][3] * xS; + dst[1][0] = src[1][0] * yS; + dst[1][1] = src[1][1] * yS; + dst[1][2] = src[1][2] * yS; + dst[1][3] = src[1][3] * yS; + dst[2][0] = src[2][0] * zS; + dst[2][1] = src[2][1] * zS; + dst[2][2] = src[2][2] * zS; + dst[2][3] = src[2][3] * zS; + dst[3][0] = src[3][0]; + dst[3][1] = src[3][1]; + dst[3][2] = src[3][2]; + dst[3][3] = src[3][3]; +} + +asm void PSMTX44ScaleApply(register CMtx44Ptr src, register Mtx44Ptr dst, + register f32 xS, register f32 yS, register f32 zS) { + nofralloc; + + psq_l f4, Mtx_00(src), 0, qr0; + frsp xS, xS; + psq_l f5, Mtx_02(src), 0, qr0; + frsp yS, yS; + psq_l f6, Mtx_10(src), 0, qr0; + ps_muls0 f4, f4, xS; + psq_l f7, Mtx_12(src), 0, qr0; + ps_muls0 f5, f5, xS; + psq_l f8, Mtx_20(src), 0, qr0; + frsp zS, zS; + psq_st f4, Mtx_00(dst), 0, qr0; + ps_muls0 f6, f6, yS; + psq_l f9, Mtx_22(src), 0, qr0; + psq_st f5, Mtx_02(dst), 0, qr0; + ps_muls0 f7, f7, yS; + psq_l f10, Mtx_30(src), 0, qr0; + psq_st f6, Mtx_10(dst), 0, qr0; + ps_muls0 f8, f8, zS; + psq_l f11, Mtx_32(src), 0, qr0; + psq_st f7, Mtx_12(dst), 0, qr0; + ps_muls0 f9, f9, zS; + psq_st f8, Mtx_20(dst), 0, qr0; + psq_st f9, Mtx_22(dst), 0, qr0; + psq_st f10, Mtx_30(dst), 0, qr0; + psq_st f11, Mtx_32(dst), 0, qr0; + + blr; +} + +void C_MTX44RotRad(Mtx44Ptr m, char axis, f32 rad) { + f32 sinA, cosA; + + OS_DEBUG_ASSERT(m, "MTX44RotRad(): NULL Mtx44Ptr 'm' "); + + sinA = sinf(rad); + cosA = cosf(rad); + C_MTX44RotTrig(m, axis, sinA, cosA); +} + +void PSMTX44RotRad(Mtx44Ptr m, char axis, f32 rad) { + f32 sinA, cosA; + + sinA = sinf(rad); + cosA = cosf(rad); + PSMTX44RotTrig(m, axis, sinA, cosA); +} + +void C_MTX44RotTrig(Mtx44Ptr m, char axis, f32 sinA, f32 cosA) { + OS_DEBUG_ASSERT(m, "MTX44RotTrig(): NULL Mtx44Ptr 'm' "); + + axis |= 0x20; + switch (axis) { + case 'x': + m[0][0] = 1.0f; + m[0][1] = 0.0f; + m[0][2] = 0.0f; + m[0][3] = 0.0f; + m[1][0] = 0.0f; + m[1][1] = cosA; + m[1][2] = -sinA; + m[1][3] = 0.0f; + m[2][0] = 0.0f; + m[2][1] = sinA; + m[2][2] = cosA; + m[2][3] = 0.0f; + m[3][0] = 0.0f; + m[3][1] = 0.0f; + m[3][2] = 0.0f; + m[3][3] = 1.0f; + break; + case 'y': + m[0][0] = cosA; + m[0][1] = 0.0f; + m[0][2] = sinA; + m[0][3] = 0.0f; + m[1][0] = 0.0f; + m[1][1] = 1.0f; + m[1][2] = 0.0f; + m[1][3] = 0.0f; + m[2][0] = -sinA; + m[2][1] = 0.0f; + m[2][2] = cosA; + m[2][3] = 0.0f; + m[3][0] = 0.0f; + m[3][1] = 0.0f; + m[3][2] = 0.0f; + m[3][3] = 1.0f; + break; + case 'z': + m[0][0] = cosA; + m[0][1] = -sinA; + m[0][2] = 0.0f; + m[0][3] = 0.0f; + m[1][0] = sinA; + m[1][1] = cosA; + m[1][2] = 0.0f; + m[1][3] = 0.0f; + m[2][0] = 0.0f; + m[2][1] = 0.0f; + m[2][2] = 1.0f; + m[2][3] = 0.0f; + m[3][0] = 0.0f; + m[3][1] = 0.0f; + m[3][2] = 0.0f; + m[3][3] = 1.0f; + break; + default: + OS_DEBUG_ASSERT(FALSE, "MTX44RotTrig(): invalid 'axis' value "); + break; + } +} + +void PSMTX44RotTrig(register MtxPtr m, register char axis, register f32 sinA, + register f32 cosA) { + register f32 ftmp0; + register f32 ftmp1; + register f32 ftmp2; + register f32 ftmp3; + register f32 ftmp4; + register f32 c_zero = 0.0f; + register f32 c_one = 1.0f; + + ASM ( + frsp sinA, sinA; + ori axis, axis, 0x20; + frsp cosA, cosA; + cmplwi axis, 'x'; + beq axis_x; + cmplwi axis, 'y'; + beq axis_y; + cmplwi axis, 'z'; + beq axis_z; + b epilogue; + axis_x: + psq_st c_one, Mtx_00(m), 1, qr0; + psq_st c_zero, Mtx_01(m), 0, qr0; + ps_neg ftmp0, sinA; + psq_st c_zero, Mtx_03(m), 0, qr0; + ps_merge00 ftmp1, sinA, cosA; + psq_st c_zero, Mtx_13(m), 0, qr0; + ps_merge00 ftmp0, cosA, ftmp0; + psq_st c_zero, Mtx_23(m), 0, qr0; + psq_st c_zero, Mtx_31(m), 0, qr0; + psq_st ftmp1, Mtx_21(m), 0, qr0; + psq_st ftmp0, Mtx_11(m), 0, qr0; + psq_st c_one, Mtx_33(m), 1, qr0; + b epilogue; + axis_y: + ps_merge00 ftmp1, cosA, c_zero; + psq_st c_zero, Mtx_30(m), 0, qr0; + ps_neg ftmp0, sinA; + psq_st c_zero, Mtx_12(m), 0, qr0; + ps_merge00 ftmp3, c_zero, c_one; + psq_st ftmp1, Mtx_00(m), 0, qr0; + ps_merge00 ftmp4, ftmp0, c_zero; + ps_merge00 ftmp2, sinA, c_zero; + psq_st ftmp3, Mtx_10(m), 0, qr0; + psq_st ftmp2, Mtx_02(m), 0, qr0; + psq_st ftmp4, Mtx_20(m), 0, qr0; + psq_st ftmp1, Mtx_22(m), 0, qr0; + psq_st ftmp3, Mtx_32(m), 0, qr0; + b epilogue; + axis_z: + psq_st c_zero, Mtx_02(m), 0, qr0; + ps_neg ftmp0, sinA; + psq_st c_zero, Mtx_12(m), 0, qr0; + ps_merge00 ftmp1, sinA, cosA; + psq_st c_zero, Mtx_20(m), 0, qr0; + ps_merge00 ftmp2, c_one, c_zero; + psq_st c_zero, Mtx_30(m), 0, qr0; + ps_merge00 ftmp3, c_zero, c_one; + psq_st ftmp1, Mtx_10(m), 0, qr0; + ps_merge00 ftmp4, cosA, ftmp0; + psq_st ftmp2, Mtx_22(m), 0, qr0; + psq_st ftmp3, Mtx_32(m), 0, qr0; + psq_st ftmp4, Mtx_00(m), 0, qr0; + epilogue: + ) +} + +void C_MTX44RotAxisRad(MtxPtr m, CVecPtr axis, f32 rad) { + Vec vN; + f32 s; + f32 c; + f32 t; + f32 x; + f32 y; + f32 z; + // The DWARF build we have is outdated, these names are added + f32 sqx; + f32 sqy; + f32 sqz; + + OS_DEBUG_ASSERT(m, "MTX44RotAxisRad(): NULL Mtx44Ptr 'm' "); + OS_DEBUG_ASSERT(axis, "MTX44RotAxisRad(): NULL VecPtr 'axis' "); + + s = sinf(rad); + c = cosf(rad); + t = 1.0f - c; + + C_VECNormalize(axis, &vN); + + x = vN.x; + y = vN.y; + z = vN.z; + sqx = x * x; + sqy = y * y; + sqz = z * z; + + m[0][0] = t * sqx + c; + m[0][1] = t * x * y - s * z; + m[0][2] = t * x * z + s * y; + m[0][3] = 0.0f; + m[1][0] = t * x * y + s * z; + m[1][1] = t * sqy + c; + m[1][2] = t * y * z - s * x; + m[1][3] = 0.0f; + m[2][0] = t * x * z - s * y; + m[2][1] = t * y * z + s * x; + m[2][2] = t * sqz + c; + m[2][3] = 0.0f; + m[3][0] = 0.0f; + m[3][1] = 0.0f; + m[3][2] = 0.0f; + m[3][3] = 1.0f; +} + +// Subroutine for PSMTX44RotAxisRad +// Paired singles don't have good ways of doing trig so we rely on a C caller +// DWARF doesn't have this, so copied from __PSMTXRotAxisRadInternal +static void __PSMTX44RotAxisRadInternal(register MtxPtr m, + register CVecPtr axis, register f32 sT, + register f32 cT) { + register f32 tT; + register f32 fc0; + register f32 tmp0; + register f32 tmp1; + register f32 tmp2; + register f32 tmp3; + register f32 tmp4; + register f32 tmp5; + register f32 tmp6; + register f32 tmp7; + register f32 tmp8; + register f32 tmp9; + + tmp9 = 0.5f; + tmp8 = 3.0f; + + ASM ( + frsp cT, cT; + psq_l tmp0, Vec.x(axis), 0, qr0; + frsp sT, sT; + lfs tmp1, Vec.z(axis); + ps_mul tmp2, tmp0, tmp0; + fadds tmp7, tmp9, tmp9; + ps_madd tmp3, tmp1, tmp1, tmp2; + fsubs fc0, tmp9, tmp9; + ps_sum0 tmp4, tmp3, tmp1, tmp2; + fsubs tT, tmp7, cT; + frsqrte tmp5, tmp4; + ps_merge00 tmp7, fc0, tmp7; + fmuls tmp2, tmp5, tmp5; + fmuls tmp3, tmp5, tmp9; + psq_st fc0, Mtx_30(m), 0, qr0; + fnmsubs tmp2, tmp2, tmp4, tmp8; + fmuls tmp5, tmp2, tmp3; + psq_st tmp7, Mtx_32(m), 0, qr0; + ps_merge00 cT, cT, cT; + ps_muls0 tmp0, tmp0, tmp5; + ps_muls0 tmp1, tmp1, tmp5; + ps_muls0 tmp4, tmp0, tT; + ps_muls0 tmp9, tmp0, sT; + ps_muls0 tmp5, tmp1, tT; + ps_muls1 tmp3, tmp4, tmp0; + ps_muls0 tmp2, tmp4, tmp0; + ps_muls0 tmp4, tmp4, tmp1; + fnmsubs tmp6, tmp1, sT, tmp3; + fmadds tmp7, tmp1, sT, tmp3; + ps_neg tmp0, tmp9; + ps_sum0 tmp8, tmp4, fc0, tmp9; + ps_sum0 tmp2, tmp2, tmp6, cT; + ps_sum1 tmp3, cT, tmp7, tmp3; + ps_sum0 tmp6, tmp0, fc0, tmp4; + psq_st tmp8, Mtx_02(m), 0, qr0; + ps_sum0 tmp0, tmp4, tmp4, tmp0; + psq_st tmp2, Mtx_00(m), 0, qr0; + ps_muls0 tmp5, tmp5, tmp1; + psq_st tmp3, Mtx_10(m), 0, qr0; + ps_sum1 tmp4, tmp9, tmp0, tmp4; + psq_st tmp6, Mtx_12(m), 0, qr0; + ps_sum0 tmp5, tmp5, fc0, cT; + psq_st tmp4, Mtx_20(m), 0, qr0; + psq_st tmp5, Mtx_22(m), 0, qr0; + ) +} + +void PSMTX44RotAxisRad(MtxPtr m, CVecPtr axis, f32 rad) { + f32 s; + f32 c; -void C_MTXFrustum(Mtx44 mtx, f32 t, f32 b, f32 l, f32 r, f32 n, f32 f) { - f32 invrange; - - invrange = 1.0f / (r - l); - mtx[0][0] = 2.0f * n * invrange; - mtx[0][1] = 0.0f; - mtx[0][2] = invrange * (r + l); - mtx[0][3] = 0.0f; - - invrange = 1.0f / (t - b); - mtx[1][0] = 0.0f; - mtx[1][1] = 2.0f * n * invrange; - mtx[1][2] = invrange * (t + b); - mtx[1][3] = 0.0f; - - invrange = 1.0f / (f - n); - mtx[2][0] = 0.0f; - mtx[2][1] = 0.0f; - mtx[2][2] = -n * invrange; - mtx[2][3] = invrange * -(f * n); - - mtx[3][0] = 0.0f; - mtx[3][1] = 0.0f; - mtx[3][2] = -1.0f; - mtx[3][3] = 0.0f; -} - -DECOMP_FORCELITERAL(mtx44_c, 0.5f); - -void C_MTXPerspective(Mtx44 mtx, f32 fovy, f32 aspect, f32 n, f32 f) { - f32 rad, cot; - f32 invrange; - - rad = M_PI / 180.0f * (0.5f * fovy); - cot = 1.0f / tanf(rad); - - mtx[0][0] = cot / aspect; - mtx[0][1] = 0.0f; - mtx[0][2] = 0.0f; - mtx[0][3] = 0.0f; - - mtx[1][0] = 0.0f; - mtx[1][1] = cot; - mtx[1][2] = 0.0f; - mtx[1][3] = 0.0f; - - invrange = 1.0f / (f - n); - mtx[2][0] = 0.0f; - mtx[2][1] = 0.0f; - mtx[2][2] = -n * invrange; - mtx[2][3] = invrange * -(f * n); - - mtx[3][0] = 0.0f; - mtx[3][1] = 0.0f; - mtx[3][2] = -1.0f; - mtx[3][3] = 0.0f; -} - -void C_MTXOrtho(Mtx44 mtx, f32 t, f32 b, f32 l, f32 r, f32 n, f32 f) { - f32 invrange; - - invrange = 1.0f / (r - l); - mtx[0][0] = 2.0f * invrange; - mtx[0][1] = 0.0f; - mtx[0][2] = 0.0f; - mtx[0][3] = invrange * -(r + l); - - invrange = 1.0f / (t - b); - mtx[1][0] = 0.0f; - mtx[1][1] = 2.0f * invrange; - mtx[1][2] = 0.0f; - mtx[1][3] = invrange * -(t + b); - - invrange = 1.0f / (f - n); - mtx[2][0] = 0.0f; - mtx[2][1] = 0.0f; - mtx[2][2] = -1.0f * invrange; - mtx[2][3] = -f * invrange; - - mtx[3][0] = 0.0f; - mtx[3][1] = 0.0f; - mtx[3][2] = 0.0f; - mtx[3][3] = 1.0f; + s = sinf(rad); + c = cosf(rad); + __PSMTX44RotAxisRadInternal(m, axis, s, c); } diff --git a/src/revolution/MTX/mtx44vec.c b/src/revolution/MTX/mtx44vec.c new file mode 100644 index 000000000..157053a4a --- /dev/null +++ b/src/revolution/MTX/mtx44vec.c @@ -0,0 +1,282 @@ +#include +#include +#include + +void C_MTX44MultVec(CMtx44Ptr m, CVecPtr src, VecPtr dst) { + Vec vTmp; + f32 w; + + OS_DEBUG_ASSERT(m, "MTX44MultVec(): NULL Mtx44Ptr 'm' "); + OS_DEBUG_ASSERT(src, "MTX44MultVec(): NULL VecPtr 'src' "); + OS_DEBUG_ASSERT(dst, "MTX44MultVec(): NULL VecPtr 'dst' "); + + vTmp.x = m[0][0] * src->x + m[0][1] * src->y + m[0][2] * src->z + m[0][3]; + vTmp.y = m[1][0] * src->x + m[1][1] * src->y + m[1][2] * src->z + m[1][3]; + vTmp.z = m[2][0] * src->x + m[2][1] * src->y + m[2][2] * src->z + m[2][3]; + w = m[3][0] * src->x + m[3][1] * src->y + m[3][2] * src->z + m[3][3]; + w = 1.0f / w; + + dst->x = vTmp.x * w; + dst->y = vTmp.y * w; + dst->z = vTmp.z * w; +} + +asm void PSMTX44MultVec(register CMtx44Ptr m, register CVecPtr src, + register VecPtr dst) { + nofralloc; + + psq_l f0, Vec.x(src), 0, qr0; + psq_l f2, Mtx_30(m), 0, qr0; + psq_l f1, Vec.z(src), 1, qr0; + ps_mul f4, f0, f2; + psq_l f3, Mtx_32(m), 0, qr0; + ps_madd f5, f1, f3, f4; + ps_merge11 f12, f1, f1; + ps_sum0 f13, f5, f5, f5; + psq_l f4, Mtx_00(m), 0, qr0; + ps_merge00 f13, f13, f13; + psq_l f5, Mtx_02(m), 0, qr0; + ps_div f13, f12, f13; + psq_l f6, Mtx_10(m), 0, qr0; + psq_l f7, Mtx_12(m), 0, qr0; + psq_l f8, Mtx_20(m), 0, qr0; + psq_l f9, Mtx_22(m), 0, qr0; + ps_mul f4, f0, f4; + ps_madd f2, f1, f5, f4; + ps_mul f6, f0, f6; + ps_madd f3, f1, f7, f6; + ps_mul f8, f0, f8; + ps_sum0 f2, f2, f2, f2; + ps_madd f9, f1, f9, f8; + ps_sum1 f2, f3, f2, f3; + ps_sum0 f3, f9, f9, f9; + ps_mul f2, f2, f13; + psq_st f2, Vec.x(dst), 0, qr0; + ps_mul f3, f3, f13; + psq_st f3, Vec.z(dst), 1, qr0; + + blr; +} + +void C_MTX44MultVecArray(CMtx44Ptr m, CVecPtr srcBase, VecPtr dstBase, + u32 count) { + u32 i; + Vec vTmp; + f32 w; + + OS_DEBUG_ASSERT(m, "MTX44MultVecArray(): NULL Mtx44Ptr 'm' "); + OS_DEBUG_ASSERT(srcBase, "MTX44MultVecArray(): NULL VecPtr 'srcBase' "); + OS_DEBUG_ASSERT(dstBase, "MTX44MultVecArray(): NULL VecPtr 'dstBase' "); + + for (i = 0; i < count; ++i) { + vTmp.x = m[0][0] * srcBase->x + m[0][1] * srcBase->y + + m[0][2] * srcBase->z + m[0][3]; + vTmp.y = m[1][0] * srcBase->x + m[1][1] * srcBase->y + + m[1][2] * srcBase->z + m[1][3]; + vTmp.z = m[2][0] * srcBase->x + m[2][1] * srcBase->y + + m[2][2] * srcBase->z + m[2][3]; + w = m[3][0] * srcBase->x + m[3][1] * srcBase->y + m[3][2] * srcBase->z + + m[3][3]; + w = 1.0f / w; + + dstBase->x = vTmp.x * w; + dstBase->y = vTmp.y * w; + dstBase->z = vTmp.z * w; + + ++srcBase; + ++dstBase; + } +} + +asm void PSMTX44MultVecArray(register CMtx44Ptr m, register CVecPtr srcBase, + register VecPtr dstBase, register u32 count) { + nofralloc; + + stwu sp, -0x18(sp); + subi count, count, 0x1; + psq_l f6, Mtx_30(m), 0, qr0; + mtctr count; + psq_l f8, Vec.x(srcBase), 0, qr0; + subi dstBase, dstBase, 0x4; + stfd f14, 0x8(sp); + psq_l f7, Mtx_32(m), 0, qr0; + psq_lu f9, Vec.z(srcBase), 1, qr0; + ps_mul f13, f6, f8; + psq_l f0, Mtx_00(m), 0, qr0; + psq_st f14, 0x10(sp), 0, qr0; + ps_madd f13, f7, f9, f13; + psq_l f2, Mtx_10(m), 0, qr0; + ps_merge11 f14, f9, f9; + ps_mul f10, f0, f8; + psq_l f4, Mtx_20(m), 0, qr0; + ps_mul f11, f2, f8; + psq_l f1, Mtx_02(m), 0, qr0; + ps_mul f12, f4, f8; + psq_l f3, Mtx_12(m), 0, qr0; + ps_sum0 f13, f13, f13, f13; + psq_l f5, Mtx_22(m), 0, qr0; + + // Don't use Vec offsets here, they're not accurate +loop: + ps_madd f10, f1, f9, f10; + ps_madd f11, f3, f9, f11; + ps_madd f12, f5, f9, f12; + ps_sum0 f10, f10, f10, f10; + ps_sum0 f11, f11, f11, f11; + ps_sum0 f12, f12, f12, f12; + ps_div f13, f14, f13; + psq_lu f8, 0x4(srcBase), 0, qr0; + psq_lu f9, 0x8(srcBase), 1, qr0; + ps_mul f10, f10, f13; + psq_stu f10, 0x4(dstBase), 1, qr0; + ps_mul f11, f11, f13; + psq_stu f11, 0x4(dstBase), 1, qr0; + ps_mul f12, f12, f13; + psq_stu f12, 0x4(dstBase), 1, qr0; + ps_mul f13, f6, f8; + ps_mul f10, f0, f8; + ps_mul f11, f2, f8; + ps_madd f13, f7, f9, f13; + ps_mul f12, f4, f8; + ps_sum0 f13, f13, f13, f13; + bdnz loop; + + ps_madd f10, f1, f9, f10; + ps_madd f11, f3, f9, f11; + ps_madd f12, f5, f9, f12; + ps_sum0 f10, f10, f10, f10; + ps_sum0 f11, f11, f11, f11; + ps_sum0 f12, f12, f12, f12; + ps_div f13, f14, f13; + ps_mul f10, f10, f13; + psq_st f10, 0x4(dstBase), 1, qr0; + ps_mul f11, f11, f13; + psq_st f11, 0x8(dstBase), 1, qr0; + ps_mul f12, f12, f13; + psq_st f12, 0xc(dstBase), 1, qr0; + + psq_l f14, 0x10(sp), 0, qr0; + lfd f14, 0x8(sp); + addi sp, sp, 0x18; + blr; +} + +void C_MTX44MultVecSR(CMtx44Ptr m, CVecPtr src, VecPtr dst) { + Vec vTmp; + + OS_DEBUG_ASSERT(m, "MTX44MultVecSR(): NULL Mtx44Ptr 'm' "); + OS_DEBUG_ASSERT(src, "MTX44MultVecSR(): NULL VecPtr 'src' "); + OS_DEBUG_ASSERT(dst, "MTX44MultVecSR(): NULL VecPtr 'dst' "); + + vTmp.x = m[0][0] * src->x + m[0][1] * src->y + m[0][2] * src->z; + vTmp.y = m[1][0] * src->x + m[1][1] * src->y + m[1][2] * src->z; + vTmp.z = m[2][0] * src->x + m[2][1] * src->y + m[2][2] * src->z; + + dst->x = vTmp.x; + dst->y = vTmp.y; + dst->z = vTmp.z; +} + +asm void PSMTX44MultVecSR(register CMtx44Ptr m, register CVecPtr src, + register VecPtr dst) { + nofralloc; + + psq_l f0, Mtx_00(m), 0, qr0; + psq_l f6, Vec.x(src), 0, qr0; + psq_l f2, Mtx_10(m), 0, qr0; + ps_mul f8, f0, f6; + psq_l f4, Mtx_20(m), 0, qr0; + ps_mul f10, f2, f6; + psq_l f7, Vec.z(src), 1, qr0; + ps_mul f12, f4, f6; + psq_l f3, Mtx_12(m), 0, qr0; + ps_sum0 f8, f8, f8, f8; + psq_l f5, Mtx_22(m), 0, qr0; + ps_sum0 f10, f10, f10, f10; + psq_l f1, Mtx_02(m), 0, qr0; + ps_sum0 f12, f12, f12, f12; + ps_madd f9, f1, f7, f8; + psq_st f9, Vec.x(dst), 1, qr0; + ps_madd f11, f3, f7, f10; + psq_st f11, Vec.y(dst), 1, qr0; + ps_madd f13, f5, f7, f12; + psq_st f13, Vec.z(dst), 1, qr0; + + blr; +} + +void C_MTX44MultVecArraySR(CMtx44Ptr m, CVecPtr srcBase, VecPtr dstBase, + u32 count) { + u32 i; + Vec vTmp; + + OS_DEBUG_ASSERT(m, "MTX44MultVecArraySR(): NULL Mtx44Ptr 'm' "); + OS_DEBUG_ASSERT(srcBase, "MTX44MultVecArraySR(): NULL VecPtr 'srcBase' "); + OS_DEBUG_ASSERT(dstBase, "MTX44MultVecArraySR(): NULL VecPtr 'dstBase' "); + + for (i = 0; i < count; ++i) { + vTmp.x = + m[0][0] * srcBase->x + m[0][1] * srcBase->y + m[0][2] * srcBase->z; + vTmp.y = + m[1][0] * srcBase->x + m[1][1] * srcBase->y + m[1][2] * srcBase->z; + vTmp.z = + m[2][0] * srcBase->x + m[2][1] * srcBase->y + m[2][2] * srcBase->z; + + dstBase->x = vTmp.x; + dstBase->y = vTmp.y; + dstBase->z = vTmp.z; + + ++srcBase; + ++dstBase; + } +} + +asm void PSMTX44MultVecArraySR(register CMtx44Ptr m, register CVecPtr srcBase, + register VecPtr dstBase, register u32 count) { + nofralloc; + + psq_l f0, Mtx_00(m), 0, qr0; + subi count, count, 0x1; + psq_l f6, Vec.x(srcBase), 0, qr0; + ps_mul f8, f0, f6; + psq_l f2, Mtx_10(m), 0, qr0; + ps_mul f9, f2, f6; + psq_l f4, Mtx_20(m), 0, qr0; + psq_lu f7, Vec.z(srcBase), 1, qr0; + ps_mul f10, f4, f6; + psq_l f1, Mtx_02(m), 1, qr0; + mtctr count; + psq_l f3, Mtx_12(m), 1, qr0; + subi dstBase, dstBase, 0x4; + psq_l f5, Mtx_22(m), 1, qr0; + + // Don't use Vec offsets here, they're not accurate +loop: + ps_madd f11, f1, f7, f8; + psq_lu f6, 0x4(srcBase), 0, qr0; + ps_madd f12, f3, f7, f9; + ps_madd f13, f5, f7, f10; + psq_lu f7, 0x8(srcBase), 1, qr0; + ps_sum0 f11, f11, f8, f8; + psq_stu f11, 0x4(dstBase), 1, qr0; + ps_sum0 f12, f12, f9, f9; + psq_stu f12, 0x4(dstBase), 1, qr0; + ps_sum0 f13, f13, f10, f10; + psq_stu f13, 0x4(dstBase), 1, qr0; + ps_mul f8, f0, f6; + ps_mul f9, f2, f6; + ps_mul f10, f4, f6; + bdnz loop; + + ps_madd f11, f1, f7, f8; + ps_madd f12, f3, f7, f9; + ps_madd f13, f5, f7, f10; + ps_sum0 f11, f11, f8, f8; + psq_stu f11, 0x4(dstBase), 1, qr0; + ps_sum0 f12, f12, f9, f9; + psq_stu f12, 0x4(dstBase), 1, qr0; + ps_sum0 f13, f13, f10, f10; + psq_stu f13, 0x4(dstBase), 1, qr0; + + blr; +} diff --git a/src/revolution/MTX/mtxstack.c b/src/revolution/MTX/mtxstack.c new file mode 100644 index 000000000..fc5a6a5c4 --- /dev/null +++ b/src/revolution/MTX/mtxstack.c @@ -0,0 +1,127 @@ +#include +#include +#include + +#define STACK_OVERFLOW_CHECK(sPtr) \ + (u32)((sPtr->stackPtr - sPtr->stackBase) / 3) >= sPtr->numMtx - 1 + +void MTXInitStack(MtxStackPtr sPtr, u32 numMtx) { + OS_DEBUG_ASSERT(sPtr, "MTXInitStack(): NULL MtxStackPtr 'sPtr' "); + OS_DEBUG_ASSERT(sPtr->stackBase, "MTXInitStack(): 'sPtr' contains a NULL ptr to stack memory "); + OS_DEBUG_ASSERT(numMtx != 0, "MTXInitStack(): 'numMtx' is 0 "); + + sPtr->numMtx = numMtx; + sPtr->stackPtr = NULL; +} + +MtxPtr MTXPush(MtxStackPtr sPtr, CMtxPtr m) { + OS_DEBUG_ASSERT(sPtr, "MTXPush(): NULL MtxStackPtr 'sPtr' "); + OS_DEBUG_ASSERT(sPtr->stackBase, "MTXPush(): 'sPtr' contains a NULL ptr to stack memory "); + OS_DEBUG_ASSERT(m, "MTXPush(): NULL MtxPtr 'm' "); + + if (sPtr->stackPtr == NULL) { + sPtr->stackPtr = sPtr->stackBase; + MTXCopy(m, sPtr->stackPtr); + } else { + if (STACK_OVERFLOW_CHECK(sPtr)) { + OS_DEBUG_ASSERT(FALSE, "MTXPush(): stack overflow "); + } + + MTXCopy(m, sPtr->stackPtr + 3); + sPtr->stackPtr += 3; + } + + return sPtr->stackPtr; +} + +MtxPtr MTXPushFwd(MtxStackPtr sPtr, CMtxPtr m) { + OS_DEBUG_ASSERT(sPtr, "MTXPushFwd(): NULL MtxStackPtr 'sPtr' "); + OS_DEBUG_ASSERT(sPtr->stackBase, "MTXPushFwd(): 'sPtr' contains a NULL ptr to stack memory "); + OS_DEBUG_ASSERT(m, "MTXPushFwd(): NULL MtxPtr 'm' "); + + if (sPtr->stackPtr == NULL) { + sPtr->stackPtr = sPtr->stackBase; + MTXCopy(m, sPtr->stackPtr); + } else { + if (STACK_OVERFLOW_CHECK(sPtr)) { + OS_DEBUG_ASSERT(FALSE, "MTXPushFwd(): stack overflow"); + } + + MTXConcat(sPtr->stackPtr, m, sPtr->stackPtr + 3); + sPtr->stackPtr += 3; + } + + return sPtr->stackPtr; +} + +MtxPtr MTXPushInv(MtxStackPtr sPtr, CMtxPtr m) { + Mtx inv; + + OS_DEBUG_ASSERT(sPtr, "MTXPushInv(): NULL MtxStackPtr 'sPtr' "); + OS_DEBUG_ASSERT(sPtr->stackBase, "MTXPushInv(): 'sPtr' contains a NULL ptr to stack memory "); + OS_DEBUG_ASSERT(m, "MTXPushInv(): NULL MtxPtr 'm' "); + + MTXInverse(m, inv); + + if (sPtr->stackPtr == NULL) { + sPtr->stackPtr = sPtr->stackBase; + MTXCopy(inv, sPtr->stackPtr); + } else { + if (STACK_OVERFLOW_CHECK(sPtr)) { + OS_DEBUG_ASSERT(FALSE, "MTXPushInv(): stack overflow"); + } + + MTXConcat(inv, sPtr->stackPtr, sPtr->stackPtr + 3); + sPtr->stackPtr += 3; + } + + return sPtr->stackPtr; +} + +MtxPtr MTXPushInvXpose(MtxStackPtr sPtr, CMtxPtr m) { + Mtx invXpose; + + OS_DEBUG_ASSERT(sPtr, "MTXPushInvXpose(): NULL MtxStackPtr 'sPtr' "); + OS_DEBUG_ASSERT(sPtr->stackBase, "MTXPushInvXpose(): 'sPtr' contains a NULL ptr to stack memory "); + OS_DEBUG_ASSERT(m, "MTXPushInvXpose(): NULL MtxPtr 'm' "); + + MTXInverse(m, invXpose); + MTXTranspose(invXpose, invXpose); + + if (sPtr->stackPtr == NULL) { + sPtr->stackPtr = sPtr->stackBase; + MTXCopy(invXpose, sPtr->stackPtr); + } else { + if (STACK_OVERFLOW_CHECK(sPtr)) { + OS_DEBUG_ASSERT(FALSE, "MTXPushInvXpose(): stack overflow "); + } + + MTXConcat(sPtr->stackPtr, invXpose, sPtr->stackPtr + 3); + sPtr->stackPtr += 3; + } + + return sPtr->stackPtr; +} + +MtxPtr MTXPop(MtxStackPtr sPtr) { + OS_DEBUG_ASSERT(sPtr, "MTXPop(): NULL MtxStackPtr 'sPtr' "); + OS_DEBUG_ASSERT(sPtr->stackBase, "MTXPop(): 'sPtr' contains a NULL ptr to stack memory "); + + if (sPtr->stackPtr == NULL) { + return NULL; + } + + if (sPtr->stackBase == sPtr->stackPtr) { + sPtr->stackPtr = NULL; + return NULL; + } + + sPtr->stackPtr -= 3; + return sPtr->stackPtr; +} + +MtxPtr MTXGetStackPtr(MtxStackPtr sPtr) { + OS_DEBUG_ASSERT(sPtr, "MTXGetStackPtr(): NULL MtxStackPtr 'sPtr' "); + OS_DEBUG_ASSERT(sPtr->stackBase, "MTXGetStackPtr(): 'sPtr' contains a NULL ptr to stack memory "); + return sPtr->stackPtr; +} diff --git a/src/revolution/MTX/mtxvec.c b/src/revolution/MTX/mtxvec.c index 6218aa6f5..a0f09e5b8 100644 --- a/src/revolution/MTX/mtxvec.c +++ b/src/revolution/MTX/mtxvec.c @@ -1,46 +1,235 @@ #include +#include +#include -asm void PSMTXMultVec(register const Mtx mtx, register const Vec* vec, - register Vec* out) { - // clang-format off - nofralloc - - // Calculate X transformation (dot(mtx[0], vec)) - psq_l f0, Vec.x(vec), 0, 0 // VX, VY - psq_l f2, 0(mtx), 0, 0 // M0X, M0Y - psq_l f1, Vec.z(vec), 1, 0 // VZ, 1 - ps_mul f4, f2, f0 // M0X*VX, M0Y*VY - psq_l f3, 8(mtx), 0, 0 // M0Z, M0W - ps_madd f5, f3, f1, f4 // M0Z*VZ+M0X*VX, M0W+M0Y*VY - psq_l f8, 16(mtx), 0, 0 // M1X, M1Y - ps_sum0 f6, f5, f6, f5 // M0Z*VZ+M0X*VX+M0W+M0Y*VY, junk - - // Head start on Y transformation - psq_l f9, 24(mtx), 0, 0 // M1Z, M1W - ps_mul f10, f8, f0 // M1X*VX, M1Y*VY - - // tx = M0X*VX + M0Y*VY + M0Z*VZ + M0W - psq_st f6, Vec.x(out), 1, 0 - - // Calculate Y transformation (dot(mtx[1], vec)) - ps_madd f11, f9, f1, f10 // M1Z*VZ+M1X*VX, M1W+M1Y*VY - psq_l f2, 32(mtx), 0, 0 // M2X, M2Y - ps_sum0 f12, f11, f12, f11 // M1Z*VZ+M1X*VX+M1W+M1Y*VY, junk - - // Head start on Z transformation - psq_l f3, 40(mtx), 0, 0 // M2Z, M2W - ps_mul f4, f2, f0 // M2X*VX, M2Y*VY - - // ty = M1X*VX + M1Y*VY + M1Z*VZ + M1W - psq_st f12, Vec.y(out), 1, 0 - - // Calculate Z transformation (dot(mtx[2], vec)) - ps_madd f5, f3, f1, f4 // M2Z*VZ+M2X*VX, M2W+M2Y*VY - ps_sum0 f6, f5, f6, f5 // M2Z*VZ+M2X*VX+M2W+M2Y*VY, junk - - // tz = M2X*VX + M2Y*VY + M2Z*VZ + M2W - psq_st f6, Vec.z(out), 1, 0 - - blr - // clang-format on -} \ No newline at end of file +void C_MTXMultVec(CMtxPtr m, CVecPtr src, VecPtr dst) { + Vec vTmp; + + OS_DEBUG_ASSERT(m, "MTXMultVec(): NULL MtxPtr 'm' "); + OS_DEBUG_ASSERT(src, "MTXMultVec(): NULL VecPtr 'src' "); + OS_DEBUG_ASSERT(dst, "MTXMultVec(): NULL VecPtr 'dst' "); + + vTmp.x = m[0][0] * src->x + m[0][1] * src->y + m[0][2] * src->z + m[0][3]; + vTmp.y = m[1][0] * src->x + m[1][1] * src->y + m[1][2] * src->z + m[1][3]; + vTmp.z = m[2][0] * src->x + m[2][1] * src->y + m[2][2] * src->z + m[2][3]; + + dst->x = vTmp.x; + dst->y = vTmp.y; + dst->z = vTmp.z; +} + +asm void PSMTXMultVec(register CMtxPtr m, register CVecPtr src, + register VecPtr dst) { + nofralloc; + + psq_l f0, Vec.x(src), 0, qr0; + psq_l f2, Mtx_00(m), 0, qr0; + psq_l f1, Vec.z(src), 1, qr0; + ps_mul f4, f2, f0; + psq_l f3, Mtx_02(m), 0, qr0; + ps_madd f5, f3, f1, f4; + psq_l f8, Mtx_10(m), 0, qr0; + ps_sum0 f6, f5, f6, f5; + psq_l f9, Mtx_12(m), 0, qr0; + ps_mul f10, f8, f0; + psq_st f6, Vec.x(dst), 1, qr0; + ps_madd f11, f9, f1, f10; + psq_l f2, Mtx_20(m), 0, qr0; + ps_sum0 f12, f11, f12, f11; + psq_l f3, Mtx_22(m), 0, qr0; + ps_mul f4, f2, f0; + psq_st f12, Vec.y(dst), 1, qr0; + ps_madd f5, f3, f1, f4; + ps_sum0 f6, f5, f6, f5; + psq_st f6, Vec.z(dst), 1, qr0; + + blr; +} + +void C_MTXMultVecArray(CMtxPtr m, CVecPtr srcBase, VecPtr dstBase, u32 count) { + u32 i; + Vec vTmp; + + OS_DEBUG_ASSERT(m, "MTXMultVecArray(): NULL MtxPtr 'm' "); + OS_DEBUG_ASSERT(srcBase, "MTXMultVecArray(): NULL VecPtr 'srcBase' "); + OS_DEBUG_ASSERT(dstBase, "MTXMultVecArray(): NULL VecPtr 'dstBase' "); + OS_DEBUG_ASSERT(count > 1, "MTXMultVecArray(): count must be greater than 1."); + + for (i = 0; i < count; ++i) { + vTmp.x = m[0][0] * srcBase->x + m[0][1] * srcBase->y + + m[0][2] * srcBase->z + m[0][3]; + vTmp.y = m[1][0] * srcBase->x + m[1][1] * srcBase->y + + m[1][2] * srcBase->z + m[1][3]; + vTmp.z = m[2][0] * srcBase->x + m[2][1] * srcBase->y + + m[2][2] * srcBase->z + m[2][3]; + + dstBase->x = vTmp.x; + dstBase->y = vTmp.y; + dstBase->z = vTmp.z; + + ++srcBase; + ++dstBase; + } +} + +asm void PSMTXMultVecArray(register CMtxPtr m, register CVecPtr srcBase, + register VecPtr dstBase, register u32 count) { + nofralloc; + + psq_l f13, Mtx_00(m), 0, qr0; + psq_l f12, Mtx_10(m), 0, qr0; + subi count, count, 0x1; + psq_l f11, Mtx_02(m), 0, qr0; + ps_merge00 f0, f13, f12; + subi dstBase, dstBase, 0x4; + psq_l f10, Mtx_12(m), 0, qr0; + ps_merge11 f1, f13, f12; + mtctr count; + psq_l f4, Mtx_20(m), 0, qr0; + ps_merge00 f2, f11, f10; + psq_l f5, Mtx_22(m), 0, qr0; + ps_merge11 f3, f11, f10; + psq_l f6, Vec.x(srcBase), 0, qr0; + psq_lu f7, Vec.z(srcBase), 1, qr0; + ps_madds0 f8, f0, f6, f3; + ps_mul f9, f4, f6; + ps_madds1 f8, f1, f6, f8; + ps_madd f10, f5, f7, f9; + +// Don't use Vec offsets here, they're not accurate +loop: + psq_lu f6, 0x4(srcBase), 0, qr0; + ps_madds0 f12, f2, f7, f8; + psq_lu f7, 0x8(srcBase), 1, qr0; + ps_sum0 f13, f10, f9, f10; + ps_madds0 f8, f0, f6, f3; + ps_mul f9, f4, f6; + psq_stu f12, 0x4(dstBase), 0, qr0; + ps_madds1 f8, f1, f6, f8; + psq_stu f13, 0x8(dstBase), 1, qr0; + ps_madd f10, f5, f7, f9; + bdnz loop; + + ps_madds0 f12, f2, f7, f8; + ps_sum0 f13, f10, f9, f10; + psq_stu f12, 0x4(dstBase), 0, qr0; + psq_stu f13, 0x8(dstBase), 1, qr0; + + blr; +} + +void C_MTXMultVecSR(CMtxPtr m, CVecPtr src, VecPtr dst) { + Vec vTmp; + + OS_DEBUG_ASSERT(m, "MTXMultVecSR(): NULL MtxPtr 'm' "); + OS_DEBUG_ASSERT(src, "MTXMultVecSR(): NULL VecPtr 'src' "); + OS_DEBUG_ASSERT(dst, "MTXMultVecSR(): NULL VecPtr 'dst' "); + + vTmp.x = m[0][0] * src->x + m[0][1] * src->y + m[0][2] * src->z; + vTmp.y = m[1][0] * src->x + m[1][1] * src->y + m[1][2] * src->z; + vTmp.z = m[2][0] * src->x + m[2][1] * src->y + m[2][2] * src->z; + + dst->x = vTmp.x; + dst->y = vTmp.y; + dst->z = vTmp.z; +} + +asm void PSMTXMultVecSR(register CMtxPtr m, register CVecPtr src, + register VecPtr dst) { + nofralloc; + + psq_l f0, Mtx_00(m), 0, qr0; + psq_l f6, Vec.x(src), 0, qr0; + psq_l f2, Mtx_10(m), 0, qr0; + ps_mul f8, f0, f6; + psq_l f4, Mtx_20(m), 0, qr0; + ps_mul f10, f2, f6; + psq_l f7, Vec.z(src), 1, qr0; + ps_mul f12, f4, f6; + psq_l f3, Mtx_12(m), 0, qr0; + ps_sum0 f8, f8, f8, f8; + psq_l f5, Mtx_22(m), 0, qr0; + ps_sum0 f10, f10, f10, f10; + psq_l f1, Mtx_02(m), 0, qr0; + ps_sum0 f12, f12, f12, f12; + ps_madd f9, f1, f7, f8; + psq_st f9, Vec.x(dst), 1, qr0; + ps_madd f11, f3, f7, f10; + psq_st f11, Vec.y(dst), 1, qr0; + ps_madd f13, f5, f7, f12; + psq_st f13, Vec.z(dst), 1, qr0; + + blr; +} + +void C_MTXMultVecArraySR(CMtxPtr m, CVecPtr srcBase, VecPtr dstBase, + u32 count) { + u32 i; + Vec vTmp; + + OS_DEBUG_ASSERT(m, "MTXMultVecArraySR(): NULL MtxPtr 'm' "); + OS_DEBUG_ASSERT(srcBase, "MTXMultVecArraySR(): NULL VecPtr 'srcBase' "); + OS_DEBUG_ASSERT(dstBase, "MTXMultVecArraySR(): NULL VecPtr 'dstBase' "); + OS_DEBUG_ASSERT(count > 1, "MTXMultVecArraySR(): count must be greater than 1."); + + for (i = 0; i < count; ++i) { + vTmp.x = + m[0][0] * srcBase->x + m[0][1] * srcBase->y + m[0][2] * srcBase->z; + vTmp.y = + m[1][0] * srcBase->x + m[1][1] * srcBase->y + m[1][2] * srcBase->z; + vTmp.z = + m[2][0] * srcBase->x + m[2][1] * srcBase->y + m[2][2] * srcBase->z; + + dstBase->x = vTmp.x; + dstBase->y = vTmp.y; + dstBase->z = vTmp.z; + + ++srcBase; + ++dstBase; + } +} + +asm void PSMTXMultVecArraySR(register CMtxPtr m, register CVecPtr srcBase, + register VecPtr dstBase, register u32 count) { + nofralloc; + + psq_l f13, Mtx_00(m), 0, qr0; + psq_l f12, Mtx_10(m), 0, qr0; + subi count, count, 0x1; + psq_l f11, Mtx_02(m), 1, qr0; + ps_merge00 f0, f13, f12; + subi dstBase, dstBase, 0x4; + psq_l f10, Mtx_12(m), 1, qr0; + ps_merge11 f1, f13, f12; + mtctr count; + psq_l f3, Mtx_20(m), 0, qr0; + ps_merge00 f2, f11, f10; + psq_l f4, Mtx_22(m), 1, qr0; + psq_l f6, Vec.x(srcBase), 0, qr0; + psq_lu f7, Vec.z(srcBase), 1, qr0; + ps_muls0 f8, f0, f6; + ps_mul f9, f3, f6; + ps_madds1 f8, f1, f6, f8; + ps_madd f10, f4, f7, f9; + +// Don't use Vec offsets here, they're not accurate +loop: + psq_lu f6, 0x4(srcBase), 0, qr0; + ps_madds0 f12, f2, f7, f8; + psq_lu f7, 0x8(srcBase), 1, qr0; + ps_sum0 f13, f10, f9, f9; + ps_muls0 f8, f0, f6; + ps_mul f9, f3, f6; + psq_stu f12, 0x4(dstBase), 0, qr0; + ps_madds1 f8, f1, f6, f8; + psq_stu f13, 0x8(dstBase), 1, qr0; + ps_madd f10, f4, f7, f9; + bdnz loop; + + ps_madds0 f12, f2, f7, f8; + ps_sum0 f13, f10, f9, f9; + psq_stu f12, 0x4(dstBase), 0, qr0; + psq_stu f13, 0x8(dstBase), 1, qr0; + + blr; +} diff --git a/src/revolution/MTX/psmtx.c b/src/revolution/MTX/psmtx.c new file mode 100644 index 000000000..c0142f85f --- /dev/null +++ b/src/revolution/MTX/psmtx.c @@ -0,0 +1,122 @@ +#include +#include + +asm void PSMTXReorder(register CMtxPtr src, register ROMtxPtr dst) { + nofralloc; + + psq_l f0, Mtx_00(src), 0, qr0; + psq_l f2, Mtx_10(src), 0, qr0; + psq_l f4, Mtx_20(src), 0, qr0; + psq_l f1, Mtx_02(src), 0, qr0; + ps_merge00 f6, f0, f2; + psq_l f3, Mtx_12(src), 0, qr0; + ps_merge01 f12, f4, f0; + psq_l f5, Mtx_22(src), 0, qr0; + ps_merge11 f7, f2, f4; + psq_st f6, ROMtx_00(dst), 0, qr0; + ps_merge00 f8, f1, f3; + psq_st f12, ROMtx_02(dst), 0, qr0; + ps_merge01 f9, f5, f1; + psq_st f7, ROMtx_11(dst), 0, qr0; + ps_merge11 f10, f3, f5; + psq_st f8, ROMtx_20(dst), 0, qr0; + psq_st f9, ROMtx_22(dst), 0, qr0; + psq_st f10, ROMtx_31(dst), 0, qr0; + + blr; +} + +asm void PSMTXROMultVecArray(register CROMtxPtr m, register CVecPtr srcBase, + register VecPtr dstBase, register u32 count) { + // Don't use Vec for offsets, they're always updated on load/store + nofralloc; + + stwu sp, -0x60(sp); + stfd f14, 0x8(sp); + psq_st f14, 0x10(sp), 0, qr0; + subi r7, count, 0x1; + stfd f15, 0x18(sp); + psq_st f15, 0x20(sp), 0, qr0; + srwi r7, r7, 0x1; + stfd f16, 0x28(sp); + psq_st f16, 0x30(sp), 0, qr0; + stfd f17, 0x38(sp); + psq_st f17, 0x40(sp), 0, qr0; + stfd f18, 0x48(sp); + psq_st f18, 0x50(sp), 0, qr0; + mtctr r7; + psq_l f0, ROMtx_00(m), 0, qr0; + subi srcBase, srcBase, 0x8; + psq_l f1, ROMtx_02(m), 1, qr0; + subi dstBase, dstBase, 0x4; + psq_l f6, ROMtx_30(m), 0, qr0; + psq_lu f8, 0x8(srcBase), 0, qr0; + psq_l f7, ROMtx_32(m), 1, qr0; + psq_lu f9, 0x8(srcBase), 0, qr0; + ps_madds0 f11, f0, f8, f6; + psq_l f2, ROMtx_10(m), 0, qr0; + ps_madds0 f12, f1, f8, f7; + psq_l f3, ROMtx_12(m), 1, qr0; + ps_madds1 f13, f0, f9, f6; + psq_lu f10, 0x8(srcBase), 0, qr0; + ps_madds1 f14, f1, f9, f7; + psq_l f5, ROMtx_22(m), 1, qr0; + ps_madds1 f11, f2, f8, f11; + ps_madds1 f12, f3, f8, f12; + psq_l f4, ROMtx_20(m), 0, qr0; + ps_madds0 f13, f2, f10, f13; + psq_lu f8, 0x8(srcBase), 0, qr0; + ps_madds0 f14, f3, f10, f14; + ps_madds0 f15, f4, f9, f11; + ps_madds0 f16, f5, f9, f12; + psq_lu f9, 0x8(srcBase), 0, qr0; + ps_madds1 f17, f4, f10, f13; + ps_madds1 f18, f5, f10, f14; + psq_lu f10, 0x8(srcBase), 0, qr0; +loop: + ps_madds0 f11, f0, f8, f6; + psq_stu f15, 0x4(dstBase), 0, qr0; + ps_madds0 f12, f1, f8, f7; + psq_stu f16, 0x8(dstBase), 1, qr0; + ps_madds1 f13, f0, f9, f6; + psq_stu f17, 0x4(dstBase), 0, qr0; + ps_madds1 f14, f1, f9, f7; + psq_stu f18, 0x8(dstBase), 1, qr0; + ps_madds1 f11, f2, f8, f11; + ps_madds1 f12, f3, f8, f12; + psq_lu f8, 0x8(srcBase), 0, qr0; + ps_madds0 f13, f2, f10, f13; + ps_madds0 f14, f3, f10, f14; + ps_madds0 f15, f4, f9, f11; + ps_madds0 f16, f5, f9, f12; + psq_lu f9, 0x8(srcBase), 0, qr0; + ps_madds1 f17, f4, f10, f13; + ps_madds1 f18, f5, f10, f14; + psq_lu f10, 0x8(srcBase), 0, qr0; + bdnz loop; + + psq_stu f15, 0x4(dstBase), 0, qr0; + + // It wants to format "clrlwi. r7" with no whitespace + // clang-format off + clrlwi. r7, count, 0x1f; + // clang-format on + + psq_stu f16, 0x8(dstBase), 1, qr0; + bne epilogue; + psq_stu f17, 0x4(dstBase), 0, qr0; + psq_stu f18, 0x8(dstBase), 1, qr0; +epilogue: + psq_l f14, 0x10(sp), 0, qr0; + lfd f14, 0x8(sp); + psq_l f15, 0x20(sp), 0, qr0; + lfd f15, 0x18(sp); + psq_l f16, 0x30(sp), 0, qr0; + lfd f16, 0x28(sp); + psq_l f17, 0x40(sp), 0, qr0; + lfd f17, 0x38(sp); + psq_l f18, 0x50(sp), 0, qr0; + lfd f18, 0x48(sp); + addi sp, sp, 0x60; + blr; +} diff --git a/src/revolution/MTX/quat.c b/src/revolution/MTX/quat.c index 05bcb13c1..0473d1d35 100644 --- a/src/revolution/MTX/quat.c +++ b/src/revolution/MTX/quat.c @@ -1,175 +1,543 @@ #include +#include +#include #include -#define MY_EPSILON 1e-5f +#define EPSILON 0.00001f -DECOMP_FORCELITERAL(quat_c, MY_EPSILON, 1.0f, 0.0f); +void C_QUATAdd(CQuaternionPtr p, CQuaternionPtr q, QuaternionPtr r) { + OS_DEBUG_ASSERT(p, "QUATAdd(): NULL QuaternionPtr 'p' "); + OS_DEBUG_ASSERT(q, "QUATAdd(): NULL QuaternionPtr 'q' "); + OS_DEBUG_ASSERT(r, "QUATAdd(): NULL QuaternionPtr 'r' "); -void PSQUATMultiply(register const Quaternion* a, register const Quaternion* b, - register Quaternion* prod) { - register f32 axy, azw; - register f32 bxy, bzw; - register f32 naxay, naxy, nazw; - register f32 work1, work2, work3, work4, work5; + r->x = p->x + q->x; + r->y = p->y + q->y; + r->z = p->z + q->z; + r->w = p->w + q->w; +} + +void PSQUATAdd(register CQuaternionPtr p, register CQuaternionPtr q, + register QuaternionPtr r) { + register f32 tmp0; + register f32 tmp1; + register f32 tmp2; + register f32 tmp3; + register f32 tmp4; + register f32 tmp5; + + ASM ( + psq_l tmp0, Quaternion.x(p), 0, qr0; + psq_l tmp1, Quaternion.x(q), 0, qr0; + ps_add tmp2, tmp0, tmp1; + psq_st tmp2, Quaternion.x(r), 0, qr0; + psq_l tmp3, Quaternion.z(p), 0, qr0; + psq_l tmp4, Quaternion.z(q), 0, qr0; + ps_add tmp5, tmp3, tmp4; + psq_st tmp5, Quaternion.z(r), 0, qr0; + ) +} + +void C_QUATSubtract(CQuaternionPtr p, CQuaternionPtr q, QuaternionPtr r) { + OS_DEBUG_ASSERT(p, "QUATSubtract(): NULL QuaternionPtr 'p' "); + OS_DEBUG_ASSERT(q, "QUATSubtract(): NULL QuaternionPtr 'q' "); + OS_DEBUG_ASSERT(r, "QUATSubtract(): NULL QuaternionPtr 'r' "); + + r->x = p->x - q->x; + r->y = p->y - q->y; + r->z = p->z - q->z; + r->w = p->w - q->w; +} + +void PSQUATSubtract(register CQuaternionPtr p, register CQuaternionPtr q, + register QuaternionPtr r) { + register f32 tmp0; + register f32 tmp1; + register f32 tmp2; + register f32 tmp3; + register f32 tmp4; + register f32 tmp5; ASM ( - // Load qA components - psq_l axy, Quaternion.x(a), 0, 0 // AX, AY - psq_l azw, Quaternion.z(a), 0, 0 // AZ, AW - // Load qB components - psq_l bxy, Quaternion.x(b), 0, 0 // BX, BY - psq_l bzw, Quaternion.z(b), 0, 0 // BZ, BW - - // Negate copy of qA components - ps_neg naxy, axy // -AX, -AY - ps_neg nazw, azw // -AZ, -AW - - // Compute parts of product - ps_muls0 work1, azw, bxy // AZ*BX, AW*BX - ps_merge01 naxay, naxy, axy // -AX, AY - ps_merge01 work2, nazw, azw // -AZ, AW - ps_muls0 work3, naxy, bxy // -AX*BX, -AY*BX - ps_muls1 work4, naxay, bxy // -AX*BY, AY*BY - ps_madds0 work1, naxay, bzw, work1 // -AX*BZ+AZ*BX, AY*BZ+AW*BX - ps_muls1 work5, work2, bxy // -AZ*BY, AW*BY - ps_madds0 work3, work2, bzw, work3 // -AZ*BZ-AX*BX, AW*BZ-AY*BX - ps_merge10 work1, work1, work1 // AY*BZ+AW*BX, -AX*BZ+AZ*BX - ps_madds1 work5, axy, bzw, work5 // -AX*BW-AZ*BY, -AY*BW+AW*BY - ps_merge10 work3, work3, work3 // AW*BZ-AY*BX, -AZ*BZ-AX*BX - ps_madds1 work4, nazw, bzw, work4 // -AZ*BW-AX*BY, -AW*BW+AY*BY - - // Put everything together - ps_add work1, work1, work5 // AY*BZ+AW*BX-AX*BW-AZ*BY, -AX*BZ+AZ*BX-AY*BW+AW*BY - ps_sub work3, work3, work4 // AW*BZ-AY*BX+AZ*BW-AX*BY, -AZ*BZ-AX*BX+AW*BW+AY*BY - - // Store result - psq_st work1, Quaternion.x(prod), 0, 0 - psq_st work3, Quaternion.z(prod), 0, 0 + psq_l tmp0, Quaternion.x(p), 0, qr0; + psq_l tmp1, Quaternion.x(q), 0, qr0; + ps_sub tmp2, tmp0, tmp1; + psq_st tmp2, Quaternion.x(r), 0, qr0; + psq_l tmp3, Quaternion.z(p), 0, qr0; + psq_l tmp4, Quaternion.z(q), 0, qr0; + ps_sub tmp5, tmp3, tmp4; + psq_st tmp5, Quaternion.z(r), 0, qr0; + ) +} + +void C_QUATMultiply(CQuaternionPtr p, CQuaternionPtr q, QuaternionPtr pq) { + QuaternionPtr r; + Quaternion rTmp; + + OS_DEBUG_ASSERT(p, "QUATMultiply(): NULL QuaternionPtr 'p' "); + OS_DEBUG_ASSERT(q, "QUATMultiply(): NULL QuaternionPtr 'q' "); + OS_DEBUG_ASSERT(pq, "QUATMultiply(): NULL QuaternionPtr 'pq' "); + + if (p == pq || q == pq) { + r = &rTmp; + } else { + r = pq; + } + + r->w = p->w * q->w - p->x * q->x - p->y * q->y - p->z * q->z; + r->x = p->w * q->x + p->x * q->w + p->y * q->z - p->z * q->y; + r->y = p->w * q->y + p->y * q->w + p->z * q->x - p->x * q->z; + r->z = p->w * q->z + p->z * q->w + p->x * q->y - p->y * q->x; + + if (r == &rTmp) { + *pq = rTmp; + } +} + +void PSQUATMultiply(register CQuaternionPtr p, register CQuaternionPtr q, + register QuaternionPtr pq) { + register f32 tmp0; + register f32 tmp1; + register f32 tmp2; + register f32 tmp3; + register f32 tmp4; + register f32 tmp5; + register f32 tmp6; + register f32 tmp7; + register f32 tmp8; + register f32 tmp9; + register f32 tmp10; + + ASM ( + psq_l tmp0, Quaternion.x(p), 0, qr0; + psq_l tmp1, Quaternion.z(p), 0, qr0; + psq_l tmp2, Quaternion.x(q), 0, qr0; + ps_neg tmp5, tmp0; + psq_l tmp3, Quaternion.z(q), 0, qr0; + ps_neg tmp6, tmp1; + ps_merge01 tmp4, tmp5, tmp0; + ps_muls0 tmp7, tmp1, tmp2; + ps_muls0 tmp5, tmp5, tmp2; + ps_merge01 tmp8, tmp6, tmp1; + ps_muls1 tmp9, tmp4, tmp2; + ps_madds0 tmp7, tmp4, tmp3, tmp7; + ps_muls1 tmp10, tmp8, tmp2; + ps_madds0 tmp5, tmp8, tmp3, tmp5; + ps_madds1 tmp9, tmp6, tmp3, tmp9; + ps_merge10 tmp7, tmp7, tmp7; + ps_madds1 tmp10, tmp0, tmp3, tmp10; + ps_merge10 tmp5, tmp5, tmp5; + ps_add tmp7, tmp7, tmp10; + psq_st tmp7, Quaternion.x(pq), 0, qr0; + ps_sub tmp5, tmp5, tmp9; + psq_st tmp5, Quaternion.z(pq), 0, qr0; ) } -void PSQUATNormalize(register const Quaternion* in, register Quaternion* out) { - register f32 xy, zw; - register f32 xy2, dot; - register f32 work0, work1, work2, work3; - register f32 c_epsilon, c_half, c_three; +void C_QUATScale(CQuaternionPtr q, QuaternionPtr r, f32 scale) { + OS_DEBUG_ASSERT(q, "QUATScale(): NULL QuaternionPtr 'q' "); + OS_DEBUG_ASSERT(r, "QUATScale(): NULL QuaternionPtr 'r' "); - c_epsilon = MY_EPSILON; - c_half = 0.5f; - c_three = 3.0f; + r->x = q->x * scale; + r->y = q->y * scale; + r->z = q->z * scale; + r->w = q->w * scale; +} + +void PSQUATScale(register CQuaternionPtr q, register QuaternionPtr r, + register f32 mult) { + register f32 tmp0; + register f32 tmp1; ASM ( - // Load quaternion components - psq_l xy, Quaternion.x(in), 0, 0 - psq_l zw, Quaternion.z(in), 0, 0 - - // Compute dot product with self - ps_mul xy2, xy, xy // X^2, Y^2 - ps_madd dot, zw, zw, xy2 // Z^2+X^2, W^2+Y^2 - ps_sum0 dot, dot, dot, dot // Z^2+X^2+W^2+Y^2, junk - - // Reciprocal square root - frsqrte work0, dot - - // Refine estimate using Newton-Raphson method - // y = 1 / sqrt(x) - fmul work3, work0, work0 // rsqrt^2 - fmul work0, work0, c_half // rsqrt * 0.5 - fnmsub work3, work3, dot, c_three // (3 - x * rsqrt^2) - fmul work0, work3, work0 // (3 - x * rsqrt^2) * (rsqrt * 0.5) - - // Set magnitude to zero if too small - // (dot - epsilon > zero) ? rsqrt : zero - ps_sub work1, dot, c_epsilon - ps_sub work2, c_epsilon, c_epsilon - ps_sel work0, work1, work0, work2 - - // Scale components to normalize - ps_muls0 xy, xy, work0 - ps_muls0 zw, zw, work0 - - // Store result - psq_st xy, Quaternion.x(out), 0, 0 - psq_st zw, Quaternion.z(out), 0, 0 + psq_l tmp0, Quaternion.x(q), 0, qr0; + psq_l tmp1, Quaternion.z(q), 0, qr0; + ps_muls0 tmp0, tmp0, mult; + psq_st tmp0, Quaternion.x(r), 0, qr0; + ps_muls0 tmp1, tmp1, mult; + psq_st tmp1, Quaternion.z(r), 0, qr0; ) } -void C_QUATMtx(Quaternion* quat, const Mtx mtx) { - f32 root, trace; - u32 dmax, dnext, dlast; - u32 next[3] = {1, 2, 0}; - f32 temp[3]; +f32 C_QUATDotProduct(CQuaternionPtr p, CQuaternionPtr q) { + OS_DEBUG_ASSERT(p, "QUATDotProduct(): NULL QuaternionPtr 'p' "); + OS_DEBUG_ASSERT(q, "QUATDotProduct(): NULL QuaternionPtr 'q' "); + return q->x * p->x + q->y * p->y + q->z * p->z + q->w * p->w; +} + +f32 PSQUATDotProduct(register CQuaternionPtr p, register CQuaternionPtr q) { + register f32 tmp0; + register f32 tmp1; + register f32 tmp2; + register f32 tmp3; + register f32 tmp4; - trace = mtx[0][0] + mtx[1][1] + mtx[2][2]; + ASM ( + psq_l tmp0, Quaternion.x(p), 0, qr0; + psq_l tmp2, Quaternion.x(q), 0, qr0; + ps_mul tmp4, tmp0, tmp2; + psq_l tmp1, Quaternion.z(p), 0, qr0; + psq_l tmp3, Quaternion.z(q), 0, qr0; + ps_madd tmp4, tmp1, tmp3, tmp4; + ps_sum0 tmp4, tmp4, tmp4, tmp4; + ) - if (trace > 0.0f) { - root = sqrtf(1.0f + trace); - quat->w = root * 0.5f; + return tmp4; +} - root = 0.5f / root; - quat->x = root * (mtx[2][1] - mtx[1][2]); - quat->y = root * (mtx[0][2] - mtx[2][0]); - quat->z = root * (mtx[1][0] - mtx[0][1]); +void C_QUATNormalize(CQuaternionPtr src, QuaternionPtr unit) { + f32 norm; + + OS_DEBUG_ASSERT(src, "QUATNormalize(): NULL QuaternionPtr 'src' "); + OS_DEBUG_ASSERT(unit, "QUATNormalize(): NULL QuaternionPtr 'unit' "); + + norm = + src->x * src->x + src->y * src->y + src->z * src->z + src->w * src->w; + if (norm >= EPSILON) { + norm = 1.0f / sqrtf(norm); + unit->x = src->x * norm; + unit->y = src->y * norm; + unit->z = src->z * norm; + unit->w = src->w * norm; } else { - dmax = 0; + unit->x = unit->y = unit->z = unit->w = 0.0f; + } +} - if (mtx[1][1] > mtx[dmax][dmax]) { - dmax = 1; - } +void PSQUATNormalize(register CQuaternionPtr src, register QuaternionPtr unit) { + register f32 tmp0; + register f32 tmp1; + register f32 tmp2; + register f32 tmp3; + register f32 tmp4 = EPSILON; + register f32 tmp5; + register f32 tmp6; + register f32 tmp7; + register f32 tmp8; + register f32 c_half = 0.5f; + register f32 c_three = 3.0f; + + ASM ( + psq_l tmp0, Quaternion.x(src), 0, qr0; + ps_mul tmp2, tmp0, tmp0; + psq_l tmp1, Quaternion.z(src), 0, qr0; + ps_sub tmp5, tmp4, tmp4; + ps_madd tmp2, tmp1, tmp1, tmp2; + ps_sum0 tmp2, tmp2, tmp2, tmp2; + frsqrte tmp3, tmp2; + ps_sub tmp4, tmp2, tmp4; + fmul tmp6, tmp3, tmp3; + fmul tmp3, tmp3, c_half; + fnmsub tmp6, tmp6, tmp2, c_three; + fmul tmp3, tmp6, tmp3; + ps_sel tmp3, tmp4, tmp3, tmp5; + ps_muls0 tmp7, tmp0, tmp3; + ps_muls0 tmp8, tmp1, tmp3; + psq_st tmp7, Quaternion.x(unit), 0, qr0; + psq_st tmp8, Quaternion.z(unit), 0, qr0; + ) +} + +void C_QUATInverse(CQuaternionPtr src, QuaternionPtr inv) { + f32 norm; + f32 mult; + + OS_DEBUG_ASSERT(src, "QUATInverse(): NULL QuaternionPtr 'src' "); + OS_DEBUG_ASSERT(inv, "QUATInverse(): NULL QuaternionPtr 'inv' "); + + norm = + src->x * src->x + src->y * src->y + src->z * src->z + src->w * src->w; + if (norm == 0.0f) { + norm = 1.0f; + } + mult = 1.0f / norm; + + inv->x = -src->x * mult; + inv->y = -src->y * mult; + inv->z = -src->z * mult; + inv->w = src->w * mult; +} + +void PSQUATInverse(register CQuaternionPtr src, register QuaternionPtr inv) { + register f32 tmp0; + register f32 tmp1; + register f32 tmp2; + register f32 tmp3; + register f32 tmp4; + register f32 tmp5; + register f32 tmp6; + register f32 tmp7; + register f32 tmp8; + register f32 tmp9; + register f32 tmp10 = 1.0f; - if (mtx[2][2] > mtx[dmax][dmax]) { - dmax = 2; + ASM ( + psq_l tmp0, Quaternion.x(src), 0, qr0; + ps_mul tmp5, tmp0, tmp0; + ps_sub tmp9, tmp10, tmp10; + psq_l tmp1, Quaternion.z(src), 0, qr0; + ps_madd tmp5, tmp1, tmp1, tmp5; + ps_add tmp8, tmp10, tmp10; + ps_sum0 tmp5, tmp5, tmp5, tmp5; + fcmpu cr0, tmp5, tmp9; + beq branch; + fres tmp10, tmp5; + ps_neg tmp6, tmp5; + ps_nmsub tmp5, tmp5, tmp10, tmp8; + ps_mul tmp10, tmp10, tmp5; + b branch; + branch: + ps_neg tmp7, tmp10; + ps_muls1 tmp2, tmp10, tmp1; + ps_muls0 tmp3, tmp0, tmp7; + psq_st tmp2, Quaternion.w(inv), 1, qr0; + ps_muls0 tmp4, tmp1, tmp7; + psq_st tmp3, Quaternion.x(inv), 0, qr0; + psq_st tmp4, Quaternion.z(inv), 1, qr0; + ) +} + +void C_QUATDivide(CQuaternionPtr p, CQuaternionPtr q, QuaternionPtr r) { + Quaternion tmp; + + OS_DEBUG_ASSERT(p, "QUATDivide(): NULL QuaternionPtr 'p' "); + OS_DEBUG_ASSERT(q, "QUATDivide(): NULL QuaternionPtr 'q' "); + OS_DEBUG_ASSERT(r, "QUATDivide(): NULL QuaternionPtr 'r' "); + + C_QUATInverse(q, &tmp); + C_QUATMultiply(&tmp, p, r); +} + +void PSQUATDivide(CQuaternionPtr p, CQuaternionPtr q, QuaternionPtr r) { + Quaternion tmp; + PSQUATInverse(q, &tmp); + PSQUATMultiply(&tmp, p, r); +} + +void C_QUATExp(CQuaternionPtr q, QuaternionPtr r) { + f32 mag; + f32 sinmag; + + OS_DEBUG_ASSERT(q, "QUATExp(): NULL QuaternionPtr 'q' "); + OS_DEBUG_ASSERT(r, "QUATExp(): NULL QuaternionPtr 'r' "); + OS_DEBUG_ASSERT(q->w == 0.0f, "QUATExp(): 'q' is not a pure quaternion. "); + + mag = sqrtf(q->x * q->x + q->y * q->y + q->z * q->z); + sinmag = 1.0f; + if (mag > EPSILON) { + sinmag = sinf(mag) / mag; + } + + // \frac{\vec{v}}{\|\vec{v}\|} \sin\|\vec{v}\| + // \implies \vec{v} \frac{\sin\|\vec{v}\|}{\|\vec{v}\|} + // Instead of normalizing and then scaling, we only need one scalar + r->x = sinmag * q->x; + r->y = sinmag * q->y; + r->z = sinmag * q->z; + r->w = cosf(mag); +} + +void C_QUATLogN(CQuaternionPtr q, QuaternionPtr r) { + f32 mag; + f32 tmp; + + OS_DEBUG_ASSERT(q, "QUATLogN(): NULL QuaternionPtr 'q' "); + OS_DEBUG_ASSERT(r, "QUATLogN(): NULL QuaternionPtr 'r' "); + + mag = q->x * q->x + q->y * q->y + q->z * q->z; + + // Release doesn't quite optimize this out +#ifndef NDEBUG + // We also need this local variable scoped in its own block for debug + { + // Broken, should be w^2, not z^2 + f32 magCheck = mag + q->z * q->z; + if (magCheck < 1.0f - EPSILON || magCheck > 1.0f + EPSILON) { + // #if 0 doesn't have same codegen + (void)0; } + } +#endif - dnext = next[dmax]; - dlast = next[dnext]; + mag = sqrtf(mag); + tmp = atan2f(mag, q->w); + if (mag > 0.0f) { + mag = tmp / mag; + } - root = sqrtf(mtx[dmax][dmax] - (mtx[dnext][dnext] + mtx[dlast][dlast]) + - 1.0f); - temp[dmax] = 0.5f * root; + r->x = mag * q->x; + r->y = mag * q->y; + r->z = mag * q->z; + r->w = 0.0f; +} - if (0.0f != root) { - root = 0.5f / root; +void C_QUATMakeClosest(CQuaternionPtr q, CQuaternionPtr qto, QuaternionPtr r) { + f32 dot; + + OS_DEBUG_ASSERT(q, "QUATMakeClosest(): NULL QuaternionPtr 'q' "); + OS_DEBUG_ASSERT(qto, "QUATMakeClosest(): NULL QuaternionPtr 'qto' "); + OS_DEBUG_ASSERT(r, "QUATMakeClosest(): NULL QuaternionPtr 'r' "); + + dot = q->x * qto->x + q->y * qto->y + q->z * qto->z + q->w * qto->w; + if (dot < 0.0f) { + r->x = -q->x; + r->y = -q->y; + r->z = -q->z; + r->w = -q->w; + } else { + *r = *q; + } +} + +void C_QUATRotAxisRad(QuaternionPtr r, CVecPtr axis, f32 rad) { + Vec unit; + f32 halfA; + f32 sinA; + f32 cosA; + + OS_DEBUG_ASSERT(r, "QUATRotAxisRad(): NULL QuaternionPtr 'r' "); + OS_DEBUG_ASSERT(axis, "QUATRotAxisRad(): NULL VecPtr 'axis' "); + + VECNormalize(axis, &unit); + halfA = rad * 0.5f; + sinA = sinf(halfA); + cosA = cosf(halfA); + + r->x = sinA * unit.x; + r->y = sinA * unit.y; + r->z = sinA * unit.z; + r->w = cosA; +} + +void C_QUATMtx(QuaternionPtr r, CMtxPtr m) { + s32 tmp0[3] = {1, 2, 0}; + f32 tmp1; + f32 tmp2; + s32 tmp3; + s32 tmp4; + s32 tmp5; + f32 tmp6[3]; + + OS_DEBUG_ASSERT(r, "QUATMtx(): NULL QuaternionPtr 'r' "); + OS_DEBUG_ASSERT(m, "QUATMtx(): NULL MtxPtr 'm' "); + + tmp1 = m[0][0] + m[1][1] + m[2][2]; + if (tmp1 > 0.0f) { + tmp2 = sqrtf(tmp1 + 1.0f); + r->w = tmp2 * 0.5f; + tmp2 = 0.5f / tmp2; + r->x = (m[2][1] - m[1][2]) * tmp2; + r->y = (m[0][2] - m[2][0]) * tmp2; + r->z = (m[1][0] - m[0][1]) * tmp2; + } else { + tmp3 = 0; + if (m[1][1] > m[0][0]) { + tmp3 = 1; + } + if (m[2][2] > m[tmp3][tmp3]) { + tmp3 = 2; } + tmp4 = tmp0[tmp3]; + tmp5 = tmp0[tmp4]; + tmp2 = sqrtf(m[tmp3][tmp3] - (m[tmp4][tmp4] + m[tmp5][tmp5]) + 1.0f); + tmp6[tmp3] = tmp2 * 0.5f; - quat->w = root * (mtx[dlast][dnext] - mtx[dnext][dlast]); - temp[dnext] = root * (mtx[dmax][dnext] + mtx[dnext][dmax]); - temp[dlast] = root * (mtx[dmax][dlast] + mtx[dlast][dmax]); + if (tmp2 != 0.0f) { + tmp2 = 0.5f / tmp2; + } - quat->x = temp[0]; - quat->y = temp[1]; - quat->z = temp[2]; + r->w = (m[tmp5][tmp4] - m[tmp4][tmp5]) * tmp2; + tmp6[tmp4] = (m[tmp3][tmp4] + m[tmp4][tmp3]) * tmp2; + tmp6[tmp5] = (m[tmp3][tmp5] + m[tmp5][tmp3]) * tmp2; + r->x = tmp6[0]; + r->y = tmp6[1]; + r->z = tmp6[2]; } } -void C_QUATSlerp(const Quaternion* a, const Quaternion* b, Quaternion* out, - f32 t) { - f32 dot; - f32 coeffa, coeffb; - f32 theta, sintheta; +void C_QUATLerp(CQuaternionPtr p, CQuaternionPtr q, QuaternionPtr r, f32 t) { + OS_DEBUG_ASSERT(p, "QUATLerp(): NULL QuaternionPtr 'p' "); + OS_DEBUG_ASSERT(q, "QUATLerp(): NULL QuaternionPtr 'q' "); + OS_DEBUG_ASSERT(r, "QUATLerp(): NULL QuaternionPtr 'r' "); - dot = a->x * b->x + a->y * b->y + a->z * b->z + a->w * b->w; - coeffb = 1.0f; + r->x = p->x + (q->x - p->x) * t; + r->y = p->y + (q->y - p->y) * t; + r->z = p->z + (q->z - p->z) * t; + r->w = p->w + (q->w - p->w) * t; +} - if (dot < 0) { +void C_QUATSlerp(CQuaternionPtr p, CQuaternionPtr q, QuaternionPtr r, f32 t) { + f32 dot; + f32 pt; + f32 qt; + f32 tmp0; + f32 tmp1; + + OS_DEBUG_ASSERT(p, "QUATSlerp(): NULL QuaternionPtr 'p' "); + OS_DEBUG_ASSERT(q, "QUATSlerp(): NULL QuaternionPtr 'q' "); + OS_DEBUG_ASSERT(r, "QUATSlerp(): NULL QuaternionPtr 'r' "); + + dot = p->x * q->x + p->y * q->y + p->z * q->z + p->w * q->w; + qt = 1.0f; + if (dot < 0.0f) { dot = -dot; - coeffb = -coeffb; + qt = -qt; } - if (dot <= 1.0f - MY_EPSILON) { - theta = acosf(dot); - sintheta = sinf(theta); - - coeffa = sinf((1.0f - t) * theta) / sintheta; - coeffb *= sinf(t * theta) / sintheta; + if (dot <= 1.0f - EPSILON) { + tmp0 = acosf(dot); + tmp1 = sinf(tmp0); + pt = sinf((1.0f - t) * tmp0) / tmp1; + qt *= sinf(t * tmp0) / tmp1; } else { - coeffa = 1.0f - t; - coeffb *= t; + pt = 1.0f - t; + qt *= t; } - out->x = coeffa * a->x + coeffb * b->x; - out->y = coeffa * a->y + coeffb * b->y; - out->z = coeffa * a->z + coeffb * b->z; - out->w = coeffa * a->w + coeffb * b->w; + r->x = pt * p->x + qt * q->x; + r->y = pt * p->y + qt * q->y; + r->z = pt * p->z + qt * q->z; + r->w = pt * p->w + qt * q->w; +} + +void C_QUATSquad(CQuaternionPtr p, CQuaternionPtr a, CQuaternionPtr b, + CQuaternionPtr q, QuaternionPtr r, f32 t) { + f32 tmp0; + Quaternion tmp1; + Quaternion tmp2; + + OS_DEBUG_ASSERT(p, "QUATSquad(): NULL QuaternionPtr 'p' "); + OS_DEBUG_ASSERT(a, "QUATSquad(): NULL QuaternionPtr 'a' "); + OS_DEBUG_ASSERT(b, "QUATSquad(): NULL QuaternionPtr 'b' "); + OS_DEBUG_ASSERT(q, "QUATSquad(): NULL QuaternionPtr 'q' "); + OS_DEBUG_ASSERT(r, "QUATSquad(): NULL QuaternionPtr 'r' "); + + tmp0 = t * 2.0f * (1.0f - t); + C_QUATSlerp(p, q, &tmp1, t); + C_QUATSlerp(a, b, &tmp2, t); + C_QUATSlerp(&tmp1, &tmp2, r, tmp0); +} + +void C_QUATCompA(CQuaternionPtr qprev, CQuaternionPtr q, CQuaternionPtr qnext, + QuaternionPtr a) { + Quaternion tmp0; + Quaternion tmp1; + Quaternion tmp2; + Quaternion tmp3; + Quaternion tmp4; + Quaternion tmp5; + + OS_DEBUG_ASSERT(qprev, "QUATCompA(): NULL QuaternionPtr 'qprev' "); + OS_DEBUG_ASSERT(q, "QUATCompA(): NULL QuaternionPtr 'q' "); + OS_DEBUG_ASSERT(qnext, "QUATCompA(): NULL QuaternionPtr 'qnext' "); + OS_DEBUG_ASSERT(a, "QUATCompA(): NULL QuaternionPtr 'a' "); + + C_QUATDivide(qprev, q, &tmp0); + C_QUATLogN(&tmp0, &tmp2); + C_QUATDivide(qnext, q, &tmp1); + C_QUATLogN(&tmp1, &tmp3); + C_QUATAdd(&tmp3, &tmp2, &tmp4); + C_QUATScale(&tmp4, &tmp4, -0.25f); + C_QUATExp(&tmp4, &tmp5); + C_QUATMultiply(q, &tmp5, a); } diff --git a/src/revolution/MTX/vec.c b/src/revolution/MTX/vec.c index 4b6c35022..a5f4a8a7e 100644 --- a/src/revolution/MTX/vec.c +++ b/src/revolution/MTX/vec.c @@ -1,240 +1,372 @@ #include +#include +#include -asm void PSVECAdd(register const Vec* a, register const Vec* b, - register Vec* sum) { - // clang-format off - nofralloc +void C_VECAdd(CVecPtr a, CVecPtr b, VecPtr ab) { + OS_DEBUG_ASSERT(a, "VECAdd(): NULL VecPtr 'a' "); + OS_DEBUG_ASSERT(b, "VECAdd(): NULL VecPtr 'b' "); + OS_DEBUG_ASSERT(ab, "VECAdd(): NULL VecPtr 'ab' "); - // Sum X,Y components - psq_l f2, Vec.x(a), 0, 0 - psq_l f4, Vec.x(b), 0, 0 - ps_add f6, f2, f4 + ab->x = a->x + b->x; + ab->y = a->y + b->y; + ab->z = a->z + b->z; +} - // Store result - psq_st f6, Vec.x(sum), 0, 0 +asm void PSVECAdd(register CVecPtr vec1, register CVecPtr vec2, + register VecPtr dst) { + psq_l f2, Vec.x(vec1), 0, qr0; + psq_l f4, Vec.x(vec2), 0, qr0; + ps_add f6, f2, f4; + psq_st f6, Vec.x(dst), 0, qr0; + psq_l f3, Vec.z(vec1), 1, qr0; + psq_l f5, Vec.z(vec2), 1, qr0; + ps_add f7, f3, f5; + psq_st f7, Vec.z(dst), 1, qr0; +} - // Sum Z component - psq_l f3, Vec.z(a), 1, 0 - psq_l f5, Vec.z(b), 1, 0 - ps_add f7, f3, f5 +void C_VECSubtract(CVecPtr a, CVecPtr b, VecPtr a_b) { + OS_DEBUG_ASSERT(a, "VECSubtract(): NULL VecPtr 'a' "); + OS_DEBUG_ASSERT(b, "VECSubtract(): NULL VecPtr 'b' "); + OS_DEBUG_ASSERT(a_b, "VECSubtract(): NULL VecPtr 'a_b' "); - // Store result - psq_st f7, Vec.z(sum), 1, 0 + a_b->x = a->x - b->x; + a_b->y = a->y - b->y; + a_b->z = a->z - b->z; +} - blr - // clang-format on +asm void PSVECSubtract(register CVecPtr vec1, register CVecPtr vec2, + register VecPtr dst) { + psq_l f2, Vec.x(vec1), 0, qr0; + psq_l f4, Vec.x(vec2), 0, qr0; + ps_sub f6, f2, f4; + psq_st f6, Vec.x(dst), 0, qr0; + psq_l f3, Vec.z(vec1), 1, qr0; + psq_l f5, Vec.z(vec2), 1, qr0; + ps_sub f7, f3, f5; + psq_st f7, Vec.z(dst), 1, qr0; } -void PSVECScale(register const Vec* in, register Vec* out, register f32 scale) { - register f32 xy, z; - register f32 sxy, sz; +void C_VECScale(CVecPtr src, VecPtr dst, f32 scale) { + OS_DEBUG_ASSERT(src, "VECScale(): NULL VecPtr 'src' "); + OS_DEBUG_ASSERT(dst, "VECScale(): NULL VecPtr 'dst' "); - ASM ( - // Load components - psq_l xy, Vec.x(in), 0, 0 - psq_l z, Vec.z(in), 1, 0 + dst->x = src->x * scale; + dst->y = src->y * scale; + dst->z = src->z * scale; +} - // Scale components - ps_muls0 sxy, xy, scale - ps_muls0 sz, z, scale +void PSVECScale(register CVecPtr src, register VecPtr dst, register f32 mult) { + register f32 vxy; + register f32 vz; + register f32 rxy; + register f32 rz; - // Store result - psq_st sxy, Vec.x(out), 0, 0 - psq_st sz, Vec.z(out), 1, 0 + ASM ( + psq_l vxy, Vec.x(src), 0, qr0; + psq_l vz, Vec.z(src), 1, qr0; + ps_muls0 rxy, vxy, mult; + psq_st rxy, Vec.x(dst), 0, qr0; + ps_muls0 rz, vz, mult; + psq_st rz, Vec.z(dst), 1, qr0; ) } -void PSVECNormalize(register const Vec* in, register Vec* out) { - register f32 c_half, c_three; - register f32 xy, z; - register f32 z2; - register f32 dot; - register f32 work0, work1, work2; +void C_VECNormalize(CVecPtr src, VecPtr unit) { + f32 mag; - c_half = 0.5f; - c_three = 3.0f; + OS_DEBUG_ASSERT(src, "VECNormalize(): NULL VecPtr 'src' "); + OS_DEBUG_ASSERT(unit, "VECNormalize(): NULL VecPtr 'unit' "); + + mag = src->x * src->x + src->y * src->y + src->z * src->z; + OS_DEBUG_ASSERT(mag != 0.0f, "VECNormalize(): zero magnitude vector "); + + mag = 1.0f / sqrtf(mag); + unit->x = src->x * mag; + unit->y = src->y * mag; + unit->z = src->z * mag; +} + +void PSVECNormalize(register CVecPtr src, register VecPtr unit) { + register f32 c_half = 0.5f; + register f32 c_three = 3.0f; + register f32 v1_xy; + register f32 v1_z; + register f32 xx_zz; + register f32 xx_yy; + register f32 sqsum; + register f32 rsqrt; + register f32 nwork0; + register f32 nwork1; ASM ( - // Load vector components - psq_l xy, Vec.x(in), 0, 0 - psq_l z, Vec.z(in), 1, 0 - - // Compute dot product with self - ps_mul work0, xy, xy // X^2, Y^2 - ps_madd z2, z, z, work0 // Z^2+X^2, junk - ps_sum0 dot, z2, z, work0 // Z^2+X^2+Y^2, junk - - // Reciprocal square root - frsqrte work0, dot - - // Refine estimate using Newton-Raphson method - // y = 1 / sqrt(x) - fmuls work1, work0, work0 // rsqrt^2 - fmuls work2, work0, c_half // rsqrt * 0.5 - fnmsubs work1, work1, dot, c_three // (3 - x * rsqrt^2) - fmuls work0, work1, work2 // (3 - x * rsqrt^2) * (rsqrt * 0.5) - - // Scale components to normalize - ps_muls0 xy, xy, work0 - ps_muls0 z, z, work0 - - // Store result - psq_st xy, Vec.x(out), 0, 0 - psq_st z, Vec.z(out), 1, 0 + psq_l v1_xy, Vec.x(src), 0, qr0; + ps_mul xx_yy, v1_xy, v1_xy; + psq_l v1_z, Vec.z(src), 1, qr0; + ps_madd xx_zz, v1_z, v1_z, xx_yy; + ps_sum0 sqsum, xx_zz, v1_z, xx_yy; + frsqrte rsqrt, sqsum; + fmuls nwork0, rsqrt, rsqrt; + fmuls nwork1, rsqrt, c_half; + fnmsubs nwork0, nwork0, sqsum, c_three; + fmuls rsqrt, nwork0, nwork1; + ps_muls0 v1_xy, v1_xy, rsqrt; + psq_st v1_xy, Vec.x(unit), 0, qr0; + ps_muls0 v1_z, v1_z, rsqrt; + psq_st v1_z, Vec.z(unit), 1, qr0; ) } -f32 PSVECMag(register const Vec* v) { - register f32 xy, xy2; - register f32 z, z2; - register f32 dot; - register f64 rsqrt; - register f32 work0, work1; - register f32 c_three, c_half, c_zero; +f32 C_VECSquareMag(CVecPtr v) { + f32 sqmag; // The DWARF build we have is outdated, this name is added - c_half = 0.5f; - ASM ( - // Load vector components - psq_l xy, Vec.x(v), 0, 0 - lfs z, Vec.z(v) + OS_DEBUG_ASSERT(v, "VECMag(): NULL VecPtr 'v' "); + + sqmag = v->x * v->x + v->y * v->y + v->z * v->z; + return sqmag; +} - // Compute dot product with self - ps_mul xy2, xy, xy // X^2, Y^2 - ps_madd z2, z, z, xy2 // Z^2+X^2, junk - ps_sum0 dot, z2, xy2, xy2 // Z^2+X^2+Y^2, junk +f32 PSVECSquareMag(register CVecPtr vec1) { + register f32 vxy; + register f32 vzz; + register f32 sqmag; - // Get zero - fsubs c_zero, c_half, c_half + ASM ( + psq_l vxy, Vec.x(vec1), 0, qr0; + ps_mul vxy, vxy, vxy; + lfs vzz, Vec.z(vec1); + ps_madd sqmag, vzz, vzz, vxy; + ps_sum0 sqmag, sqmag, vxy, vxy; ) - // Avoid problematic square root where dot is zero - if (dot == c_zero) { - return dot; - } + return sqmag; +} + +f32 C_VECMag(CVecPtr v) { + return sqrtf(C_VECSquareMag(v)); +} - // Estimate reciprocal square root - rsqrt = __frsqrte(dot); +f32 PSVECMag(register CVecPtr v) { + register f32 vxy; + register f32 vzz; + register f32 sqmag; + register f32 mag; // DWARF has this, but somehow this can't exist, unused? + register f32 rmag; + register f32 nwork0; + register f32 nwork1; + register f32 c_three; + register f32 c_half = 0.5f; + register f32 c_zero; // DWARF doesn't have this, added + ASM ( + psq_l vxy, Vec.x(v), 0, qr0; + ps_mul vxy, vxy, vxy; + lfs vzz, Vec.z(v); + fsubs c_zero, c_half, c_half; + ps_madd sqmag, vzz, vzz, vxy; + ps_sum0 sqmag, sqmag, vxy, vxy; + fcmpu cr0, sqmag, c_zero; + beq ret; + frsqrte rmag, sqmag; + ) c_three = 3.0f; + ASM ( - // Refine estimate using Newton-Raphson method - // y = 1 / sqrt(x) - fmuls work0, rsqrt, rsqrt // rsqrt^2 - fmuls work1, rsqrt, c_half // rsqrt * 0.5 - fnmsubs work0, work0, dot, c_three // (3 - x * rsqrt^2) - fmuls work1, work0, work1 // (3 - x * rsqrt^2) * (rsqrt * 0.5) - - // Convert rsqrt -> sqrt - // x * rsqrt(x) == sqrt(x) - fmuls dot, dot, work1 + fmuls nwork0, rmag, rmag; + fmuls nwork1, rmag, c_half; + fnmsubs nwork0, nwork0, sqmag, c_three; + fmuls rmag, nwork0, nwork1; + fmuls sqmag, sqmag, rmag; + ret: ) + return sqmag; +} + +f32 C_VECDotProduct(CVecPtr a, CVecPtr b) { + f32 dot; + + OS_DEBUG_ASSERT(a, "VECDotProduct(): NULL VecPtr 'a' "); + OS_DEBUG_ASSERT(b, "VECDotProduct(): NULL VecPtr 'b' "); + + dot = a->x * b->x + a->y * b->y + a->z * b->z; return dot; } -asm f32 PSVECDotProduct(register const Vec* a, register const Vec* b) { - // clang-format off - nofralloc +asm f32 PSVECDotProduct(register CVecPtr vec1, register CVecPtr vec2) { + psq_l f2, Vec.y(vec1), 0, qr0; + psq_l f3, Vec.y(vec2), 0, qr0; + ps_mul f2, f2, f3; + psq_l f5, Vec.x(vec1), 0, qr0; + psq_l f4, Vec.x(vec2), 0, qr0; + ps_madd f3, f5, f4, f2; + ps_sum0 f1, f3, f2, f2; +} - // Compute Y,Z products - psq_l f2, Vec.y(a), 0, 0 - psq_l f3, Vec.y(b), 0, 0 - ps_mul f2, f2, f3 // ABY, ABZ +void C_VECCrossProduct(CVecPtr a, CVecPtr b, VecPtr axb) { + Vec v; - // Compute X product - psq_l f5, Vec.x(a), 0, 0 - psq_l f4, Vec.x(b), 0, 0 - ps_madd f3, f5, f4, f2 // ABX+ABY, junk + OS_DEBUG_ASSERT(a, "VECCrossProduct(): NULL VecPtr 'a' "); + OS_DEBUG_ASSERT(b, "VECCrossProduct(): NULL VecPtr 'b' "); + OS_DEBUG_ASSERT(axb, "VECCrossProduct(): NULL VecPtr 'axb' "); - // Compute dot product - ps_sum0 f1, f3, f2, f2 // ABX+ABY+ABZ, junk + v.x = a->y * b->z - a->z * b->y; + v.y = a->z * b->x - a->x * b->z; + v.z = a->x * b->y - a->y * b->x; - blr - // clang-format on + axb->x = v.x; + axb->y = v.y; + axb->z = v.z; } -asm void PSVECCrossProduct(register const Vec* a, register const Vec* b, - register Vec* prod) { - // clang-format off - nofralloc +asm void PSVECCrossProduct(register CVecPtr vec1, register CVecPtr vec2, + register VecPtr dst) { + psq_l f1, Vec.x(vec2), 0, qr0; + lfs f2, Vec.z(vec1); + psq_l f0, Vec.x(vec1), 0, qr0; + ps_merge10 f6, f1, f1; + lfs f3, Vec.z(vec2); + ps_mul f4, f1, f2; + ps_muls0 f7, f1, f0; + ps_msub f5, f0, f3, f4; + ps_msub f8, f0, f6, f7; + ps_merge11 f9, f5, f5; + ps_merge01 f10, f5, f8; + psq_st f9, Vec.x(dst), 1, qr0; + ps_neg f10, f10; + psq_st f10, Vec.y(dst), 0, qr0; +} - // Load vector components - psq_l f1, Vec.x(b), 0, 0 // BX, BY - lfs f2, Vec.z(a) // AZ, AZ - psq_l f0, Vec.x(a), 0, 0 // AX, AY - ps_merge10 f6, f1, f1 // BY, BX - lfs f3, Vec.z(b) // BZ, BZ +void C_VECHalfAngle(CVecPtr a, CVecPtr b, VecPtr half) { + Vec aTmp; + Vec bTmp; + Vec hTmp; + + OS_DEBUG_ASSERT(a, "VECHalfAngle(): NULL VecPtr 'a' "); + OS_DEBUG_ASSERT(b, "VECHalfAngle(): NULL VecPtr 'b' "); + OS_DEBUG_ASSERT(half, "VECHalfAngle(): NULL VecPtr 'half' "); + + aTmp.x = -a->x; + aTmp.y = -a->y; + aTmp.z = -a->z; + bTmp.x = -b->x; + bTmp.y = -b->y; + bTmp.z = -b->z; + + VECNormalize(&aTmp, &aTmp); + VECNormalize(&bTmp, &bTmp); + VECAdd(&aTmp, &bTmp, &hTmp); + if (VECDotProduct(&hTmp, &hTmp) > 0.0f) { + VECNormalize(&hTmp, half); + } else { + *half = hTmp; + } +} - // Compute cross product components - ps_mul f4, f1, f2 // BX*AZ, BY*AZ - ps_muls0 f7, f1, f0 // BX*AX, BY*AX - ps_msub f5, f0, f3, f4 // AX*BZ-BX*AZ, AY*BZ-BY*AZ - ps_msub f8, f0, f6, f7 // AX*BY-BX*AX, AY*BX-BY*AX +void C_VECReflect(CVecPtr src, CVecPtr normal, VecPtr dst) { + f32 cosA; + Vec uI; + Vec uN; - // Manipulate storage - ps_merge11 f9, f5, f5 // AY*BZ-BY*AZ, AY*BZ-BY*AZ - ps_merge01 f10, f5, f8 // AX*BZ-BX*AZ, AY*BX-BY*AX + OS_DEBUG_ASSERT(src, "VECReflect(): NULL VecPtr 'src' "); + OS_DEBUG_ASSERT(normal, "VECReflect(): NULL VecPtr 'normal' "); + OS_DEBUG_ASSERT(dst, "VECReflect(): NULL VecPtr 'dst' "); - // Store cross product X - // cx = AY*BZ-BY*AZ - psq_st f9, Vec.x(prod), 1, 0 + uI.x = -src->x; + uI.y = -src->y; + uI.z = -src->z; - // Store cross product Y/Z - // Negate to fix formula - // cy = -(AX*BZ-BX*AZ) -> BX*AZ-AX*BZ - // cz = -(AY*BX-BY*AX) -> BY*AX-AY*BX - ps_neg f10, f10 - psq_st f10, Vec.y(prod), 0, 0 + VECNormalize(&uI, &uI); + VECNormalize(normal, &uN); + cosA = VECDotProduct(&uI, &uN); - blr - // clang-format on + dst->x = (2.0f * uN.x * cosA) - uI.x; + dst->y = (2.0f * uN.y * cosA) - uI.y; + dst->z = (2.0f * uN.z * cosA) - uI.z; + VECNormalize(dst, dst); } -void C_VECHalfAngle(register const Vec* a, register const Vec* b, - register Vec* half) { - Vec na, nb, ns; +f32 C_VECSquareDistance(CVecPtr a, CVecPtr b) { + Vec v; - na.x = -a->x; - na.y = -a->y; - na.z = -a->z; + v.x = a->x - b->x; + v.y = a->y - b->y; + v.z = a->z - b->z; + return v.x * v.x + v.y * v.y + v.z * v.z; +} + +f32 PSVECSquareDistance(register CVecPtr a, register CVecPtr b) { + register f32 v0yz; + register f32 v1yz; + register f32 v0xy; + register f32 v1xy; + register f32 dyz; + register f32 dxy; + register f32 sqdist; - nb.x = -b->x; - nb.y = -b->y; - nb.z = -b->z; + ASM ( + psq_l v0yz, Vec.y(a), 0, qr0; + psq_l v1yz, Vec.y(b), 0, qr0; + ps_sub dyz, v0yz, v1yz; + psq_l v0xy, Vec.x(a), 0, qr0; + psq_l v1xy, Vec.x(b), 0, qr0; + ps_mul dyz, dyz, dyz; + ps_sub dxy, v0xy, v1xy; + ps_madd sqdist, dxy, dxy, dyz; + ps_sum0 sqdist, sqdist, dyz, dyz; + ) - PSVECNormalize(&na, &na); - PSVECNormalize(&nb, &nb); - PSVECAdd(&na, &nb, &ns); + return sqdist; +} - if (PSVECDotProduct(&ns, &ns) > 0.0f) { - PSVECNormalize(&ns, half); - } else { - *half = ns; - } +f32 C_VECDistance(CVecPtr a, CVecPtr b) { + return sqrtf(C_VECSquareDistance(a, b)); } -f32 PSVECSquareDistance(register const Vec* a, register const Vec* b) { - register f32 ayz, byz; - register f32 axy, bxy; - register f32 dxy, dyz; - register f32 dist; +f32 PSVECDistance(register CVecPtr a, register CVecPtr b) { + register f32 v0yz; + register f32 v1yz; + register f32 v0xy; + register f32 v1xy; + register f32 dyz; + register f32 dxy; + register f32 sqdist; + register f32 rdist; + register f32 dist; // DWARF has this, but somehow this can't exist, unused? + register f32 nwork0; + register f32 nwork1; + register f32 c_half; + register f32 c_three; + register f32 c_zero; // DWARF doesn't have this, added + + ASM ( + psq_l v0yz, Vec.y(a), 0, qr0; + psq_l v1yz, Vec.y(b), 0, qr0; + ps_sub dyz, v0yz, v1yz; + psq_l v0xy, Vec.x(a), 0, qr0; + psq_l v1xy, Vec.x(b), 0, qr0; + ps_mul dyz, dyz, dyz; + ps_sub dxy, v0xy, v1xy; + ) + c_half = 0.5f; + + ASM ( + ps_madd sqdist, dxy, dxy, dyz; + fsubs c_zero, c_half, c_half; + ps_sum0 sqdist, sqdist, dyz, dyz; + fcmpu cr0, c_zero, sqdist; + beq end; + ) + c_three = 3.0f; ASM ( - // Load vector components - psq_l axy, Vec.x(a), 0, 0 - psq_l ayz, Vec.y(a), 0, 0 - psq_l bxy, Vec.x(b), 0, 0 - psq_l byz, Vec.y(b), 0, 0 - - // Compute differences - ps_sub dxy, axy, bxy - ps_sub dyz, ayz, byz - - // Compute distance - ps_mul dyz, dyz, dyz - ps_madd dist, dxy, dxy, dyz - ps_sum0 dist, dist, dyz, dyz + frsqrte rdist, sqdist; + fmuls nwork0, rdist, rdist; + fmuls nwork1, rdist, c_half; + fnmsubs nwork0, nwork0, sqdist, c_three; + fmuls rdist, nwork0, nwork1; + fmuls sqdist, sqdist, rdist; + end: ) - return dist; + return sqdist; }