704 if constexpr (modulus.data[3] >= MODULUS_TOP_LIMB_LARGE_THRESHOLD) {
705 return montgomery_mul_big(other);
707#if defined(__SIZEOF_INT128__) && !defined(__wasm__)
709 auto [t0, c] = mul_wide(
data[0], other.data[0]);
710 uint64_t k = t0 * T::r_inv;
711 uint64_t
a = mac_discard_lo(t0, k, modulus.data[0]);
713 uint64_t t1 = mac_mini(
a,
data[0], other.
data[1],
a);
714 mac(t1, k, modulus.data[1], c, t0, c);
715 uint64_t t2 = mac_mini(
a,
data[0], other.
data[2],
a);
716 mac(t2, k, modulus.data[2], c, t1, c);
717 uint64_t t3 = mac_mini(
a,
data[0], other.
data[3],
a);
718 mac(t3, k, modulus.data[3], c, t2, c);
721 mac_mini(t0,
data[1], other.data[0], t0,
a);
723 c = mac_discard_lo(t0, k, modulus.data[0]);
724 mac(t1,
data[1], other.data[1],
a, t1,
a);
725 mac(t1, k, modulus.data[1], c, t0, c);
726 mac(t2,
data[1], other.data[2],
a, t2,
a);
727 mac(t2, k, modulus.data[2], c, t1, c);
728 mac(t3,
data[1], other.data[3],
a, t3,
a);
729 mac(t3, k, modulus.data[3], c, t2, c);
732 mac_mini(t0,
data[2], other.data[0], t0,
a);
734 c = mac_discard_lo(t0, k, modulus.data[0]);
735 mac(t1,
data[2], other.data[1],
a, t1,
a);
736 mac(t1, k, modulus.data[1], c, t0, c);
737 mac(t2,
data[2], other.data[2],
a, t2,
a);
738 mac(t2, k, modulus.data[2], c, t1, c);
739 mac(t3,
data[2], other.data[3],
a, t3,
a);
740 mac(t3, k, modulus.data[3], c, t2, c);
743 mac_mini(t0,
data[3], other.data[0], t0,
a);
745 c = mac_discard_lo(t0, k, modulus.data[0]);
746 mac(t1,
data[3], other.data[1],
a, t1,
a);
747 mac(t1, k, modulus.data[1], c, t0, c);
748 mac(t2,
data[3], other.data[2],
a, t2,
a);
749 mac(t2, k, modulus.data[2], c, t1, c);
750 mac(t3,
data[3], other.data[3],
a, t3,
a);
751 mac(t3, k, modulus.data[3], c, t2, c);
753 return { t0, t1, t2, t3 };
757 auto left = wasm_convert(
data);
758 auto right = wasm_convert(other.data);
759 constexpr uint64_t mask = 0x1fffffff;
770 uint64_t temp_10 = 0;
771 uint64_t temp_11 = 0;
772 uint64_t temp_12 = 0;
773 uint64_t temp_13 = 0;
774 uint64_t temp_14 = 0;
775 uint64_t temp_15 = 0;
776 uint64_t temp_16 = 0;
779 wasm_madd(left[0], right, temp_0, temp_1, temp_2, temp_3, temp_4, temp_5, temp_6, temp_7, temp_8);
780 wasm_madd(left[1], right, temp_1, temp_2, temp_3, temp_4, temp_5, temp_6, temp_7, temp_8, temp_9);
781 wasm_madd(left[2], right, temp_2, temp_3, temp_4, temp_5, temp_6, temp_7, temp_8, temp_9, temp_10);
782 wasm_madd(left[3], right, temp_3, temp_4, temp_5, temp_6, temp_7, temp_8, temp_9, temp_10, temp_11);
783 wasm_madd(left[4], right, temp_4, temp_5, temp_6, temp_7, temp_8, temp_9, temp_10, temp_11, temp_12);
784 wasm_madd(left[5], right, temp_5, temp_6, temp_7, temp_8, temp_9, temp_10, temp_11, temp_12, temp_13);
785 wasm_madd(left[6], right, temp_6, temp_7, temp_8, temp_9, temp_10, temp_11, temp_12, temp_13, temp_14);
786 wasm_madd(left[7], right, temp_7, temp_8, temp_9, temp_10, temp_11, temp_12, temp_13, temp_14, temp_15);
787 wasm_madd(left[8], right, temp_8, temp_9, temp_10, temp_11, temp_12, temp_13, temp_14, temp_15, temp_16);
790 wasm_reduce_yuval(temp_0, temp_1, temp_2, temp_3, temp_4, temp_5, temp_6, temp_7, temp_8, temp_9);
791 wasm_reduce_yuval(temp_1, temp_2, temp_3, temp_4, temp_5, temp_6, temp_7, temp_8, temp_9, temp_10);
792 wasm_reduce_yuval(temp_2, temp_3, temp_4, temp_5, temp_6, temp_7, temp_8, temp_9, temp_10, temp_11);
793 wasm_reduce_yuval(temp_3, temp_4, temp_5, temp_6, temp_7, temp_8, temp_9, temp_10, temp_11, temp_12);
794 wasm_reduce_yuval(temp_4, temp_5, temp_6, temp_7, temp_8, temp_9, temp_10, temp_11, temp_12, temp_13);
795 wasm_reduce_yuval(temp_5, temp_6, temp_7, temp_8, temp_9, temp_10, temp_11, temp_12, temp_13, temp_14);
796 wasm_reduce_yuval(temp_6, temp_7, temp_8, temp_9, temp_10, temp_11, temp_12, temp_13, temp_14, temp_15);
797 wasm_reduce_yuval(temp_7, temp_8, temp_9, temp_10, temp_11, temp_12, temp_13, temp_14, temp_15, temp_16);
803 wasm_reduce(temp_8, temp_9, temp_10, temp_11, temp_12, temp_13, temp_14, temp_15, temp_16);
835 return { (temp_9 << 0) | (temp_10 << 29) | (temp_11 << 58),
836 (temp_11 >> 6) | (temp_12 << 23) | (temp_13 << 52),
837 (temp_13 >> 12) | (temp_14 << 17) | (temp_15 << 46),
838 (temp_15 >> 18) | (temp_16 << 11) };
849 if constexpr (modulus.data[3] >= MODULUS_TOP_LIMB_LARGE_THRESHOLD) {
850 return montgomery_mul_big(*
this);
852#if defined(__SIZEOF_INT128__) && !defined(__wasm__)
853 uint64_t carry_hi = 0;
855 auto [t0, carry_lo] = mul_wide(
data[0],
data[0]);
856 uint64_t t1 = square_accumulate(0,
data[1],
data[0], carry_lo, carry_hi, carry_lo, carry_hi);
857 uint64_t t2 = square_accumulate(0,
data[2],
data[0], carry_lo, carry_hi, carry_lo, carry_hi);
858 uint64_t t3 = square_accumulate(0,
data[3],
data[0], carry_lo, carry_hi, carry_lo, carry_hi);
860 uint64_t round_carry = carry_lo;
861 uint64_t k = t0 * T::r_inv;
862 carry_lo = mac_discard_lo(t0, k, modulus.data[0]);
863 mac(t1, k, modulus.data[1], carry_lo, t0, carry_lo);
864 mac(t2, k, modulus.data[2], carry_lo, t1, carry_lo);
865 mac(t3, k, modulus.data[3], carry_lo, t2, carry_lo);
866 t3 = carry_lo + round_carry;
868 t1 = mac_mini(t1,
data[1],
data[1], carry_lo);
870 t2 = square_accumulate(t2,
data[2],
data[1], carry_lo, carry_hi, carry_lo, carry_hi);
871 t3 = square_accumulate(t3,
data[3],
data[1], carry_lo, carry_hi, carry_lo, carry_hi);
872 round_carry = carry_lo;
874 carry_lo = mac_discard_lo(t0, k, modulus.data[0]);
875 mac(t1, k, modulus.data[1], carry_lo, t0, carry_lo);
876 mac(t2, k, modulus.data[2], carry_lo, t1, carry_lo);
877 mac(t3, k, modulus.data[3], carry_lo, t2, carry_lo);
878 t3 = carry_lo + round_carry;
880 t2 = mac_mini(t2,
data[2],
data[2], carry_lo);
882 t3 = square_accumulate(t3,
data[3],
data[2], carry_lo, carry_hi, carry_lo, carry_hi);
883 round_carry = carry_lo;
885 carry_lo = mac_discard_lo(t0, k, modulus.data[0]);
886 mac(t1, k, modulus.data[1], carry_lo, t0, carry_lo);
887 mac(t2, k, modulus.data[2], carry_lo, t1, carry_lo);
888 mac(t3, k, modulus.data[3], carry_lo, t2, carry_lo);
889 t3 = carry_lo + round_carry;
891 t3 = mac_mini(t3,
data[3],
data[3], carry_lo);
893 round_carry = carry_lo;
894 carry_lo = mac_discard_lo(t0, k, modulus.data[0]);
895 mac(t1, k, modulus.data[1], carry_lo, t0, carry_lo);
896 mac(t2, k, modulus.data[2], carry_lo, t1, carry_lo);
897 mac(t3, k, modulus.data[3], carry_lo, t2, carry_lo);
898 t3 = carry_lo + round_carry;
899 return { t0, t1, t2, t3 };
902 auto left = wasm_convert(
data);
903 constexpr uint64_t mask = 0x1fffffff;
914 uint64_t temp_10 = 0;
915 uint64_t temp_11 = 0;
916 uint64_t temp_12 = 0;
917 uint64_t temp_13 = 0;
918 uint64_t temp_14 = 0;
919 uint64_t temp_15 = 0;
920 uint64_t temp_16 = 0;
923 temp_0 += left[0] * left[0];
925 acc += left[0] * left[1];
926 temp_1 += (acc << 1);
928 acc += left[0] * left[2];
929 temp_2 += left[1] * left[1];
930 temp_2 += (acc << 1);
932 acc += left[0] * left[3];
933 acc += left[1] * left[2];
934 temp_3 += (acc << 1);
936 acc += left[0] * left[4];
937 acc += left[1] * left[3];
938 temp_4 += left[2] * left[2];
939 temp_4 += (acc << 1);
941 acc += left[0] * left[5];
942 acc += left[1] * left[4];
943 acc += left[2] * left[3];
944 temp_5 += (acc << 1);
946 acc += left[0] * left[6];
947 acc += left[1] * left[5];
948 acc += left[2] * left[4];
949 temp_6 += left[3] * left[3];
950 temp_6 += (acc << 1);
952 acc += left[0] * left[7];
953 acc += left[1] * left[6];
954 acc += left[2] * left[5];
955 acc += left[3] * left[4];
956 temp_7 += (acc << 1);
958 acc += left[0] * left[8];
959 acc += left[1] * left[7];
960 acc += left[2] * left[6];
961 acc += left[3] * left[5];
962 temp_8 += left[4] * left[4];
963 temp_8 += (acc << 1);
965 acc += left[1] * left[8];
966 acc += left[2] * left[7];
967 acc += left[3] * left[6];
968 acc += left[4] * left[5];
969 temp_9 += (acc << 1);
971 acc += left[2] * left[8];
972 acc += left[3] * left[7];
973 acc += left[4] * left[6];
974 temp_10 += left[5] * left[5];
975 temp_10 += (acc << 1);
977 acc += left[3] * left[8];
978 acc += left[4] * left[7];
979 acc += left[5] * left[6];
980 temp_11 += (acc << 1);
982 acc += left[4] * left[8];
983 acc += left[5] * left[7];
984 temp_12 += left[6] * left[6];
985 temp_12 += (acc << 1);
987 acc += left[5] * left[8];
988 acc += left[6] * left[7];
989 temp_13 += (acc << 1);
991 acc += left[6] * left[8];
992 temp_14 += left[7] * left[7];
993 temp_14 += (acc << 1);
995 acc += left[7] * left[8];
996 temp_15 += (acc << 1);
997 temp_16 += left[8] * left[8];
1001 wasm_reduce_yuval(temp_0, temp_1, temp_2, temp_3, temp_4, temp_5, temp_6, temp_7, temp_8, temp_9);
1002 wasm_reduce_yuval(temp_1, temp_2, temp_3, temp_4, temp_5, temp_6, temp_7, temp_8, temp_9, temp_10);
1003 wasm_reduce_yuval(temp_2, temp_3, temp_4, temp_5, temp_6, temp_7, temp_8, temp_9, temp_10, temp_11);
1004 wasm_reduce_yuval(temp_3, temp_4, temp_5, temp_6, temp_7, temp_8, temp_9, temp_10, temp_11, temp_12);
1005 wasm_reduce_yuval(temp_4, temp_5, temp_6, temp_7, temp_8, temp_9, temp_10, temp_11, temp_12, temp_13);
1006 wasm_reduce_yuval(temp_5, temp_6, temp_7, temp_8, temp_9, temp_10, temp_11, temp_12, temp_13, temp_14);
1007 wasm_reduce_yuval(temp_6, temp_7, temp_8, temp_9, temp_10, temp_11, temp_12, temp_13, temp_14, temp_15);
1008 wasm_reduce_yuval(temp_7, temp_8, temp_9, temp_10, temp_11, temp_12, temp_13, temp_14, temp_15, temp_16);
1026 wasm_reduce(temp_8, temp_9, temp_10, temp_11, temp_12, temp_13, temp_14, temp_15, temp_16);
1044 return { (temp_9 << 0) | (temp_10 << 29) | (temp_11 << 58),
1045 (temp_11 >> 6) | (temp_12 << 23) | (temp_13 << 52),
1046 (temp_13 >> 12) | (temp_14 << 17) | (temp_15 << 46),
1047 (temp_15 >> 18) | (temp_16 << 11) };