In my free time, I worked on a utility library that, among other things, supports 128-bit signed / unsigned integers. This library uses cpu-dispatching to use simd commands in some cases, but it requires a portable backup, so it will work everywhere. Most recently, I implemented a portable reserve for 128-bit switching. It works correctly and works fast enough, but it is not as fast as I would like, especially on 32-bit architectures.
Here is a split version with all the relevant types and functionality (for completeness of the 64-bit version):
typedef uint32_t UInt32; typedef int32_t Int32; typedef uint64_t UInt64; typedef int64_t Int64;
The corresponding 32-bit compilation is quite long, so if not requested, I omit it.
The main bottleneck obviously branches out when the shift argument is unknown at compile time. What can be done to eliminate branching, or whatever the use of portable cheating to speed it up?
Update 1
A copy assignment operator is added that is not available in the above example. For those who are interested, unit tests are given. I use Catch for simplicity.
// Left shift lookup table, from 0-127. const UInt128 gLeftShiftLut128[] = { UInt128(0xFFEEDDCCBBAA9988, 0x7766554433221100), UInt128(0xFFDDBB9977553310, 0xEECCAA8866442200), UInt128(0xFFBB7732EEAA6621, 0xDD995510CC884400), UInt128(0xFF76EE65DD54CC43, 0xBB32AA2199108800), UInt128(0xFEEDDCCBBAA99887, 0x7665544332211000), UInt128(0xFDDBB9977553310E, 0xECCAA88664422000), UInt128(0xFBB7732EEAA6621D, 0xD995510CC8844000), UInt128(0xF76EE65DD54CC43B, 0xB32AA21991088000), UInt128(0xEEDDCCBBAA998877, 0x6655443322110000), UInt128(0xDDBB9977553310EE, 0xCCAA886644220000), UInt128(0xBB7732EEAA6621DD, 0x995510CC88440000), UInt128(0x76EE65DD54CC43BB, 0x32AA219910880000), UInt128(0xEDDCCBBAA9988776, 0x6554433221100000), UInt128(0xDBB9977553310EEC, 0xCAA8866442200000), UInt128(0xB7732EEAA6621DD9, 0x95510CC884400000), UInt128(0x6EE65DD54CC43BB3, 0x2AA2199108800000), UInt128(0xDDCCBBAA99887766, 0x5544332211000000), UInt128(0xBB9977553310EECC, 0xAA88664422000000), UInt128(0x7732EEAA6621DD99, 0x5510CC8844000000), UInt128(0xEE65DD54CC43BB32, 0xAA21991088000000), UInt128(0xDCCBBAA998877665, 0x5443322110000000), UInt128(0xB9977553310EECCA, 0xA886644220000000), UInt128(0x732EEAA6621DD995, 0x510CC88440000000), UInt128(0xE65DD54CC43BB32A, 0xA219910880000000), UInt128(0xCCBBAA9988776655, 0x4433221100000000), UInt128(0x9977553310EECCAA, 0x8866442200000000), UInt128(0x32EEAA6621DD9955, 0x10CC884400000000), UInt128(0x65DD54CC43BB32AA, 0x2199108800000000), UInt128(0xCBBAA99887766554, 0x4332211000000000), UInt128(0x977553310EECCAA8, 0x8664422000000000), UInt128(0x2EEAA6621DD99551, 0xCC8844000000000), UInt128(0x5DD54CC43BB32AA2, 0x1991088000000000), UInt128(0xBBAA998877665544, 0x3322110000000000), UInt128(0x77553310EECCAA88, 0x6644220000000000), UInt128(0xEEAA6621DD995510, 0xCC88440000000000), UInt128(0xDD54CC43BB32AA21, 0x9910880000000000), UInt128(0xBAA9988776655443, 0x3221100000000000), UInt128(0x7553310EECCAA886, 0x6442200000000000), UInt128(0xEAA6621DD995510C, 0xC884400000000000), UInt128(0xD54CC43BB32AA219, 0x9108800000000000), UInt128(0xAA99887766554433, 0x2211000000000000), UInt128(0x553310EECCAA8866, 0x4422000000000000), UInt128(0xAA6621DD995510CC, 0x8844000000000000), UInt128(0x54CC43BB32AA2199, 0x1088000000000000), UInt128(0xA998877665544332, 0x2110000000000000), UInt128(0x53310EECCAA88664, 0x4220000000000000), UInt128(0xA6621DD995510CC8, 0x8440000000000000), UInt128(0x4CC43BB32AA21991, 0x880000000000000), UInt128(0x9988776655443322, 0x1100000000000000), UInt128(0x3310EECCAA886644, 0x2200000000000000), UInt128(0x6621DD995510CC88, 0x4400000000000000), UInt128(0xCC43BB32AA219910, 0x8800000000000000), UInt128(0x9887766554433221, 0x1000000000000000), UInt128(0x310EECCAA8866442, 0x2000000000000000), UInt128(0x621DD995510CC884, 0x4000000000000000), UInt128(0xC43BB32AA2199108, 0x8000000000000000), UInt128(0x8877665544332211, 0x0), UInt128(0x10EECCAA88664422, 0x0), UInt128(0x21DD995510CC8844, 0x0), UInt128(0x43BB32AA21991088, 0x0), UInt128(0x8776655443322110, 0x0), UInt128(0xEECCAA886644220 , 0x0), UInt128(0x1DD995510CC88440, 0x0), UInt128(0x3BB32AA219910880, 0x0), UInt128(0x7766554433221100, 0x0), UInt128(0xEECCAA8866442200, 0x0), UInt128(0xDD995510CC884400, 0x0), UInt128(0xBB32AA2199108800, 0x0), UInt128(0x7665544332211000, 0x0), UInt128(0xECCAA88664422000, 0x0), UInt128(0xD995510CC8844000, 0x0), UInt128(0xB32AA21991088000, 0x0), UInt128(0x6655443322110000, 0x0), UInt128(0xCCAA886644220000, 0x0), UInt128(0x995510CC88440000, 0x0), UInt128(0x32AA219910880000, 0x0), UInt128(0x6554433221100000, 0x0), UInt128(0xCAA8866442200000, 0x0), UInt128(0x95510CC884400000, 0x0), UInt128(0x2AA2199108800000, 0x0), UInt128(0x5544332211000000, 0x0), UInt128(0xAA88664422000000, 0x0), UInt128(0x5510CC8844000000, 0x0), UInt128(0xAA21991088000000, 0x0), UInt128(0x5443322110000000, 0x0), UInt128(0xA886644220000000, 0x0), UInt128(0x510CC88440000000, 0x0), UInt128(0xA219910880000000, 0x0), UInt128(0x4433221100000000, 0x0), UInt128(0x8866442200000000, 0x0), UInt128(0x10CC884400000000, 0x0), UInt128(0x2199108800000000, 0x0), UInt128(0x4332211000000000, 0x0), UInt128(0x8664422000000000, 0x0), UInt128(0xCC8844000000000 , 0x0), UInt128(0x1991088000000000, 0x0), UInt128(0x3322110000000000, 0x0), UInt128(0x6644220000000000, 0x0), UInt128(0xCC88440000000000, 0x0), UInt128(0x9910880000000000, 0x0), UInt128(0x3221100000000000, 0x0), UInt128(0x6442200000000000, 0x0), UInt128(0xC884400000000000, 0x0), UInt128(0x9108800000000000, 0x0), UInt128(0x2211000000000000, 0x0), UInt128(0x4422000000000000, 0x0), UInt128(0x8844000000000000, 0x0), UInt128(0x1088000000000000, 0x0), UInt128(0x2110000000000000, 0x0), UInt128(0x4220000000000000, 0x0), UInt128(0x8440000000000000, 0x0), UInt128(0x880000000000000 , 0x0), UInt128(0x1100000000000000, 0x0), UInt128(0x2200000000000000, 0x0), UInt128(0x4400000000000000, 0x0), UInt128(0x8800000000000000, 0x0), UInt128(0x1000000000000000, 0x0), UInt128(0x2000000000000000, 0x0), UInt128(0x4000000000000000, 0x0), UInt128(0x8000000000000000, 0x0), UInt128(0x0, 0x0), UInt128(0x0, 0x0), UInt128(0x0, 0x0), UInt128(0x0, 0x0), UInt128(0x0, 0x0), UInt128(0x0, 0x0), UInt128(0x0, 0x0), UInt128(0x0, 0x0) }; // Right shift lookup table, from 0-127. const UInt128 gRightShiftLut128[] = { UInt128(0xFFEEDDCCBBAA9988, 0x7766554433221100), UInt128(0x7FF76EE65DD54CC4, 0x3BB32AA219910880), UInt128(0x3FFBB7732EEAA662, 0x1DD995510CC88440), UInt128(0x1FFDDBB997755331, 0xEECCAA886644220), UInt128(0xFFEEDDCCBBAA998, 0x8776655443322110), UInt128(0x7FF76EE65DD54CC, 0x43BB32AA21991088), UInt128(0x3FFBB7732EEAA66, 0x21DD995510CC8844), UInt128(0x1FFDDBB99775533, 0x10EECCAA88664422), UInt128(0xFFEEDDCCBBAA99, 0x8877665544332211), UInt128(0x7FF76EE65DD54C, 0xC43BB32AA2199108), UInt128(0x3FFBB7732EEAA6, 0x621DD995510CC884), UInt128(0x1FFDDBB9977553, 0x310EECCAA8866442), UInt128(0xFFEEDDCCBBAA9, 0x9887766554433221), UInt128(0x7FF76EE65DD54, 0xCC43BB32AA219910), UInt128(0x3FFBB7732EEAA, 0x6621DD995510CC88), UInt128(0x1FFDDBB997755, 0x3310EECCAA886644), UInt128(0xFFEEDDCCBBAA, 0x9988776655443322), UInt128(0x7FF76EE65DD5, 0x4CC43BB32AA21991), UInt128(0x3FFBB7732EEA, 0xA6621DD995510CC8), UInt128(0x1FFDDBB99775, 0x53310EECCAA88664), UInt128(0xFFEEDDCCBBA, 0xA998877665544332), UInt128(0x7FF76EE65DD, 0x54CC43BB32AA2199), UInt128(0x3FFBB7732EE, 0xAA6621DD995510CC), UInt128(0x1FFDDBB9977, 0x553310EECCAA8866), UInt128(0xFFEEDDCCBB, 0xAA99887766554433), UInt128(0x7FF76EE65D, 0xD54CC43BB32AA219), UInt128(0x3FFBB7732E, 0xEAA6621DD995510C), UInt128(0x1FFDDBB997, 0x7553310EECCAA886), UInt128(0xFFEEDDCCB, 0xBAA9988776655443), UInt128(0x7FF76EE65, 0xDD54CC43BB32AA21), UInt128(0x3FFBB7732, 0xEEAA6621DD995510), UInt128(0x1FFDDBB99, 0x77553310EECCAA88), UInt128(0xFFEEDDCC, 0xBBAA998877665544), UInt128(0x7FF76EE6, 0x5DD54CC43BB32AA2), UInt128(0x3FFBB773, 0x2EEAA6621DD99551), UInt128(0x1FFDDBB9, 0x977553310EECCAA8), UInt128(0xFFEEDDC, 0xCBBAA99887766554), UInt128(0x7FF76EE, 0x65DD54CC43BB32AA), UInt128(0x3FFBB77, 0x32EEAA6621DD9955), UInt128(0x1FFDDBB, 0x9977553310EECCAA), UInt128(0xFFEEDD, 0xCCBBAA9988776655), UInt128(0x7FF76E, 0xE65DD54CC43BB32A), UInt128(0x3FFBB7, 0x732EEAA6621DD995), UInt128(0x1FFDDB, 0xB9977553310EECCA), UInt128(0xFFEED, 0xDCCBBAA998877665), UInt128(0x7FF76, 0xEE65DD54CC43BB32), UInt128(0x3FFBB, 0x7732EEAA6621DD99), UInt128(0x1FFDD, 0xBB9977553310EECC), UInt128(0xFFEE, 0xDDCCBBAA99887766), UInt128(0x7FF7, 0x6EE65DD54CC43BB3), UInt128(0x3FFB, 0xB7732EEAA6621DD9), UInt128(0x1FFD, 0xDBB9977553310EEC), UInt128(0xFFE, 0xEDDCCBBAA9988776), UInt128(0x7FF, 0x76EE65DD54CC43BB), UInt128(0x3FF, 0xBB7732EEAA6621DD), UInt128(0x1FF, 0xDDBB9977553310EE), UInt128(0xFF, 0xEEDDCCBBAA998877), UInt128(0x7F, 0xF76EE65DD54CC43B), UInt128(0x3F, 0xFBB7732EEAA6621D), UInt128(0x1F, 0xFDDBB9977553310E), UInt128(0xF, 0xFEEDDCCBBAA99887), UInt128(0x7, 0xFF76EE65DD54CC43), UInt128(0x3, 0xFFBB7732EEAA6621), UInt128(0x1, 0xFFDDBB9977553310), UInt128(0x0, 0xFFEEDDCCBBAA9988), UInt128(0x0, 0x7FF76EE65DD54CC4), UInt128(0x0, 0x3FFBB7732EEAA662), UInt128(0x0, 0x1FFDDBB997755331), UInt128(0x0, 0xFFEEDDCCBBAA998), UInt128(0x0, 0x7FF76EE65DD54CC), UInt128(0x0, 0x3FFBB7732EEAA66), UInt128(0x0, 0x1FFDDBB99775533), UInt128(0x0, 0xFFEEDDCCBBAA99), UInt128(0x0, 0x7FF76EE65DD54C), UInt128(0x0, 0x3FFBB7732EEAA6), UInt128(0x0, 0x1FFDDBB9977553), UInt128(0x0, 0xFFEEDDCCBBAA9), UInt128(0x0, 0x7FF76EE65DD54), UInt128(0x0, 0x3FFBB7732EEAA), UInt128(0x0, 0x1FFDDBB997755), UInt128(0x0, 0xFFEEDDCCBBAA), UInt128(0x0, 0x7FF76EE65DD5), UInt128(0x0, 0x3FFBB7732EEA), UInt128(0x0, 0x1FFDDBB99775), UInt128(0x0, 0xFFEEDDCCBBA), UInt128(0x0, 0x7FF76EE65DD), UInt128(0x0, 0x3FFBB7732EE), UInt128(0x0, 0x1FFDDBB9977), UInt128(0x0, 0xFFEEDDCCBB), UInt128(0x0, 0x7FF76EE65D), UInt128(0x0, 0x3FFBB7732E), UInt128(0x0, 0x1FFDDBB997), UInt128(0x0, 0xFFEEDDCCB), UInt128(0x0, 0x7FF76EE65), UInt128(0x0, 0x3FFBB7732), UInt128(0x0, 0x1FFDDBB99), UInt128(0x0, 0xFFEEDDCC), UInt128(0x0, 0x7FF76EE6), UInt128(0x0, 0x3FFBB773), UInt128(0x0, 0x1FFDDBB9), UInt128(0x0, 0xFFEEDDC), UInt128(0x0, 0x7FF76EE), UInt128(0x0, 0x3FFBB77), UInt128(0x0, 0x1FFDDBB), UInt128(0x0, 0xFFEEDD), UInt128(0x0, 0x7FF76E), UInt128(0x0, 0x3FFBB7), UInt128(0x0, 0x1FFDDB), UInt128(0x0, 0xFFEED), UInt128(0x0, 0x7FF76), UInt128(0x0, 0x3FFBB), UInt128(0x0, 0x1FFDD), UInt128(0x0, 0xFFEE), UInt128(0x0, 0x7FF7), UInt128(0x0, 0x3FFB), UInt128(0x0, 0x1FFD), UInt128(0x0, 0xFFE), UInt128(0x0, 0x7FF), UInt128(0x0, 0x3FF), UInt128(0x0, 0x1FF), UInt128(0x0, 0xFF), UInt128(0x0, 0x7F), UInt128(0x0, 0x3F), UInt128(0x0, 0x1F), UInt128(0x0, 0xF), UInt128(0x0, 0x7), UInt128(0x0, 0x3), UInt128(0x0, 0x1) }; TEST_CASE("UInt128 left shift produces correct results.") { auto base = UInt128(0xFFEEDDCCBBAA9988, 0x7766554433221100); for (auto i = 1; i <= 127; i++) { auto sample = base; sample <<= i; INFO("i = " << i); REQUIRE(sample == gLeftShiftLut128[i]); } } TEST_CASE("UInt128 right shift produces correct results.") { auto base = UInt128(0xFFEEDDCCBBAA9988, 0x7766554433221100); for (auto i = 0; i <= 127; i++) { auto sample = base; sample >>= i; INFO("i = " << i); REQUIRE(sample == gRightShiftLut128[i]); } }