Add fast function to zero memory

2017-08-02 05:31:08 +02:00
parent 1a4d1238a1
commit ed92e3993e
2 changed files with 271 additions and 10 deletions
--- a/source/tanya/memory/arch/x86_64.d
+++ b/source/tanya/memory/arch/x86_64.d
@@ -79,3 +79,192 @@ pure nothrow @system @nogc
        ret;
    }
 }
 pragma(inline, true)
 package (tanya.memory) void zero(void[] memory)
 pure nothrow @system @nogc
 {
    asm pure nothrow @nogc
    {
        naked;
    }
    version (Windows) asm pure nothrow @nogc
    {
        /*
         * RCX - array.
         */
        mov       R8,           [ RCX ];
        mov       R9,           [ RCX + 8 ];
    }
    else asm pure nothrow @nogc
    {
        /*
         * RSI - pointer.
         * RDI - length.
         */
        mov       R8,           RDI;
        mov       R9,           RSI;
    }
    asm pure nothrow @nogc
    {
        // Check for zero length.
        test      R8,           R8;
        jz        end;
        // Set to 0.
        pxor      XMM0,         XMM0;
        // Check if the pointer is aligned to a 16-byte boundary.
        and       R9,           -0x10;
    }
    // Compute the number of misaligned bytes.
    version (Windows) asm pure nothrow @nogc
    {
        mov       RAX,          [ RCX + 8 ];
    }
    else asm pure nothrow @nogc
    {
        mov       RAX,          RSI;
    }
    asm pure nothrow @nogc
    {
        sub       RAX,          R9;
        test      RAX,          RAX;
        jz aligned;
        // Get the number of bytes to be written until we are aligned.
        mov       RDX,          0x10;
        sub       RDX,          RAX;
    }
    version (Windows) asm pure nothrow @nogc
    {
        mov       R9,           [ RCX + 8 ];
    }
    else asm pure nothrow @nogc
    {
        mov       R9,           RSI;
    }
    asm pure nothrow @nogc
    {
        // Set RAX to zero, so we can set bytes and dwords.
        xor       RAX,          RAX;
    naligned:
        mov       [ R9 ],       AL; // Write a byte.
        // Advance the pointer. Decrease the total number of bytes
        // and the misaligned ones.
        inc       R9;
        dec       RDX;
        dec       R8;
        // Checks if we are aligned.
        test      RDX,          RDX;
        jnz naligned;
    aligned:
        // Checks if we're done writing bytes.
        test      R8,           R8;
        jz end;
        // Write 1 byte at a time.
        cmp       R8,           8;
        jl aligned_1;
        // Write 8 bytes at a time.
        cmp       R8,           16;
        jl aligned_8;
        // Write 16 bytes at a time.
        cmp       R8,           32;
        jl aligned_16;
        // Write 32 bytes at a time.
        cmp       R8,           64;
        jl aligned_32;
    aligned_64:
        movdqa    [ R9 ],        XMM0;
        movdqa    [ R9 + 16 ],   XMM0;
        movdqa    [ R9 + 32 ],   XMM0;
        movdqa    [ R9 + 48 ],   XMM0;
        add       R9,            64;
        sub       R8,            64;
        cmp       R8,            64;
        jge aligned_64;
        // Checks if we're done writing bytes.
        test      R8,            R8;
        jz end;
        // Write 1 byte at a time.
        cmp       R8,            8;
        jl aligned_1;
        // Write 8 bytes at a time.
        cmp       R8,            16;
        jl aligned_8;
        // Write 16 bytes at a time.
        cmp       R8,            32;
        jl aligned_16;
    aligned_32:
        movdqa    [ R9 ],        XMM0;
        movdqa    [ R9 + 16 ],   XMM0;
        add       R9,            32;
        sub       R8,            32;
        // Checks if we're done writing bytes.
        test      R8,            R8;
        jz end;
        // Write 1 byte at a time.
        cmp       R8,            8;
        jl aligned_1;
        // Write 8 bytes at a time.
        cmp       R8,            16;
        jl aligned_8;
    aligned_16:
        movdqa    [ R9 ],        XMM0;
        add       R9,            16;
        sub       R8,            16;
        // Checks if we're done writing bytes.
        test      R8,            R8;
        jz end;
        // Write 1 byte at a time.
        cmp       R8,            8;
        jl aligned_1;
    aligned_8:
        mov       [ R9 ],        RAX;
        add       R9,            8;
        sub       R8,            8;
        // Checks if we're done writing bytes.
        test      R8,            R8;
        jz end;
    aligned_1:
        mov       [ R9 ],        AL;
        inc       R9;
        dec       R8;
        test      R8,            R8;
        jnz aligned_1;
    end:
        ret;
    }
 }
--- a/source/tanya/memory/op.d
+++ b/source/tanya/memory/op.d
@@ -1,4 +1,4 @@
-/* This Source Code Form is subject to the terms of the Mozilla Public
+/* This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
@@ -17,6 +17,8 @@ version (D_InlineAsm_X86_64)
    static import tanya.memory.arch.x86_64;
 }
 private enum alignmentMask = size_t.sizeof - 1;
 /**
 * Copies $(D_PARAM source) into $(D_PARAM target).
 *
@@ -24,7 +26,7 @@ version (D_InlineAsm_X86_64)
 * of $(D_PARAM target) points to an element of $(D_PARAM source).
 *
 * $(D_PARAM target) shall have enough space $(D_INLINECODE source.length)
- * elements. 
+ * elements.
 *
 * Params:
 *  source = Memory to copy from.
@@ -48,7 +50,6 @@ body
        auto source1 = cast(const(ubyte)*) source;
        auto target1 = cast(ubyte*) target;
        auto count = source.length;
        enum alignmentMask = size_t.sizeof - 1;
        // Check if the pointers are aligned or at least can be aligned
        // properly.
@@ -79,19 +80,17 @@ body
        while (count--)
        {
            *target1++ = *source1++;
-        } 
+        }
    }
 }
 ///
 pure nothrow @safe @nogc unittest
 {
-    {
+    ubyte[9] source = [1, 2, 3, 4, 5, 6, 7, 8, 9];
-        ubyte[9] source = [1, 2, 3, 4, 5, 6, 7, 8, 9];
+    ubyte[9] target;
-        ubyte[9] target;
+    source.copy(target);
-        source.copy(target);
+    assert(source == target);
        assert(source == target);
    }
 }
 private pure nothrow @safe @nogc unittest
@@ -113,3 +112,76 @@ private pure nothrow @safe @nogc unittest
        assert(source == target);
    }
 }
 /**
 * Fills $(D_PARAM memory) with zero-valued bytes.
 *
 * Param:
 *  memory = Memory block.
 */
 void zero(void[] memory) pure nothrow @trusted @nogc
 {
    version (D_InlineAsm_X86_64)
    {
        tanya.memory.arch.x86_64.zero(memory);
    }
    else // Naive implementation.
    {
        auto n = memory.length;
        ubyte* vp = cast(ubyte*) memory.ptr;
        // Align.
        while (((cast(size_t) vp) & alignmentMask) != 0)
        {
            *vp++ = 0;
            --n;
        }
        // Set size_t.sizeof bytes at ones.
        auto sp = cast(size_t*) vp;
        while (n / size_t.sizeof > 0)
        {
            *sp++ = 0;
            n -= size_t.sizeof;
        }
        // Write the remaining bytes.
        vp = cast(ubyte*) sp;
        while (n--)
        {
            *vp = 0;
            ++vp;
        }
    }
 }
 ///
 pure nothrow @safe @nogc unittest
 {
    ubyte[9] memory = [1, 2, 3, 4, 5, 6, 7, 8, 9];
    memory.zero();
    foreach (ubyte v; memory)
    {
        assert(v == 0);
    }
 }
 // Stress test. Checks that `zero` can handle unaligned pointers and different
 // lengths.
 pure nothrow @safe @nogc private unittest
 {
    ubyte[192] memory;
    foreach (j; 0 .. 192)
    {
        foreach (ubyte i, ref ubyte v; memory[j .. $])
        {
            v = i;
        }
        zero(memory[j .. $]);
        foreach (ubyte v; memory[j .. $])
        {
            assert(v == 0);
        }
    }
 }