Add memory.op.copyBackward

Added function that can copy memory chunks that can overlap.
2017-08-09 07:01:57 +02:00
parent e6b28468ca
commit 7c2abadb90
2 changed files with 316 additions and 182 deletions
--- a/source/tanya/memory/arch/x86_64.d
+++ b/source/tanya/memory/arch/x86_64.d
@@ -82,206 +82,261 @@ pure nothrow @system @nogc
    }
 }
-private enum const(char[]) MovArrayPointer(string Destination)()
+package (tanya.memory) template fill(ubyte Byte)
 {
-    string asmCode = "asm pure nothrow @nogc { mov ";
+    private enum const(char[]) MovArrayPointer(string Destination)()
    version (Windows)
    {
-        asmCode ~= Destination ~ ", [ RCX + 8 ];";
+        string asmCode = "asm pure nothrow @nogc { mov ";
        version (Windows)
        {
            asmCode ~= Destination ~ ", [ RCX + 8 ];";
        }
        else
        {
            asmCode ~= Destination ~ ", RSI;";
        }
        return asmCode ~ "}";
    }
-    else
+
    pragma(inline, true)
    void fill(void[] memory)
    {
-        asmCode ~= Destination ~ ", RSI;";
+        asm pure nothrow @nogc
        {
            naked;
        }
        version (Windows) asm pure nothrow @nogc
        {
            /*
             * RCX - array.
             */
            mov       R8,           [ RCX ];
        }
        else asm pure nothrow @nogc
        {
            /*
             * RSI - pointer.
             * RDI - length.
             */
            mov       R8,           RDI;
        }
        mixin(MovArrayPointer!"R9");
        asm pure nothrow @nogc
        {
            // Check for zero length.
            test      R8,           R8;
            jz        end;
        }
        // Set 128- and 64-bit registers to values we want to fill with.
        static if (Byte == 0)
        {
            asm pure nothrow @nogc
            {
                xor  RAX,  RAX;
                pxor XMM0, XMM0;
            }
        }
        else
        {
            enum ulong FilledBytes = FilledBytes!Byte;
            asm pure nothrow @nogc
            {
                mov     RAX,  FilledBytes;
                movq    XMM0, RAX;
                movlhps XMM0, XMM0;
            }
        }
        asm pure nothrow @nogc
        {
            // Check if the pointer is aligned to a 16-byte boundary.
            and       R9,           -0x10;
        }
        // Compute the number of misaligned bytes.
        mixin(MovArrayPointer!"R10");
        asm pure nothrow @nogc
        {
            sub       R10,          R9;
            test      R10,          R10;
            jz aligned;
            // Get the number of bytes to be written until we are aligned.
            mov       RDX,          0x10;
            sub       RDX,          R10;
        }
        mixin(MovArrayPointer!"R9");
        asm pure nothrow @nogc
        {
        naligned:
            mov       [ R9 ],       AL; // Write a byte.
            // Advance the pointer. Decrease the total number of bytes
            // and the misaligned ones.
            inc       R9;
            dec       RDX;
            dec       R8;
            // Checks if we are aligned.
            test      RDX,          RDX;
            jnz naligned;
        aligned:
            // Checks if we're done writing bytes.
            test      R8,           R8;
            jz end;
            // Write 1 byte at a time.
            cmp       R8,           8;
            jl aligned_1;
            // Write 8 bytes at a time.
            cmp       R8,           16;
            jl aligned_8;
            // Write 16 bytes at a time.
            cmp       R8,           32;
            jl aligned_16;
            // Write 32 bytes at a time.
            cmp       R8,           64;
            jl aligned_32;
        aligned_64:
            movdqa    [ R9 ],        XMM0;
            movdqa    [ R9 + 16 ],   XMM0;
            movdqa    [ R9 + 32 ],   XMM0;
            movdqa    [ R9 + 48 ],   XMM0;
            add       R9,            64;
            sub       R8,            64;
            cmp       R8,            64;
            jge aligned_64;
            // Checks if we're done writing bytes.
            test      R8,            R8;
            jz end;
            // Write 1 byte at a time.
            cmp       R8,            8;
            jl aligned_1;
            // Write 8 bytes at a time.
            cmp       R8,            16;
            jl aligned_8;
            // Write 16 bytes at a time.
            cmp       R8,            32;
            jl aligned_16;
        aligned_32:
            movdqa    [ R9 ],        XMM0;
            movdqa    [ R9 + 16 ],   XMM0;
            add       R9,            32;
            sub       R8,            32;
            // Checks if we're done writing bytes.
            test      R8,            R8;
            jz end;
            // Write 1 byte at a time.
            cmp       R8,            8;
            jl aligned_1;
            // Write 8 bytes at a time.
            cmp       R8,            16;
            jl aligned_8;
        aligned_16:
            movdqa    [ R9 ],        XMM0;
            add       R9,            16;
            sub       R8,            16;
            // Checks if we're done writing bytes.
            test      R8,            R8;
            jz end;
            // Write 1 byte at a time.
            cmp       R8,            8;
            jl aligned_1;
        aligned_8:
            mov       [ R9 ],        RAX;
            add       R9,            8;
            sub       R8,            8;
            // Checks if we're done writing bytes.
            test      R8,            R8;
            jz end;
        aligned_1:
            mov       [ R9 ],        AL;
            inc       R9;
            dec       R8;
            test      R8,            R8;
            jnz aligned_1;
        end:
            ret;
        }
    }
    return asmCode ~ "}";
 }
 pragma(inline, true)
-package (tanya.memory) void fill(ubyte Byte)(void[] memory)
+package (tanya.memory) void copyBackward(const void[] source, void[] target)
-pure nothrow @system @nogc
+pure nothrow @system  @nogc
 {
    asm pure nothrow @nogc
    {
        naked;
        // Save the registers should be restored.
        mov R8, RSI;
        mov R9, RDI;
    }
    // Prepare the registers for movsb.
    version (Windows) asm pure nothrow @nogc
    {
-        /*
+        // RDX - source.
-         * RCX - array.
+        // RCX - target.
-         */
+
-        mov       R8,           [ RCX ];
+        mov RAX, [ RCX + 8 ];
        mov R10, [ RDX + 8 ];
        mov RCX, [ RDX ];
        lea RDI, [ RAX + RCX - 1 ];
        lea RSI, [ R10 + RCX - 1 ];
    }
    else asm pure nothrow @nogc
    {
-        /*
+        // RDX - source length.
-         * RSI - pointer.
+        // RCX - source data.
-         * RDI - length.
+        // RDI - target length
-         */
+        // RSI - target data.
        mov       R8,           RDI;
    }
    mixin(MovArrayPointer!"R9");
-    asm pure nothrow @nogc
+        lea RDI, [ RSI + RDX - 1 ];
-    {
+        lea RSI, [ RCX + RDX - 1 ];
-        // Check for zero length.
+        mov RCX, RDX;
        test      R8,           R8;
        jz        end;
    }
    // Set 128- and 64-bit registers to values we want to fill with.
    static if (Byte == 0)
    {
        asm pure nothrow @nogc
        {
            xor  RAX,  RAX;
            pxor XMM0, XMM0;
        }
    }
    else
    {
        enum ulong FilledBytes = FilledBytes!Byte;
        asm pure nothrow @nogc
        {
            mov     RAX,  FilledBytes;
            movq    XMM0, RAX;
            movlhps XMM0, XMM0;
        }
    }
    asm pure nothrow @nogc
    {
-        // Check if the pointer is aligned to a 16-byte boundary.
+        std; // Set the direction flag.
        and       R9,           -0x10;
    }
    // Compute the number of misaligned bytes.
    mixin(MovArrayPointer!"R10");
    asm pure nothrow @nogc
    {
        sub       R10,          R9;
-        test      R10,          R10;
+        rep;
-        jz aligned;
+        movsb;
-        // Get the number of bytes to be written until we are aligned.
+        cld; // Clear the direction flag.
        mov       RDX,          0x10;
        sub       RDX,          R10;
    }
    mixin(MovArrayPointer!"R9");
    asm pure nothrow @nogc
    {
    naligned:
        mov       [ R9 ],       AL; // Write a byte.
-        // Advance the pointer. Decrease the total number of bytes
+        // Restore registers.
-        // and the misaligned ones.
+        mov RDI, R9;
-        inc       R9;
+        mov RSI, R8;
        dec       RDX;
        dec       R8;
        // Checks if we are aligned.
        test      RDX,          RDX;
        jnz naligned;
    aligned:
        // Checks if we're done writing bytes.
        test      R8,           R8;
        jz end;
        // Write 1 byte at a time.
        cmp       R8,           8;
        jl aligned_1;
        // Write 8 bytes at a time.
        cmp       R8,           16;
        jl aligned_8;
        // Write 16 bytes at a time.
        cmp       R8,           32;
        jl aligned_16;
        // Write 32 bytes at a time.
        cmp       R8,           64;
        jl aligned_32;
    aligned_64:
        movdqa    [ R9 ],        XMM0;
        movdqa    [ R9 + 16 ],   XMM0;
        movdqa    [ R9 + 32 ],   XMM0;
        movdqa    [ R9 + 48 ],   XMM0;
        add       R9,            64;
        sub       R8,            64;
        cmp       R8,            64;
        jge aligned_64;
        // Checks if we're done writing bytes.
        test      R8,            R8;
        jz end;
        // Write 1 byte at a time.
        cmp       R8,            8;
        jl aligned_1;
        // Write 8 bytes at a time.
        cmp       R8,            16;
        jl aligned_8;
        // Write 16 bytes at a time.
        cmp       R8,            32;
        jl aligned_16;
    aligned_32:
        movdqa    [ R9 ],        XMM0;
        movdqa    [ R9 + 16 ],   XMM0;
        add       R9,            32;
        sub       R8,            32;
        // Checks if we're done writing bytes.
        test      R8,            R8;
        jz end;
        // Write 1 byte at a time.
        cmp       R8,            8;
        jl aligned_1;
        // Write 8 bytes at a time.
        cmp       R8,            16;
        jl aligned_8;
    aligned_16:
        movdqa    [ R9 ],        XMM0;
        add       R9,            16;
        sub       R8,            16;
        // Checks if we're done writing bytes.
        test      R8,            R8;
        jz end;
        // Write 1 byte at a time.
        cmp       R8,            8;
        jl aligned_1;
    aligned_8:
        mov       [ R9 ],        RAX;
        add       R9,            8;
        sub       R8,            8;
        // Checks if we're done writing bytes.
        test      R8,            R8;
        jz end;
    aligned_1:
        mov       [ R9 ],        AL;
        inc       R9;
        dec       R8;
        test      R8,            R8;
        jnz aligned_1;
    end:
        ret;
    }
 }
--- a/source/tanya/memory/op.d
+++ b/source/tanya/memory/op.d
@@ -17,21 +17,23 @@ version (D_InlineAsm_X86_64)
    static import tanya.memory.arch.x86_64;
 }
-private enum alignmentMask = size_t.sizeof - 1;
+private enum alignMask = size_t.sizeof - 1;
 /**
 * Copies $(D_PARAM source) into $(D_PARAM target).
 *
- * $(D_PARAM source) and $(D_PARAM target) shall not overlap so that an element
+ * $(D_PARAM source) and $(D_PARAM target) shall not overlap so that
- * of $(D_PARAM target) points to an element of $(D_PARAM source).
+ * $(D_PARAM source) points ahead of $(D_PARAM target).
 *
- * $(D_PARAM target) shall have enough space $(D_INLINECODE source.length)
+ * $(D_PARAM target) shall have enough space for $(D_INLINECODE source.length)
 * elements.
 *
 * Params:
 *  source = Memory to copy from.
 *  target = Destination memory.
 *
 * See_Also: $(D_PSYMBOL copyBackward).
 *
 * Precondition: $(D_INLINECODE source.length <= target.length).
 */
 void copy(const void[] source, void[] target) pure nothrow @trusted @nogc
@@ -53,8 +55,8 @@ body
        // Check if the pointers are aligned or at least can be aligned
        // properly.
-        ushort naligned = (cast(size_t) source.ptr) & alignmentMask;
+        ushort naligned = (cast(size_t) source.ptr) & alignMask;
-        if (naligned == ((cast(size_t) target.ptr) & alignmentMask))
+        if (naligned == ((cast(size_t) target.ptr) & alignMask))
        {
            // Align the pointers if possible.
            if (naligned != 0)
@@ -135,7 +137,7 @@ package template FilledBytes(ubyte Byte, ubyte I = 0)
 *  Byte   = The value to fill $(D_PARAM memory) with.
 *  memory = Memory block.
 */
-void fill(ubyte Byte = 0)(void[] memory) pure nothrow @trusted @nogc
+void fill(ubyte Byte = 0)(void[] memory) @trusted
 {
    version (D_InlineAsm_X86_64)
    {
@@ -147,7 +149,7 @@ void fill(ubyte Byte = 0)(void[] memory) pure nothrow @trusted @nogc
        ubyte* vp = cast(ubyte*) memory.ptr;
        // Align.
-        while (((cast(size_t) vp) & alignmentMask) != 0)
+        while (((cast(size_t) vp) & alignMask) != 0)
        {
            *vp++ = Byte;
            --n;
@@ -206,3 +208,80 @@ pure nothrow @safe @nogc private unittest
        }
    }
 }
 /**
 * Copies starting from the end of $(D_PARAM source) into the end of
 * $(D_PARAM target).
 *
 * $(D_PSYMBOL copyBackward) copies the elements in reverse order, but the
 * order of elements in the $(D_PARAM target) is exactly the same as in the
 * $(D_PARAM source).
 *
 * $(D_PARAM source) and $(D_PARAM target) shall not overlap so that
 * $(D_PARAM target) points ahead of $(D_PARAM source).
 *
 * $(D_PARAM target) shall have enough space for $(D_INLINECODE source.length)
 * elements.
 *
 * Params:
 *  source = Memory to copy from.
 *  target = Destination memory.
 *
 * See_Also: $(D_PSYMBOL copy).
 *
 * Precondition: $(D_INLINECODE source.length <= target.length).
 */
 void copyBackward(const void[] source, void[] target) pure nothrow @trusted @nogc
 in
 {
    assert(source.length <= target.length);
 }
 body
 {
    version (D_InlineAsm_X86_64)
    {
        tanya.memory.arch.x86_64.copyBackward(source, target);
    }
    else // Naive implementation.
    {
        auto count = source.length;
        // Try to align the pointers if possible.
        if (((cast(size_t) source.ptr) & alignMask) == ((cast(size_t) target.ptr) & alignMask))
        {
            while (((cast(size_t) (source.ptr + count)) & alignMask) != 0)
            {
                if (!count--)
                {
                    return;
                }
                (cast(ubyte[]) target)[count]
                    = (cast(const(ubyte)[]) source)[count];
            }
        }
        // Write as long we're aligned.
        for (; count >= size_t.sizeof; count -= size_t.sizeof)
        {
                *(cast(size_t*) (target.ptr + count - size_t.sizeof))
                    = *(cast(const(size_t)*) (source.ptr + count - size_t.sizeof));
        }
        // Write the remaining bytes.
        while (count--)
        {
            (cast(ubyte[]) target)[count]
                = (cast(const(ubyte)[]) source)[count];
        }
    }
 }
 ///
 pure nothrow @safe @nogc unittest
 {
    ubyte[6] mem = [ 'a', 'a', 'b', 'b', 'c', 'c' ];
    ubyte[6] expected = [ 'a', 'a', 'a', 'a', 'b', 'b' ];
    copyBackward(mem[0 .. 4], mem[2 .. $]);
    assert(expected == mem);
 }