diff --git a/source/tanya/memory/arch/x86_64.d b/source/tanya/memory/arch/x86_64.d index a9caec5..26d948e 100644 --- a/source/tanya/memory/arch/x86_64.d +++ b/source/tanya/memory/arch/x86_64.d @@ -82,206 +82,261 @@ pure nothrow @system @nogc } } -private enum const(char[]) MovArrayPointer(string Destination)() +package (tanya.memory) template fill(ubyte Byte) { - string asmCode = "asm pure nothrow @nogc { mov "; - version (Windows) + private enum const(char[]) MovArrayPointer(string Destination)() { - asmCode ~= Destination ~ ", [ RCX + 8 ];"; + string asmCode = "asm pure nothrow @nogc { mov "; + version (Windows) + { + asmCode ~= Destination ~ ", [ RCX + 8 ];"; + } + else + { + asmCode ~= Destination ~ ", RSI;"; + } + return asmCode ~ "}"; } - else + + pragma(inline, true) + void fill(void[] memory) { - asmCode ~= Destination ~ ", RSI;"; + asm pure nothrow @nogc + { + naked; + } + version (Windows) asm pure nothrow @nogc + { + /* + * RCX - array. + */ + mov R8, [ RCX ]; + } + else asm pure nothrow @nogc + { + /* + * RSI - pointer. + * RDI - length. + */ + mov R8, RDI; + } + mixin(MovArrayPointer!"R9"); + + asm pure nothrow @nogc + { + // Check for zero length. + test R8, R8; + jz end; + } + // Set 128- and 64-bit registers to values we want to fill with. + static if (Byte == 0) + { + asm pure nothrow @nogc + { + xor RAX, RAX; + pxor XMM0, XMM0; + } + } + else + { + enum ulong FilledBytes = FilledBytes!Byte; + asm pure nothrow @nogc + { + mov RAX, FilledBytes; + movq XMM0, RAX; + movlhps XMM0, XMM0; + } + } + asm pure nothrow @nogc + { + // Check if the pointer is aligned to a 16-byte boundary. + and R9, -0x10; + } + // Compute the number of misaligned bytes. + mixin(MovArrayPointer!"R10"); + asm pure nothrow @nogc + { + sub R10, R9; + + test R10, R10; + jz aligned; + + // Get the number of bytes to be written until we are aligned. + mov RDX, 0x10; + sub RDX, R10; + } + mixin(MovArrayPointer!"R9"); + asm pure nothrow @nogc + { + naligned: + mov [ R9 ], AL; // Write a byte. + + // Advance the pointer. Decrease the total number of bytes + // and the misaligned ones. + inc R9; + dec RDX; + dec R8; + + // Checks if we are aligned. + test RDX, RDX; + jnz naligned; + + aligned: + // Checks if we're done writing bytes. + test R8, R8; + jz end; + + // Write 1 byte at a time. + cmp R8, 8; + jl aligned_1; + + // Write 8 bytes at a time. + cmp R8, 16; + jl aligned_8; + + // Write 16 bytes at a time. + cmp R8, 32; + jl aligned_16; + + // Write 32 bytes at a time. + cmp R8, 64; + jl aligned_32; + + aligned_64: + movdqa [ R9 ], XMM0; + movdqa [ R9 + 16 ], XMM0; + movdqa [ R9 + 32 ], XMM0; + movdqa [ R9 + 48 ], XMM0; + + add R9, 64; + sub R8, 64; + + cmp R8, 64; + jge aligned_64; + + // Checks if we're done writing bytes. + test R8, R8; + jz end; + + // Write 1 byte at a time. + cmp R8, 8; + jl aligned_1; + + // Write 8 bytes at a time. + cmp R8, 16; + jl aligned_8; + + // Write 16 bytes at a time. + cmp R8, 32; + jl aligned_16; + + aligned_32: + movdqa [ R9 ], XMM0; + movdqa [ R9 + 16 ], XMM0; + + add R9, 32; + sub R8, 32; + + // Checks if we're done writing bytes. + test R8, R8; + jz end; + + // Write 1 byte at a time. + cmp R8, 8; + jl aligned_1; + + // Write 8 bytes at a time. + cmp R8, 16; + jl aligned_8; + + aligned_16: + movdqa [ R9 ], XMM0; + + add R9, 16; + sub R8, 16; + + // Checks if we're done writing bytes. + test R8, R8; + jz end; + + // Write 1 byte at a time. + cmp R8, 8; + jl aligned_1; + + aligned_8: + mov [ R9 ], RAX; + + add R9, 8; + sub R8, 8; + + // Checks if we're done writing bytes. + test R8, R8; + jz end; + + aligned_1: + mov [ R9 ], AL; + + inc R9; + dec R8; + + test R8, R8; + jnz aligned_1; + + end: + ret; + } } - return asmCode ~ "}"; } pragma(inline, true) -package (tanya.memory) void fill(ubyte Byte)(void[] memory) -pure nothrow @system @nogc +package (tanya.memory) void copyBackward(const void[] source, void[] target) +pure nothrow @system @nogc { asm pure nothrow @nogc { naked; + + // Save the registers should be restored. + mov R8, RSI; + mov R9, RDI; } + // Prepare the registers for movsb. version (Windows) asm pure nothrow @nogc { - /* - * RCX - array. - */ - mov R8, [ RCX ]; + // RDX - source. + // RCX - target. + + mov RAX, [ RCX + 8 ]; + mov R10, [ RDX + 8 ]; + mov RCX, [ RDX ]; + + lea RDI, [ RAX + RCX - 1 ]; + lea RSI, [ R10 + RCX - 1 ]; } else asm pure nothrow @nogc { - /* - * RSI - pointer. - * RDI - length. - */ - mov R8, RDI; - } - mixin(MovArrayPointer!"R9"); + // RDX - source length. + // RCX - source data. + // RDI - target length + // RSI - target data. - asm pure nothrow @nogc - { - // Check for zero length. - test R8, R8; - jz end; - } - // Set 128- and 64-bit registers to values we want to fill with. - static if (Byte == 0) - { - asm pure nothrow @nogc - { - xor RAX, RAX; - pxor XMM0, XMM0; - } - } - else - { - enum ulong FilledBytes = FilledBytes!Byte; - asm pure nothrow @nogc - { - mov RAX, FilledBytes; - movq XMM0, RAX; - movlhps XMM0, XMM0; - } + lea RDI, [ RSI + RDX - 1 ]; + lea RSI, [ RCX + RDX - 1 ]; + mov RCX, RDX; } asm pure nothrow @nogc { - // Check if the pointer is aligned to a 16-byte boundary. - and R9, -0x10; - } - // Compute the number of misaligned bytes. - mixin(MovArrayPointer!"R10"); - asm pure nothrow @nogc - { - sub R10, R9; + std; // Set the direction flag. - test R10, R10; - jz aligned; + rep; + movsb; - // Get the number of bytes to be written until we are aligned. - mov RDX, 0x10; - sub RDX, R10; - } - mixin(MovArrayPointer!"R9"); - asm pure nothrow @nogc - { - naligned: - mov [ R9 ], AL; // Write a byte. + cld; // Clear the direction flag. - // Advance the pointer. Decrease the total number of bytes - // and the misaligned ones. - inc R9; - dec RDX; - dec R8; + // Restore registers. + mov RDI, R9; + mov RSI, R8; - // Checks if we are aligned. - test RDX, RDX; - jnz naligned; - - aligned: - // Checks if we're done writing bytes. - test R8, R8; - jz end; - - // Write 1 byte at a time. - cmp R8, 8; - jl aligned_1; - - // Write 8 bytes at a time. - cmp R8, 16; - jl aligned_8; - - // Write 16 bytes at a time. - cmp R8, 32; - jl aligned_16; - - // Write 32 bytes at a time. - cmp R8, 64; - jl aligned_32; - - aligned_64: - movdqa [ R9 ], XMM0; - movdqa [ R9 + 16 ], XMM0; - movdqa [ R9 + 32 ], XMM0; - movdqa [ R9 + 48 ], XMM0; - - add R9, 64; - sub R8, 64; - - cmp R8, 64; - jge aligned_64; - - // Checks if we're done writing bytes. - test R8, R8; - jz end; - - // Write 1 byte at a time. - cmp R8, 8; - jl aligned_1; - - // Write 8 bytes at a time. - cmp R8, 16; - jl aligned_8; - - // Write 16 bytes at a time. - cmp R8, 32; - jl aligned_16; - - aligned_32: - movdqa [ R9 ], XMM0; - movdqa [ R9 + 16 ], XMM0; - - add R9, 32; - sub R8, 32; - - // Checks if we're done writing bytes. - test R8, R8; - jz end; - - // Write 1 byte at a time. - cmp R8, 8; - jl aligned_1; - - // Write 8 bytes at a time. - cmp R8, 16; - jl aligned_8; - - aligned_16: - movdqa [ R9 ], XMM0; - - add R9, 16; - sub R8, 16; - - // Checks if we're done writing bytes. - test R8, R8; - jz end; - - // Write 1 byte at a time. - cmp R8, 8; - jl aligned_1; - - aligned_8: - mov [ R9 ], RAX; - - add R9, 8; - sub R8, 8; - - // Checks if we're done writing bytes. - test R8, R8; - jz end; - - aligned_1: - mov [ R9 ], AL; - - inc R9; - dec R8; - - test R8, R8; - jnz aligned_1; - - end: ret; } } diff --git a/source/tanya/memory/op.d b/source/tanya/memory/op.d index e0a3dea..9b15586 100644 --- a/source/tanya/memory/op.d +++ b/source/tanya/memory/op.d @@ -17,21 +17,23 @@ version (D_InlineAsm_X86_64) static import tanya.memory.arch.x86_64; } -private enum alignmentMask = size_t.sizeof - 1; +private enum alignMask = size_t.sizeof - 1; /** * Copies $(D_PARAM source) into $(D_PARAM target). * - * $(D_PARAM source) and $(D_PARAM target) shall not overlap so that an element - * of $(D_PARAM target) points to an element of $(D_PARAM source). + * $(D_PARAM source) and $(D_PARAM target) shall not overlap so that + * $(D_PARAM source) points ahead of $(D_PARAM target). * - * $(D_PARAM target) shall have enough space $(D_INLINECODE source.length) + * $(D_PARAM target) shall have enough space for $(D_INLINECODE source.length) * elements. * * Params: * source = Memory to copy from. * target = Destination memory. * + * See_Also: $(D_PSYMBOL copyBackward). + * * Precondition: $(D_INLINECODE source.length <= target.length). */ void copy(const void[] source, void[] target) pure nothrow @trusted @nogc @@ -53,8 +55,8 @@ body // Check if the pointers are aligned or at least can be aligned // properly. - ushort naligned = (cast(size_t) source.ptr) & alignmentMask; - if (naligned == ((cast(size_t) target.ptr) & alignmentMask)) + ushort naligned = (cast(size_t) source.ptr) & alignMask; + if (naligned == ((cast(size_t) target.ptr) & alignMask)) { // Align the pointers if possible. if (naligned != 0) @@ -135,7 +137,7 @@ package template FilledBytes(ubyte Byte, ubyte I = 0) * Byte = The value to fill $(D_PARAM memory) with. * memory = Memory block. */ -void fill(ubyte Byte = 0)(void[] memory) pure nothrow @trusted @nogc +void fill(ubyte Byte = 0)(void[] memory) @trusted { version (D_InlineAsm_X86_64) { @@ -147,7 +149,7 @@ void fill(ubyte Byte = 0)(void[] memory) pure nothrow @trusted @nogc ubyte* vp = cast(ubyte*) memory.ptr; // Align. - while (((cast(size_t) vp) & alignmentMask) != 0) + while (((cast(size_t) vp) & alignMask) != 0) { *vp++ = Byte; --n; @@ -206,3 +208,80 @@ pure nothrow @safe @nogc private unittest } } } + +/** + * Copies starting from the end of $(D_PARAM source) into the end of + * $(D_PARAM target). + * + * $(D_PSYMBOL copyBackward) copies the elements in reverse order, but the + * order of elements in the $(D_PARAM target) is exactly the same as in the + * $(D_PARAM source). + * + * $(D_PARAM source) and $(D_PARAM target) shall not overlap so that + * $(D_PARAM target) points ahead of $(D_PARAM source). + * + * $(D_PARAM target) shall have enough space for $(D_INLINECODE source.length) + * elements. + * + * Params: + * source = Memory to copy from. + * target = Destination memory. + * + * See_Also: $(D_PSYMBOL copy). + * + * Precondition: $(D_INLINECODE source.length <= target.length). + */ +void copyBackward(const void[] source, void[] target) pure nothrow @trusted @nogc +in +{ + assert(source.length <= target.length); +} +body +{ + version (D_InlineAsm_X86_64) + { + tanya.memory.arch.x86_64.copyBackward(source, target); + } + else // Naive implementation. + { + auto count = source.length; + + // Try to align the pointers if possible. + if (((cast(size_t) source.ptr) & alignMask) == ((cast(size_t) target.ptr) & alignMask)) + { + while (((cast(size_t) (source.ptr + count)) & alignMask) != 0) + { + if (!count--) + { + return; + } + (cast(ubyte[]) target)[count] + = (cast(const(ubyte)[]) source)[count]; + } + } + + // Write as long we're aligned. + for (; count >= size_t.sizeof; count -= size_t.sizeof) + { + *(cast(size_t*) (target.ptr + count - size_t.sizeof)) + = *(cast(const(size_t)*) (source.ptr + count - size_t.sizeof)); + } + + // Write the remaining bytes. + while (count--) + { + (cast(ubyte[]) target)[count] + = (cast(const(ubyte)[]) source)[count]; + } + } +} + +/// +pure nothrow @safe @nogc unittest +{ + ubyte[6] mem = [ 'a', 'a', 'b', 'b', 'c', 'c' ]; + ubyte[6] expected = [ 'a', 'a', 'a', 'a', 'b', 'b' ]; + + copyBackward(mem[0 .. 4], mem[2 .. $]); + assert(expected == mem); +}