Add memory.op.copyBackward

Added function that can copy memory chunks that can overlap.
This commit is contained in:
Eugen Wissner 2017-08-09 07:01:57 +02:00
parent e6b28468ca
commit 7c2abadb90
2 changed files with 316 additions and 182 deletions

View File

@ -82,206 +82,261 @@ pure nothrow @system @nogc
} }
} }
private enum const(char[]) MovArrayPointer(string Destination)() package (tanya.memory) template fill(ubyte Byte)
{ {
string asmCode = "asm pure nothrow @nogc { mov "; private enum const(char[]) MovArrayPointer(string Destination)()
version (Windows)
{ {
asmCode ~= Destination ~ ", [ RCX + 8 ];"; string asmCode = "asm pure nothrow @nogc { mov ";
version (Windows)
{
asmCode ~= Destination ~ ", [ RCX + 8 ];";
}
else
{
asmCode ~= Destination ~ ", RSI;";
}
return asmCode ~ "}";
} }
else
pragma(inline, true)
void fill(void[] memory)
{ {
asmCode ~= Destination ~ ", RSI;"; asm pure nothrow @nogc
{
naked;
}
version (Windows) asm pure nothrow @nogc
{
/*
* RCX - array.
*/
mov R8, [ RCX ];
}
else asm pure nothrow @nogc
{
/*
* RSI - pointer.
* RDI - length.
*/
mov R8, RDI;
}
mixin(MovArrayPointer!"R9");
asm pure nothrow @nogc
{
// Check for zero length.
test R8, R8;
jz end;
}
// Set 128- and 64-bit registers to values we want to fill with.
static if (Byte == 0)
{
asm pure nothrow @nogc
{
xor RAX, RAX;
pxor XMM0, XMM0;
}
}
else
{
enum ulong FilledBytes = FilledBytes!Byte;
asm pure nothrow @nogc
{
mov RAX, FilledBytes;
movq XMM0, RAX;
movlhps XMM0, XMM0;
}
}
asm pure nothrow @nogc
{
// Check if the pointer is aligned to a 16-byte boundary.
and R9, -0x10;
}
// Compute the number of misaligned bytes.
mixin(MovArrayPointer!"R10");
asm pure nothrow @nogc
{
sub R10, R9;
test R10, R10;
jz aligned;
// Get the number of bytes to be written until we are aligned.
mov RDX, 0x10;
sub RDX, R10;
}
mixin(MovArrayPointer!"R9");
asm pure nothrow @nogc
{
naligned:
mov [ R9 ], AL; // Write a byte.
// Advance the pointer. Decrease the total number of bytes
// and the misaligned ones.
inc R9;
dec RDX;
dec R8;
// Checks if we are aligned.
test RDX, RDX;
jnz naligned;
aligned:
// Checks if we're done writing bytes.
test R8, R8;
jz end;
// Write 1 byte at a time.
cmp R8, 8;
jl aligned_1;
// Write 8 bytes at a time.
cmp R8, 16;
jl aligned_8;
// Write 16 bytes at a time.
cmp R8, 32;
jl aligned_16;
// Write 32 bytes at a time.
cmp R8, 64;
jl aligned_32;
aligned_64:
movdqa [ R9 ], XMM0;
movdqa [ R9 + 16 ], XMM0;
movdqa [ R9 + 32 ], XMM0;
movdqa [ R9 + 48 ], XMM0;
add R9, 64;
sub R8, 64;
cmp R8, 64;
jge aligned_64;
// Checks if we're done writing bytes.
test R8, R8;
jz end;
// Write 1 byte at a time.
cmp R8, 8;
jl aligned_1;
// Write 8 bytes at a time.
cmp R8, 16;
jl aligned_8;
// Write 16 bytes at a time.
cmp R8, 32;
jl aligned_16;
aligned_32:
movdqa [ R9 ], XMM0;
movdqa [ R9 + 16 ], XMM0;
add R9, 32;
sub R8, 32;
// Checks if we're done writing bytes.
test R8, R8;
jz end;
// Write 1 byte at a time.
cmp R8, 8;
jl aligned_1;
// Write 8 bytes at a time.
cmp R8, 16;
jl aligned_8;
aligned_16:
movdqa [ R9 ], XMM0;
add R9, 16;
sub R8, 16;
// Checks if we're done writing bytes.
test R8, R8;
jz end;
// Write 1 byte at a time.
cmp R8, 8;
jl aligned_1;
aligned_8:
mov [ R9 ], RAX;
add R9, 8;
sub R8, 8;
// Checks if we're done writing bytes.
test R8, R8;
jz end;
aligned_1:
mov [ R9 ], AL;
inc R9;
dec R8;
test R8, R8;
jnz aligned_1;
end:
ret;
}
} }
return asmCode ~ "}";
} }
pragma(inline, true) pragma(inline, true)
package (tanya.memory) void fill(ubyte Byte)(void[] memory) package (tanya.memory) void copyBackward(const void[] source, void[] target)
pure nothrow @system @nogc pure nothrow @system @nogc
{ {
asm pure nothrow @nogc asm pure nothrow @nogc
{ {
naked; naked;
// Save the registers should be restored.
mov R8, RSI;
mov R9, RDI;
} }
// Prepare the registers for movsb.
version (Windows) asm pure nothrow @nogc version (Windows) asm pure nothrow @nogc
{ {
/* // RDX - source.
* RCX - array. // RCX - target.
*/
mov R8, [ RCX ]; mov RAX, [ RCX + 8 ];
mov R10, [ RDX + 8 ];
mov RCX, [ RDX ];
lea RDI, [ RAX + RCX - 1 ];
lea RSI, [ R10 + RCX - 1 ];
} }
else asm pure nothrow @nogc else asm pure nothrow @nogc
{ {
/* // RDX - source length.
* RSI - pointer. // RCX - source data.
* RDI - length. // RDI - target length
*/ // RSI - target data.
mov R8, RDI;
}
mixin(MovArrayPointer!"R9");
asm pure nothrow @nogc lea RDI, [ RSI + RDX - 1 ];
{ lea RSI, [ RCX + RDX - 1 ];
// Check for zero length. mov RCX, RDX;
test R8, R8;
jz end;
}
// Set 128- and 64-bit registers to values we want to fill with.
static if (Byte == 0)
{
asm pure nothrow @nogc
{
xor RAX, RAX;
pxor XMM0, XMM0;
}
}
else
{
enum ulong FilledBytes = FilledBytes!Byte;
asm pure nothrow @nogc
{
mov RAX, FilledBytes;
movq XMM0, RAX;
movlhps XMM0, XMM0;
}
} }
asm pure nothrow @nogc asm pure nothrow @nogc
{ {
// Check if the pointer is aligned to a 16-byte boundary. std; // Set the direction flag.
and R9, -0x10;
}
// Compute the number of misaligned bytes.
mixin(MovArrayPointer!"R10");
asm pure nothrow @nogc
{
sub R10, R9;
test R10, R10; rep;
jz aligned; movsb;
// Get the number of bytes to be written until we are aligned. cld; // Clear the direction flag.
mov RDX, 0x10;
sub RDX, R10;
}
mixin(MovArrayPointer!"R9");
asm pure nothrow @nogc
{
naligned:
mov [ R9 ], AL; // Write a byte.
// Advance the pointer. Decrease the total number of bytes // Restore registers.
// and the misaligned ones. mov RDI, R9;
inc R9; mov RSI, R8;
dec RDX;
dec R8;
// Checks if we are aligned.
test RDX, RDX;
jnz naligned;
aligned:
// Checks if we're done writing bytes.
test R8, R8;
jz end;
// Write 1 byte at a time.
cmp R8, 8;
jl aligned_1;
// Write 8 bytes at a time.
cmp R8, 16;
jl aligned_8;
// Write 16 bytes at a time.
cmp R8, 32;
jl aligned_16;
// Write 32 bytes at a time.
cmp R8, 64;
jl aligned_32;
aligned_64:
movdqa [ R9 ], XMM0;
movdqa [ R9 + 16 ], XMM0;
movdqa [ R9 + 32 ], XMM0;
movdqa [ R9 + 48 ], XMM0;
add R9, 64;
sub R8, 64;
cmp R8, 64;
jge aligned_64;
// Checks if we're done writing bytes.
test R8, R8;
jz end;
// Write 1 byte at a time.
cmp R8, 8;
jl aligned_1;
// Write 8 bytes at a time.
cmp R8, 16;
jl aligned_8;
// Write 16 bytes at a time.
cmp R8, 32;
jl aligned_16;
aligned_32:
movdqa [ R9 ], XMM0;
movdqa [ R9 + 16 ], XMM0;
add R9, 32;
sub R8, 32;
// Checks if we're done writing bytes.
test R8, R8;
jz end;
// Write 1 byte at a time.
cmp R8, 8;
jl aligned_1;
// Write 8 bytes at a time.
cmp R8, 16;
jl aligned_8;
aligned_16:
movdqa [ R9 ], XMM0;
add R9, 16;
sub R8, 16;
// Checks if we're done writing bytes.
test R8, R8;
jz end;
// Write 1 byte at a time.
cmp R8, 8;
jl aligned_1;
aligned_8:
mov [ R9 ], RAX;
add R9, 8;
sub R8, 8;
// Checks if we're done writing bytes.
test R8, R8;
jz end;
aligned_1:
mov [ R9 ], AL;
inc R9;
dec R8;
test R8, R8;
jnz aligned_1;
end:
ret; ret;
} }
} }

View File

@ -17,21 +17,23 @@ version (D_InlineAsm_X86_64)
static import tanya.memory.arch.x86_64; static import tanya.memory.arch.x86_64;
} }
private enum alignmentMask = size_t.sizeof - 1; private enum alignMask = size_t.sizeof - 1;
/** /**
* Copies $(D_PARAM source) into $(D_PARAM target). * Copies $(D_PARAM source) into $(D_PARAM target).
* *
* $(D_PARAM source) and $(D_PARAM target) shall not overlap so that an element * $(D_PARAM source) and $(D_PARAM target) shall not overlap so that
* of $(D_PARAM target) points to an element of $(D_PARAM source). * $(D_PARAM source) points ahead of $(D_PARAM target).
* *
* $(D_PARAM target) shall have enough space $(D_INLINECODE source.length) * $(D_PARAM target) shall have enough space for $(D_INLINECODE source.length)
* elements. * elements.
* *
* Params: * Params:
* source = Memory to copy from. * source = Memory to copy from.
* target = Destination memory. * target = Destination memory.
* *
* See_Also: $(D_PSYMBOL copyBackward).
*
* Precondition: $(D_INLINECODE source.length <= target.length). * Precondition: $(D_INLINECODE source.length <= target.length).
*/ */
void copy(const void[] source, void[] target) pure nothrow @trusted @nogc void copy(const void[] source, void[] target) pure nothrow @trusted @nogc
@ -53,8 +55,8 @@ body
// Check if the pointers are aligned or at least can be aligned // Check if the pointers are aligned or at least can be aligned
// properly. // properly.
ushort naligned = (cast(size_t) source.ptr) & alignmentMask; ushort naligned = (cast(size_t) source.ptr) & alignMask;
if (naligned == ((cast(size_t) target.ptr) & alignmentMask)) if (naligned == ((cast(size_t) target.ptr) & alignMask))
{ {
// Align the pointers if possible. // Align the pointers if possible.
if (naligned != 0) if (naligned != 0)
@ -135,7 +137,7 @@ package template FilledBytes(ubyte Byte, ubyte I = 0)
* Byte = The value to fill $(D_PARAM memory) with. * Byte = The value to fill $(D_PARAM memory) with.
* memory = Memory block. * memory = Memory block.
*/ */
void fill(ubyte Byte = 0)(void[] memory) pure nothrow @trusted @nogc void fill(ubyte Byte = 0)(void[] memory) @trusted
{ {
version (D_InlineAsm_X86_64) version (D_InlineAsm_X86_64)
{ {
@ -147,7 +149,7 @@ void fill(ubyte Byte = 0)(void[] memory) pure nothrow @trusted @nogc
ubyte* vp = cast(ubyte*) memory.ptr; ubyte* vp = cast(ubyte*) memory.ptr;
// Align. // Align.
while (((cast(size_t) vp) & alignmentMask) != 0) while (((cast(size_t) vp) & alignMask) != 0)
{ {
*vp++ = Byte; *vp++ = Byte;
--n; --n;
@ -206,3 +208,80 @@ pure nothrow @safe @nogc private unittest
} }
} }
} }
/**
* Copies starting from the end of $(D_PARAM source) into the end of
* $(D_PARAM target).
*
* $(D_PSYMBOL copyBackward) copies the elements in reverse order, but the
* order of elements in the $(D_PARAM target) is exactly the same as in the
* $(D_PARAM source).
*
* $(D_PARAM source) and $(D_PARAM target) shall not overlap so that
* $(D_PARAM target) points ahead of $(D_PARAM source).
*
* $(D_PARAM target) shall have enough space for $(D_INLINECODE source.length)
* elements.
*
* Params:
* source = Memory to copy from.
* target = Destination memory.
*
* See_Also: $(D_PSYMBOL copy).
*
* Precondition: $(D_INLINECODE source.length <= target.length).
*/
void copyBackward(const void[] source, void[] target) pure nothrow @trusted @nogc
in
{
assert(source.length <= target.length);
}
body
{
version (D_InlineAsm_X86_64)
{
tanya.memory.arch.x86_64.copyBackward(source, target);
}
else // Naive implementation.
{
auto count = source.length;
// Try to align the pointers if possible.
if (((cast(size_t) source.ptr) & alignMask) == ((cast(size_t) target.ptr) & alignMask))
{
while (((cast(size_t) (source.ptr + count)) & alignMask) != 0)
{
if (!count--)
{
return;
}
(cast(ubyte[]) target)[count]
= (cast(const(ubyte)[]) source)[count];
}
}
// Write as long we're aligned.
for (; count >= size_t.sizeof; count -= size_t.sizeof)
{
*(cast(size_t*) (target.ptr + count - size_t.sizeof))
= *(cast(const(size_t)*) (source.ptr + count - size_t.sizeof));
}
// Write the remaining bytes.
while (count--)
{
(cast(ubyte[]) target)[count]
= (cast(const(ubyte)[]) source)[count];
}
}
}
///
pure nothrow @safe @nogc unittest
{
ubyte[6] mem = [ 'a', 'a', 'b', 'b', 'c', 'c' ];
ubyte[6] expected = [ 'a', 'a', 'a', 'a', 'b', 'b' ];
copyBackward(mem[0 .. 4], mem[2 .. $]);
assert(expected == mem);
}