Add fast function to zero memory

This commit is contained in:
Eugen Wissner 2017-08-02 05:31:08 +02:00
parent 1a4d1238a1
commit ed92e3993e
2 changed files with 271 additions and 10 deletions

View File

@ -79,3 +79,192 @@ pure nothrow @system @nogc
ret; ret;
} }
} }
pragma(inline, true)
package (tanya.memory) void zero(void[] memory)
pure nothrow @system @nogc
{
asm pure nothrow @nogc
{
naked;
}
version (Windows) asm pure nothrow @nogc
{
/*
* RCX - array.
*/
mov R8, [ RCX ];
mov R9, [ RCX + 8 ];
}
else asm pure nothrow @nogc
{
/*
* RSI - pointer.
* RDI - length.
*/
mov R8, RDI;
mov R9, RSI;
}
asm pure nothrow @nogc
{
// Check for zero length.
test R8, R8;
jz end;
// Set to 0.
pxor XMM0, XMM0;
// Check if the pointer is aligned to a 16-byte boundary.
and R9, -0x10;
}
// Compute the number of misaligned bytes.
version (Windows) asm pure nothrow @nogc
{
mov RAX, [ RCX + 8 ];
}
else asm pure nothrow @nogc
{
mov RAX, RSI;
}
asm pure nothrow @nogc
{
sub RAX, R9;
test RAX, RAX;
jz aligned;
// Get the number of bytes to be written until we are aligned.
mov RDX, 0x10;
sub RDX, RAX;
}
version (Windows) asm pure nothrow @nogc
{
mov R9, [ RCX + 8 ];
}
else asm pure nothrow @nogc
{
mov R9, RSI;
}
asm pure nothrow @nogc
{
// Set RAX to zero, so we can set bytes and dwords.
xor RAX, RAX;
naligned:
mov [ R9 ], AL; // Write a byte.
// Advance the pointer. Decrease the total number of bytes
// and the misaligned ones.
inc R9;
dec RDX;
dec R8;
// Checks if we are aligned.
test RDX, RDX;
jnz naligned;
aligned:
// Checks if we're done writing bytes.
test R8, R8;
jz end;
// Write 1 byte at a time.
cmp R8, 8;
jl aligned_1;
// Write 8 bytes at a time.
cmp R8, 16;
jl aligned_8;
// Write 16 bytes at a time.
cmp R8, 32;
jl aligned_16;
// Write 32 bytes at a time.
cmp R8, 64;
jl aligned_32;
aligned_64:
movdqa [ R9 ], XMM0;
movdqa [ R9 + 16 ], XMM0;
movdqa [ R9 + 32 ], XMM0;
movdqa [ R9 + 48 ], XMM0;
add R9, 64;
sub R8, 64;
cmp R8, 64;
jge aligned_64;
// Checks if we're done writing bytes.
test R8, R8;
jz end;
// Write 1 byte at a time.
cmp R8, 8;
jl aligned_1;
// Write 8 bytes at a time.
cmp R8, 16;
jl aligned_8;
// Write 16 bytes at a time.
cmp R8, 32;
jl aligned_16;
aligned_32:
movdqa [ R9 ], XMM0;
movdqa [ R9 + 16 ], XMM0;
add R9, 32;
sub R8, 32;
// Checks if we're done writing bytes.
test R8, R8;
jz end;
// Write 1 byte at a time.
cmp R8, 8;
jl aligned_1;
// Write 8 bytes at a time.
cmp R8, 16;
jl aligned_8;
aligned_16:
movdqa [ R9 ], XMM0;
add R9, 16;
sub R8, 16;
// Checks if we're done writing bytes.
test R8, R8;
jz end;
// Write 1 byte at a time.
cmp R8, 8;
jl aligned_1;
aligned_8:
mov [ R9 ], RAX;
add R9, 8;
sub R8, 8;
// Checks if we're done writing bytes.
test R8, R8;
jz end;
aligned_1:
mov [ R9 ], AL;
inc R9;
dec R8;
test R8, R8;
jnz aligned_1;
end:
ret;
}
}

View File

@ -1,4 +1,4 @@
/* This Source Code Form is subject to the terms of the Mozilla Public /* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this * License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
@ -17,6 +17,8 @@ version (D_InlineAsm_X86_64)
static import tanya.memory.arch.x86_64; static import tanya.memory.arch.x86_64;
} }
private enum alignmentMask = size_t.sizeof - 1;
/** /**
* Copies $(D_PARAM source) into $(D_PARAM target). * Copies $(D_PARAM source) into $(D_PARAM target).
* *
@ -24,7 +26,7 @@ version (D_InlineAsm_X86_64)
* of $(D_PARAM target) points to an element of $(D_PARAM source). * of $(D_PARAM target) points to an element of $(D_PARAM source).
* *
* $(D_PARAM target) shall have enough space $(D_INLINECODE source.length) * $(D_PARAM target) shall have enough space $(D_INLINECODE source.length)
* elements. * elements.
* *
* Params: * Params:
* source = Memory to copy from. * source = Memory to copy from.
@ -48,7 +50,6 @@ body
auto source1 = cast(const(ubyte)*) source; auto source1 = cast(const(ubyte)*) source;
auto target1 = cast(ubyte*) target; auto target1 = cast(ubyte*) target;
auto count = source.length; auto count = source.length;
enum alignmentMask = size_t.sizeof - 1;
// Check if the pointers are aligned or at least can be aligned // Check if the pointers are aligned or at least can be aligned
// properly. // properly.
@ -79,19 +80,17 @@ body
while (count--) while (count--)
{ {
*target1++ = *source1++; *target1++ = *source1++;
} }
} }
} }
/// ///
pure nothrow @safe @nogc unittest pure nothrow @safe @nogc unittest
{ {
{ ubyte[9] source = [1, 2, 3, 4, 5, 6, 7, 8, 9];
ubyte[9] source = [1, 2, 3, 4, 5, 6, 7, 8, 9]; ubyte[9] target;
ubyte[9] target; source.copy(target);
source.copy(target); assert(source == target);
assert(source == target);
}
} }
private pure nothrow @safe @nogc unittest private pure nothrow @safe @nogc unittest
@ -113,3 +112,76 @@ private pure nothrow @safe @nogc unittest
assert(source == target); assert(source == target);
} }
} }
/**
* Fills $(D_PARAM memory) with zero-valued bytes.
*
* Param:
* memory = Memory block.
*/
void zero(void[] memory) pure nothrow @trusted @nogc
{
version (D_InlineAsm_X86_64)
{
tanya.memory.arch.x86_64.zero(memory);
}
else // Naive implementation.
{
auto n = memory.length;
ubyte* vp = cast(ubyte*) memory.ptr;
// Align.
while (((cast(size_t) vp) & alignmentMask) != 0)
{
*vp++ = 0;
--n;
}
// Set size_t.sizeof bytes at ones.
auto sp = cast(size_t*) vp;
while (n / size_t.sizeof > 0)
{
*sp++ = 0;
n -= size_t.sizeof;
}
// Write the remaining bytes.
vp = cast(ubyte*) sp;
while (n--)
{
*vp = 0;
++vp;
}
}
}
///
pure nothrow @safe @nogc unittest
{
ubyte[9] memory = [1, 2, 3, 4, 5, 6, 7, 8, 9];
memory.zero();
foreach (ubyte v; memory)
{
assert(v == 0);
}
}
// Stress test. Checks that `zero` can handle unaligned pointers and different
// lengths.
pure nothrow @safe @nogc private unittest
{
ubyte[192] memory;
foreach (j; 0 .. 192)
{
foreach (ubyte i, ref ubyte v; memory[j .. $])
{
v = i;
}
zero(memory[j .. $]);
foreach (ubyte v; memory[j .. $])
{
assert(v == 0);
}
}
}