Replace inline assembly with GAS.
This commit is contained in:
Eugen Wissner 2017-09-26 08:26:12 +02:00
parent 20e7df386b
commit faebf3e4d5
7 changed files with 322 additions and 366 deletions

View File

@ -5,6 +5,9 @@ rule archive
command = ar rcs $out $in
build abs.o: gas x64/linux/math/abs.S
build cmp.o: gas x64/linux/memory/cmp.S
build fill.o: gas x64/linux/memory/fill.S
build copy.o: gas x64/linux/memory/copy.S
build syscall.o: gas x64/linux/syscall.S
build tanya.a: archive syscall.o abs.o
build tanya.a: archive syscall.o copy.o fill.o cmp.o abs.o

View File

@ -0,0 +1,67 @@
.text
/*
* cmpMemory.
*
* rdi - r1 length
* rsi - r1 data.
* rdx - r2 length.
* rcx - r2 data.
*/
.globl _D5tanya6memory2op9cmpMemoryFNaNbNixAvxAvZi
.type _D5tanya6memory2op9cmpMemoryFNaNbNixAvxAvZi, @function
_D5tanya6memory2op9cmpMemoryFNaNbNixAvxAvZi:
// Compare the lengths
cmp %rdx, %rdi
jl less
jg greater
mov %rcx, %rdi
// Check if we're aligned
cmp $0x08, %rdx
jc aligned_1
test $0x07, %edi
jz aligned_8
naligned:
cmpsb
jl less
jg greater
dec %rdx
test $0x07, %edi
jnz naligned
aligned_8:
mov %rdx, %rcx
shr $0x03, %rcx
repe cmpsq
jl less
jg greater
and $0x07, %edx
jz equal
aligned_1: // Compare the remaining bytes
mov %rdx, %rcx
repe cmpsb
jl less
jg greater
equal:
xor %rax, %rax // Return 0
jmp end
greater:
mov $0x01, %rax
jmp end
less:
mov $-0x01, %rax
end:
ret

View File

@ -0,0 +1,67 @@
.text
/*
* copyMemory.
*
* rdi - source length
* rsi - source data.
* rdx - target length.
* rcx - target data.
*/
.globl _D5tanya6memory2op10copyMemoryFNaNbNixAvAvZv
.type _D5tanya6memory2op10copyMemoryFNaNbNixAvAvZv, @function
_D5tanya6memory2op10copyMemoryFNaNbNixAvAvZv:
mov %rdi, %rdx
mov %rcx, %rdi
cmp $0x08, %rdx
jc aligned_1
test $0x07, %edi
jz aligned_8
naligned:
movsb
dec %rdx
test $0x07, %edi
jnz naligned
aligned_8:
mov %rdx, %rcx
shr $0x03, %rcx
rep movsq
and $0x07, %edx
jz end
aligned_1:
// Write the remaining bytes
mov %rdx, %rcx
rep movsb
end:
ret
/*
* moveMemory.
*
* rdi - source length
* rsi - source data.
* rdx - target length.
* rcx - target data.
*/
.globl _D5tanya6memory2op10moveMemoryFNaNbNixAvAvZv
.type _D5tanya6memory2op10moveMemoryFNaNbNixAvAvZv, @function
_D5tanya6memory2op10moveMemoryFNaNbNixAvAvZv:
mov %rdi, %rdx
lea -1(%rdx, %rsi), %rsi
lea -1(%rdx, %rcx), %rdi
mov %rdx, %rcx
std // Set the direction flag
rep movsb
cld // Clear the direction flag
ret

View File

@ -0,0 +1,155 @@
.text
/*
* fillMemory.
*
* rdi - length.
* rsi - pointer.
* rdx - value filled with a byte.
*/
.globl _D5tanya6memory2op10fillMemoryFNaNbNiAvmZv
.type _D5tanya6memory2op10fillMemoryFNaNbNiAvmZv, @function
_D5tanya6memory2op10fillMemoryFNaNbNiAvmZv:
// Check for zero length
test %rdi, %rdi
jz end
mov %rdi, %rax
mov %rsi, %r8
movq %rdx, %xmm0
movlhps %xmm0, %xmm0
// Check if the pointer is aligned to a 16-byte boundary
and $-0x10, %r8
// Compute the number of misaligned bytes
mov %rsi, %r9
sub %r8, %r9
test %r9, %r9
jz aligned
// Get the number of bytes to be written until we are aligned
mov $0x10, %rcx
sub %r9, %rcx
mov %rsi, %r8
naligned:
mov %dl, (%r8) // Write a byte
// Advance the pointer. Decrease the total number of bytes
// and the misaligned ones
inc %r8
dec %rcx
dec %rax
// Checks if we are aligned
test %rcx, %rcx
jnz naligned
aligned:
// Checks if we're done writing bytes
test %rax, %rax
jz end
// Write 1 byte at a time
cmp $8, %rax
jl aligned_1
// Write 8 bytes at a time
cmp $16, %rax
jl aligned_8
// Write 16 bytes at a time
cmp $32, %rax
jl aligned_16
// Write 32 bytes at a time
cmp $64, %rax
jl aligned_32
aligned_64:
movdqa %xmm0, (%r8)
movdqa %xmm0, 16(%r8)
movdqa %xmm0, 32(%r8)
movdqa %xmm0, 48(%r8)
add $64, %r8
sub $64, %rax
cmp $64, %rax
jge aligned_64
// Checks if we're done writing bytes
test %rax, %rax
jz end
// Write 1 byte at a time
cmp $8, %rax
jl aligned_1
// Write 8 bytes at a time
cmp $16, %rax
jl aligned_8
// Write 16 bytes at a time
cmp $32, %rax
jl aligned_16
aligned_32:
movdqa %xmm0, (%r8)
movdqa %xmm0, 16(%r8)
add $32, %r8
sub $32, %rax
// Checks if we're done writing bytes
test %rax, %rax
jz end
// Write 1 byte at a time
cmp $8, %rax
jl aligned_1
// Write 8 bytes at a time
cmp $16, %rax
jl aligned_8
aligned_16:
movdqa %xmm0, (%r8)
add $16, %r8
sub $16, %rax
// Checks if we're done writing bytes
test %rax, %rax
jz end
// Write 1 byte at a time
cmp $8, %rax
jl aligned_1
aligned_8:
mov %rdx, (%r8)
add $8, %r8
sub $8, %rax
// Checks if we're done writing bytes
test %rax, %rax
jz end
aligned_1:
mov %dl, (%r8)
inc %r8
dec %rax
test %rax, %rax
jnz aligned_1
end:
ret

View File

@ -18,9 +18,10 @@
{
"name": "native",
"targetType": "library",
"platforms": ["linux-x86_64"],
"platforms": ["linux-x86_64-gdc"],
"preBuildCommands": ["ninja -C arch"],
"lflags": ["arch/tanya.a"]
"lflags": ["arch/tanya.a"],
"versions": ["TanyaNative"]
}
]
}

View File

@ -1,346 +0,0 @@
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
/*
* Implementions of functions found in $(D_PSYMBOL tanya.memory.op) for x64.
*
* Copyright: Eugene Wissner 2017.
* License: $(LINK2 https://www.mozilla.org/en-US/MPL/2.0/,
* Mozilla Public License, v. 2.0).
* Authors: $(LINK2 mailto:info@caraus.de, Eugene Wissner)
* Source: $(LINK2 https://github.com/caraus-ecms/tanya/blob/master/source/tanya/memory/arch/x86_64.d,
* tanya/memory/arch/x86_64.d)
*/
module tanya.memory.arch.x86_64;
import tanya.memory.op;
version (D_InlineAsm_X86_64):
pragma(inline, true)
package (tanya.memory) void copy(const void[] source, void[] target)
pure nothrow @system @nogc
{
asm pure nothrow @nogc
{
naked;
// RDI and RSI should be preserved.
mov RAX, RDI;
mov R8, RSI;
// RDX - source length.
// RCX - source data.
// RDI - target length
// RSI - target data.
mov RDI, RSI;
mov RSI, RCX;
cmp RDX, 0x08;
jc aligned_1;
test EDI, 0x07;
jz aligned_8;
naligned:
movsb;
dec RDX;
test EDI, 0x07;
jnz naligned;
aligned_8:
mov RCX, RDX;
shr RCX, 0x03;
rep;
movsq;
and EDX, 0x07;
jz end;
aligned_1:
// Write the remaining bytes.
mov RCX, RDX;
rep;
movsb;
end: // Restore registers.
mov RSI, R8;
mov RDI, RAX;
ret;
}
}
pragma(inline, true)
package (tanya.memory) void fill(void[], ulong) pure nothrow @system @nogc
{
asm pure nothrow @nogc
{
naked;
// Check for zero length.
test RSI, RSI;
jz end;
/*
* RDX - pointer.
* RSI - length.
* RDI - value filled with a byte.
*/
mov RAX, RSI;
mov R8, RDX;
movq XMM0, RDI;
movlhps XMM0, XMM0;
// Check if the pointer is aligned to a 16-byte boundary.
and R8, -0x10;
// Compute the number of misaligned bytes.
mov R9, RDX;
sub R9, R8;
test R9, R9;
jz aligned;
// Get the number of bytes to be written until we are aligned.
mov RCX, 0x10;
sub RCX, R9;
mov R8, RDX;
naligned:
mov [ R8 ], DIL; // Write a byte.
// Advance the pointer. Decrease the total number of bytes
// and the misaligned ones.
inc R8;
dec RCX;
dec RAX;
// Checks if we are aligned.
test RCX, RCX;
jnz naligned;
aligned:
// Checks if we're done writing bytes.
test RAX, RAX;
jz end;
// Write 1 byte at a time.
cmp RAX, 8;
jl aligned_1;
// Write 8 bytes at a time.
cmp RAX, 16;
jl aligned_8;
// Write 16 bytes at a time.
cmp RAX, 32;
jl aligned_16;
// Write 32 bytes at a time.
cmp RAX, 64;
jl aligned_32;
aligned_64:
movdqa [ R8 ], XMM0;
movdqa [ R8 + 16 ], XMM0;
movdqa [ R8 + 32 ], XMM0;
movdqa [ R8 + 48 ], XMM0;
add R8, 64;
sub RAX, 64;
cmp RAX, 64;
jge aligned_64;
// Checks if we're done writing bytes.
test RAX, RAX;
jz end;
// Write 1 byte at a time.
cmp RAX, 8;
jl aligned_1;
// Write 8 bytes at a time.
cmp RAX, 16;
jl aligned_8;
// Write 16 bytes at a time.
cmp RAX, 32;
jl aligned_16;
aligned_32:
movdqa [ R8 ], XMM0;
movdqa [ R8 + 16 ], XMM0;
add R8, 32;
sub RAX, 32;
// Checks if we're done writing bytes.
test RAX, RAX;
jz end;
// Write 1 byte at a time.
cmp RAX, 8;
jl aligned_1;
// Write 8 bytes at a time.
cmp RAX, 16;
jl aligned_8;
aligned_16:
movdqa [ R8 ], XMM0;
add R8, 16;
sub RAX, 16;
// Checks if we're done writing bytes.
test RAX, RAX;
jz end;
// Write 1 byte at a time.
cmp RAX, 8;
jl aligned_1;
aligned_8:
mov [ R8 ], RDI;
add R8, 8;
sub RAX, 8;
// Checks if we're done writing bytes.
test RAX, RAX;
jz end;
aligned_1:
mov [ R8 ], DIL;
inc R8;
dec RAX;
test RAX, RAX;
jnz aligned_1;
end:
ret;
}
}
pragma(inline, true)
package (tanya.memory) void copyBackward(const void[] source, void[] target)
pure nothrow @system @nogc
{
asm pure nothrow @nogc
{
naked;
// Save the registers should be restored.
mov R8, RSI;
mov R9, RDI;
// RDX - source length.
// RCX - source data.
// RDI - target length
// RSI - target data.
lea RDI, [ RSI + RDX - 1 ];
lea RSI, [ RCX + RDX - 1 ];
mov RCX, RDX;
std; // Set the direction flag.
rep;
movsb;
cld; // Clear the direction flag.
// Restore registers.
mov RDI, R9;
mov RSI, R8;
ret;
}
}
pragma(inline, true)
package (tanya.memory) int cmp(const void[] r1, const void[] r2)
pure nothrow @system @nogc
{
asm pure nothrow @nogc
{
naked;
// RDI and RSI should be preserved.
mov R9, RDI;
mov R8, RSI;
// RDX - r1 length.
// RCX - r1 data.
// RDI - r2 length
// RSI - r2 data.
mov RSI, RCX;
mov RCX, RDI;
mov RDI, R8;
// Compare the lengths.
cmp RDX, RCX;
jl less;
jg greater;
// Check if we're aligned.
cmp RDX, 0x08;
jc aligned_1;
test EDI, 0x07;
jz aligned_8;
naligned:
cmpsb;
jl less;
jg greater;
dec RDX;
test EDI, 0x07;
jnz naligned;
aligned_8:
mov RCX, RDX;
shr RCX, 0x03;
repe;
cmpsq;
jl less;
jg greater;
and EDX, 0x07;
jz equal;
aligned_1: // Compare the remaining bytes.
mov RCX, RDX;
repe;
cmpsb;
jl less;
jg greater;
equal:
xor RAX, RAX; // Return 0.
jmp end;
greater:
mov RAX, 1;
jmp end;
less:
mov RAX, -1;
jmp end;
end: // Restore registers.
mov RSI, R8;
mov RDI, R9;
ret;
}
}

View File

@ -14,13 +14,22 @@
*/
module tanya.memory.op;
version (TanyaPhobos)
version (TanyaNative)
{
import core.stdc.string;
extern private void fillMemory(void[], size_t) pure nothrow @system @nogc;
extern private void copyMemory(const void[], void[])
pure nothrow @system @nogc;
extern private void moveMemory(const void[], void[])
pure nothrow @system @nogc;
extern private int cmpMemory(const void[], const void[])
pure nothrow @system @nogc;
}
else
{
static import tanya.memory.arch.x86_64;
import core.stdc.string;
}
private enum alignMask = size_t.sizeof - 1;
@ -49,13 +58,13 @@ in
}
body
{
version (TanyaPhobos)
version (TanyaNative)
{
memcpy(target.ptr, source.ptr, source.length);
copyMemory(source, target);
}
else
{
tanya.memory.arch.x86_64.copy(source, target);
memcpy(target.ptr, source.ptr, source.length);
}
}
@ -112,13 +121,13 @@ private template filledBytes(ubyte Byte, ubyte I = 0)
*/
void fill(ubyte c = 0)(void[] memory) @trusted
{
version (TanyaPhobos)
version (TanyaNative)
{
memset(memory.ptr, c, memory.length);
fillMemory(memory, filledBytes!c);
}
else
{
tanya.memory.arch.x86_64.fill(memory, filledBytes!c);
memset(memory.ptr, c, memory.length);
}
}
@ -187,13 +196,13 @@ in
}
body
{
version (TanyaPhobos)
version (TanyaNative)
{
memmove(target.ptr, source.ptr, source.length);
moveMemory(source, target);
}
else
{
tanya.memory.arch.x86_64.copyBackward(source, target);
memmove(target.ptr, source.ptr, source.length);
}
}
@ -235,7 +244,11 @@ private nothrow @safe @nogc unittest
*/
int cmp(const void[] r1, const void[] r2) pure nothrow @trusted @nogc
{
version (TanyaPhobos)
version (TanyaNative)
{
return cmpMemory(r1, r2);
}
else
{
if (r1.length > r2.length)
{
@ -243,10 +256,6 @@ int cmp(const void[] r1, const void[] r2) pure nothrow @trusted @nogc
}
return r1.length < r2.length ? -1 : memcmp(r1.ptr, r2.ptr, r1.length);
}
else
{
return tanya.memory.arch.x86_64.cmp(r1, r2);
}
}
///