From faebf3e4d5f6fcdfe1ad9f30c75fb478a7259fd1 Mon Sep 17 00:00:00 2001 From: Eugen Wissner Date: Tue, 26 Sep 2017 08:26:12 +0200 Subject: [PATCH] Fix #304 Replace inline assembly with GAS. --- arch/build.ninja | 5 +- arch/x64/linux/memory/cmp.S | 67 ++++++ arch/x64/linux/memory/copy.S | 67 ++++++ arch/x64/linux/memory/fill.S | 155 +++++++++++++ dub.json | 5 +- source/tanya/memory/arch/x86_64.d | 346 ------------------------------ source/tanya/memory/op.d | 43 ++-- 7 files changed, 322 insertions(+), 366 deletions(-) create mode 100644 arch/x64/linux/memory/cmp.S create mode 100644 arch/x64/linux/memory/copy.S create mode 100644 arch/x64/linux/memory/fill.S delete mode 100644 source/tanya/memory/arch/x86_64.d diff --git a/arch/build.ninja b/arch/build.ninja index eff2eb9..fce2838 100644 --- a/arch/build.ninja +++ b/arch/build.ninja @@ -5,6 +5,9 @@ rule archive command = ar rcs $out $in build abs.o: gas x64/linux/math/abs.S +build cmp.o: gas x64/linux/memory/cmp.S +build fill.o: gas x64/linux/memory/fill.S +build copy.o: gas x64/linux/memory/copy.S build syscall.o: gas x64/linux/syscall.S -build tanya.a: archive syscall.o abs.o +build tanya.a: archive syscall.o copy.o fill.o cmp.o abs.o diff --git a/arch/x64/linux/memory/cmp.S b/arch/x64/linux/memory/cmp.S new file mode 100644 index 0000000..169e2eb --- /dev/null +++ b/arch/x64/linux/memory/cmp.S @@ -0,0 +1,67 @@ + .text + +/* + * cmpMemory. + * + * rdi - r1 length + * rsi - r1 data. + * rdx - r2 length. + * rcx - r2 data. + */ + .globl _D5tanya6memory2op9cmpMemoryFNaNbNixAvxAvZi + .type _D5tanya6memory2op9cmpMemoryFNaNbNixAvxAvZi, @function + +_D5tanya6memory2op9cmpMemoryFNaNbNixAvxAvZi: + // Compare the lengths + cmp %rdx, %rdi + jl less + jg greater + + mov %rcx, %rdi + + // Check if we're aligned + cmp $0x08, %rdx + jc aligned_1 + test $0x07, %edi + jz aligned_8 + + naligned: + cmpsb + jl less + jg greater + + dec %rdx + test $0x07, %edi + jnz naligned + + aligned_8: + mov %rdx, %rcx + shr $0x03, %rcx + + repe cmpsq + jl less + jg greater + + and $0x07, %edx + jz equal + + aligned_1: // Compare the remaining bytes + mov %rdx, %rcx + + repe cmpsb + jl less + jg greater + + equal: + xor %rax, %rax // Return 0 + jmp end + + greater: + mov $0x01, %rax + jmp end + + less: + mov $-0x01, %rax + + end: + ret diff --git a/arch/x64/linux/memory/copy.S b/arch/x64/linux/memory/copy.S new file mode 100644 index 0000000..bf74e0f --- /dev/null +++ b/arch/x64/linux/memory/copy.S @@ -0,0 +1,67 @@ + .text + +/* + * copyMemory. + * + * rdi - source length + * rsi - source data. + * rdx - target length. + * rcx - target data. + */ + .globl _D5tanya6memory2op10copyMemoryFNaNbNixAvAvZv + .type _D5tanya6memory2op10copyMemoryFNaNbNixAvAvZv, @function + +_D5tanya6memory2op10copyMemoryFNaNbNixAvAvZv: + mov %rdi, %rdx + mov %rcx, %rdi + + cmp $0x08, %rdx + jc aligned_1 + test $0x07, %edi + jz aligned_8 + + naligned: + movsb + dec %rdx + test $0x07, %edi + jnz naligned + + aligned_8: + mov %rdx, %rcx + shr $0x03, %rcx + rep movsq + and $0x07, %edx + jz end + + aligned_1: + // Write the remaining bytes + mov %rdx, %rcx + rep movsb + + end: + ret + +/* + * moveMemory. + * + * rdi - source length + * rsi - source data. + * rdx - target length. + * rcx - target data. + */ + .globl _D5tanya6memory2op10moveMemoryFNaNbNixAvAvZv + .type _D5tanya6memory2op10moveMemoryFNaNbNixAvAvZv, @function + +_D5tanya6memory2op10moveMemoryFNaNbNixAvAvZv: + mov %rdi, %rdx + + lea -1(%rdx, %rsi), %rsi + lea -1(%rdx, %rcx), %rdi + mov %rdx, %rcx + + std // Set the direction flag + + rep movsb + + cld // Clear the direction flag + ret diff --git a/arch/x64/linux/memory/fill.S b/arch/x64/linux/memory/fill.S new file mode 100644 index 0000000..d4fc0ac --- /dev/null +++ b/arch/x64/linux/memory/fill.S @@ -0,0 +1,155 @@ + .text + +/* + * fillMemory. + * + * rdi - length. + * rsi - pointer. + * rdx - value filled with a byte. + */ + .globl _D5tanya6memory2op10fillMemoryFNaNbNiAvmZv + .type _D5tanya6memory2op10fillMemoryFNaNbNiAvmZv, @function + +_D5tanya6memory2op10fillMemoryFNaNbNiAvmZv: + // Check for zero length + test %rdi, %rdi + jz end + + mov %rdi, %rax + mov %rsi, %r8 + + movq %rdx, %xmm0 + movlhps %xmm0, %xmm0 + + // Check if the pointer is aligned to a 16-byte boundary + and $-0x10, %r8 + + // Compute the number of misaligned bytes + mov %rsi, %r9 + sub %r8, %r9 + + test %r9, %r9 + jz aligned + + // Get the number of bytes to be written until we are aligned + mov $0x10, %rcx + sub %r9, %rcx + + mov %rsi, %r8 + + naligned: + mov %dl, (%r8) // Write a byte + + // Advance the pointer. Decrease the total number of bytes + // and the misaligned ones + inc %r8 + dec %rcx + dec %rax + + // Checks if we are aligned + test %rcx, %rcx + jnz naligned + + aligned: + // Checks if we're done writing bytes + test %rax, %rax + jz end + + // Write 1 byte at a time + cmp $8, %rax + jl aligned_1 + + // Write 8 bytes at a time + cmp $16, %rax + jl aligned_8 + + // Write 16 bytes at a time + cmp $32, %rax + jl aligned_16 + + // Write 32 bytes at a time + cmp $64, %rax + jl aligned_32 + + aligned_64: + movdqa %xmm0, (%r8) + movdqa %xmm0, 16(%r8) + movdqa %xmm0, 32(%r8) + movdqa %xmm0, 48(%r8) + + add $64, %r8 + sub $64, %rax + + cmp $64, %rax + jge aligned_64 + + // Checks if we're done writing bytes + test %rax, %rax + jz end + + // Write 1 byte at a time + cmp $8, %rax + jl aligned_1 + + // Write 8 bytes at a time + cmp $16, %rax + jl aligned_8 + + // Write 16 bytes at a time + cmp $32, %rax + jl aligned_16 + + aligned_32: + movdqa %xmm0, (%r8) + movdqa %xmm0, 16(%r8) + + add $32, %r8 + sub $32, %rax + + // Checks if we're done writing bytes + test %rax, %rax + jz end + + // Write 1 byte at a time + cmp $8, %rax + jl aligned_1 + + // Write 8 bytes at a time + cmp $16, %rax + jl aligned_8 + + aligned_16: + movdqa %xmm0, (%r8) + + add $16, %r8 + sub $16, %rax + + // Checks if we're done writing bytes + test %rax, %rax + jz end + + // Write 1 byte at a time + cmp $8, %rax + jl aligned_1 + + aligned_8: + mov %rdx, (%r8) + + add $8, %r8 + sub $8, %rax + + // Checks if we're done writing bytes + test %rax, %rax + jz end + + aligned_1: + mov %dl, (%r8) + + inc %r8 + dec %rax + + test %rax, %rax + jnz aligned_1 + + end: + ret diff --git a/dub.json b/dub.json index e6dfaaf..00188bf 100644 --- a/dub.json +++ b/dub.json @@ -18,9 +18,10 @@ { "name": "native", "targetType": "library", - "platforms": ["linux-x86_64"], + "platforms": ["linux-x86_64-gdc"], "preBuildCommands": ["ninja -C arch"], - "lflags": ["arch/tanya.a"] + "lflags": ["arch/tanya.a"], + "versions": ["TanyaNative"] } ] } diff --git a/source/tanya/memory/arch/x86_64.d b/source/tanya/memory/arch/x86_64.d deleted file mode 100644 index 57e1563..0000000 --- a/source/tanya/memory/arch/x86_64.d +++ /dev/null @@ -1,346 +0,0 @@ -/* This Source Code Form is subject to the terms of the Mozilla Public - * License, v. 2.0. If a copy of the MPL was not distributed with this - * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ - -/* - * Implementions of functions found in $(D_PSYMBOL tanya.memory.op) for x64. - * - * Copyright: Eugene Wissner 2017. - * License: $(LINK2 https://www.mozilla.org/en-US/MPL/2.0/, - * Mozilla Public License, v. 2.0). - * Authors: $(LINK2 mailto:info@caraus.de, Eugene Wissner) - * Source: $(LINK2 https://github.com/caraus-ecms/tanya/blob/master/source/tanya/memory/arch/x86_64.d, - * tanya/memory/arch/x86_64.d) - */ -module tanya.memory.arch.x86_64; - -import tanya.memory.op; - -version (D_InlineAsm_X86_64): - -pragma(inline, true) -package (tanya.memory) void copy(const void[] source, void[] target) -pure nothrow @system @nogc -{ - asm pure nothrow @nogc - { - naked; - - // RDI and RSI should be preserved. - mov RAX, RDI; - mov R8, RSI; - - // RDX - source length. - // RCX - source data. - // RDI - target length - // RSI - target data. - - mov RDI, RSI; - mov RSI, RCX; - - cmp RDX, 0x08; - jc aligned_1; - test EDI, 0x07; - jz aligned_8; - - naligned: - movsb; - dec RDX; - test EDI, 0x07; - jnz naligned; - - aligned_8: - mov RCX, RDX; - shr RCX, 0x03; - rep; - movsq; - and EDX, 0x07; - jz end; - - aligned_1: - // Write the remaining bytes. - mov RCX, RDX; - rep; - movsb; - - end: // Restore registers. - mov RSI, R8; - mov RDI, RAX; - - ret; - } -} - -pragma(inline, true) -package (tanya.memory) void fill(void[], ulong) pure nothrow @system @nogc -{ - asm pure nothrow @nogc - { - naked; - - // Check for zero length. - test RSI, RSI; - jz end; - - /* - * RDX - pointer. - * RSI - length. - * RDI - value filled with a byte. - */ - mov RAX, RSI; - mov R8, RDX; - - movq XMM0, RDI; - movlhps XMM0, XMM0; - - // Check if the pointer is aligned to a 16-byte boundary. - and R8, -0x10; - - // Compute the number of misaligned bytes. - mov R9, RDX; - sub R9, R8; - - test R9, R9; - jz aligned; - - // Get the number of bytes to be written until we are aligned. - mov RCX, 0x10; - sub RCX, R9; - - mov R8, RDX; - - naligned: - mov [ R8 ], DIL; // Write a byte. - - // Advance the pointer. Decrease the total number of bytes - // and the misaligned ones. - inc R8; - dec RCX; - dec RAX; - - // Checks if we are aligned. - test RCX, RCX; - jnz naligned; - - aligned: - // Checks if we're done writing bytes. - test RAX, RAX; - jz end; - - // Write 1 byte at a time. - cmp RAX, 8; - jl aligned_1; - - // Write 8 bytes at a time. - cmp RAX, 16; - jl aligned_8; - - // Write 16 bytes at a time. - cmp RAX, 32; - jl aligned_16; - - // Write 32 bytes at a time. - cmp RAX, 64; - jl aligned_32; - - aligned_64: - movdqa [ R8 ], XMM0; - movdqa [ R8 + 16 ], XMM0; - movdqa [ R8 + 32 ], XMM0; - movdqa [ R8 + 48 ], XMM0; - - add R8, 64; - sub RAX, 64; - - cmp RAX, 64; - jge aligned_64; - - // Checks if we're done writing bytes. - test RAX, RAX; - jz end; - - // Write 1 byte at a time. - cmp RAX, 8; - jl aligned_1; - - // Write 8 bytes at a time. - cmp RAX, 16; - jl aligned_8; - - // Write 16 bytes at a time. - cmp RAX, 32; - jl aligned_16; - - aligned_32: - movdqa [ R8 ], XMM0; - movdqa [ R8 + 16 ], XMM0; - - add R8, 32; - sub RAX, 32; - - // Checks if we're done writing bytes. - test RAX, RAX; - jz end; - - // Write 1 byte at a time. - cmp RAX, 8; - jl aligned_1; - - // Write 8 bytes at a time. - cmp RAX, 16; - jl aligned_8; - - aligned_16: - movdqa [ R8 ], XMM0; - - add R8, 16; - sub RAX, 16; - - // Checks if we're done writing bytes. - test RAX, RAX; - jz end; - - // Write 1 byte at a time. - cmp RAX, 8; - jl aligned_1; - - aligned_8: - mov [ R8 ], RDI; - - add R8, 8; - sub RAX, 8; - - // Checks if we're done writing bytes. - test RAX, RAX; - jz end; - - aligned_1: - mov [ R8 ], DIL; - - inc R8; - dec RAX; - - test RAX, RAX; - jnz aligned_1; - - end: - ret; - } -} - -pragma(inline, true) -package (tanya.memory) void copyBackward(const void[] source, void[] target) -pure nothrow @system @nogc -{ - asm pure nothrow @nogc - { - naked; - - // Save the registers should be restored. - mov R8, RSI; - mov R9, RDI; - - // RDX - source length. - // RCX - source data. - // RDI - target length - // RSI - target data. - - lea RDI, [ RSI + RDX - 1 ]; - lea RSI, [ RCX + RDX - 1 ]; - mov RCX, RDX; - - std; // Set the direction flag. - - rep; - movsb; - - cld; // Clear the direction flag. - - // Restore registers. - mov RDI, R9; - mov RSI, R8; - - ret; - } -} - -pragma(inline, true) -package (tanya.memory) int cmp(const void[] r1, const void[] r2) -pure nothrow @system @nogc -{ - asm pure nothrow @nogc - { - naked; - - // RDI and RSI should be preserved. - mov R9, RDI; - mov R8, RSI; - - // RDX - r1 length. - // RCX - r1 data. - // RDI - r2 length - // RSI - r2 data. - - mov RSI, RCX; - mov RCX, RDI; - mov RDI, R8; - - // Compare the lengths. - cmp RDX, RCX; - jl less; - jg greater; - - // Check if we're aligned. - cmp RDX, 0x08; - jc aligned_1; - test EDI, 0x07; - jz aligned_8; - - naligned: - cmpsb; - jl less; - jg greater; - - dec RDX; - test EDI, 0x07; - jnz naligned; - - aligned_8: - mov RCX, RDX; - shr RCX, 0x03; - - repe; - cmpsq; - jl less; - jg greater; - - and EDX, 0x07; - jz equal; - - aligned_1: // Compare the remaining bytes. - mov RCX, RDX; - - repe; - cmpsb; - jl less; - jg greater; - - equal: - xor RAX, RAX; // Return 0. - jmp end; - - greater: - mov RAX, 1; - jmp end; - - less: - mov RAX, -1; - jmp end; - - end: // Restore registers. - mov RSI, R8; - mov RDI, R9; - - ret; - } -} diff --git a/source/tanya/memory/op.d b/source/tanya/memory/op.d index e3c9451..9af7fad 100644 --- a/source/tanya/memory/op.d +++ b/source/tanya/memory/op.d @@ -14,13 +14,22 @@ */ module tanya.memory.op; -version (TanyaPhobos) +version (TanyaNative) { - import core.stdc.string; + extern private void fillMemory(void[], size_t) pure nothrow @system @nogc; + + extern private void copyMemory(const void[], void[]) + pure nothrow @system @nogc; + + extern private void moveMemory(const void[], void[]) + pure nothrow @system @nogc; + + extern private int cmpMemory(const void[], const void[]) + pure nothrow @system @nogc; } else { - static import tanya.memory.arch.x86_64; + import core.stdc.string; } private enum alignMask = size_t.sizeof - 1; @@ -49,13 +58,13 @@ in } body { - version (TanyaPhobos) + version (TanyaNative) { - memcpy(target.ptr, source.ptr, source.length); + copyMemory(source, target); } else { - tanya.memory.arch.x86_64.copy(source, target); + memcpy(target.ptr, source.ptr, source.length); } } @@ -112,13 +121,13 @@ private template filledBytes(ubyte Byte, ubyte I = 0) */ void fill(ubyte c = 0)(void[] memory) @trusted { - version (TanyaPhobos) + version (TanyaNative) { - memset(memory.ptr, c, memory.length); + fillMemory(memory, filledBytes!c); } else { - tanya.memory.arch.x86_64.fill(memory, filledBytes!c); + memset(memory.ptr, c, memory.length); } } @@ -187,13 +196,13 @@ in } body { - version (TanyaPhobos) + version (TanyaNative) { - memmove(target.ptr, source.ptr, source.length); + moveMemory(source, target); } else { - tanya.memory.arch.x86_64.copyBackward(source, target); + memmove(target.ptr, source.ptr, source.length); } } @@ -235,7 +244,11 @@ private nothrow @safe @nogc unittest */ int cmp(const void[] r1, const void[] r2) pure nothrow @trusted @nogc { - version (TanyaPhobos) + version (TanyaNative) + { + return cmpMemory(r1, r2); + } + else { if (r1.length > r2.length) { @@ -243,10 +256,6 @@ int cmp(const void[] r1, const void[] r2) pure nothrow @trusted @nogc } return r1.length < r2.length ? -1 : memcmp(r1.ptr, r2.ptr, r1.length); } - else - { - return tanya.memory.arch.x86_64.cmp(r1, r2); - } } ///