#include #if defined(__i386__) //averages at about 80 ticks with an offset of 1 static uint32_t optmemmeasure(char* memory, uint32_t offset) { asm( "rdtsc ;" "mov %%eax, %%ebp ;" "cmpb $0x23, (%%ebx) ;" "cmpb $0x42, (%%ebx, %%ecx) ;" "rdtsc ;" "sub %%ebp, %%eax ;" : : "b" (memory), "c" (offset) ); } static uint64_t rdtsc() { uint64_t rdtsc; asm( "rdtsc ;" : "=A" (rdtsc) ); return rdtsc; } #elif defined(__x86_64__) // this implementation uses all of the data provided by rdtsc but uses // more instrucions // averages at about 77.3 ticks with an offset of 1 static uint64_t memmeasure(char* memory, uint64_t offset) { asm( // here be magic dragons and memory access (read segfaults) ahead // TODO: evaluate if more cmp types (like w and l) do make sense "cmpb $0x23, (%%rbx) ;" // get starting time "rdtsc ;" "shl $32, %%rdx ;" "add %%rax, %%rdx ;" "mov %%rdx, %%rdi ;" "cmpb $0x42, (%%rbx,%%rcx) ;" "rdtsc ;" "shl $32, %%rdx ;" "add %%rdx, %%rax ;" "sub %%rdi, %%rax ;" //result in rax with is the return value : : "b" (memory), "c" (offset) ); } // this implementation only uses the lower part of the values retured by // rdtsc to save instrucions. it is not significantly faster than the // accurate one but it has fewer instrucions and by that is less likely // to be delayed by the scheduler static uint64_t optmemmeasure(char* memoryFirst, char* memorySecond) { asm volatile ( // load other value "mov (%%rbx), %%rax ;" // start clock measure "rdtsc ;" "mov %%eax, %%edi ;" // load second value "mov (%%rcx), %%rdx ;" // stop clock measure "rdtsc ;" "sub %%edi, %%eax ;" // do shit "xor (%%rax,%%rdx), %%rbx ;" "mov %%rbx, (%%rcx) ;" : : "b" (memoryFirst), "c" (memorySecond) ); } // smal implentation to get the rdtsc counter static uint64_t rdtsc() { asm( "rdtsc ;" "shl $32, %rdx ;" "add %rdx, %rax" ); } #else #error "This code only supports x86 and x86_64" #endif