Measure Time Lapses

Task: to measure the cost of memcpy() of 4KB

Hint:

  • Use rdtsc to measure the time lapses
  • Do memcopy 4GB, and then calculate for 4KB
  • 1st memcpy of 4GB will trigger page fault, measure the 2nd 4GB memcpy in the same application
  • TSC is nowdays invariant with CPU frequency, and is used as a stable wallclock source. Use dmesg | grep tsc to get the actual TSC frequency.
  • In addition, a load fence should be put before rdtsc, otherwise, rdtsc mighe be reordered with previous load instructions and therefore cannot get a precise timestamp.

gettimeofday

  • us level
  • coarse-grained
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/time.h>

void memcpy_speed(int buf_size, int iters){
        struct timeval start, end;
        char *buff_1, *buff_2;

        buff_1 = (char*)malloc(buf_size);
        buff_2 = (char*)malloc(buf_size);

        gettimeofday(&start, NULL);
        for(int i = 0; i < iters; ++i){
                memcpy(buff_2, buff_1, buf_size);
        }
        gettimeofday(&end, NULL);

        printf ("buf_size: %d B, iters: %d, time: %lu usec.\n", \
                buf_size, iters, (end.tv_sec - start.tv_sec) * 1000 * 1000 + end.tv_usec - start.tv_usec);
        free(buff_1);
        free(buff_2);
}

int main() {
        memcpy_speed(4096, 1);
        return 0;
}

clock_gettime

  • ns level
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>

void memcpy_speed(unsigned long buf_size, int iters){
    struct timespec start, end;
    char *buff_1, *buff_2;

    buff_1 = (char*)malloc(buf_size);
    buff_2 = (char*)malloc(buf_size);

    memcpy(buff_2, buff_1, buf_size);

    clock_gettime(CLOCK_MONOTONIC, &start);
    for(int i = 0; i < iters; ++i){
        memcpy(buff_2, buff_1, buf_size);
    }
    clock_gettime(CLOCK_MONOTONIC, &end);

    printf ("buf_size: %lu, iters: %d, time: %lu nsec.\n", \
        buf_size, iters, (end.tv_sec - start.tv_sec) * 1000 * 1000 * 1000 + end.tv_nsec - start.tv_nsec);

    free(buff_1);
    free(buff_2);
}

int main() {
    unsigned long buf_size = (unsigned long)4 * 1024 * 1024 * 1024;
    memcpy_speed(buf_size, 1);
    return 0;
}

rdtsc

  • cycle level
#include <stdio.h>
#include <stdlib.h>
#include <string.h>


#if defined(__i386__)

static __inline__ unsigned long long rdtsc(void)
{
  unsigned long long int x;
     __asm__ volatile (".byte 0x0f, 0x31" : "=A" (x));
     return x;
}

#elif defined(__x86_64__)

static __inline__ unsigned long long rdtsc(void)
{
  unsigned hi, lo;
  __asm__ __volatile__ ("lfence; rdtsc" : "=a"(lo), "=d"(hi));
  return ( (unsigned long long)lo)|( ((unsigned long long)hi)<<32 );
}

#endif

void memcpy_speed(unsigned long buf_size, int iters){
    unsigned long long start, end;
    char *buff_1, *buff_2;

    buff_1 = (char*)malloc(buf_size);
    buff_2 = (char*)malloc(buf_size);

    memcpy(buff_2, buff_1, buf_size);

    start = rdtsc();
    for(int i = 0; i < iters; ++i){
        memcpy(buff_2, buff_1, buf_size);
    }
    end = rdtsc();

    printf ("buf_size: %lu, iters: %d, time: %llu cycles.\n", \
        buf_size, iters, end - start);

    free(buff_1);
    free(buff_2);
}

int main() {
    unsigned long buf_size = (unsigned long)4 * 1024 * 1024 * 1024;
    memcpy_speed(buf_size, 1);
    return 0;
}

results matching ""

    No results matching ""