Measure Time Lapses
Task: to measure the cost of memcpy() of 4KB
Hint:
- Use rdtsc to measure the time lapses
- Do memcopy 4GB, and then calculate for 4KB
- 1st memcpy of 4GB will trigger page fault, measure the 2nd 4GB memcpy in the same application
- TSC is nowdays invariant with CPU frequency, and is used as a stable wallclock source. Use
dmesg | grep tsc
to get the actual TSC frequency. - In addition, a load fence should be put before rdtsc, otherwise, rdtsc mighe be reordered with previous load instructions and therefore cannot get a precise timestamp.
gettimeofday
- us level
- coarse-grained
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/time.h>
void memcpy_speed(int buf_size, int iters){
struct timeval start, end;
char *buff_1, *buff_2;
buff_1 = (char*)malloc(buf_size);
buff_2 = (char*)malloc(buf_size);
gettimeofday(&start, NULL);
for(int i = 0; i < iters; ++i){
memcpy(buff_2, buff_1, buf_size);
}
gettimeofday(&end, NULL);
printf ("buf_size: %d B, iters: %d, time: %lu usec.\n", \
buf_size, iters, (end.tv_sec - start.tv_sec) * 1000 * 1000 + end.tv_usec - start.tv_usec);
free(buff_1);
free(buff_2);
}
int main() {
memcpy_speed(4096, 1);
return 0;
}
clock_gettime
- ns level
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
void memcpy_speed(unsigned long buf_size, int iters){
struct timespec start, end;
char *buff_1, *buff_2;
buff_1 = (char*)malloc(buf_size);
buff_2 = (char*)malloc(buf_size);
memcpy(buff_2, buff_1, buf_size);
clock_gettime(CLOCK_MONOTONIC, &start);
for(int i = 0; i < iters; ++i){
memcpy(buff_2, buff_1, buf_size);
}
clock_gettime(CLOCK_MONOTONIC, &end);
printf ("buf_size: %lu, iters: %d, time: %lu nsec.\n", \
buf_size, iters, (end.tv_sec - start.tv_sec) * 1000 * 1000 * 1000 + end.tv_nsec - start.tv_nsec);
free(buff_1);
free(buff_2);
}
int main() {
unsigned long buf_size = (unsigned long)4 * 1024 * 1024 * 1024;
memcpy_speed(buf_size, 1);
return 0;
}
rdtsc
- cycle level
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#if defined(__i386__)
static __inline__ unsigned long long rdtsc(void)
{
unsigned long long int x;
__asm__ volatile (".byte 0x0f, 0x31" : "=A" (x));
return x;
}
#elif defined(__x86_64__)
static __inline__ unsigned long long rdtsc(void)
{
unsigned hi, lo;
__asm__ __volatile__ ("lfence; rdtsc" : "=a"(lo), "=d"(hi));
return ( (unsigned long long)lo)|( ((unsigned long long)hi)<<32 );
}
#endif
void memcpy_speed(unsigned long buf_size, int iters){
unsigned long long start, end;
char *buff_1, *buff_2;
buff_1 = (char*)malloc(buf_size);
buff_2 = (char*)malloc(buf_size);
memcpy(buff_2, buff_1, buf_size);
start = rdtsc();
for(int i = 0; i < iters; ++i){
memcpy(buff_2, buff_1, buf_size);
}
end = rdtsc();
printf ("buf_size: %lu, iters: %d, time: %llu cycles.\n", \
buf_size, iters, end - start);
free(buff_1);
free(buff_2);
}
int main() {
unsigned long buf_size = (unsigned long)4 * 1024 * 1024 * 1024;
memcpy_speed(buf_size, 1);
return 0;
}