You can time a sectuion of execution as follows:

   cudaEvent_t start, stop;
   cudaEventCreate(&start);
   cudaEventCreate(&stop);

   cudaMemcpy(d_x, x, N*sizeof(float), cudaMemcpyHostToDevice);
   cudaMemcpy(d_y, y, N*sizeof(float), cudaMemcpyHostToDevice);

   cudaEventRecord(start);        // <----------------------- record start time

   saxpy<<<(N+255)/256, 256>>>(N, 2.0f, d_x, d_y);

   cudaEventRecord(stop);         // <----------------------- record stop time

   cudaMemcpy(y, d_y, N*sizeof(float), cudaMemcpyDeviceToHost);
   cudaEventSynchronize(stop);

   float milliseconds = 0;
   cudaEventElapsedTime(&milliseconds, start, stop);

