#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "helper_math.h"
#include <stdio.h>
#include <iostream>
 
 
#define CacheCount 5
static const int FloatCountPerChunk = 32 * 1024 * 1024;
static const int BenchmarkRepetitionCount = 10;
static const int MaxCacheSize = 4 * 1024 * 1024;
static const int BenchmarkCacheSizeDelta = 64 * 1024;
static const float CacheDRAMDifFactor = 1.1f;
 
void CheckError(int ErrorCode, char* CallName)
{
	if (ErrorCode != cudaSuccess)
	{
		printf("Error: ");
		printf(CallName);
		printf("\n");
		printf("CUDA errorcode %i \nExiting . . . . \n", ErrorCode);
		system("pause");
		exit;
	}
}
 
__global__ void BenchMarkCacheSizeKernel(float* In, int IterCount, int CacheSizeFloat)
{
	int Index = threadIdx.x;
 
	float Temp = 1;
 
	for (int i = 0; i < IterCount; i++)
	{
		Temp += In[Index];
		Index += blockDim.x;
		if (Index >= CacheSizeFloat)
			Index -= CacheSizeFloat;
	}
 
	if (Temp == -1)
		In[0] = -1;
}
 
 
__global__ void BenchMarkDRAMReadKernel(float4* In, int FloatCount)
{
	int ThreadID = (blockDim.x *blockIdx.x + threadIdx.x) % FloatCount;
 
	float4 Temp = make_float4(1);
 
	Temp += In[ThreadID];
 
	if (length(Temp) == -1)
		In[0] = Temp;
 
}
 
 
__global__ void BenchMarkDRAMWriteKernel(float4* In, int FloatCount)
{
	int ThreadID = (blockDim.x *blockIdx.x + threadIdx.x) % FloatCount;
 
	float4 Temp = make_float4(1);
 
	In[ThreadID] = Temp;;
}
 
 
 
__global__ void BenchMarkCacheReadKernel(float4* In, int Zero, int FloatCount)
{
	int ThreadID = (blockDim.x *blockIdx.x + threadIdx.x) % FloatCount;
 
	float4 Temp = make_float4(1);
 
#pragma unroll
	for (int i = 0; i < CacheCount; i++)
	{
		Temp += In[ThreadID + i*Zero];
	}
 
	if (length(Temp)  == - 1)
		In[0] = Temp;
 
}
 
 
__global__ void BenchMarkCacheWriteKernel(float4* In, int Zero, int FloatCount)
{
	int ThreadID = (blockDim.x *blockIdx.x + threadIdx.x) % FloatCount;
 
	float4 Temp = make_float4(1);
 
#pragma unroll
	for (int i = 0; i < CacheCount; i++)
	{
	In[ThreadID + i*Zero] = Temp;
	}
}
 
int main()
{
	printf("Nai's Cache Size Benchmark \n");
	printf("DISCLAIMER:\n"
		"This Benchmark tries to roughly estimate the L2 cache size in  \n"
		"CUDA by benchmarking memory latencies for differently sized\n"
		"working sets and different chunks of global memory.\n"
		"Use it without anything in the DRAM of your GPU or else\n"
		"the swapping behaviour of the GPU may corrupt the measurement.\n"
		"If the benchmark produces strange outputs nevertheless,\n"
		"there is a high proability that this benchmark is not working\n"
		"as intended. Your GPU is probably just fine. So please stop \n"
		"making annoying whine posts in any forums, if this benchmark \n"
		"produces a suspicous output.\n"
		);
 
	system("pause");
 
	int nDevices;
 
	CheckError(cudaGetDeviceCount(&nDevices),"Getting devices");
	if (nDevices == 0)
	{
		printf("Error: No CUDA devices found \n");
		system("pause");
		exit;
	}
 
	cudaDeviceProp prop;
	CheckError(cudaGetDeviceProperties(&prop, 0), "Getting device properties");
 
	if (prop.major < 3)
	{
		printf("Error: Compute Capability 2.x and 1.x are not supported anymore\n");
		system("pause");
		exit;
	}
 
 
 
 
	printf("Device name: %s\n", prop.name);
	printf("Device memory size: %i MiByte\n", prop.totalGlobalMem/1024/1024);
 
 
	static const int PointerCount = 5000;
 
 
	int ChunkSize = FloatCountPerChunk*sizeof(float);
	int ChunkSizeMB = (ChunkSize / 1024) / 1024;
	float* Pointers[PointerCount];
	int UsedPointers = 0;
 
	while (true)
	{
 
		int Error = cudaMalloc(&Pointers[UsedPointers], ChunkSize);
 
		if (Error == cudaErrorMemoryAllocation)
			break;
 
		cudaMemset(Pointers[UsedPointers], 0, ChunkSize);
		UsedPointers++;
	}
 
	printf("Chunk Size: %i MiByte  \n", ChunkSizeMB);
	printf("Allocated %i Chunks \n", UsedPointers);
	printf("Allocated %i MiByte \n", ChunkSizeMB*UsedPointers);
 
	cudaEvent_t start, stop;
	CheckError(cudaEventCreate(&start), "Creating events");
	CheckError(cudaEventCreate(&stop), "Creating events");
 
 
	printf("Benchmarking L2 cache size\n");
 
	for (int i = 0; i < UsedPointers; i++)
	{
		float BestLatency = 99999999.f;
 
		for (int j = BenchmarkCacheSizeDelta; j <= MaxCacheSize; j += BenchmarkCacheSizeDelta)
		{
			int CurrentCacheFloatSize = j / sizeof(float);
			int IterCount = BenchmarkRepetitionCount * (CurrentCacheFloatSize / 32);
 
			CheckError(cudaEventRecord(start), "Recording events");
 
			BenchMarkCacheSizeKernel <<<1, 32 >>>(Pointers[i], IterCount, CurrentCacheFloatSize);
 
			CheckError(cudaEventRecord(stop), "Recording events");
			CheckError(cudaEventSynchronize(stop), "Synchronizing with GPU");
 
			float milliseconds = 0;
			CheckError(cudaEventElapsedTime(&milliseconds, start, stop), "Calculating ellapsed time");
 
			float Latency = (milliseconds) / ((float)(IterCount));
 
 
			if (j == BenchmarkCacheSizeDelta)
			{
				BestLatency = Latency;
			}
			else if (Latency >= CacheDRAMDifFactor * BestLatency)
			{
				printf("L2 cache size of chunk no. %i (%i MiByte to %i MiByte): %i kiByte \n", i, ChunkSizeMB*i, ChunkSizeMB*(i + 1), (j - BenchmarkCacheSizeDelta) / 1024);
				break;
			}
 
			if (j == MaxCacheSize)
			{
				printf("Error estimating L2 cache size of chunk no. %i (%i MiByte to %i MiByte) probably because of swapping!\n", i, ChunkSizeMB*i, ChunkSizeMB*(i + 1));
				printf("Latency for the smallest working set: %f ms \nLatency for the largest working set: %f ms \n",Latency, BestLatency);
			}
 
		}
	}
 
 
	int Float4CountPerChunk = FloatCountPerChunk / 4;
	int BlockSize = 128;
	int BlockCount = BenchmarkRepetitionCount * Float4CountPerChunk / BlockSize;
 
	printf("Benchmarking DRAM \n");
 
	for (int i = 0; i < UsedPointers; i++)
	{
		float milliseconds = 0;
 
		CheckError(cudaEventRecord(start), "Recording events");
		BenchMarkDRAMReadKernel << <BlockCount, BlockSize >> >((float4*)Pointers[i], Float4CountPerChunk);
		CheckError(cudaEventRecord(stop), "Recording events");
		CheckError(cudaEventSynchronize(stop), "Synchronizing with GPU");
		CheckError(cudaEventElapsedTime(&milliseconds, start, stop), "Calculating ellapsed time");
		float BandwidthRead = ((float)(BenchmarkRepetitionCount)* (float)(ChunkSize)) / milliseconds / 1000.f / 1000.f;
 
		CheckError(cudaEventRecord(start), "Recording events");
		BenchMarkDRAMWriteKernel << <BlockCount, BlockSize >> >((float4*)Pointers[i], Float4CountPerChunk);
		CheckError(cudaEventRecord(stop), "Recording events");
		CheckError(cudaEventSynchronize(stop), "Synchronizing with GPU");
		CheckError(cudaEventElapsedTime(&milliseconds, start, stop), "Calculating ellapsed time");
		float BandwidthWrite= ((float)(BenchmarkRepetitionCount)* (float)(ChunkSize)) / milliseconds / 1000.f / 1000.f;
 
		printf("%i MiByte to %i MiByte: %5.2f GByte/s Read, %5.2f GByte/s Write \n", ChunkSizeMB*i, ChunkSizeMB*(i + 1), BandwidthRead, BandwidthWrite);
	}
 
 
 
	printf("Benchmarking L2 cache \n");
	for (int i = 0; i < UsedPointers; i++)
	{
 
		float milliseconds = 0;
 
		CheckError(cudaEventRecord(start), "Recording events");
		BenchMarkCacheReadKernel << <BlockCount, BlockSize >> >((float4*)Pointers[i], 0, Float4CountPerChunk);
		CheckError(cudaEventRecord(stop), "Recording events");
		CheckError(cudaEventSynchronize(stop), "Synchronizing with GPU");
		CheckError(cudaEventElapsedTime(&milliseconds, start, stop), "Calculating ellapsed time");
		float BandwidthRead = (((float)CacheCount* (float)BenchmarkRepetitionCount * (float)ChunkSize)) / milliseconds / 1000.f / 1000.f;
 
		CheckError(cudaEventRecord(start), "Recording events");
		BenchMarkCacheWriteKernel<< <BlockCount, BlockSize >> >((float4*)Pointers[i], 0, Float4CountPerChunk);
		CheckError(cudaEventRecord(stop), "Recording events");
		CheckError(cudaEventSynchronize(stop), "Synchronizing with GPU");
		CheckError(cudaEventElapsedTime(&milliseconds, start, stop), "Calculating ellapsed time");
		float BandwidthWrite = (((float)CacheCount* (float)BenchmarkRepetitionCount * (float)ChunkSize)) / milliseconds / 1000.f / 1000.f;
 
		printf("%i MiByte to %i MiByte: %5.2f GByte/s Read, %5.2f GByte/s Write \n", ChunkSizeMB*i, ChunkSizeMB*(i + 1), BandwidthRead, BandwidthWrite);
	}
 
	system("pause");
}