CUDA一維紋理內(nèi)存

2019-11-11 04:25:28

字體：大中小

供稿：網(wǎng)友

紋理一詞來源于GPU圖形世界，GPU通用并行計算“盜用”了紋理一詞，定義了一個紋理內(nèi)存的概念。紋理內(nèi)存緩存在設(shè)備上，在某些情況下能減少對內(nèi)存的請求并降低內(nèi)存帶寬的使用，是專門為那些在內(nèi)存訪問模式中存在大量空間局部性的圖形應(yīng)用而設(shè)計，意味著一個線程讀取的位置可能與鄰近線程讀取的位置“非常接近”。對于GPU內(nèi)核而言，紋理內(nèi)存是只讀內(nèi)存，并且只有通過特殊的紋理API才能對其訪問。

紋理內(nèi)存分為一維紋理內(nèi)存和二維紋理內(nèi)存，理解紋理內(nèi)存最好的方式是丟掉“紋理”兩個字，紋理內(nèi)存本質(zhì)上是一塊內(nèi)存，是GPU在特定應(yīng)用中對一維、二維變量的特殊聲明定義以及特殊使用，這種特殊使用能夠減少內(nèi)存流量，提升運(yùn)算性能。

紋理變量（引用）必須聲明為文件作用域內(nèi)的全局變量，這里先探討一下一維紋理內(nèi)存的使用方法。一維紋理內(nèi)存的關(guān)鍵操作如下：

1、用texture<類型>類型聲明。

如聲明一個unsigned char 型的一維紋理tex1，格式為：

texture<unsigned char,1,cudaReadmodeElementType> tex1;

2、通過cudaBindTexture()綁定到紋理內(nèi)存中，并關(guān)聯(lián)到對應(yīng)的數(shù)據(jù)上。

如將unsigned char類型的dev_A綁定到一維紋理tex1上，格式為：

cudaBindTexture(0,tex1,dev_A);

注意一旦將數(shù)據(jù)綁定到一個紋理內(nèi)存上，該數(shù)據(jù)就已經(jīng)傳輸?shù)搅嗽O(shè)備緩存上，在核函數(shù)中就可以直接訪問，不再需要額外傳入。

3、通過tex1Dfetch()來讀取紋理內(nèi)存中的數(shù)據(jù)。

紋理內(nèi)存是一種特殊的內(nèi)存，需要使用特定的紋理API來訪問其中的數(shù)據(jù)。如訪問tex1數(shù)組的第3個元素，格式為：

tex1Dfetch(tex1,2);

4、通過cudaUnbindTexture()取消綁定紋理內(nèi)存。

紋理內(nèi)存使用完之后需要取消綁定，釋放空間，如解除紋理tex1的綁定，格式為：

cudaUnbindTexture(tex1)；

考慮一個簡單的應(yīng)用，把一個長度是100的向量A中的數(shù)據(jù)拷貝到一個向量B中，使用普通CPU編程實現(xiàn)如下：

#include <iostream>using namespace std;#define _length 100//CPU函數(shù)實現(xiàn)復(fù)制一個數(shù)組void Copy_CPU(unsigned int * listSource, unsigned int * listTarget, int length){	for (int i = 0; i < length; i++)	{		listTarget[i] = listSource[i];	}}int main(){	unsigned int * listSource = new unsigned int[_length];	unsigned int * listTarget = new unsigned int[_length];	//賦值	for (int i = 0; i < _length; i++)	{		listSource[i] = i;	}	//調(diào)用CPU復(fù)制函數(shù)	Copy_CPU(listSource, listTarget, _length);	cout << "原始數(shù)據(jù)： ";	for (int i = 0; i < _length; i++)	{		cout << listSource[i] << " ";	}	cout << endl << endl << "通過CPU拷貝的數(shù)據(jù)： ";	for (int i = 0; i < _length; i++)	{		cout << listTarget[i] << " ";	}	getchar();}運(yùn)行結(jié)果：
使用GPU編程，普通變量編程實現(xiàn)：
#include"cuda_runtime.h"#include"device_launch_parameters.h"#include<iostream>#define _length 100using namespace std;//聲明要調(diào)用的Copy_GPU函數(shù)extern "C" void Copy_GPU(unsigned int* listSource, unsigned int* listTarget, int length);void main(int argc, char** argv){	unsigned int *listSource = new unsigned int[_length];	unsigned int *listTarget = new unsigned int[_length];	//賦值	for (int i = 0; i < _length; i++)	{		listSource[i] = i;	}	// 調(diào)用Copy_GPU函數(shù)，Copy_GPU中會調(diào)用gpu端的kernel函數(shù)	Copy_GPU(listSource, listTarget, _length);	cout << "原始數(shù)據(jù)： ";	for (int i = 0; i < _length; i++)	{		cout << listSource[i] << " ";	}	cout << endl << endl << "通過GPU普通內(nèi)存拷貝的數(shù)據(jù)： ";	for (int i = 0; i < _length; i++)	{		cout << listTarget[i] << " ";	}	getchar();}//核心代碼，在gpu端執(zhí)行的kernel，__global__ void Blending_Texture(unsigned int* listSource, unsigned int* listTarget, int size){	//通過線程ID得到數(shù)組下標(biāo)	int index = blockIdx.x * blockDim.x + threadIdx.x;	if (index < size)		listTarget[index] = listSource[index];}void Copy_GPU(unsigned int* listSource, unsigned int* listTarget, int length){	int data_size = length * sizeof(unsigned int);	unsigned int *dev_Source;	unsigned int *dev_Target;	//在設(shè)備上申請顯存空間	cudaMalloc((void**)&dev_Source, data_size);	cudaMalloc((void**)&dev_Target, data_size);	//將host端的數(shù)據(jù)拷貝到device端	cudaMemcpy(dev_Source, listSource, data_size, cudaMemcpyHostToDevice);	//調(diào)用kernel	Blending_Texture << < ceil(_length / 10), 10 >> > (dev_Source, dev_Target, _length);	//將結(jié)果拷貝到host端 ☆host就是CPU	cudaMemcpy(listTarget, dev_Target, data_size, cudaMemcpyDeviceToHost);	//釋放內(nèi)存空間	cudaFree(dev_Source);	cudaFree(dev_Target);}
運(yùn)行結(jié)果：
使用GPU編程，一維紋理變量編程實現(xiàn)：
#include"cuda_runtime.h"#include"device_launch_parameters.h"#include<iostream>#define _length 100using namespace std;//聲明紋理，用來綁定紋理，其實也就是個紋理標(biāo)識texture<unsigned int, 1, cudaReadModeElementType> rT1;//聲明要調(diào)用的Copy_GPU函數(shù)extern "C" void Copy_GPU(unsigned int* listSource, unsigned int* listTarget, int length);void main(int argc, char** argv){	unsigned int *listSource = new unsigned int[_length];	unsigned int *listTarget = new unsigned int[_length];	//賦值	for (int i = 0; i < _length; i++)	{		listSource[i] = i;	}	// 調(diào)用Copy_GPU函數(shù)，Copy_GPU中會調(diào)用gpu端的kernel函數(shù)	Copy_GPU(listSource, listTarget, _length);	cout << "原始數(shù)據(jù)： ";	for (int i = 0; i < _length; i++)	{		cout << listSource[i] << " ";	}	cout << endl << endl << "通過GPU紋理內(nèi)存拷貝的數(shù)據(jù)： ";	for (int i = 0; i < _length; i++)	{		cout << listTarget[i] << " ";	}	getchar();}//核心代碼，在gpu端執(zhí)行的kernel，__global__ void Blending_Texture(unsigned int* listTarget, int size){	//通過線程ID得到數(shù)組下標(biāo)	int index = blockIdx.x * blockDim.x + threadIdx.x;	//通過紋理獲取函數(shù)得到數(shù)據(jù)再運(yùn)算	if (index < size)		listTarget[index] = tex1Dfetch(rT1, index);}void Copy_GPU(unsigned int* listSource, unsigned int* listTarget, int length){	int data_size = length * sizeof(unsigned int);	unsigned int *dev_Source;	unsigned int *dev_Target;	//在設(shè)備上申請顯存空間	cudaMalloc((void**)&dev_Source, data_size);	cudaMalloc((void**)&dev_Target, data_size);	//將host端的數(shù)據(jù)拷貝到device端	cudaMemcpy(dev_Source, listSource, data_size, cudaMemcpyHostToDevice);	//綁定紋理，綁定的紋理標(biāo)識對應(yīng)的數(shù)據(jù) 	cudaBindTexture(0, rT1, dev_Source);	//調(diào)用kernel	Blending_Texture << < ceil(_length / 10), 10 >> > (dev_Target, _length);	//將結(jié)果拷貝到host端 ☆host就是CPU	cudaMemcpy(listTarget, dev_Target, data_size, cudaMemcpyDeviceToHost);	//取消綁定	cudaUnbindTexture(rT1);	//釋放內(nèi)存空間	cudaFree(dev_Source);	cudaFree(dev_Target);}運(yùn)行結(jié)果：
再舉一個使用CUDA+OpenCv編程，實現(xiàn)復(fù)制一幅圖像的例子：
#include"cuda_runtime.h"#include<iostream>#include<highgui/highgui.hpp>#include<imgPRoc/imgproc.hpp>#define DIM 512   //圖像尺寸using namespace std;using namespace cv;//一維紋理聲明texture<unsigned char, 1, cudaReadModeElementType> rT1;__global__ void Kernel_Copy(unsigned char* imageTarget){	int x = threadIdx.x + blockIdx.x*blockDim.x;	int y = threadIdx.y + blockIdx.y*blockDim.y;	int offset = x + y*blockDim.x*gridDim.x;	//復(fù)制圖像	imageTarget[offset * 3 + 2] = tex1Dfetch(rT1, offset * 3 + 2);	imageTarget[offset * 3 + 1] = tex1Dfetch(rT1, offset * 3 + 1);	imageTarget[offset * 3 + 0] = tex1Dfetch(rT1, offset * 3 + 0);}void main(int argc, char** argv){	Mat image = imread("D://lena.jpg");	Mat imageSource;	resize(image, imageSource, Size(DIM, DIM)); //調(diào)整圖像大小	Mat imageTarget = Mat(Size(DIM, DIM), CV_8UC3, Scalar::all(0));	//分配空間	unsigned char *dev_imageSource;	unsigned char *dev_imageTarget;	cudaMalloc((void**)&dev_imageSource, 3 * imageSource.rows*imageSource.cols);	cudaMalloc((void**)&dev_imageTarget, 3 * imageSource.rows*imageSource.cols);	cudaMemcpy(dev_imageSource, imageSource.data, 3 * imageSource.cols*imageSource.rows, cudaMemcpyHostToDevice);	cudaMemcpy(dev_imageTarget, imageTarget.data, 3 * imageSource.cols*imageSource.rows, cudaMemcpyHostToDevice);	//綁定紋理	cudaBindTexture(0, rT1, dev_imageSource);	dim3 grids(DIM / 16, DIM / 16);	dim3 threads(16, 16);	//調(diào)用kernel	Kernel_Copy << < grids, threads >> > (dev_imageTarget);	//將結(jié)果拷貝到host端 ☆host就是CPU	cudaMemcpy(imageTarget.data, dev_imageTarget, 3 * imageSource.cols*imageSource.rows, cudaMemcpyDeviceToHost);	imshow("CUDA紋理內(nèi)存使用示例", imageTarget);	waitKey();	//解除紋理綁定	cudaUnbindTexture(rT1);	cudaFree(dev_imageSource);	cudaFree(dev_imageSource);}運(yùn)行結(jié)果：