紋理一詞來源于GPU圖形世界,GPU通用并行計算“盜用”了紋理一詞,定義了一個紋理內(nèi)存的概念。紋理內(nèi)存緩存在 設(shè)備上,在某些情況下能減少對內(nèi)存的請求并降低內(nèi)存帶寬的使用,是專門為那些在內(nèi)存訪問模式中存在大量空間局部性的圖形應(yīng)用而設(shè)計,意味著一個線程讀取的位置可能與鄰近線程讀取的位置“非常接近”。對于GPU內(nèi)核而言,紋理內(nèi)存是只讀內(nèi)存,并且只有通過特殊的紋理API才能對其訪問。
紋理內(nèi)存分為一維紋理內(nèi)存和二維紋理內(nèi)存,理解紋理內(nèi)存最好的方式是丟掉“紋理”兩個字,紋理內(nèi)存本質(zhì)上是一塊內(nèi)存,是GPU在特定應(yīng)用中對一維、二維變量的特殊聲明定義以及特殊使用,這種特殊使用能夠減少內(nèi)存流量,提升運(yùn)算性能。
紋理變量(引用)必須聲明為文件作用域內(nèi)的全局變量,這里先探討一下一維紋理內(nèi)存的使用方法。一維紋理內(nèi)存的關(guān)鍵操作如下:
1、用texture<類型>類型聲明。如聲明一個unsigned char 型的一維紋理tex1,格式為:
texture<unsigned char,1,cudaReadmodeElementType> tex1;
2、通過cudaBindTexture()綁定到紋理內(nèi)存中,并關(guān)聯(lián)到對應(yīng)的數(shù)據(jù)上。如將unsigned char類型的dev_A綁定到一維紋理tex1上,格式為:
cudaBindTexture(0,tex1,dev_A);
注意一旦將數(shù)據(jù)綁定到一個紋理內(nèi)存上,該數(shù)據(jù)就已經(jīng)傳輸?shù)搅嗽O(shè)備緩存上,在核函數(shù)中就可以直接訪問,不再需要額外傳入。
3、 通過tex1Dfetch()來讀取紋理內(nèi)存中的數(shù)據(jù)。紋理內(nèi)存是一種特殊的內(nèi)存,需要使用特定的紋理API來訪問其中的數(shù)據(jù)。如訪問tex1數(shù)組的第3個元素,格式為:
tex1Dfetch(tex1,2);
4、 通過cudaUnbindTexture()取消綁定紋理內(nèi)存。紋理內(nèi)存使用完之后需要取消綁定,釋放空間,如解除紋理tex1的綁定,格式為:
cudaUnbindTexture(tex1);
考慮一個簡單的應(yīng)用,把一個長度是100的向量A中的數(shù)據(jù)拷貝到一個向量B中,使用普通CPU編程實現(xiàn)如下:
#include <iostream>using namespace std;#define _length 100//CPU函數(shù)實現(xiàn)復(fù)制一個數(shù)組void Copy_CPU(unsigned int * listSource, unsigned int * listTarget, int length){ for (int i = 0; i < length; i++) { listTarget[i] = listSource[i]; }}int main(){ unsigned int * listSource = new unsigned int[_length]; unsigned int * listTarget = new unsigned int[_length]; //賦值 for (int i = 0; i < _length; i++) { listSource[i] = i; } //調(diào)用CPU復(fù)制函數(shù) Copy_CPU(listSource, listTarget, _length); cout << "原始數(shù)據(jù): "; for (int i = 0; i < _length; i++) { cout << listSource[i] << " "; } cout << endl << endl << "通過CPU拷貝的數(shù)據(jù): "; for (int i = 0; i < _length; i++) { cout << listTarget[i] << " "; } getchar();}運(yùn)行結(jié)果:
使用GPU編程,普通變量編程實現(xiàn):
#include"cuda_runtime.h"#include"device_launch_parameters.h"#include<iostream>#define _length 100using namespace std;//聲明要調(diào)用的Copy_GPU函數(shù)extern "C" void Copy_GPU(unsigned int* listSource, unsigned int* listTarget, int length);void main(int argc, char** argv){ unsigned int *listSource = new unsigned int[_length]; unsigned int *listTarget = new unsigned int[_length]; //賦值 for (int i = 0; i < _length; i++) { listSource[i] = i; } // 調(diào)用Copy_GPU函數(shù),Copy_GPU中會調(diào)用gpu端的kernel函數(shù) Copy_GPU(listSource, listTarget, _length); cout << "原始數(shù)據(jù): "; for (int i = 0; i < _length; i++) { cout << listSource[i] << " "; } cout << endl << endl << "通過GPU普通內(nèi)存拷貝的數(shù)據(jù): "; for (int i = 0; i < _length; i++) { cout << listTarget[i] << " "; } getchar();}//核心代碼,在gpu端執(zhí)行的kernel,__global__ void Blending_Texture(unsigned int* listSource, unsigned int* listTarget, int size){ //通過線程ID得到數(shù)組下標(biāo) int index = blockIdx.x * blockDim.x + threadIdx.x; if (index < size) listTarget[index] = listSource[index];}void Copy_GPU(unsigned int* listSource, unsigned int* listTarget, int length){ int data_size = length * sizeof(unsigned int); unsigned int *dev_Source; unsigned int *dev_Target; //在設(shè)備上申請顯存空間 cudaMalloc((void**)&dev_Source, data_size); cudaMalloc((void**)&dev_Target, data_size); //將host端的數(shù)據(jù)拷貝到device端 cudaMemcpy(dev_Source, listSource, data_size, cudaMemcpyHostToDevice); //調(diào)用kernel Blending_Texture << < ceil(_length / 10), 10 >> > (dev_Source, dev_Target, _length); //將結(jié)果拷貝到host端 ☆host就是CPU cudaMemcpy(listTarget, dev_Target, data_size, cudaMemcpyDeviceToHost); //釋放內(nèi)存空間 cudaFree(dev_Source); cudaFree(dev_Target);}運(yùn)行結(jié)果:
使用GPU編程,一維紋理變量編程實現(xiàn):
#include"cuda_runtime.h"#include"device_launch_parameters.h"#include<iostream>#define _length 100using namespace std;//聲明紋理,用來綁定紋理,其實也就是個紋理標(biāo)識texture<unsigned int, 1, cudaReadModeElementType> rT1;//聲明要調(diào)用的Copy_GPU函數(shù)extern "C" void Copy_GPU(unsigned int* listSource, unsigned int* listTarget, int length);void main(int argc, char** argv){ unsigned int *listSource = new unsigned int[_length]; unsigned int *listTarget = new unsigned int[_length]; //賦值 for (int i = 0; i < _length; i++) { listSource[i] = i; } // 調(diào)用Copy_GPU函數(shù),Copy_GPU中會調(diào)用gpu端的kernel函數(shù) Copy_GPU(listSource, listTarget, _length); cout << "原始數(shù)據(jù): "; for (int i = 0; i < _length; i++) { cout << listSource[i] << " "; } cout << endl << endl << "通過GPU紋理內(nèi)存拷貝的數(shù)據(jù): "; for (int i = 0; i < _length; i++) { cout << listTarget[i] << " "; } getchar();}//核心代碼,在gpu端執(zhí)行的kernel,__global__ void Blending_Texture(unsigned int* listTarget, int size){ //通過線程ID得到數(shù)組下標(biāo) int index = blockIdx.x * blockDim.x + threadIdx.x; //通過紋理獲取函數(shù)得到數(shù)據(jù)再運(yùn)算 if (index < size) listTarget[index] = tex1Dfetch(rT1, index);}void Copy_GPU(unsigned int* listSource, unsigned int* listTarget, int length){ int data_size = length * sizeof(unsigned int); unsigned int *dev_Source; unsigned int *dev_Target; //在設(shè)備上申請顯存空間 cudaMalloc((void**)&dev_Source, data_size); cudaMalloc((void**)&dev_Target, data_size); //將host端的數(shù)據(jù)拷貝到device端 cudaMemcpy(dev_Source, listSource, data_size, cudaMemcpyHostToDevice); //綁定紋理,綁定的紋理標(biāo)識對應(yīng)的數(shù)據(jù) cudaBindTexture(0, rT1, dev_Source); //調(diào)用kernel Blending_Texture << < ceil(_length / 10), 10 >> > (dev_Target, _length); //將結(jié)果拷貝到host端 ☆host就是CPU cudaMemcpy(listTarget, dev_Target, data_size, cudaMemcpyDeviceToHost); //取消綁定 cudaUnbindTexture(rT1); //釋放內(nèi)存空間 cudaFree(dev_Source); cudaFree(dev_Target);}運(yùn)行結(jié)果:
再舉一個使用CUDA+OpenCv編程,實現(xiàn)復(fù)制一幅圖像的例子:
#include"cuda_runtime.h"#include<iostream>#include<highgui/highgui.hpp>#include<imgPRoc/imgproc.hpp>#define DIM 512 //圖像尺寸using namespace std;using namespace cv;//一維紋理聲明texture<unsigned char, 1, cudaReadModeElementType> rT1;__global__ void Kernel_Copy(unsigned char* imageTarget){ int x = threadIdx.x + blockIdx.x*blockDim.x; int y = threadIdx.y + blockIdx.y*blockDim.y; int offset = x + y*blockDim.x*gridDim.x; //復(fù)制圖像 imageTarget[offset * 3 + 2] = tex1Dfetch(rT1, offset * 3 + 2); imageTarget[offset * 3 + 1] = tex1Dfetch(rT1, offset * 3 + 1); imageTarget[offset * 3 + 0] = tex1Dfetch(rT1, offset * 3 + 0);}void main(int argc, char** argv){ Mat image = imread("D://lena.jpg"); Mat imageSource; resize(image, imageSource, Size(DIM, DIM)); //調(diào)整圖像大小 Mat imageTarget = Mat(Size(DIM, DIM), CV_8UC3, Scalar::all(0)); //分配空間 unsigned char *dev_imageSource; unsigned char *dev_imageTarget; cudaMalloc((void**)&dev_imageSource, 3 * imageSource.rows*imageSource.cols); cudaMalloc((void**)&dev_imageTarget, 3 * imageSource.rows*imageSource.cols); cudaMemcpy(dev_imageSource, imageSource.data, 3 * imageSource.cols*imageSource.rows, cudaMemcpyHostToDevice); cudaMemcpy(dev_imageTarget, imageTarget.data, 3 * imageSource.cols*imageSource.rows, cudaMemcpyHostToDevice); //綁定紋理 cudaBindTexture(0, rT1, dev_imageSource); dim3 grids(DIM / 16, DIM / 16); dim3 threads(16, 16); //調(diào)用kernel Kernel_Copy << < grids, threads >> > (dev_imageTarget); //將結(jié)果拷貝到host端 ☆host就是CPU cudaMemcpy(imageTarget.data, dev_imageTarget, 3 * imageSource.cols*imageSource.rows, cudaMemcpyDeviceToHost); imshow("CUDA紋理內(nèi)存使用示例", imageTarget); waitKey(); //解除紋理綁定 cudaUnbindTexture(rT1); cudaFree(dev_imageSource); cudaFree(dev_imageSource);}運(yùn)行結(jié)果:
新聞熱點
疑難解答