首先要搞明白CUDA、CUDAToolkit….的关系
安装CUDA(cudatoolkit好像会顺带安上)
安装Pytorch
在Pytorch官网根据电脑环境生成合适的命令下载直接pip install torch
会下错成CPU版(报错Torch not compiled with CUDA enabled
当时我看了官方站的问答才明白)
验证
1 2 3 4 5 6
| >>> import torch >>> torch.cuda.is_available() True >>> torch.cuda.device_count() 1 >>>
|
共享存储器的应用:矩阵乘法
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
| __global__ void MatrixMulKernel(float* Md, float* Nd, float* Pd, int Width){ __shared__ float Mds[TILE_WIDTH][TILE_WIDTH]; __shared__ float Nds[TILE_WIDTH][TILE_WIDTH]; int bx = blockIdx.x; int by = blockIdx.y; int tx = threadIdx.x; int ty = threadIdx.y; int Row = by * TILE_WIDTH + ty; int Col = bx * TILE_WIDTH + tx; float Pvalue = 0; for(int m = 0; m < Width/TILE_WIDTH; ++m) { Mds[ty][tx] = Md[Row*Width + (m*TILE_WIDTH + tx)]; Nds[ty][tx] = Nd[Col + (m*TILE_WIDTH + ty)*Width]; __syncthreads(); for (int k = 0; k < TILE_WIDTH; ++k) Pvalue += Mds[ty][k] * Nds[k][tx]; __synchthreads(); } Pd[Row*Width+Col] = Pvalue; }
|