Thực hiện song song cho nhiều SVD bằng CUDA

Tôi mới lập trình song song bằng GPU nên tôi xin lỗi nếu câu hỏi rộng hoặc mơ hồ. Tôi biết có một số chức năng SVD song song trong thư viện CULA, nhưng chiến lược nào nếu tôi có một số lượng lớn ma trận tương đối nhỏ để xác định? Ví dụ: tôi có n ma trận có kích thước d, n lớn và d là nhỏ. Làm thế nào để song song quá trình này? Bất cứ ai có thể cho tôi một gợi ý?Thực hiện song song cho nhiều SVD bằng CUDA

Nguồn

2013-07-01 Logan Yang

Bạn có thể xem bài đăng Batched Operations của blog CULA để thảo luận về sự cố của bạn.

EDIT

Từ những gì tôi hiểu từ nhận xét của bạn dưới đây, bạn muốn mỗi thread để tính toán một SVD riêng biệt. Vì vậy, về cơ bản, mỗi luồng nên thực hiện một lược đồ SVD tuần tự, chuẩn. Cho rằng một số tài liệu tham khảo có thể hữu ích:

Numerical Recipes

Golub, Van Loan, Matrix Computations

Nếu bạn sử dụng phương pháp này, tuy nhiên, tôi sợ bạn sẽ không thể sử dụng nữa để cuBLAS, như những người đang host chức năng không callable từ device (trừ khi bạn không có khả năng tính toán >3.5, xem ví dụ simpleDevLibCUBLAS.). Nhưng về cơ bản theo cách này tôi nghĩ rằng bạn đang bằng cách nào đó thực hiện các khái niệm hàng loạt của chính mình.

Nếu bạn quyết định đi đến một tiêu chuẩn hơn thực hiện GPU song song, tài liệu tham khảo dưới đây có thể được quan tâm:

Singular Value Decomposition on GPU using CUDA

Nguồn

2013-07-01 13:02:41 JackOLantern

Tương tự như mã ngược giải/ma trận đồng loạt đăng tải trên CUDA đăng ký trang web của nhà phát triển bạn có thể xem xét một sợi ma trận mỗi hoặc một cách tiếp cận ma trận mỗi thread-block. Điều này hoạt động tốt nếu kích thước lô lớn và ma trận rất nhỏ. Các giá trị điển hình cho n và d trong trường hợp của bạn là gì? – njuffa

Chế độ lô BLAS chỉ có phép nhân ma trận, phải không? Làm thế nào tôi có thể sử dụng nó cho SVD? Và bạn có thể cho tôi một ví dụ về cách phân chia các luồng hoặc các khối trong GPU và cho phép mỗi đơn vị thực hiện một SVD song song không? Ví dụ: nếu n = 500 d = 20. Cảm ơn! –

Tôi đã chỉnh sửa bài đăng của mình. Tôi hy vọng nó sẽ hữu ích. – JackOLantern

câu trả lời trước của tôi bây giờ là out-of-date. Tính đến tháng 2 năm 2015, CUDA 7 (hiện đang trong phiên bản ứng cử viên phát hành) cung cấp đầy đủ khả năng SVD trong thư viện cuSOLVER của nó. Dưới đây, tôi đang cung cấp một ví dụ về việc tạo ra phân tích giá trị số ít bằng cách sử dụng CUDA cuSOLVER.

Về vấn đề cụ thể bạn đang tăng (tính SVD của một số ma trận có kích thước nhỏ), bạn nên điều chỉnh ví dụ tôi đang cung cấp bên dưới bằng cách sử dụng luồng. Để liên kết một dòng cho mỗi công việc bạn có thể sử dụng

cudaStreamCreate()

và

cusolverDnSetStream()

kernel.cu

#include "cuda_runtime.h" 
#include "device_launch_parameters.h" 

#include<iostream> 
#include<iomanip> 
#include<stdlib.h> 
#include<stdio.h> 
#include<assert.h> 
#include<math.h> 

#include <cusolverDn.h> 
#include <cuda_runtime_api.h> 

#include "Utilities.cuh" 

/********/ 
/* MAIN */ 
/********/ 
int main(){ 

    // --- gesvd only supports Nrows >= Ncols 
    // --- column major memory ordering 

    const int Nrows = 7; 
    const int Ncols = 5; 

    // --- cuSOLVE input/output parameters/arrays 
    int work_size = 0; 
    int *devInfo;   gpuErrchk(cudaMalloc(&devInfo,   sizeof(int))); 

    // --- CUDA solver initialization 
    cusolverDnHandle_t solver_handle; 
    cusolverDnCreate(&solver_handle); 

    // --- Setting the host, Nrows x Ncols matrix 
    double *h_A = (double *)malloc(Nrows * Ncols * sizeof(double)); 
    for(int j = 0; j < Nrows; j++) 
     for(int i = 0; i < Ncols; i++) 
      h_A[j + i*Nrows] = (i + j*j) * sqrt((double)(i + j)); 

    // --- Setting the device matrix and moving the host matrix to the device 
    double *d_A;   gpuErrchk(cudaMalloc(&d_A,  Nrows * Ncols * sizeof(double))); 
    gpuErrchk(cudaMemcpy(d_A, h_A, Nrows * Ncols * sizeof(double), cudaMemcpyHostToDevice)); 

    // --- host side SVD results space 
    double *h_U = (double *)malloc(Nrows * Nrows  * sizeof(double)); 
    double *h_V = (double *)malloc(Ncols * Ncols  * sizeof(double)); 
    double *h_S = (double *)malloc(min(Nrows, Ncols) * sizeof(double)); 

    // --- device side SVD workspace and matrices 
    double *d_U;   gpuErrchk(cudaMalloc(&d_U, Nrows * Nrows  * sizeof(double))); 
    double *d_V;   gpuErrchk(cudaMalloc(&d_V, Ncols * Ncols  * sizeof(double))); 
    double *d_S;   gpuErrchk(cudaMalloc(&d_S, min(Nrows, Ncols) * sizeof(double))); 

    // --- CUDA SVD initialization 
    cusolveSafeCall(cusolverDnDgesvd_bufferSize(solver_handle, Nrows, Ncols, &work_size)); 
    double *work; gpuErrchk(cudaMalloc(&work, work_size * sizeof(double))); 

    // --- CUDA SVD execution 
    cusolveSafeCall(cusolverDnDgesvd(solver_handle, 'A', 'A', Nrows, Ncols, d_A, Nrows, d_S, d_U, Nrows, d_V, Ncols, work, work_size, NULL, devInfo)); 
    int devInfo_h = 0; gpuErrchk(cudaMemcpy(&devInfo_h, devInfo, sizeof(int), cudaMemcpyDeviceToHost)); 
    if (devInfo_h != 0) std::cout << "Unsuccessful SVD execution\n\n"; 

    // --- Moving the results from device to host 
    gpuErrchk(cudaMemcpy(h_S, d_S, min(Nrows, Ncols) * sizeof(double), cudaMemcpyDeviceToHost)); 
    gpuErrchk(cudaMemcpy(h_U, d_U, Nrows * Nrows  * sizeof(double), cudaMemcpyDeviceToHost)); 
    gpuErrchk(cudaMemcpy(h_V, d_V, Ncols * Ncols  * sizeof(double), cudaMemcpyDeviceToHost)); 

    std::cout << "Singular values\n"; 
    for(int i = 0; i < min(Nrows, Ncols); i++) 
     std::cout << "d_S["<<i<<"] = " << std::setprecision(15) << h_S[i] << std::endl; 

    std::cout << "\nLeft singular vectors - For y = A * x, the columns of U span the space of y\n"; 
    for(int j = 0; j < Nrows; j++) { 
     printf("\n"); 
     for(int i = 0; i < Nrows; i++) 
      printf("U[%i,%i]=%f\n",i,j,h_U[j*Nrows + i]); 
    } 

    std::cout << "\nRight singular vectors - For y = A * x, the columns of V span the space of x\n"; 
    for(int i = 0; i < Ncols; i++) { 
     printf("\n"); 
     for(int j = 0; j < Ncols; j++) 
      printf("V[%i,%i]=%f\n",i,j,h_V[j*Ncols + i]); 
    } 

    cusolverDnDestroy(solver_handle); 

    return 0; 

}

Utilities.cuh

#ifndef UTILITIES_CUH 
#define UTILITIES_CUH 

extern "C" int iDivUp(int, int); 
extern "C" void gpuErrchk(cudaError_t); 
extern "C" void cusolveSafeCall(cusolverStatus_t); 

#endif

Utilities.cu

#include <stdio.h> 
#include <assert.h> 

#include "cuda_runtime.h" 
#include <cuda.h> 

#include <cusolverDn.h> 

/*******************/ 
/* iDivUp FUNCTION */ 
/*******************/ 
extern "C" int iDivUp(int a, int b){ return ((a % b) != 0) ? (a/b + 1) : (a/b); } 

/********************/ 
/* CUDA ERROR CHECK */ 
/********************/ 
// --- Credit to http://stackoverflow.com/questions/14038589/what-is-the-canonical-way-to-check-for-errors-using-the-cuda-runtime-api 
void gpuAssert(cudaError_t code, char *file, int line, bool abort=true) 
{ 
    if (code != cudaSuccess) 
    { 
     fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line); 
     if (abort) { exit(code); } 
    } 
} 

extern "C" void gpuErrchk(cudaError_t ans) { gpuAssert((ans), __FILE__, __LINE__); } 

/**************************/ 
/* CUSOLVE ERROR CHECKING */ 
/**************************/ 
static const char *_cudaGetErrorEnum(cusolverStatus_t error) 
{ 
    switch (error) 
    { 
     case CUSOLVER_STATUS_SUCCESS: 
      return "CUSOLVER_SUCCESS"; 

     case CUSOLVER_STATUS_NOT_INITIALIZED: 
      return "CUSOLVER_STATUS_NOT_INITIALIZED"; 

     case CUSOLVER_STATUS_ALLOC_FAILED: 
      return "CUSOLVER_STATUS_ALLOC_FAILED"; 

     case CUSOLVER_STATUS_INVALID_VALUE: 
      return "CUSOLVER_STATUS_INVALID_VALUE"; 

     case CUSOLVER_STATUS_ARCH_MISMATCH: 
      return "CUSOLVER_STATUS_ARCH_MISMATCH"; 

     case CUSOLVER_STATUS_EXECUTION_FAILED: 
      return "CUSOLVER_STATUS_EXECUTION_FAILED"; 

     case CUSOLVER_STATUS_INTERNAL_ERROR: 
      return "CUSOLVER_STATUS_INTERNAL_ERROR"; 

     case CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED: 
      return "CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED"; 

    } 

    return "<unknown>"; 
} 

inline void __cusolveSafeCall(cusolverStatus_t err, const char *file, const int line) 
{ 
    if(CUSOLVER_STATUS_SUCCESS != err) { 
     fprintf(stderr, "CUSOLVE error in file '%s', line %d\n %s\nerror %d: %s\nterminating!\n",__FILE__, __LINE__,err, \ 
           _cudaGetErrorEnum(err)); \ 
     cudaDeviceReset(); assert(0); \ 
    } 
} 

extern "C" void cusolveSafeCall(cusolverStatus_t err) { __cusolveSafeCall(err, __FILE__, __LINE__); }

Nguồn

2015-02-07 08:19:57 JackOLantern

Bạn nghĩ gì về phương pháp này so với sử dụng MAGMA? –

@AndreasYankopolus Tôi chưa so sánh hai thư viện, xin lỗi. – JackOLantern

Thực hiện song song cho nhiều SVD bằng CUDA

Trả lời

Các vấn đề liên quan