I have following matlab code;
tempx = full(sum(X.^2, 2));
tempc = full(sum(C.^2, 2).');
D = -2*(X * C.');
D = bsxfun(@plus, D, tempx);
D = bsxfun(@plus, D, tempc);
where X is nxm and W is kxm matrices realtively. One is the data and the other is the weight matrix. I find the distance matrix D with the given code. I am watching an efficient Cublas or Thrust implementation of this operations. I succeeded the line D = -2*(X * C.');
by cublas but the residual part is still a question as a newbie? Can anybody help with a snippet or give suggestions?
Here is what I have so far: Edit: I add some more code and I need bsxfun like implementation of summation. Sum vector V with all the columns and sum V2 with all the rows as a final step.
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <cuda_runtime.h>
#include "cublas_v2.h"
#include <algorithm>
#include <cuda.h>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/transform.h>
#include <thrust/functional.h>
#include <thrust/system_error.h>
#include <thrust/sequence.h>
#include <thrust/copy.h>
#define N 4
#define K 4
#define M 3
template <typename T>
struct square_op{
__host__ __device__ T operator()(const T& x)const{
return x*x;
}
};
int main (void){
cudaError_t cudaStat;
cublasStatus_t stat;
cublasHandle_t handle;
stat = cublasCreate(&handle);
if (stat != CUBLAS_STATUS_SUCCESS){
printf("CUBLAS initialization failure!!\n");
return EXIT_FAILURE;
}
// Fill with random data
thrust::host_vector<float> C_h(N*K);
thrust::host_vector<float> A_h(N*M); //data matrix
thrust::host_vector<float> B_h(K*M); //weight matrix
thrust::sequence(A_h.begin(),A_h.end());
thrust::sequence(B_h.begin(),B_h.end());
// std::generate(A_h.begin(), A_h.end(), rand);
// std::generate(B_h.begin(), B_h.end(), rand);
thrust::device_vector<float> A_d = A_h;
thrust::device_vector<float> B_d = B_h;
thrust::device_vector<float> C_d(N*K);
thrust::device_vector<float> dummy_x(M,1);
thrust::device_vector<float> A_sum_vec_d(N,0);
thrust::device_vector<float> B_sum_vec_d(K,0);
// TEST variables
thrust::host_vector<float> A_sum_vec_h(N,0);
thrust::host_vector<float> B_sum_vec_h(K,0);
for (int i = 0; i < N; ++i) {
for (int j = 0; j < M; ++j) {
printf("%f ",A_h[i*M+j]);
}
printf("\n");
}
printf("\n");
for (int i = 0; i < K; ++i) {
for (int j = 0; j < M; ++j) {
printf("%f ",B_h[i*M+j]);
}
printf("\n");
}
printf("\n");
std::cout<< "Starting GPU run" <<std::endl; //add this line
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
//************************************
// Calculate Square Elements
//************************************
square_op<float> unary_op = square_op<float>();
thrust::transform(A_d.begin(),A_d.end(),A_d.begin(),unary_op);
thrust::transform(B_d.begin(),B_d.end(),B_d.begin(),unary_op);
// TEST
thrust::copy(A_d.begin(),A_d.end(), A_h.begin());
printf("Matrix A after square!!\n");
for (int i = 0; i < N; ++i) {
for (int j = 0; j < M; ++j) {
printf("%f ",A_h[i*M+j]);
}
printf("\n");
}
printf("\n");
thrust::copy(B_d.begin(),B_d.end(), B_h.begin());
printf("Matrix B after square!!\n");
for (int i = 0; i < K; ++i) {
for (int j = 0; j < M; ++j) {
printf("%f ",B_h[i*M+j]);
}
printf("\n");
}
printf("\n");
//************************************
// Sum of the Rows
//************************************
float alpha = 1.0f;
float beta = 0.0f;
stat = cublasSgemv_v2(handle,CUBLAS_OP_T,M,N,&alpha,thrust::raw_pointer_cast(&A_d[0]),M,thrust::raw_pointer_cast(&dummy_x[0]),1,&beta,thrust::raw_pointer_cast(&A_sum_vec_d[0]),1);
if (stat != CUBLAS_STATUS_SUCCESS){
printf("1 CUBLAS initialization failure!!\n");
return EXIT_FAILURE;
}
stat = cublasSgemv_v2(handle,CUBLAS_OP_T,M,K,&alpha,thrust::raw_pointer_cast(&B_d[0]),M,thrust::raw_pointer_cast(&dummy_x[0]),1,&beta,thrust::raw_pointer_cast(&B_sum_vec_d[0]),1);
if (stat != CUBLAS_STATUS_SUCCESS){
printf("2 CUBLAS initialization failure!!\n");
return EXIT_FAILURE;
}
// TEST
thrust::copy(A_sum_vec_d.begin(), A_sum_vec_d.end(), A_sum_vec_h.begin());
printf("A_vec after row sum!!\n");
for (int j = 0; j < N; ++j) {
printf("%f ",A_sum_vec_h[j]);
}
printf("\n \n");
thrust::copy(B_sum_vec_d.begin(), B_sum_vec_d.end(), B_sum_vec_h.begin());
printf("B_vec after row sum!!\n");
for (int j = 0; j < K; ++j) {
printf("%f ",B_sum_vec_h[j]);
}
printf("\n \n");
//************************************
// Matrix Multiplication
//************************************
alpha = 2.0f;
beta = 0.0f;
//alpha*(A*B')+beta in row_major_order
stat = cublasSgemm_v2(handle,CUBLAS_OP_T,CUBLAS_OP_N,N,K,M,&alpha,thrust::raw_pointer_cast(&A_d[0]),M,thrust::raw_pointer_cast(&B_d[0]), M, &beta,thrust::raw_pointer_cast(&C_d[0]), N);
if (stat != CUBLAS_STATUS_SUCCESS){
printf("CUBLAS initialization failure!!\n");
return EXIT_FAILURE;
}
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
float elapsedTime;
float totalTime;
cudaEventElapsedTime(&elapsedTime, start, stop);
cudaEventDestroy(start);
cudaEventDestroy(stop);
totalTime = elapsedTime/1000;
std::cout<<"Elapsed_time:"<< totalTime<<std::endl;
//copy back data
thrust::copy(C_d.begin(),C_d.end(),C_h.begin());
for (int i = 0; i < N; ++i) {
for (int j = 0; j < K; ++j) {
printf("%f ",C_h[i*K+j]);
}
printf("\n");
}
//************************************
// Final summation
//************************************
//.... NEED CODE
if (stat != CUBLAS_STATUS_SUCCESS){
printf("CUBLAS initialization failure!!\n");
return EXIT_FAILURE;
}
printf("Execution ends!!\n");
return EXIT_SUCCESS;
}