Scaling of vectorized code which is limited by memory access
1 view (last 30 days)
Show older comments
Dear all,
what possibilities are there to make a vectorized code faster, where the speed is limited by memory access?
I have attached two runs on two different machines with the same vectorized MATLAB code. Here var_z_analyse is a huge array of fixed size, which will be accessed/querried by a long list of indices which may vary in size and content for each call. May parfor be a solution in this context?
For example the first line does not benefit from 4 to 18 cores, whereas the last two lines scale very well.
- first machine has 4 cores on one node
- seconde machine has 18 cores on one node
Regards
11 Comments
Accepted Answer
Bruno Luong
on 11 Aug 2019
Edited: Bruno Luong
on 11 Aug 2019
Here is my C-mex implementation, on my laptop it accelerates by 20 fold from your original MATLAB code (compiled with MSVS 2017, 1 thread)
N=19012;
M=130000;
u_xi = rand(1,M);
u_eta = rand(1,M);
a_I = ceil(rand(1,M)*N);
var_z_analyse = rand(3,3,N);
tic
for i=1:1000
V_x = zeros(3,1,M);
V_y = zeros(1,3,M);
A = var_z_analyse(:,:,a_I);
V_x(2,1,:) = u_xi(:);
V_x(1,1,:) = 1;
V_x(3,1,:) = V_x(2,1,:).*V_x(2,1,:);
V_y(1,2,:) = u_eta(:);
V_y(1,1,:) = 1;
V_y(1,3,:) = V_y(1,2,:).*V_y(1,2,:);
outer_product = bsxfun(@times,V_x,V_y);
u_z = sum(sum(outer_product.*A,1),2);
end
toc % 21.871430 seconds.
tic
for i=1:1000
u_z_mex = hash(u_xi,u_eta,var_z_analyse,a_I);
end
toc % 1.057965 seconds.
norm(u_z_mex(:) - u_z(:)) / norm(u_z(:)) % 7.5515e-17
The hash.c C-mex code (quick and dirty without checking correctness of the input)
/*************************************************
* CALLING:
* >> u_z_mex = hash(u_xi,u_eta,var_z_analyse,a_I);
* COMPILING:
* >> mex -O hash.c
************************************************/
#include "mex.h"
#include "matrix.h"
#define XI prhs[0]
#define ETA prhs[1]
#define VAR_Z_ANALYSE prhs[2]
#define A_I prhs[3]
#define UZ plhs[0]
void mexFunction(int nlhs, mxArray *plhs[],
int nrhs, const mxArray *prhs[])
{
int i,j,k,M;
double xi_k_pow[3], eta_k, xi_k, etap, s, ss;
double *Ak, *u_eta, *u_xi, *a_I, *uz, *var_z_analyse;
var_z_analyse = (double*)mxGetData(VAR_Z_ANALYSE);
u_eta = (double*)mxGetData(ETA);
u_xi = (double*)mxGetData(XI);
a_I = (double*)mxGetData(A_I);
M = (int)mxGetNumberOfElements(A_I);
UZ = mxCreateDoubleMatrix(1,M,mxREAL);
uz = (double*)mxGetData(UZ);
xi_k_pow[0] = 1;
for (k=M; k--;)
{
Ak = var_z_analyse + 9*((int)(*a_I++)-1);
eta_k = *u_eta++;
xi_k = *u_xi++;
xi_k_pow[1] = xi_k;
xi_k_pow[2] = xi_k*xi_k;
etap = 1.0;
s = 0.0;
for (i=3; i--;)
{
ss = 0.0;
for (j=0; j<3; j++) ss += xi_k_pow[j] * *(Ak++);
s += etap * ss;
etap *= eta_k;
}
*uz++ = s;
}
}
6 Comments
Bruno Luong
on 11 Aug 2019
Folding for-loop variant code, 10% faster still
/*************************************************
* CALLING:
* >> u_z_mex = hash(u_xi,u_eta,var_z_analyse,a_I);
* COMPILING:
* >> mex -O hash.c
************************************************/
#include "mex.h"
#include "matrix.h"
#define XI prhs[0]
#define ETA prhs[1]
#define VAR_Z_ANALYSE prhs[2]
#define A_I prhs[3]
#define UZ plhs[0]
void mexFunction(int nlhs, mxArray *plhs[],
int nrhs, const mxArray *prhs[])
{
int k, M;
double eta_k, xi_k, s, ss;
double *Ak, *u_eta, *u_xi, *a_I, *uz, *var_z_analyse;
var_z_analyse = (double*)mxGetData(VAR_Z_ANALYSE);
u_eta = (double*)mxGetData(ETA);
u_xi = (double*)mxGetData(XI);
a_I = (double*)mxGetData(A_I);
M = (int)mxGetNumberOfElements(A_I);
UZ = mxCreateDoubleMatrix(1,M,mxREAL);
uz = (double*)mxGetData(UZ);
for (k=M; k--;)
{
Ak = var_z_analyse + 9*(int)(*a_I++);
eta_k = *u_eta++;
xi_k = *u_xi++;
s = *(--Ak);
s = s*xi_k + *(--Ak);
s = s*xi_k + *(--Ak);
ss = *(--Ak);
ss = ss*xi_k + *(--Ak);
ss = ss*xi_k + *(--Ak);
s = s*eta_k + ss;
ss = *(--Ak);
ss = ss*xi_k + *(--Ak);
ss = ss*xi_k + *(--Ak);
*uz++ = s*eta_k + ss;
}
}
More Answers (0)
See Also
Categories
Find more on MATLAB Compiler in Help Center and File Exchange
Community Treasure Hunt
Find the treasures in MATLAB Central and discover how the community can help you!
Start Hunting!