Scaling of vectorized code which is limited by memory access

1 view (last 30 days)
Dear all,
what possibilities are there to make a vectorized code faster, where the speed is limited by memory access?
I have attached two runs on two different machines with the same vectorized MATLAB code. Here var_z_analyse is a huge array of fixed size, which will be accessed/querried by a long list of indices which may vary in size and content for each call. May parfor be a solution in this context?
For example the first line does not benefit from 4 to 18 cores, whereas the last two lines scale very well.
  • first machine has 4 cores on one node
4.png
  • seconde machine has 18 cores on one node
18.png
Regards
  11 Comments
ConvexHull
ConvexHull on 11 Aug 2019
Thank you Bruno,
i already have optimized the fortran code for single performance. There's nothing more to get.
Regards
ConvexHull
ConvexHull on 11 Aug 2019
Edited: ConvexHull on 11 Aug 2019
Back to the roots. The question is really simple.
We have a fixed list of data B and a list of ids C which may vary in size and content for each call.
A=B(:,:,C(:));
How to handle such stuff in Matlab/C/Fortran with or without openmp in context of speed/scaling for a medium size problem?

Sign in to comment.

Accepted Answer

Bruno Luong
Bruno Luong on 11 Aug 2019
Edited: Bruno Luong on 11 Aug 2019
Here is my C-mex implementation, on my laptop it accelerates by 20 fold from your original MATLAB code (compiled with MSVS 2017, 1 thread)
N=19012;
M=130000;
u_xi = rand(1,M);
u_eta = rand(1,M);
a_I = ceil(rand(1,M)*N);
var_z_analyse = rand(3,3,N);
tic
for i=1:1000
V_x = zeros(3,1,M);
V_y = zeros(1,3,M);
A = var_z_analyse(:,:,a_I);
V_x(2,1,:) = u_xi(:);
V_x(1,1,:) = 1;
V_x(3,1,:) = V_x(2,1,:).*V_x(2,1,:);
V_y(1,2,:) = u_eta(:);
V_y(1,1,:) = 1;
V_y(1,3,:) = V_y(1,2,:).*V_y(1,2,:);
outer_product = bsxfun(@times,V_x,V_y);
u_z = sum(sum(outer_product.*A,1),2);
end
toc % 21.871430 seconds.
tic
for i=1:1000
u_z_mex = hash(u_xi,u_eta,var_z_analyse,a_I);
end
toc % 1.057965 seconds.
norm(u_z_mex(:) - u_z(:)) / norm(u_z(:)) % 7.5515e-17
The hash.c C-mex code (quick and dirty without checking correctness of the input)
/*************************************************
* CALLING:
* >> u_z_mex = hash(u_xi,u_eta,var_z_analyse,a_I);
* COMPILING:
* >> mex -O hash.c
************************************************/
#include "mex.h"
#include "matrix.h"
#define XI prhs[0]
#define ETA prhs[1]
#define VAR_Z_ANALYSE prhs[2]
#define A_I prhs[3]
#define UZ plhs[0]
void mexFunction(int nlhs, mxArray *plhs[],
int nrhs, const mxArray *prhs[])
{
int i,j,k,M;
double xi_k_pow[3], eta_k, xi_k, etap, s, ss;
double *Ak, *u_eta, *u_xi, *a_I, *uz, *var_z_analyse;
var_z_analyse = (double*)mxGetData(VAR_Z_ANALYSE);
u_eta = (double*)mxGetData(ETA);
u_xi = (double*)mxGetData(XI);
a_I = (double*)mxGetData(A_I);
M = (int)mxGetNumberOfElements(A_I);
UZ = mxCreateDoubleMatrix(1,M,mxREAL);
uz = (double*)mxGetData(UZ);
xi_k_pow[0] = 1;
for (k=M; k--;)
{
Ak = var_z_analyse + 9*((int)(*a_I++)-1);
eta_k = *u_eta++;
xi_k = *u_xi++;
xi_k_pow[1] = xi_k;
xi_k_pow[2] = xi_k*xi_k;
etap = 1.0;
s = 0.0;
for (i=3; i--;)
{
ss = 0.0;
for (j=0; j<3; j++) ss += xi_k_pow[j] * *(Ak++);
s += etap * ss;
etap *= eta_k;
}
*uz++ = s;
}
}
  6 Comments
Bruno Luong
Bruno Luong on 11 Aug 2019
Folding for-loop variant code, 10% faster still
/*************************************************
* CALLING:
* >> u_z_mex = hash(u_xi,u_eta,var_z_analyse,a_I);
* COMPILING:
* >> mex -O hash.c
************************************************/
#include "mex.h"
#include "matrix.h"
#define XI prhs[0]
#define ETA prhs[1]
#define VAR_Z_ANALYSE prhs[2]
#define A_I prhs[3]
#define UZ plhs[0]
void mexFunction(int nlhs, mxArray *plhs[],
int nrhs, const mxArray *prhs[])
{
int k, M;
double eta_k, xi_k, s, ss;
double *Ak, *u_eta, *u_xi, *a_I, *uz, *var_z_analyse;
var_z_analyse = (double*)mxGetData(VAR_Z_ANALYSE);
u_eta = (double*)mxGetData(ETA);
u_xi = (double*)mxGetData(XI);
a_I = (double*)mxGetData(A_I);
M = (int)mxGetNumberOfElements(A_I);
UZ = mxCreateDoubleMatrix(1,M,mxREAL);
uz = (double*)mxGetData(UZ);
for (k=M; k--;)
{
Ak = var_z_analyse + 9*(int)(*a_I++);
eta_k = *u_eta++;
xi_k = *u_xi++;
s = *(--Ak);
s = s*xi_k + *(--Ak);
s = s*xi_k + *(--Ak);
ss = *(--Ak);
ss = ss*xi_k + *(--Ak);
ss = ss*xi_k + *(--Ak);
s = s*eta_k + ss;
ss = *(--Ak);
ss = ss*xi_k + *(--Ak);
ss = ss*xi_k + *(--Ak);
*uz++ = s*eta_k + ss;
}
}

Sign in to comment.

More Answers (0)

Categories

Find more on MATLAB Compiler in Help Center and File Exchange

Tags

Community Treasure Hunt

Find the treasures in MATLAB Central and discover how the community can help you!

Start Hunting!