How to vectorize the multiple loops
    6 views (last 30 days)
  
       Show older comments
    
    Life is Wonderful
 on 13 Jul 2023
  
    
    
    
    
    Commented: Life is Wonderful
 on 18 Jul 2023
            Hi there,
I'd like some advice on vectorizing the multiple for loop. might you provide me any solid suggestions on how I might try to optimise the loop?
I would want the loop performance to be approximately 5 seconds or less; at the moment, this is a far too large amount.
Thank you very much
% set up the initialization params
a_b = 4;% max 8
h = 1920;% max upto 7680
w = 1080;% max upto 4320
u1 = zeros(1936,1096);
u0 = zeros(1936,1096);
x = int16(zeros(h/a_b,w/a_b));
y = int16(zeros(h/a_b,w/a_b));
% loop over
tic
for r = a_b:a_b:h
    rb = floor(r/a_b);
    for c = a_b:a_b:w
        cb = floor(c/a_b);
        c_l = 1e+10;
        for u = -a_b:a_b
            for v = -a_b:a_b
                cv = u1(r+1:r+a_b,c+1:c+ ...
                    a_b)-u0(r+u+1:r+u+a_b, ...
                    c+v+1:c+v+a_b);
                cv = sum(abs(cv(:)));
                if cv < c_l
                    c_l=cv;
                    x(rb,cb) = v; y(rb,cb) = u;
                end
            end
        end
    end
end
toc;
7 Comments
  Bruno Luong
      
      
 on 16 Jul 2023
				
      Edited: Bruno Luong
      
      
 on 16 Jul 2023
  
			@Life is Wonderful are you asking for registration using correlation? I haven't mentioned any code in my previous comment.
I dig out this  demo file from an old discussion
PatternMatching()
function PatternMatching(GPU)
%% Test data
n = 0; % 0, 100, 500, 1000
switch n
    case 0,
        im1=imread('bigsur1.jpg');
        im2=imread('bigsur2.jpg');
        A = sum(double([im1; im2]),3);
        A1 = A(1:size(im1,1),:);
        A2 = A(size(im1,1)+1:end,:);
    case 100,
        % Shift between two images (actually of their upper/right corners)
        dx = 5; dy = 7;
        % Generate cropped images
        idx1 = 5:96;
        idy1 = 4:84;
        idx2 = idx1(1)+dx:64;
        idy2 = idy1(1)+dy:90;
    case 500,
         dx = -13; dy = 27;
         % Generate crops
         idx1 = 20:460;
         idy1 = 15:449;
         idx2 = idx1+dx;
         idy2 = idy1+dy;
    case 1000,       
        % Shift between two images (actually of their upper/right corners)
        dx = -13; dy = 27;
        % Generate cropped images
        idx1 = 50:960;
        idy1 = 35:849;
        idx2 = idx1(1)+dx:640;
        idy2 = idy1(1)+dy:900;
    otherwise
        error('Please setup crop-size for n=%d', n)
end
if n>0
    fullimg = peaks(n);
    noiseptv = 1;
    % Crop
    A1 = fullimg(idy1,idx1);
    A2 = fullimg(idy2,idx2);
    % Add noise (uniform pdf)
    A1 = A1 + noiseptv*(rand(size(A1))-0.5);
    A2 = A2 + noiseptv*(rand(size(A2))-0.5);
    % clean up
    clear fullimg dx dy idx1 idx2 idy1 idy2
    maxshift = [];
else
    maxshift = [] %[100 700];
end
% Use GPU flag
if nargin<1
    GPU = true;
end
%%
tic
% should return the same input as above
[dx dy] = pmatch(A1, A2, maxshift, GPU);
toc
fprintf('Shift found [pixel] is (%d,%d)\n', dy, dx);
%%
% Graphic check
fig=figure(1);
clf(fig);
ax = axes('Parent',fig);
x1 = 1:size(A1,2);
y1 = 1:size(A1,1);
hold(ax,'on');
if n==0
    A1=im1;
    A2=im2;
end
imagesc(x1,y1,A1,'Parent',ax);
plot3(ax,x1([1 end end 1 1])+[-1 1 1 -1 -1]/2,...
         y1([1 1 end end 1])+[-1 -1 1 1 -1]/2,...
         ones(1,5),'k');
x2 = dx + (1:size(A2,2));
y2 = dy + 1:size(A2,1);
imagesc(x2,y2,A2,'Parent',ax);
plot3(ax,x2([1 end end 1 1])+[-1 1 1 -1 -1]/2,...
         y2([1 1 end end 1])+[-1 -1 1 1 -1]/2,...
          ones(1,5),'k');
% Intersection
x = intersect(x1,x2);
y = intersect(y1,y2);
plot3(ax,x([1 end end 1 1])+[-1 1 1 -1 -1]/2,...
         y([1 1 end end 1])+[-1 -1 1 1 -1]/2,...
          ones(1,5),'r','LineWidth',1);
set(ax,'Ydir','reverse');
axis(ax,'equal')
end % PatternMatching
%%
function [dx dy] = pmatch(A1, A2, maxshift, GPU)
% function [dx dy] = pmatch(A1, A2, maxshift, GPU)
% Pattern matching engine
% Use GPU
if nargin<3 || isempty(maxshift)
    % We don't look for shift larger than this (see ImageAnalyst's post)
    % half of the size of A1 and A2
    maxshift = ceil(0.8*min(size(A1),size(A2)))
end
% Use GPU
if nargin<4 || isempty(GPU)
    GPU = true;
end
if isscalar(maxshift)
    % common margin duplicated for both dimensions
    maxshift = maxshift([1 1]);
end
% Select 2D convolution engine
if ~isempty(which('convnfft'))
    % http://www.mathworks.com/matlabcentral/fileexchange/24504
    convfun = @convnfft;
    convarg = {[], GPU};
    fprintf('GPU/jacket flag = %i\n', GPU);
else
    % This one will last almost forever
    convfun = @conv2;
    convarg = {};
    fprintf('Slow Matlab CONV2...\n');
end
% Correlation engine
A2f = A2(end:-1:1,end:-1:1);
C = convfun(A1, A2f, 'full', convarg{:});
V1 = convfun(A1.^2, ones(size(A2f)), 'full', convarg{:});
V2 = convfun(ones(size(A1)), A2f.^2, 'full', convarg{:});
C2 = C.^2 ./ (V1.*V2);
center = size(A2f);
C2 = C2(center(1)+(-maxshift(1):maxshift(1)), ...
        center(2)+(-maxshift(2):maxshift(2)));
[cmax ilin] = max(C2(:));
fprintf('Correlation = %g\n', sqrt(cmax))
[iy ix] = ind2sub(size(C2),ilin);
dx = ix - (maxshift(2)+1);
dy = iy - (maxshift(1)+1);
end % pmatch
Accepted Answer
  Bruno Luong
      
      
 on 17 Jul 2023
        
      Edited: Bruno Luong
      
      
 on 18 Jul 2023
  
      Warning code not tested for correctness:
EDIT correctness is now check
% set up the initialization params
a_b = 4;% max 8
h = 1920;% max upto 7680
w = 1080;% max upto 4320
u1 = rand(1936,1096);
u0 = rand(1936,1096);
x = zeros(h/a_b,w/a_b);
y = zeros(h/a_b,w/a_b);
% loop over
tic
for r = a_b:a_b:h
    rb = floor(r/a_b);
    for c = a_b:a_b:w
        cb = floor(c/a_b);
        c_l = 1e+10;
        for u = -a_b:a_b
            for v = -a_b:a_b
                cv = u1(r+1:r+a_b,c+1:c+ ...
                    a_b)-u0(r+u+1:r+u+a_b, ...
                    c+v+1:c+v+a_b);
                cv = sum(abs(cv(:)));
                if cv < c_l
                    c_l=cv;
                    x(rb,cb) = v; y(rb,cb) = u;
                end
            end
        end
    end
end
toc; % Elapsed time is 27.587217 seconds.
tic
r = a_b:a_b:h;
c = a_b:a_b:w;
r1 = r(1)+1:r(end)+a_b;
c1 = c(1)+1:c(end)+a_b;
U1 = u1(r1,c1);
[m,n] = size(U1);
m = m/a_b;
n = n/a_b;
uv = -a_b:a_b;
p = 2*a_b + 1; % length(uv)
CV = zeros([m n p p]);
for i = 1:p
    u = uv(i);
    r0 = r1+u;
    u00 = u0(r0,:);
    for j = 1:p
        v = uv(j);
        c0 = c1+v;
        U0 = u00(:,c0);
        DU = abs(U1-U0);
        DU = reshape(DU, a_b, m, a_b, n);
        DU = sum(DU, [1 3]);
        CV(:,:,i,j) = reshape(DU, [m n]);
    end
end
[mincv,locmin] = min(CV, [], [3 4], 'linear');
% Edit correct bug here
[~,~,iy,ix] = ind2sub(size(CV),locmin);
y_v = uv(iy);
x_v = uv(ix);
toc % Elapsed time is 1.022439 seconds.
isequal(x,x_v)
isequal(y,y_v)
3 Comments
  Bruno Luong
      
      
 on 18 Jul 2023
				
      Edited: Bruno Luong
      
      
 on 18 Jul 2023
  
			"Is there any chance of additional optimisation?"
Yes there is a chance, code the algo in assembly language with Nvidia card.
Look, as it is it's 27-30 times faster than your code.
In TMW online server the tic/toc is "Elapsed time is 0.570510 seconds." and you said in the question "I would want the loop performance to be approximately 5 seconds"
it's 8 faster than your goal.
May be at some point you should not be hungry and stop to ask for more.
More Answers (0)
See Also
Categories
				Find more on Conway's Game of Life in Help Center and File Exchange
			
	Community Treasure Hunt
Find the treasures in MATLAB Central and discover how the community can help you!
Start Hunting!