Comparing Strings and Finding Differencies
1 view (last 30 days)
Show older comments
I have done this so far but i cant get the output "mutation" and cant make a table
Ref= 'accatttacggttagtcctg';
gene_sequences={1,2,3,4,5,6,7,8,9,10,11,12,13,14,15; 'accgtttacggttagtcctg','accatttacggttagtcctg','accatttacggttagtcctg','accatttacggttagtcctg','accatttacggttagtcctg','accatttacagttagtcctg','accatttacggttagtcctg','accatttacggtcagtcctg','accatttacggttagtcctg','accatttacggttagtcctg','accatttacggttagtcctg','accatttacggttagttctg','accatttacggttagtcctg','accatttacggttagtcctg','accatttacggttagtcctg'};
N=length(Ref);
results=zeros(1,N);
n=0;
for k=1:2:30
str_var= gene_sequences(k:k+1);
str_var=char(str_var(2));
for i=1:N
if strcmp(Ref(i),str_var(i))
result (i)=0;
else
result(i)=1;
end
if nnz(result)==0
pos_mut=0;
else
pos_mut=find(result==1);
end
end
n=n+1;
fprintf ('%s\t patient %d\t no mutation\t %d\n',str_var, n, pos_mut)
end
0 Comments
Accepted Answer
Voss
on 18 Apr 2022
Edited: Voss
on 18 Apr 2022
Your calculation of the positions of mutations pos_mut is ok, but it should be done only after result is completely determined; otherwise you are doing unnecessary operations.
Ref= 'accatttacggttagtcctg';
gene_sequences={1,2,3,4,5,6,7,8,9,10,11,12,13,14,15; 'accgtttacggttagtcctg','accatttacggttagtcctg','accatttacggttagtcctg','accatttacggttagtcctg','accatttacggttagtcctg','accatttacagttagtcctg','accatttacggttagtcctg','accatttacggtcagtcctg','accatttacggttagtcctg','accatttacggttagtcctg','accatttacggttagtcctg','accatttacggttagttctg','accatttacggttagtcctg','accatttacggttagtcctg','accatttacggttagtcctg'};
N=length(Ref);
result=zeros(1,N);
n=0;
for k=1:2:30
str_var= gene_sequences(k:k+1);
str_var=char(str_var(2));
for i=1:N
if strcmp(Ref(i),str_var(i))
result(i)=0;
else
result(i)=1;
end
end
% do the calculation of 'pos_mut' only after 'result'
% has been completely calculated:
if nnz(result)==0
pos_mut=0;
else
pos_mut=find(result==1);
end
n=n+1;
fprintf ('%s\t patient %d\t no mutation\t %d\n',str_var, n, pos_mut)
end
But it can be simplified:
Ref= 'accatttacggttagtcctg';
gene_sequences={1,2,3,4,5,6,7,8,9,10,11,12,13,14,15; 'accgtttacggttagtcctg','accatttacggttagtcctg','accatttacggttagtcctg','accatttacggttagtcctg','accatttacggttagtcctg','accatttacagttagtcctg','accatttacggttagtcctg','accatttacggtcagtcctg','accatttacggttagtcctg','accatttacggttagtcctg','accatttacggttagtcctg','accatttacggttagttctg','accatttacggttagtcctg','accatttacggttagtcctg','accatttacggttagtcctg'};
N=length(Ref);
n=0;
for k=1:2:30
str_var= gene_sequences(k:k+1);
str_var=char(str_var(2));
pos_mut = find(Ref ~= str_var); % you can compare elements of two char arrays like this, if they are the same length (just like any other type of array)
if isempty(pos_mut)
pos_mut = 0;
end
n=n+1;
fprintf ('%s\t patient %d\t no mutation\t %d\n',str_var, n, pos_mut)
end
And I guess you'd rather use the first row of gene_sequences instead of assuming that patient number always goes 1, 2, 3, ..., and also have the result say "no mutation" only when there is no mutation:
Ref= 'accatttacggttagtcctg';
gene_sequences={1,2,3,4,5,6,7,8,9,10,11,12,13,14,15; 'accgtttacggttagtcctg','accatttacggttagtcctg','accatttacggttagtcctg','accatttacggttagtcctg','accatttacggttagtcctg','accatttacagttagtcctg','accatttacggttagtcctg','accatttacggtcagtcctg','accatttacggttagtcctg','accatttacggttagtcctg','accatttacggttagtcctg','accatttacggttagttctg','accatttacggttagtcctg','accatttacggttagtcctg','accatttacggttagtcctg'};
for k = 1:size(gene_sequences,2) % loop over columns of gene_sequences
str_var = gene_sequences(:,k); % use both elements of column k: first element is patient number
pos_mut = find(Ref ~= str_var{2}); % second element is nucleotide sequence
if isempty(pos_mut)
pos_mut = 0;
str_mut = 'no mutation';
else
str_mut = 'mutation';
end
fprintf('%s\t patient %d\t %-12s\t %d\n', str_var{2}, str_var{1}, str_mut, pos_mut)
end
And put the results into a cell array instead of fprintf-ing them to the command line:
Ref= 'accatttacggttagtcctg';
gene_sequences={1,2,3,4,5,6,7,8,9,10,11,12,13,14,15; 'accgtttacggttagtcctg','accatttacggttagtcctg','accatttacggttagtcctg','accatttacggttagtcctg','accatttacggttagtcctg','accatttacagttagtcctg','accatttacggttagtcctg','accatttacggtcagtcctg','accatttacggttagtcctg','accatttacggttagtcctg','accatttacggttagtcctg','accatttacggttagttctg','accatttacggttagtcctg','accatttacggttagtcctg','accatttacggttagtcctg'};
N = size(gene_sequences,2);
result = cell(N,4);
result(:,1) = gene_sequences(2,:);
result(:,2) = sprintfc('patient %d',[gene_sequences{1,:}]);
for k = 1:N
str_var = gene_sequences(:,k); % use both elements of column k: first element is patient number
pos_mut = find(Ref ~= str_var{2}); % second element is nucleotide sequence
if isempty(pos_mut)
pos_mut = 0;
str_mut = 'no mutation';
else
str_mut = 'mutation';
end
result(k,[3 4]) = {str_mut pos_mut};
end
result
More Answers (1)
Stephen23
on 18 Apr 2022
Edited: Stephen23
on 18 Apr 2022
Your current approach and other simple code based on STRCMP or comparing entire character vectors is not robust to insertions, deletions, and strings of different lengths. A much more robust approach uses the Bioinformatics Toolbox, for example https://www.mathworks.com/help/bioinfo/ref/nwalign.html to get a robust global sequence match.
ref = 'accatttacggttagtcctg';
inp = {'accgtttacggttagtcctg','accatttacggttagtcctg','accatttacggttagtcctg','accatttacggttagtcctg','accatttacggttagtcctg','accatttacagttagtcctg','accatttacggttagtcctg','accatttacggtcagtcctg','accatttacggttagtcctg','accatttacggttagtcctg','accatttacggttagtcctg','accatttacggttagttctg','accatttacggttagtcctg','accatttacggttagtcctg','accatttacggttagtcctg'};
vec = 1:numel(inp);
[idx,msg] = cellfun(@(s)myfun(s,ref),inp(:),'uni',0);
tmp = compose('patient %u',1:numel(inp));
out = [inp(:),tmp(:),msg,idx]
function [idx,msg] = myfun(st1,st2)
[~,aln] = nwalign(st1,st2); % requires Bioinformatics Toolbox.
idx = find(aln(2,:)~='|');
msg = 'mutation';
if isempty(idx)
msg = 'no mutation';
idx = 0;
end
end
0 Comments
See Also
Categories
Find more on Text Files in Help Center and File Exchange
Community Treasure Hunt
Find the treasures in MATLAB Central and discover how the community can help you!
Start Hunting!