rbmatlab  1.16.09
 All Classes Namespaces Files Functions Variables Modules Pages
detect_duplicate_rows.m
1 function [un_id, dupl_id, un_id_of_dupl] = ...
2  detect_duplicate_rows(matrix, epsilon)
3 %function [un_id, dupl_id, un_id_of_dupl] = ...
4 % detect_duplicate_rows(matrix [, epsilon] );
5 %
6 % checks all rows of the matrix on uniqueness by determining, whether
7 % l2-norm of difference is smaller than epsilon (default 1e-6 if not
8 % specified)
9 %
10 % The vector un_id contains all row indices, which belong to unique rows
11 % and the first occurence of duplicated rows.
12 % ALl other indices are set into the vector of duplicated row indices
13 % dupl_id.
14 % The vector un_id_of_dupl gives the indices of the corresponding first
15 % occurence of the duplicates, e.g.
16 % the row with index dupl_id(1) is identical to vector un_id_of_dupl(1)
17 %
18 % e.g. matrix = [1 2; 2 3; 2 3; 0 0 ; 4 5 ; 2 3] results in
19 % un_id = [1 2 4 5], dupl_id = [3 6], un_id_of_dupl = [2 2]
20 %
21 % A simple loop-version is implemented, which also works for large
22 % matrices and rounding errors in large aspect-ratio case are prevented
23 % by this
24 
25 % Bernard Haasdonk 15.3.2007
26 
27 if nargin < 2
28  epsilon = 1e-6;
29 end;
30 
31 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
32 %% only perform vertorized method up to 1000 rows
33 %if (size(matrix,1) <= 1000)
34 %
35 %% construct l2 distance matrix of all vectors by binomial formula
36 %
37 %K1 = repmat(sum(matrix.*matrix,2),1,size(matrix,1));
38 %K = matrix * matrix';
39 %Dsqr = K1 + K1' - 2* K;
40 %dupl_mask = (Dsqr < epsilon^2);
41 %
42 %% a vector is unique, if there is no 1 in its row before the diagonal element
43 %% So: delete upper triangle matrix including diagonal
44 %% find true zero rows => uniques.
45 %% nonzero rows => dupl_ids
46 %% take column of first 1 in its row as un_id_of_dupl
47 %
48 %[r,c] = ind2sub(size(dupl_mask),find(dupl_mask));
49 %i = find(c>=r);
50 %rd = r(i);
51 %cd = c(i);
52 %s = sub2ind(size(dupl_mask), rd, cd);
53 %dupl_mask(s) = 0;
54 
55 %%% test, if upper triangle is really deleted
56 %%[r,c] = ind2sub(size(dupl_mask),find(dupl_mask));
57 %%i = find(c>=r);
58 %%if length(i>0)
59 %% error('deletion in dupl_mask not successful!');
60 %%else
61 %% disp('deletion in dupl_mask successful, remove test!!');
62 %%end,
63 
64 %m = max(dupl_mask,[],2);
65 
66 %un_id = find(m==0);
67 %dupl_id = find(m>0);
68 %[dummy, un_id_of_dupl] = max(dupl_mask(dupl_id,:),[],2);
69 
70 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
71 %else % in number of rows larger than 1000, implement trivial loop search
72 % warning(['Duplicate rows search by non-vectorized version for > 1000 ',...
73 % 'rows may take some time!!!']);
74 
75  un_id = [1];
76  dupl_id = [];
77  un_id_of_dupl = [];
78 
79  for r = 2:size(matrix,1)
80  % search row r among previous ones
81  diff = sum((matrix(1:(r-1),:)-repmat(matrix(r,:),r-1,1)).^2,2);
82  diff_mask = (diff < epsilon^2);
83  %keyboard;
84  [m,i] = max(diff_mask);
85  if (m==0) % vector is unique
86  un_id = [un_id, r];
87  else % vector is duplicate
88  dupl_id = [dupl_id, r];
89  un_id_of_dupl = [un_id_of_dupl, i];
90  end;
91  end;
92 
93  un_id = un_id(:);
94  dupl_id = dupl_id(:);
95  un_id_of_dupl = un_id_of_dupl(:);
96 
97 %end;
98 %keyboard;
99 
100 
101 % TO BE ADJUSTED TO NEW SYNTAX
102 %| \docupdate