001 /*--------------------------------------------------------------------------+
002 $Id: Diff.java 26283 2010-02-18 11:18:57Z juergens $
003 | |
004 | Copyright 2005-2010 Technische Universitaet Muenchen |
005 | |
006 | Licensed under the Apache License, Version 2.0 (the "License"); |
007 | you may not use this file except in compliance with the License. |
008 | You may obtain a copy of the License at |
009 | |
010 | http://www.apache.org/licenses/LICENSE-2.0 |
011 | |
012 | Unless required by applicable law or agreed to in writing, software |
013 | distributed under the License is distributed on an "AS IS" BASIS, |
014 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
015 | See the License for the specific language governing permissions and |
016 | limitations under the License. |
017 +--------------------------------------------------------------------------*/
018 package edu.tum.cs.commons.algo;
019
020 import java.util.ArrayList;
021 import java.util.Arrays;
022 import java.util.List;
023
024 import edu.tum.cs.commons.assertion.CCSMPre;
025 import edu.tum.cs.commons.string.StringUtils;
026
027 /**
028 * Implementation of the diff algorithm described in: E.W. Myers: "An O(ND)
029 * Difference Algorithm and Its Variations".
030 * <p>
031 * Let N be the sum of the concatenated input strings and D the size of the
032 * delta (i.e. the number of changes required to transform one string into the
033 * other). Then the time complexity is O(ND) and the space complexity is O(D^2).
034 *
035 * @param <T>
036 * The type of objects for which the diff is constructed.
037 *
038 * @author hummelb
039 * @author $Author: juergens $
040 * @version $Rev: 26283 $
041 * @levd.rating GREEN Hash: 2B0182DC254FAFBACB0727A1A353E51C
042 */
043 public class Diff<T> {
044
045 /** The first list of objects. */
046 private final List<T> a;
047
048 /** The second list of objects. */
049 private final List<T> b;
050
051 /** Length of {@link #a}. */
052 private final int n;
053
054 /** Length of {@link #b}. */
055 private final int m;
056
057 /** The maximal possible difference between {@link #a} and {@link #b}. */
058 private final int max;
059
060 /**
061 * The array for storing the positions on each diagonal. This is an
062 * "unrolled" version compared to the original paper, i.e. we create a new
063 * array for each iteration of the main loop.
064 */
065 private final int[][] v;
066
067 /**
068 * This array stores from where we came during the
069 * {@link #calculateDeltaSize()} method. Its structure is the same as
070 * {@link #v}.
071 */
072 private final boolean[][] from;
073
074 /**
075 * Hidden constructor. Use one of the {@link #computeDelta(List, List)} or
076 * {@link #computeDelta(Object[], Object[])} methods instead.
077 */
078 private Diff(List<T> a, List<T> b) {
079 this.a = a;
080 this.b = b;
081
082 n = a.size();
083 m = b.size();
084 max = n + m;
085 v = new int[max + 1][];
086 from = new boolean[max + 1][];
087 }
088
089 /** Performs the actual computations. */
090 private Delta<T> computeDelta() {
091 return constructDelta(calculateDeltaSize());
092 }
093
094 /** Constructs the actual delta. */
095 private Delta<T> constructDelta(int size) {
096 int d = size;
097 int k = -size;
098 while (v[size][size + k] < n || v[size][d + k] - k < m) {
099 ++k;
100 }
101
102 Delta<T> delta = new Delta<T>(size, n, m);
103
104 int difference = n - m;
105 while (d > 0) {
106 if (from[d][d + k]) {
107 ++k;
108 } else {
109 --k;
110 }
111 --d;
112
113 int x = v[d][d + k];
114 int y = x - k;
115
116 int newDifference = x - y;
117 if (newDifference > difference) {
118 delta.position[d] = y + 1;
119 delta.t[d] = b.get(y);
120 } else {
121 delta.position[d] = -x - 1;
122 delta.t[d] = a.get(x);
123 }
124 difference = newDifference;
125 }
126 return delta;
127 }
128
129 /**
130 * Calculates the size of the delta (i.e. the number of additions and
131 * deletions. Additionally the {@link #v} and {@link #from} arrays are
132 * filled.
133 */
134 private int calculateDeltaSize() {
135 int size = -1;
136 for (int d = 0; size < 0 && d <= max; ++d) {
137 v[d] = new int[2 * d + 1];
138 from[d] = new boolean[2 * d + 1];
139 for (int k = -d; k <= d; k += 2) {
140 int x = 0;
141 if (d > 0) {
142 if (k == -d
143 || k != d
144 && v[d - 1][d - 1 + k - 1] < v[d - 1][d - 1 + k + 1]) {
145 x = v[d - 1][d - 1 + k + 1];
146 from[d][d + k] = true;
147 } else {
148 x = v[d - 1][d - 1 + k - 1] + 1;
149 from[d][d + k] = false;
150 }
151 }
152 int y = x - k;
153 while (x < n && y < m && a.get(x).equals(b.get(y))) {
154 ++x;
155 ++y;
156 }
157 v[d][d + k] = x;
158 if (x >= n && y >= m) {
159 size = d;
160 }
161 }
162 }
163 return size;
164 }
165
166 /**
167 * Applies the diff algorithm on the supplied arrays and returns the delta
168 * between them.
169 *
170 * @param a
171 * the first "word", i.e., array of objects to produce a delta
172 * for.
173 * @param b
174 * the second "word", i.e., array of objects to produce a delta
175 * for.
176 * @return a delta containing the differences between a and b.
177 */
178 public static <T> Delta<T> computeDelta(T[] a, T[] b) {
179 return computeDelta(Arrays.asList(a), Arrays.asList(b));
180 }
181
182 /**
183 * Applies the diff algorithm on the supplied lists and returns the delta
184 * between them.
185 *
186 * @param a
187 * the first "word", i.e., list of objects to produce a delta
188 * for.
189 * @param b
190 * the second "word", i.e., list of objects to produce a delta
191 * for.
192 * @return a delta containing the differences between a and b.
193 */
194 public static <T> Delta<T> computeDelta(List<T> a, List<T> b) {
195 return new Diff<T>(a, b).computeDelta();
196 }
197
198 /**
199 * Objects of this class describe the additions and deletions used to
200 * transform between two words.
201 */
202 public static class Delta<T> {
203
204 /** The size of the first word. */
205 private final int n;
206
207 /** The size of the second word. */
208 private final int m;
209
210 /**
211 * This array stores the position at which a string is changed. If it is
212 * positive, it indicates an addition (i.e. the position is for the
213 * second string). Otherwise it is a deletion (i.e. the (negated)
214 * position is for the first string). To allow storing a sign for
215 * position 0, all positions are incremented before (so this has to be
216 * compensated for).
217 */
218 private final int[] position;
219
220 /**
221 * This array stores the characters which are added or deleted
222 * (interpretation depends on {@link #position}).
223 */
224 private final T[] t;
225
226 /** Create new delta of given size. */
227 @SuppressWarnings("unchecked")
228 private Delta(int size, int n, int m) {
229 this.n = n;
230 this.m = m;
231 position = new int[size];
232 t = (T[]) new Object[size];
233 }
234
235 /**
236 * Returns the size of the delta, i.e. the number of additions and
237 * deletions.
238 */
239 public int getSize() {
240 return position.length;
241 }
242
243 /** Returns the size of the first word the delta was created for. */
244 public int getN() {
245 return n;
246 }
247
248 /** Returns the size of the second word the delta was created for. */
249 public int getM() {
250 return m;
251 }
252
253 /** Returns the i-th element stored as addition or deletion. */
254 public T getT(int i) {
255 return t[i];
256 }
257
258 /**
259 * Returns the i-th element of the change positions. If it is positive,
260 * it indicates an addition (i.e. the position is for the second
261 * string). Otherwise it is a deletion (i.e. the (negated) position is
262 * for the first string). To allow storing a sign for position 0, all
263 * positions are incremented before (so this has to be compensated for).
264 */
265 public int getPosition(int i) {
266 return position[i];
267 }
268
269 /**
270 * Applies the forward patch, i.e. if the first string is inserted, then
271 * the second string is returned. The input word must be of length n,
272 * the output word will be of length m.
273 */
274 public List<T> forwardPatch(List<T> a) {
275 CCSMPre.isTrue(a.size() == n, "Input word must be of size " + n);
276 return doPatch(a, new ArrayList<T>(m), 1);
277 }
278
279 /**
280 * Applies the backward patch, i.e. if the second string is inserted,
281 * then the first string is returned. The input word must be of length
282 * m, the output word will be of length n.
283 */
284 public List<T> backwardPatch(List<T> b) {
285 CCSMPre.isTrue(b.size() == m, "Input word must be of size " + m);
286 return doPatch(b, new ArrayList<T>(n), -1);
287 }
288
289 /**
290 * Performs the patching from a to b put pre-multiplying the positions
291 * with the given factor.
292 */
293 private List<T> doPatch(List<T> a, List<T> b, int positionFactor) {
294 int posA = 0;
295 int posB = 0;
296
297 for (int j = 0; j < position.length; ++j) {
298 int k = position[j] * positionFactor;
299 if (k > 0) {
300 // add character
301 k = k - 1;
302 while (posB < k) {
303 b.add(a.get(posA));
304 ++posA;
305 ++posB;
306 }
307 b.add(t[j]);
308 ++posB;
309 } else {
310 // delete character
311 k = -k - 1;
312 while (posA < k) {
313 b.add(a.get(posA));
314 ++posA;
315 ++posB;
316 }
317 ++posA;
318 }
319 }
320 while (posA < a.size()) {
321 b.add(a.get(posA));
322 ++posA;
323 ++posB;
324 }
325
326 return b;
327 }
328
329 /** {@inheritDoc} */
330 @Override
331 public String toString() {
332 StringBuilder sb = new StringBuilder();
333 for (int i = 0; i < position.length; ++i) {
334 sb.append(Math.abs(position[i]) - 1);
335 if (position[i] > 0) {
336 sb.append("+ ");
337 } else {
338 sb.append("- ");
339 }
340 sb.append(t[i] + StringUtils.CR);
341 }
342 return sb.toString();
343 }
344 }
345 }