1 | namespace my.utils { |
2 | using System; |
3 | using System.Collections; |
4 | using System.Text; |
5 | using System.Text.RegularExpressions; |
6 | |
7 | /// <summary> |
8 | /// This Class implements the Difference Algorithm published in |
9 | /// "An O(ND) Difference Algorithm and its Variations" by Eugene Myers |
10 | /// Algorithmica Vol. 1 No. 2, 1986, p 251. |
11 | /// |
12 | /// There are many C, Java, Lisp implementations public available but they all seem to come |
13 | /// from the same source (diffutils) that is under the (unfree) GNU public License |
14 | /// and cannot be reused as a sourcecode for a commercial application. |
15 | /// There are very old C implementations that use other (worse) algorithms. |
16 | /// Microsoft also published sourcecode of a diff-tool (windiff) that uses some tree data. |
17 | /// Also, a direct transfer from a C source to C# is not easy because there is a lot of pointer |
18 | /// arithmetic in the typical C solutions and i need a managed solution. |
19 | /// These are the reasons why I implemented the original published algorithm from the scratch and |
20 | /// make it avaliable without the GNU license limitations. |
21 | /// I do not need a high performance diff tool because it is used only sometimes. |
22 | /// I will do some performace tweaking when needed. |
23 | /// |
24 | /// The algorithm itself is comparing 2 arrays of numbers so when comparing 2 text documents |
25 | /// each line is converted into a (hash) number. See DiffText(). |
26 | /// |
27 | /// Some chages to the original algorithm: |
28 | /// The original algorithm was described using a recursive approach and comparing zero indexed arrays. |
29 | /// Extracting sub-arrays and rejoining them is very performance and memory intensive so the same |
30 | /// (readonly) data arrays are passed arround together with their lower and upper bounds. |
31 | /// This circumstance makes the LCS and SMS functions more complicate. |
32 | /// I added some code to the LCS function to get a fast response on sub-arrays that are identical, |
33 | /// completely deleted or inserted. |
34 | /// |
35 | /// The result from a comparisation is stored in 2 arrays that flag for modified (deleted or inserted) |
36 | /// lines in the 2 data arrays. These bits are then analysed to produce a array of Item objects. |
37 | /// |
38 | /// Further possible optimizations: |
39 | /// (first rule: don't do it; second: don't do it yet) |
40 | /// The arrays DataA and DataB are passed as parameters, but are never changed after the creation |
41 | /// so they can be members of the class to avoid the paramter overhead. |
42 | /// In SMS is a lot of boundary arithmetic in the for-D and for-k loops that can be done by increment |
43 | /// and decrement of local variables. |
44 | /// The DownVector and UpVector arrays are alywas created and destroyed each time the SMS gets called. |
45 | /// It is possible to reuse tehm when transfering them to members of the class. |
46 | /// See TODO: hints. |
47 | /// |
48 | /// Changes: |
49 | /// 2002.09.20 There was a "hang" in some situations. |
50 | /// Now I undestand a little bit more of the SMS algorithm. |
51 | /// There have been overlapping boxes; that where analyzed partial differently. |
52 | /// One return-point is enough. |
53 | /// A assertion was added in CreateDiffs when in debug-mode, that counts the number of equal (no modified) lines in both arrays. |
54 | /// They must be identical. |
55 | /// |
56 | /// </summary> |
57 | |
58 | public class Diff { |
59 | |
60 | /// <summary>details of one difference.</summary> |
61 | public struct Item { |
62 | /// <summary>Start Line number in Data A.</summary> |
63 | public int StartA; |
64 | /// <summary>Start Line number in Data B.</summary> |
65 | public int StartB; |
66 | |
67 | /// <summary>Number of changes in Data A.</summary> |
68 | public int deletedA; |
69 | /// <summary>Number of changes in Data A.</summary> |
70 | public int insertedB; |
71 | } // Item |
72 | |
73 | /// <summary> |
74 | /// Shortest Middle Snake Return Data |
75 | /// </summary> |
76 | private struct SMSRD { |
77 | internal int x, y; |
internal int u, v; | |
78 | // internal int u, v; // 2002.09.20: no need for 2 points |
79 | } |
80 | |
81 | /// <summary>The A-Version of the data (original data) to be compared.</summary> |
82 | private DiffData DataA; |
83 | |
84 | /// <summary>The B-Version of the data (modified data) to be compared.</summary> |
85 | private DiffData DataB; |
86 | |
87 | #region self-Test |
88 | |
89 | #if (false) |
90 | /// <summary> |
91 | /// start a self- / box-test for some diff cases and report to the debug output. |
92 | /// </summary> |
93 | /// <param name="args">not used</param> |
94 | /// <returns>always 0</returns> |
95 | public static int Main(string[] args) { |
96 | Diff d = new Diff(); |
97 | StringBuilder ret = new StringBuilder(); |
98 | string a, b; |
99 | |
100 | Debug.setDebug(true); |
// Debug.setDebugLevel(0); | |
101 | Debug.setDebugLevel(0); |
102 | |
103 | // test all changes |
104 | a = "a,b,c,d,e,f,g,h,i,j,k,l".Replace(',', '\n'); |
105 | b = "0,1,2,3,4,5,6,7,8,9".Replace(',', '\n'); |
Debug.Assert(TestHelper(d.DiffText(a, b, false, false, false)) == "12.10.0.0*", | |
106 | Debug.Assert(TestHelper(d.DiffText(a, b, false, false, false)) |
107 | == "12.10.0.0*", |
108 | "Diff-Selftest", "all-changes test failed."); |
109 | |
110 | // test all same |
111 | a = "a,b,c,d,e,f,g,h,i,j,k,l".Replace(',', '\n'); |
112 | b = a; |
Debug.Assert(TestHelper(d.DiffText(a, b, false, false, false)) == "", | |
113 | Debug.Assert(TestHelper(d.DiffText(a, b, false, false, false)) |
114 | == "", |
115 | "Diff-Selftest", "all-same test failed."); |
116 | |
117 | // test snake |
118 | a = "a,b,c,d,e,f".Replace(',', '\n'); |
119 | b = "b,c,d,e,f,x".Replace(',', '\n'); |
120 | Debug.Assert(TestHelper(d.DiffText(a, b, false, false, false)) |
121 | == "1.0.0.0*0.1.6.5*", |
122 | "Diff-Selftest", "snake test failed."); |
123 | |
124 | // 2002.09.20 - repro |
125 | a = "c1,a,c2,b,c,d,e,g,h,i,j,c3,k,l".Replace(',', '\n'); |
126 | b = "C1,a,C2,b,c,d,e,I1,e,g,h,i,j,C3,k,I2,l".Replace(',', '\n'); |
127 | Debug.Assert(TestHelper(d.DiffText(a, b, false, false, false)) |
128 | == "1.1.0.0*1.1.2.2*0.2.7.7*1.1.11.13*0.1.13.15*", |
129 | "Diff-Selftest", "snake test failed."); |
130 | |
131 | // test some differences |
132 | a = "a,b,-,c,d,e,f,f".Replace(',', '\n'); |
b = "la,b,x,c,e,f".Replace(',', '\n'); | |
Debug.Assert(TestHelper(d.DiffText(a, b, false, false, false)) == "1.1.2.2*1.0.4.4*1.0.6.5*", | |
133 | b = "a,b,x,c,e,f".Replace(',', '\n'); |
134 | Debug.Assert(TestHelper(d.DiffText(a, b, false, false, false)) |
135 | == "1.1.2.2*1.0.4.4*1.0.6.5*", |
136 | "Diff-Selftest", "some-changes test failed."); |
137 | return (0); |
138 | } |
#endif | |
139 | |
140 | |
141 | public static string TestHelper(Item []f) { |
142 | StringBuilder ret = new StringBuilder(); |
143 | for (int n = 0; n < f.Length; n++) { |
144 | ret.Append(f[n].deletedA.ToString() + "." + f[n].insertedB.ToString() + "." + f[n].StartA.ToString() + "." + f[n].StartB.ToString() + "*"); |
145 | } |
146 | return (ret.ToString()); |
147 | } |
148 | #endif |
149 | #endregion |
150 | |
151 | /// <summary> |
152 | /// Find the difference in 2 texts, comparing by textlines. |
153 | /// </summary> |
154 | /// <param name="TextA">A-version of the text (usualy the old one)</param> |
155 | /// <param name="TextB">B-version of the text (usualy the new one)</param> |
156 | public Item [] DiffText(string TextA, string TextB) { |
157 | return(DiffText(TextA, TextB, false, false, false)); |
158 | } // DiffText |
159 | |
160 | |
161 | /// <summary> |
162 | /// Find the difference in 2 text documents, comparing by textlines. |
163 | /// The algorithm itself is comparing 2 arrays of numbers so when comparing 2 text documents |
164 | /// each line is converted into a (hash) number. This hash-value is computed by storing all |
165 | /// textlines into a common hashtable so i can find dublicates in there, and generating a |
166 | /// new number each time a new textline is inserted. |
167 | /// </summary> |
168 | /// <param name="TextA">A-version of the text (usualy the old one)</param> |
169 | /// <param name="TextB">B-version of the text (usualy the new one)</param> |
170 | /// <param name="trimSpace">When set to true, all leading and trailing whitespace characters are stripped out before the comparation is done.</param> |
171 | /// <param name="ignoreSpace">When set to true, all whitespace characters are converted to a single space character before the comparation is done.</param> |
172 | /// <param name="ignoreCase">When set to true, all characters are converted to their lowercase equivivalence before the comparation is done.</param> |
173 | /// <returns></returns> |
174 | public Item [] DiffText(string TextA, string TextB, bool trimSpace, bool ignoreSpace, bool ignoreCase) { |
175 | // prepare the input-text and convert to comparable numbers. |
176 | Hashtable h = new Hashtable(TextA.Length + TextB.Length); |
177 | DataA = new DiffData(DiffCodes(TextA, h, trimSpace, ignoreSpace, ignoreCase)); |
178 | DataB = new DiffData(DiffCodes(TextB, h, trimSpace, ignoreSpace, ignoreCase)); |
179 | h = null; // free up hashtable memory (maybe) |
180 | |
181 | LCS(DataA, 0, DataA.Length, DataB, 0, DataB.Length); |
182 | return CreateDiffs(); |
183 | } // DiffText |
184 | |
185 | |
186 | /// <summary> |
187 | /// This function converts all textlines of the text into unique numbers for every unique textline |
188 | /// so further work can work only with simple numbers. |
189 | /// </summary> |
190 | /// <param name="aText">the input text</param> |
191 | /// <param name="h">This extern initialized hashtable is used for storing all ever used textlines.</param> |
192 | /// <param name="trimSpace">ignore leading and trailing space characters</param> |
193 | /// <returns>a array of integers.</returns> |
194 | private int[] DiffCodes(string aText, Hashtable h, bool trimSpace, bool ignoreSpace, bool ignoreCase) { |
195 | // get all codes of the text |
196 | string []Lines; |
197 | int []Codes; |
198 | int lastUsedCode = h.Count; |
199 | object aCode; |
200 | string s; |
201 | |
202 | // strip off all cr, only use lf as textline separator. |
203 | aText = aText.Replace("\r", ""); |
204 | Lines = aText.Split('\n'); |
205 | |
206 | Codes = new int[Lines.Length]; |
207 | |
208 | for (int i = 0; i < Lines.Length; ++i) { |
209 | s = Lines[i]; |
210 | if (trimSpace) |
211 | s = s.Trim(); |
212 | |
213 | if (ignoreSpace) { |
s = Regex.Replace(s, "\\s", " "); // TODO: optimization: faster blank removal. | |
214 | s = Regex.Replace(s, "\\s+", " "); // TODO: optimization: faster blank removal. |
215 | } |
216 | |
217 | if (ignoreCase) |
218 | s = s.ToLower(); |
219 | |
220 | aCode = h[s]; |
221 | if (aCode == null) { |
222 | lastUsedCode++; |
223 | h[s] = lastUsedCode; |
224 | Codes[i] = lastUsedCode; |
225 | } else { |
226 | Codes[i] = (int)aCode; |
227 | } // if |
228 | } // for |
229 | return(Codes); |
230 | } // DiffCodes |
231 | |
232 | |
233 | /// <summary> |
234 | /// This is the algorithm to find the Shortest Middle Snake (SMS). |
235 | /// </summary> |
236 | /// <param name="DataA">sequence A</param> |
237 | /// <param name="LowerA">lower bound of the actual range in DataA</param> |
238 | /// <param name="UpperA">upper bound of the actual range in DataA (exclusive)</param> |
239 | /// <param name="DataB">sequence B</param> |
240 | /// <param name="LowerB">lower bound of the actual range in DataB</param> |
241 | /// <param name="UpperB">upper bound of the actual range in DataB (exclusive)</param> |
242 | /// <returns>a MiddleSnakeData record containing x,y and u,v</returns> |
243 | private SMSRD SMS(DiffData DataA, int LowerA, int UpperA, DiffData DataB, int LowerB, int UpperB) { |
244 | SMSRD ret; |
245 | int MAX = DataA.Length + DataB.Length; |
246 | |
247 | /// vector for the (0,0) to (x,y) search |
248 | int[] DownVector = new int[2* MAX + 2]; |
249 | |
250 | /// vector for the (u,v) to (N,M) search |
251 | int[] UpVector = new int[2 * MAX + 2]; |
252 | |
253 | // The vectors in the publication accepts negative indexes. the vectors implemented here are 0-based |
254 | // and are access using a specific offset: vOffset |
255 | int vOffset = MAX; |
256 | |
257 | int DownK = LowerA - LowerB; // the k-line to start the forward search |
258 | int UpK = UpperA - UpperB; // the k-line to start the reverse search |
259 | |
260 | int Delta = (UpperA - LowerA) - (UpperB - LowerB); |
261 | bool oddDelta = (Delta & 1) != 0; |
262 | |
263 | int MaxD = ((UpperA - LowerA + UpperB - LowerB) / 2) + 1; |
264 | |
265 | // Debug.Write(2, "SMS", String.Format("Search the box: A[{0}-{1}] to B[{2}-{3}]", LowerA, UpperA, LowerB, UpperB)); |
266 | |
267 | // init vectors |
268 | DownVector[vOffset + DownK + 1] = LowerA; |
269 | UpVector[vOffset + UpK - 1] = UpperA; |
270 | |
271 | for (int D = 0; D <= MaxD; D++) { |
272 | |
273 | // Extend the forward path. |
274 | for (int k = DownK - D; k <= DownK + D; k += 2) { |
275 | // Debug.Write(0, "SMS", "extend forward path " + k.ToString()); |
276 | |
277 | // find the only or better starting point |
278 | int x, y; |
279 | if (k == DownK - D) { |
280 | x = DownVector[vOffset + k+1]; |
281 | } else { |
282 | x = DownVector[vOffset + k-1] + 1; |
283 | if ((k < DownK + D) && (DownVector[vOffset + k+1] >= x)) |
284 | x = DownVector[vOffset + k+1]; |
285 | } |
286 | y = x - k; |
287 | |
288 | // find the end of the furthest reaching forward D-path in diagonal k. |
289 | while ((x < UpperA) && (y < UpperB) && (DataA.data[x] == DataB.data[y])) { |
290 | x++; y++; |
291 | } |
292 | DownVector[vOffset + k] = x; |
293 | |
294 | // overlap ? |
295 | if (oddDelta && (UpK-D < k) && (k < UpK+D)) { |
296 | if (UpVector[vOffset + k] <= DownVector[vOffset + k]) { |
297 | ret.x = DownVector[vOffset + k]; |
298 | ret.y = DownVector[vOffset + k] - k; |
ret.u = UpVector[vOffset + k]; | |
ret.v = UpVector[vOffset + k] - k; | |
299 | // ret.u = UpVector[vOffset + k]; // 2002.09.20: no need for 2 points |
300 | // ret.v = UpVector[vOffset + k] - k; |
301 | return (ret); |
302 | } // if |
303 | } // if |
304 | |
305 | } // for k |
306 | |
307 | // Extend the reverse path. |
308 | for (int k = UpK - D; k <= UpK + D; k += 2) { |
309 | // Debug.Write(0, "SMS", "extend reverse path " + k.ToString()); |
310 | |
311 | // find the only or better starting point |
312 | int x, y; |
313 | if (k == UpK + D) { |
314 | x = UpVector[vOffset + k-1]; // up |
315 | } else { |
316 | x = UpVector[vOffset + k+1] - 1; // left |
317 | if ((k > UpK - D) && (UpVector[vOffset + k-1] < x)) |
318 | x = UpVector[vOffset + k-1]; // up |
319 | } // if |
320 | y = x - k; |
321 | |
322 | while ((x > LowerA) && (y > LowerB) && (DataA.data[x-1] == DataB.data[y-1])) { |
323 | x--; y--; // diagonal |
324 | } |
325 | UpVector[vOffset + k] = x; |
326 | |
327 | // overlap ? |
328 | if (! oddDelta && (DownK-D <= k) && (k <= DownK+D)) { |
329 | if (UpVector[vOffset + k] <= DownVector[vOffset + k]) { |
330 | ret.x = DownVector[vOffset + k]; |
331 | ret.y = DownVector[vOffset + k] - k; |
ret.u = UpVector[vOffset + k]; | |
ret.v = UpVector[vOffset + k] - k; | |
332 | // ret.u = UpVector[vOffset + k]; // 2002.09.20: no need for 2 points |
333 | // ret.v = UpVector[vOffset + k] - k; |
334 | return (ret); |
335 | } // if |
336 | } // if |
337 | |
338 | } // for k |
339 | |
340 | } // for D |
341 | |
342 | throw new ApplicationException("the algorithm should never come here."); |
343 | } // SMS |
344 | |
345 | |
346 | /// <summary> |
347 | /// This is the divide-and-conquer implementation of the longes common-subsequence (LCS) |
348 | /// algorithm. |
349 | /// The published algorithm passes recursively parts of the A and B sequences. |
350 | /// To avoid copying these arrays the lower and upper bounds are passed while the sequences stay constant. |
351 | /// </summary> |
352 | /// <param name="DataA">sequence A</param> |
353 | /// <param name="LowerA">lower bound of the actual range in DataA</param> |
354 | /// <param name="UpperA">upper bound of the actual range in DataA (exclusive)</param> |
355 | /// <param name="DataB">sequence B</param> |
356 | /// <param name="LowerB">lower bound of the actual range in DataB</param> |
357 | /// <param name="UpperB">upper bound of the actual range in DataB (exclusive)</param> |
private void LCS(DiffData DataA, int LowerA, int UpperA, DiffData DataB, int LowerB, int UpperB) | |
{ | |
358 | private void LCS(DiffData DataA, int LowerA, int UpperA, DiffData DataB, int LowerB, int UpperB) { |
359 | // Debug.Write(2, "LCS", String.Format("Analyse the box: A[{0}-{1}] to B[{2}-{3}]", LowerA, UpperA, LowerB, UpperB)); |
360 | |
361 | // Fast walkthrough equal lines at the start |
362 | while (LowerA < UpperA && LowerB < UpperB && DataA.data[LowerA] == DataB.data[LowerB]) { |
363 | LowerA++; LowerB++; |
364 | } |
365 | |
366 | // Fast walkthrough equal lines at the end |
while (UpperA > LowerA && UpperB > LowerB && DataA.data[UpperA - 1] == DataB.data[UpperB - 1]) { | |
367 | while (LowerA < UpperA && LowerB < UpperB && DataA.data[UpperA-1] == DataB.data[UpperB-1]) { |
368 | --UpperA; --UpperB; |
369 | } |
370 | |
371 | if (LowerA == UpperA) { |
372 | // mark as inserted lines. |
373 | while (LowerB < UpperB) |
374 | DataB.modified[LowerB++] = true; |
375 | |
376 | } else if (LowerB == UpperB) { |
377 | // mark as deleted lines. |
378 | while (LowerA < UpperA) |
379 | DataA.modified[LowerA++] = true; |
380 | |
381 | } else { |
382 | // Find the middle snakea and length of an optimal path for A and B |
383 | SMSRD smsrd = SMS(DataA, LowerA, UpperA, DataB, LowerB, UpperB); |
// Debug.Write(2, "MiddleSnakeData", String.Format("{0},{1} to {2},{3}", smsrd.x, smsrd.y, smsrd.u, smsrd.v)); | |
384 | // Debug.Write(2, "MiddleSnakeData", String.Format("{0},{1}", smsrd.x, smsrd.y)); |
385 | |
// The path is from (x,y) to (u,v) | |
386 | // The path is from LowerX to (x,y) and (x,y) ot UpperX |
387 | LCS(DataA, LowerA, smsrd.x, DataB, LowerB, smsrd.y); |
LCS(DataA, smsrd.u, UpperA, DataB, smsrd.v, UpperB); | |
388 | LCS(DataA, smsrd.x, UpperA, DataB, smsrd.y, UpperB); // 2002.09.20: no need for 2 points |
389 | } |
390 | } // LCS() |
391 | |
392 | |
393 | /// <summary>Scan the tables of which lines are inserted and deleted, |
394 | /// producing an edit script in forward order. |
395 | /// </summary> |
396 | /// dynamic array |
397 | private Item [] CreateDiffs() { |
398 | ArrayList a = new ArrayList(); |
399 | Item aItem; |
400 | Item []result; |
401 | |
402 | int StartA, StartB; |
int LineA = 0; | |
int LineB = 0; | |
403 | int LineA, LineB; |
404 | |
405 | LineA = 0; |
406 | LineB = 0; |
407 | while (LineA < DataA.Length || LineB < DataB.Length) { |
if ((! DataA.modified[LineA]) && (! DataB.modified[LineB])) { | |
408 | if ((LineA < DataA.Length) && (! DataA.modified[LineA]) |
409 | && (LineB < DataB.Length) && (! DataB.modified[LineB])) { |
410 | // equal lines |
411 | LineA++; |
412 | LineB++; |
413 | |
414 | } else { |
415 | // maybe deleted and/or inserted lines |
416 | StartA = LineA; |
417 | StartB = LineB; |
418 | |
while (LineA < DataA.Length && DataA.modified[LineA]) | |
419 | while (LineA < DataA.Length && (LineB >= DataB.Length || DataA.modified[LineA])) |
420 | // while (LineA < DataA.Length && DataA.modified[LineA]) |
421 | LineA++; |
422 | |
while (LineB < DataB.Length && DataB.modified[LineB]) | |
423 | while (LineB < DataB.Length && (LineA >= DataA.Length || DataB.modified[LineB])) |
424 | // while (LineB < DataB.Length && DataB.modified[LineB]) |
425 | LineB++; |
426 | |
427 | if ((StartA < LineA) || (StartB < LineB)) { |
428 | // store a new difference-item |
429 | aItem = new Item(); |
430 | aItem.StartA = StartA; |
431 | aItem.StartB = StartB; |
432 | aItem.deletedA = LineA - StartA; |
433 | aItem.insertedB = LineB - StartB; |
434 | a.Add(aItem); |
435 | } // if |
436 | } // if |
437 | } // while |
438 | |
439 | result = new Item[a.Count]; |
440 | a.CopyTo(result); |
441 | |
442 | return (result); |
443 | } |
444 | |
445 | } // class Diff |
446 | |
447 | /// <summary>Data on one input file being compared. |
448 | /// </summary> |
449 | internal class DiffData { |
450 | |
451 | /// <summary>Number of elements (lines).</summary> |
452 | internal int Length; |
453 | |
454 | /// <summary>Buffer of numbers that will be compared.</summary> |
455 | internal int[] data; |
456 | |
457 | /// <summary> |
458 | /// Array of booleans that flag for modified data. |
459 | /// This is the result of the diff. |
460 | /// This means deletedA in the first Data or inserted in the second Data. |
461 | /// </summary> |
462 | internal bool[] modified; |
463 | |
464 | /// <summary> |
465 | /// Initialize the Diff-Data buffer. |
466 | /// </summary> |
467 | /// <param name="data">reference to the buffer</param> |
468 | internal DiffData(int[] initData) { |
469 | data = initData; |
470 | Length = initData.Length; |
471 | modified = new bool[Length + 2]; |
472 | } // DiffData |
473 | |
474 | } // class DiffData |
475 | |
476 | } // namespace |
This page is part of the http://www.mathertel.de/ web site.