Mercurial > hg4j
comparison src/org/tmatesoft/hg/internal/DiffHelper.java @ 551:4ea0351ca878
Better (precise) name for diff facility, tests
| author | Artem Tikhomirov <tikhomirov.artem@gmail.com> | 
|---|---|
| date | Wed, 20 Feb 2013 18:19:52 +0100 | 
| parents | src/org/tmatesoft/hg/internal/PatchGenerator.java@83afa680555d | 
| children | e55f17a7a195 | 
   comparison
  equal
  deleted
  inserted
  replaced
| 550:c1478cc31f45 | 551:4ea0351ca878 | 
|---|---|
| 1 /* | |
| 2 * Copyright (c) 2013 TMate Software Ltd | |
| 3 * | |
| 4 * This program is free software; you can redistribute it and/or modify | |
| 5 * it under the terms of the GNU General Public License as published by | |
| 6 * the Free Software Foundation; version 2 of the License. | |
| 7 * | |
| 8 * This program is distributed in the hope that it will be useful, | |
| 9 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
| 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
| 11 * GNU General Public License for more details. | |
| 12 * | |
| 13 * For information on how to redistribute this software under | |
| 14 * the terms of a license other than GNU General Public License | |
| 15 * contact TMate Software at support@hg4j.com | |
| 16 */ | |
| 17 package org.tmatesoft.hg.internal; | |
| 18 | |
| 19 import java.util.ArrayList; | |
| 20 import java.util.HashMap; | |
| 21 import java.util.Map; | |
| 22 | |
| 23 import org.tmatesoft.hg.repo.HgInvalidStateException; | |
| 24 | |
| 25 /** | |
| 26 * Mercurial cares about changes only up to the line level, e.g. a simple file version dump in manifest looks like (RevlogDump output): | |
| 27 * | |
| 28 * 522: 233748 0 103 17438 433 522 521 -1 756073cf2321df44d3ed0585f2a5754bc8a1b2f6 | |
| 29 * <PATCH>: | |
| 30 * 3487..3578, 91:src/org/tmatesoft/hg/core/HgIterateDirection.java\00add61a8a665c5d8f092210767f812fe0d335ac8 | |
| 31 * | |
| 32 * I.e. for the {fname}{revision} entry format of manifest, not only {revision} is changed, but the whole line, with unchanged {fname} is recorded | |
| 33 * in the patch. | |
| 34 * | |
| 35 * Mercurial paper describes reasons for choosing this approach to delta generation, too. | |
| 36 * | |
| 37 * | |
| 38 * @author Artem Tikhomirov | |
| 39 * @author TMate Software Ltd. | |
| 40 */ | |
| 41 public class DiffHelper<T extends DiffHelper.ChunkSequence<?>> { | |
| 42 | |
| 43 private Map<Object, IntVector> chunk2UseIndex; | |
| 44 private T seq1, seq2; | |
| 45 | |
| 46 // get filled by #longestMatch, track start of common sequence in seq1 and seq2, respectively | |
| 47 private int matchStartS1, matchStartS2; | |
| 48 | |
| 49 private MatchInspector<T> matchInspector; | |
| 50 | |
| 51 public void init(T s1, T s2) { | |
| 52 seq1 = s1; | |
| 53 seq2 = s2; | |
| 54 prepare(s2); | |
| 55 } | |
| 56 | |
| 57 public void init(T s1) { | |
| 58 if (seq2 == null) { | |
| 59 throw new IllegalStateException("Use this #init() only when target sequence shall be matched against different origin"); | |
| 60 } | |
| 61 seq1 = s1; | |
| 62 } | |
| 63 | |
| 64 | |
| 65 private void prepare(T s2) { | |
| 66 chunk2UseIndex = new HashMap<Object, IntVector>(); | |
| 67 for (int i = 0, len = s2.chunkCount(); i < len; i++) { | |
| 68 Object bc = s2.chunk(i); | |
| 69 IntVector loc = chunk2UseIndex.get(bc); | |
| 70 if (loc == null) { | |
| 71 chunk2UseIndex.put(bc, loc = new IntVector()); | |
| 72 } | |
| 73 loc.add(i); | |
| 74 // bc.registerUseIn(i) - BEWARE, use of bc here is incorrect | |
| 75 // in this case need to find the only ByteChain to keep indexes | |
| 76 // i.e. when there are few equal ByteChain instances, notion of "usedIn" shall be either shared (reference same vector) | |
| 77 // or kept within only one of them | |
| 78 } | |
| 79 } | |
| 80 | |
| 81 public void findMatchingBlocks(MatchInspector<T> insp) { | |
| 82 insp.begin(seq1, seq2); | |
| 83 matchInspector = insp; | |
| 84 findMatchingBlocks(0, seq1.chunkCount(), 0, seq2.chunkCount()); | |
| 85 insp.end(); | |
| 86 } | |
| 87 | |
| 88 /** | |
| 89 * implementation based on Python's difflib.py and SequenceMatcher | |
| 90 */ | |
| 91 public int longestMatch(int startS1, int endS1, int startS2, int endS2) { | |
| 92 matchStartS1 = matchStartS2 = 0; | |
| 93 int maxLength = 0; | |
| 94 IntMap<Integer> chunkIndex2MatchCount = new IntMap<Integer>(8); | |
| 95 for (int i = startS1; i < endS1; i++) { | |
| 96 Object bc = seq1.chunk(i); | |
| 97 IntVector occurencesInS2 = chunk2UseIndex.get(bc); | |
| 98 if (occurencesInS2 == null) { | |
| 99 chunkIndex2MatchCount.clear(); | |
| 100 continue; | |
| 101 } | |
| 102 IntMap<Integer> newChunkIndex2MatchCount = new IntMap<Integer>(8); | |
| 103 for (int j : occurencesInS2.toArray()) { | |
| 104 // s1[i] == s2[j] | |
| 105 if (j < startS2) { | |
| 106 continue; | |
| 107 } | |
| 108 if (j >= endS2) { | |
| 109 break; | |
| 110 } | |
| 111 int prevChunkMatches = chunkIndex2MatchCount.containsKey(j-1) ? chunkIndex2MatchCount.get(j-1) : 0; | |
| 112 int k = prevChunkMatches + 1; | |
| 113 newChunkIndex2MatchCount.put(j, k); | |
| 114 if (k > maxLength) { | |
| 115 matchStartS1 = i-k+1; | |
| 116 matchStartS2 = j-k+1; | |
| 117 maxLength = k; | |
| 118 } | |
| 119 } | |
| 120 chunkIndex2MatchCount = newChunkIndex2MatchCount; | |
| 121 } | |
| 122 return maxLength; | |
| 123 } | |
| 124 | |
| 125 private void findMatchingBlocks(int startS1, int endS1, int startS2, int endS2) { | |
| 126 int matchLength = longestMatch(startS1, endS1, startS2, endS2); | |
| 127 if (matchLength > 0) { | |
| 128 final int saveStartS1 = matchStartS1, saveStartS2 = matchStartS2; | |
| 129 if (startS1 < matchStartS1 && startS2 < matchStartS2) { | |
| 130 findMatchingBlocks(startS1, matchStartS1, startS2, matchStartS2); | |
| 131 } | |
| 132 matchInspector.match(saveStartS1, saveStartS2, matchLength); | |
| 133 if (saveStartS1+matchLength < endS1 && saveStartS2+matchLength < endS2) { | |
| 134 findMatchingBlocks(saveStartS1 + matchLength, endS1, saveStartS2 + matchLength, endS2); | |
| 135 } | |
| 136 } | |
| 137 } | |
| 138 | |
| 139 public interface MatchInspector<T extends ChunkSequence<?>> { | |
| 140 void begin(T s1, T s2); | |
| 141 void match(int startSeq1, int startSeq2, int matchLength); | |
| 142 void end(); | |
| 143 } | |
| 144 | |
| 145 static class MatchDumpInspector<T extends ChunkSequence<?>> implements MatchInspector<T> { | |
| 146 private int matchCount; | |
| 147 | |
| 148 public void begin(T s1, T s2) { | |
| 149 matchCount = 0; | |
| 150 } | |
| 151 | |
| 152 public void match(int startSeq1, int startSeq2, int matchLength) { | |
| 153 matchCount++; | |
| 154 System.out.printf("match #%d: from line #%d and line #%d of length %d\n", matchCount, startSeq1, startSeq2, matchLength); | |
| 155 } | |
| 156 | |
| 157 public void end() { | |
| 158 if (matchCount == 0) { | |
| 159 System.out.println("NO MATCHES FOUND!"); | |
| 160 } | |
| 161 } | |
| 162 } | |
| 163 | |
| 164 /** | |
| 165 * Matcher implementation that translates "match/equal" notification to a delta-style "added/removed/changed". | |
| 166 */ | |
| 167 public static class DeltaInspector<T extends ChunkSequence<?>> implements MatchInspector<T> { | |
| 168 protected int changeStartS1, changeStartS2; | |
| 169 protected T seq1, seq2; | |
| 170 | |
| 171 public void begin(T s1, T s2) { | |
| 172 seq1 = s1; | |
| 173 seq2 = s2; | |
| 174 changeStartS1 = changeStartS2 = 0; | |
| 175 } | |
| 176 | |
| 177 public void match(int startSeq1, int startSeq2, int matchLength) { | |
| 178 reportDeltaElement(startSeq1, startSeq2, matchLength); | |
| 179 changeStartS1 = startSeq1 + matchLength; | |
| 180 changeStartS2 = startSeq2 + matchLength; | |
| 181 } | |
| 182 | |
| 183 public void end() { | |
| 184 if (changeStartS1 < seq1.chunkCount()-1 || changeStartS2 < seq2.chunkCount()-1) { | |
| 185 reportDeltaElement(seq1.chunkCount()-1, seq2.chunkCount()-1, 0); | |
| 186 } | |
| 187 } | |
| 188 | |
| 189 protected void reportDeltaElement(int matchStartSeq1, int matchStartSeq2, int matchLength) { | |
| 190 if (changeStartS1 < matchStartSeq1) { | |
| 191 if (changeStartS2 < matchStartSeq2) { | |
| 192 changed(changeStartS1, matchStartSeq1, changeStartS2, matchStartSeq2); | |
| 193 } else { | |
| 194 assert changeStartS2 == matchStartSeq2; | |
| 195 deleted(matchStartSeq2, changeStartS1, matchStartSeq1); | |
| 196 } | |
| 197 } else { | |
| 198 assert changeStartS1 == matchStartSeq1; | |
| 199 if(changeStartS2 < matchStartSeq2) { | |
| 200 added(changeStartS1, changeStartS2, matchStartSeq2); | |
| 201 } else { | |
| 202 assert changeStartS2 == matchStartSeq2; | |
| 203 if (matchStartSeq1 > 0 || matchStartSeq2 > 0) { | |
| 204 // FIXME perhaps, exception is too much for the case | |
| 205 // once diff is covered with tests, replace with assert false : msg; | |
| 206 throw new HgInvalidStateException(String.format("adjustent equal blocks %d, %d and %d,%d", changeStartS1, matchStartSeq1, changeStartS2, matchStartSeq2)); | |
| 207 } | |
| 208 } | |
| 209 } | |
| 210 if (matchLength > 0) { | |
| 211 unchanged(matchStartSeq1, matchStartSeq2, matchLength); | |
| 212 } | |
| 213 } | |
| 214 | |
| 215 /** | |
| 216 * [s1From..s1To) replaced with [s2From..s2To) | |
| 217 */ | |
| 218 protected void changed(int s1From, int s1To, int s2From, int s2To) { | |
| 219 // NO-OP | |
| 220 } | |
| 221 | |
| 222 protected void deleted(int s2DeletePoint, int s1From, int s1To) { | |
| 223 // NO-OP | |
| 224 } | |
| 225 | |
| 226 protected void added(int s1InsertPoint, int s2From, int s2To) { | |
| 227 // NO-OP | |
| 228 } | |
| 229 | |
| 230 protected void unchanged(int s1From, int s2From, int length) { | |
| 231 // NO-OP | |
| 232 } | |
| 233 } | |
| 234 | |
| 235 static class DeltaDumpInspector<T extends ChunkSequence<?>> extends DeltaInspector<T> { | |
| 236 | |
| 237 @Override | |
| 238 protected void changed(int s1From, int s1To, int s2From, int s2To) { | |
| 239 System.out.printf("changed [%d..%d) with [%d..%d)\n", s1From, s1To, s2From, s2To); | |
| 240 } | |
| 241 | |
| 242 @Override | |
| 243 protected void deleted(int s2DeletionPoint, int s1From, int s1To) { | |
| 244 System.out.printf("deleted [%d..%d)\n", s1From, s1To); | |
| 245 } | |
| 246 | |
| 247 @Override | |
| 248 protected void added(int s1InsertPoint, int s2From, int s2To) { | |
| 249 System.out.printf("added [%d..%d) at %d\n", s2From, s2To, s1InsertPoint); | |
| 250 } | |
| 251 | |
| 252 @Override | |
| 253 protected void unchanged(int s1From, int s2From, int length) { | |
| 254 System.out.printf("same [%d..%d) and [%d..%d)\n", s1From, s1From + length, s2From, s2From + length); | |
| 255 } | |
| 256 } | |
| 257 | |
| 258 /** | |
| 259 * Generic sequence of chunk, where chunk is anything comparable to another chunk, e.g. a string or a single char | |
| 260 * Sequence diff algorithm above doesn't care about sequence nature. | |
| 261 */ | |
| 262 public interface ChunkSequence<T> { | |
| 263 public T chunk(int index); | |
| 264 public int chunkCount(); | |
| 265 } | |
| 266 | |
| 267 public static final class LineSequence implements ChunkSequence<LineSequence.ByteChain> { | |
| 268 | |
| 269 private final byte[] input; | |
| 270 private ArrayList<ByteChain> lines; | |
| 271 | |
| 272 public LineSequence(byte[] data) { | |
| 273 input = data; | |
| 274 } | |
| 275 | |
| 276 public static LineSequence newlines(byte[] array) { | |
| 277 return new LineSequence(array).splitByNewlines(); | |
| 278 } | |
| 279 | |
| 280 // sequence ends with fake, empty line chunk | |
| 281 public LineSequence splitByNewlines() { | |
| 282 lines = new ArrayList<ByteChain>(); | |
| 283 int lastStart = 0; | |
| 284 for (int i = 0; i < input.length; i++) { | |
| 285 if (input[i] == '\n') { | |
| 286 lines.add(new ByteChain(lastStart, i+1)); | |
| 287 lastStart = i+1; | |
| 288 } else if (input[i] == '\r') { | |
| 289 if (i+1 < input.length && input[i+1] == '\n') { | |
| 290 i++; | |
| 291 } | |
| 292 lines.add(new ByteChain(lastStart, i+1)); | |
| 293 lastStart = i+1; | |
| 294 } | |
| 295 } | |
| 296 if (lastStart < input.length) { | |
| 297 lines.add(new ByteChain(lastStart, input.length)); | |
| 298 } | |
| 299 // empty chunk to keep offset of input end | |
| 300 lines.add(new ByteChain(input.length)); | |
| 301 return this; | |
| 302 } | |
| 303 | |
| 304 public ByteChain chunk(int index) { | |
| 305 return lines.get(index); | |
| 306 } | |
| 307 | |
| 308 public int chunkCount() { | |
| 309 return lines.size(); | |
| 310 } | |
| 311 | |
| 312 public byte[] data(int chunkFrom, int chunkTo) { | |
| 313 if (chunkFrom == chunkTo) { | |
| 314 return new byte[0]; | |
| 315 } | |
| 316 int from = chunk(chunkFrom).getOffset(), to = chunk(chunkTo).getOffset(); | |
| 317 byte[] rv = new byte[to - from]; | |
| 318 System.arraycopy(input, from, rv, 0, rv.length); | |
| 319 return rv; | |
| 320 } | |
| 321 | |
| 322 | |
| 323 final class ByteChain { | |
| 324 private final int start, end; | |
| 325 private final int hash; | |
| 326 | |
| 327 /** | |
| 328 * construct a chunk with a sole purpose to keep | |
| 329 * offset of the data end | |
| 330 */ | |
| 331 ByteChain(int offset) { | |
| 332 start = end = offset; | |
| 333 // ensure this chunk doesn't match trailing chunk of another sequence | |
| 334 hash = System.identityHashCode(this); | |
| 335 } | |
| 336 | |
| 337 ByteChain(int s, int e) { | |
| 338 start = s; | |
| 339 end = e; | |
| 340 hash = calcHash(input, s, e); | |
| 341 } | |
| 342 | |
| 343 /** | |
| 344 * byte offset of the this ByteChain inside ChainSequence | |
| 345 */ | |
| 346 public int getOffset() { | |
| 347 return start; | |
| 348 } | |
| 349 | |
| 350 public byte[] data() { | |
| 351 byte[] rv = new byte[end - start]; | |
| 352 System.arraycopy(input, start, rv, 0, rv.length); | |
| 353 return rv; | |
| 354 } | |
| 355 | |
| 356 @Override | |
| 357 public boolean equals(Object obj) { | |
| 358 if (obj == null || obj.getClass() != ByteChain.class) { | |
| 359 return false; | |
| 360 } | |
| 361 ByteChain other = (ByteChain) obj; | |
| 362 if (other.hash != hash || other.end - other.start != end - start) { | |
| 363 return false; | |
| 364 } | |
| 365 return other.match(input, start); | |
| 366 } | |
| 367 | |
| 368 private boolean match(byte[] oi, int from) { | |
| 369 for (int i = start, j = from; i < end; i++, j++) { | |
| 370 if (LineSequence.this.input[i] != oi[j]) { | |
| 371 return false; | |
| 372 } | |
| 373 } | |
| 374 return true; | |
| 375 } | |
| 376 | |
| 377 @Override | |
| 378 public int hashCode() { | |
| 379 return hash; | |
| 380 } | |
| 381 | |
| 382 @Override | |
| 383 public String toString() { | |
| 384 return String.format("[@%d\"%s\"]", start, new String(data())); | |
| 385 } | |
| 386 } | |
| 387 | |
| 388 // same as Arrays.hashCode(byte[]), just for a slice of a bigger array | |
| 389 static int calcHash(byte[] data, int from, int to) { | |
| 390 int result = 1; | |
| 391 for (int i = from; i < to; i++) { | |
| 392 result = 31 * result + data[i]; | |
| 393 } | |
| 394 return result; | |
| 395 } | |
| 396 } | |
| 397 } | 
