Mercurial > jhg
comparison src/org/tmatesoft/hg/internal/PatchGenerator.java @ 533:e6f72c9829a6
Generate patches using diff algorithm
| author | Artem Tikhomirov <tikhomirov.artem@gmail.com> | 
|---|---|
| date | Wed, 30 Jan 2013 15:48:36 +0100 | 
| parents | |
| children | dd4f6311af52 | 
   comparison
  equal
  deleted
  inserted
  replaced
| 532:688c1ab113bb | 533:e6f72c9829a6 | 
|---|---|
| 1 /* | |
| 2 * Copyright (c) 2013 TMate Software Ltd | |
| 3 * | |
| 4 * This program is free software; you can redistribute it and/or modify | |
| 5 * it under the terms of the GNU General Public License as published by | |
| 6 * the Free Software Foundation; version 2 of the License. | |
| 7 * | |
| 8 * This program is distributed in the hope that it will be useful, | |
| 9 * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
| 10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
| 11 * GNU General Public License for more details. | |
| 12 * | |
| 13 * For information on how to redistribute this software under | |
| 14 * the terms of a license other than GNU General Public License | |
| 15 * contact TMate Software at support@hg4j.com | |
| 16 */ | |
| 17 package org.tmatesoft.hg.internal; | |
| 18 | |
| 19 import java.util.ArrayList; | |
| 20 import java.util.HashMap; | |
| 21 import java.util.Map; | |
| 22 | |
| 23 import org.tmatesoft.hg.repo.HgDataFile; | |
| 24 import org.tmatesoft.hg.repo.HgLookup; | |
| 25 import org.tmatesoft.hg.repo.HgRepository; | |
| 26 | |
| 27 /** | |
| 28 * Mercurial cares about changes only up to the line level, e.g. a simple file version bump in manifest looks like (RevlogDump output): | |
| 29 * | |
| 30 * 522: 233748 0 103 17438 433 522 521 -1 756073cf2321df44d3ed0585f2a5754bc8a1b2f6 | |
| 31 * <PATCH>: | |
| 32 * 3487..3578, 91:src/org/tmatesoft/hg/core/HgIterateDirection.java\00add61a8a665c5d8f092210767f812fe0d335ac8 | |
| 33 * | |
| 34 * I.e. for the {fname}{revision} entry format of manifest, not only {revision} is changed, but the whole line, with unchanged {fname} is recorded | |
| 35 * in the patch. | |
| 36 * | |
| 37 * Mercurial paper describes reasons for choosing this approach to delta generation, too. | |
| 38 * | |
| 39 * | |
| 40 * @author Artem Tikhomirov | |
| 41 * @author TMate Software Ltd. | |
| 42 */ | |
| 43 public class PatchGenerator { | |
| 44 | |
| 45 private Map<ChunkSequence.ByteChain, IntVector> chunk2UseIndex; | |
| 46 private ChunkSequence seq1, seq2; | |
| 47 | |
| 48 // get filled by #longestMatch, track start of common sequence in seq1 and seq2, respectively | |
| 49 private int matchStartS1, matchStartS2; | |
| 50 // get filled by #findMatchingBlocks, track start of changed/unknown sequence in seq1 and seq2 | |
| 51 private int changeStartS1, changeStartS2; | |
| 52 | |
| 53 public void init(byte[] data1, byte[] data2) { | |
| 54 seq1 = new ChunkSequence(data1); | |
| 55 seq1.splitByNewlines(); | |
| 56 seq2 = new ChunkSequence(data2); | |
| 57 seq2.splitByNewlines(); | |
| 58 prepare(seq2); | |
| 59 } | |
| 60 | |
| 61 private void prepare(ChunkSequence s2) { | |
| 62 chunk2UseIndex = new HashMap<ChunkSequence.ByteChain, IntVector>(); | |
| 63 for (int i = 0, len = s2.chunkCount(); i < len; i++) { | |
| 64 ChunkSequence.ByteChain bc = s2.chunk(i); | |
| 65 IntVector loc = chunk2UseIndex.get(bc); | |
| 66 if (loc == null) { | |
| 67 chunk2UseIndex.put(bc, loc = new IntVector()); | |
| 68 } | |
| 69 loc.add(i); | |
| 70 // bc.registerUseIn(i) - BEWARE, use of bc here is incorrect | |
| 71 // in this case need to find the only ByteChain to keep indexes | |
| 72 // i.e. when there are few equal ByteChain instances, notion of "usedIn" shall be either shared (reference same vector) | |
| 73 // or kept within only one of them | |
| 74 } | |
| 75 // for (ChunkSequence.ByteChain bc : chunk2UseIndex.keySet()) { | |
| 76 // System.out.printf("%s: {", new String(bc.data())); | |
| 77 // for (int x : chunk2UseIndex.get(bc).toArray()) { | |
| 78 // System.out.printf(" %d,", x); | |
| 79 // } | |
| 80 // System.out.println("}"); | |
| 81 // } | |
| 82 } | |
| 83 | |
| 84 public void findMatchingBlocks() { | |
| 85 changeStartS1 = changeStartS2 = 0; | |
| 86 findMatchingBlocks(0, seq1.chunkCount(), 0, seq2.chunkCount()); | |
| 87 if (changeStartS1 < seq1.chunkCount() || changeStartS2 < seq2.chunkCount()) { | |
| 88 reportDeltaElement(seq1.chunkCount(), seq2.chunkCount()); | |
| 89 } | |
| 90 } | |
| 91 | |
| 92 /** | |
| 93 * implementation based on Python's difflib.py and SequenceMatcher | |
| 94 */ | |
| 95 public int longestMatch(int startS1, int endS1, int startS2, int endS2) { | |
| 96 matchStartS1 = matchStartS2 = 0; | |
| 97 int maxLength = 0; | |
| 98 IntMap<Integer> chunkIndex2MatchCount = new IntMap<Integer>(8); | |
| 99 for (int i = startS1; i < endS1; i++) { | |
| 100 ChunkSequence.ByteChain bc = seq1.chunk(i); | |
| 101 IntMap<Integer> newChunkIndex2MatchCount = new IntMap<Integer>(8); | |
| 102 IntVector occurencesInS2 = chunk2UseIndex.get(bc); | |
| 103 if (occurencesInS2 == null) { | |
| 104 // chunkIndex2MatchCount.clear(); // TODO need clear instead of new instance | |
| 105 chunkIndex2MatchCount = newChunkIndex2MatchCount; | |
| 106 continue; | |
| 107 } | |
| 108 for (int j : occurencesInS2.toArray()) { | |
| 109 // s1[i] == s2[j] | |
| 110 if (j < startS2) { | |
| 111 continue; | |
| 112 } | |
| 113 if (j >= endS2) { | |
| 114 break; | |
| 115 } | |
| 116 int prevChunkMatches = chunkIndex2MatchCount.containsKey(j-1) ? chunkIndex2MatchCount.get(j-1) : 0; | |
| 117 int k = prevChunkMatches + 1; | |
| 118 newChunkIndex2MatchCount.put(j, k); | |
| 119 if (k > maxLength) { | |
| 120 matchStartS1 = i-k+1; | |
| 121 matchStartS2 = j-k+1; | |
| 122 maxLength = k; | |
| 123 } | |
| 124 } | |
| 125 chunkIndex2MatchCount = newChunkIndex2MatchCount; | |
| 126 } | |
| 127 return maxLength; | |
| 128 } | |
| 129 | |
| 130 public void findMatchingBlocks(int startS1, int endS1, int startS2, int endS2) { | |
| 131 int matchLength = longestMatch(startS1, endS1, startS2, endS2); | |
| 132 if (matchLength > 0) { | |
| 133 final int saveStartS1 = matchStartS1, saveStartS2 = matchStartS2; | |
| 134 if (startS1 < matchStartS1 && startS2 < matchStartS2) { | |
| 135 findMatchingBlocks(startS1, matchStartS1, startS2, matchStartS2); | |
| 136 } | |
| 137 reportDeltaElement(saveStartS1, saveStartS2); | |
| 138 changeStartS1 = saveStartS1 + matchLength; | |
| 139 changeStartS2 = saveStartS2 + matchLength; | |
| 140 // System.out.printf("match: from line #%d and line #%d of length %d\n", saveStartS1, saveStartS2, matchLength); | |
| 141 if (saveStartS1+matchLength < endS1 && saveStartS2+matchLength < endS2) { | |
| 142 findMatchingBlocks(saveStartS1 + matchLength, endS1, saveStartS2 + matchLength, endS2); | |
| 143 } | |
| 144 } | |
| 145 } | |
| 146 | |
| 147 private Patch deltaCollector; | |
| 148 | |
| 149 private void reportDeltaElement(int i, int j) { | |
| 150 if (changeStartS1 < i) { | |
| 151 if (changeStartS2 < j) { | |
| 152 System.out.printf("changed [%d..%d) with [%d..%d)\n", changeStartS1, i, changeStartS2, j); | |
| 153 } else { | |
| 154 assert changeStartS2 == j; | |
| 155 System.out.printf("deleted [%d..%d)\n", changeStartS1, i); | |
| 156 } | |
| 157 if (deltaCollector != null) { | |
| 158 int from = seq1.chunk(changeStartS1).getOffset(); | |
| 159 int to = seq1.chunk(i).getOffset(); | |
| 160 byte[] data = seq2.data(changeStartS2, j); | |
| 161 deltaCollector.add(from, to, data); | |
| 162 } | |
| 163 } else { | |
| 164 assert changeStartS1 == i; | |
| 165 if(changeStartS2 < j) { | |
| 166 System.out.printf("added [%d..%d)\n", changeStartS2, j); | |
| 167 } else { | |
| 168 assert changeStartS2 == j; | |
| 169 System.out.printf("adjustent equal blocks %d, %d and %d,%d\n", changeStartS1, i, changeStartS2, j); | |
| 170 } | |
| 171 if (deltaCollector != null) { | |
| 172 int insPoint = seq1.chunk(changeStartS1).getOffset(); | |
| 173 byte[] data = seq2.data(changeStartS2, j); | |
| 174 deltaCollector.add(insPoint, insPoint, data); | |
| 175 } | |
| 176 } | |
| 177 } | |
| 178 | |
| 179 public static void main(String[] args) throws Exception { | |
| 180 HgRepository repo = new HgLookup().detectFromWorkingDir(); | |
| 181 HgDataFile df = repo.getFileNode("cmdline/org/tmatesoft/hg/console/Main.java"); | |
| 182 ByteArrayChannel bac1, bac2; | |
| 183 df.content(80, bac1 = new ByteArrayChannel()); | |
| 184 df.content(81, bac2 = new ByteArrayChannel()); | |
| 185 // String s1 = "line 1\nline 2\r\nline 3\n\nline 1\nline 2"; | |
| 186 // String s2 = "abc\ncdef\r\nline 2\r\nline 3\nline 2"; | |
| 187 PatchGenerator pg = new PatchGenerator(); | |
| 188 pg.init(bac1.toArray(), bac2.toArray()); | |
| 189 pg.findMatchingBlocks(); | |
| 190 } | |
| 191 | |
| 192 public Patch delta(byte[] prev, byte[] content) { | |
| 193 deltaCollector = new Patch(); | |
| 194 init(prev, content); | |
| 195 findMatchingBlocks(); | |
| 196 return deltaCollector; | |
| 197 } | |
| 198 | |
| 199 private static class ChunkSequence { | |
| 200 | |
| 201 private final byte[] input; | |
| 202 private ArrayList<ByteChain> lines; | |
| 203 | |
| 204 public ChunkSequence(byte[] data) { | |
| 205 input = data; | |
| 206 } | |
| 207 | |
| 208 public void splitByNewlines() { | |
| 209 lines = new ArrayList<ByteChain>(); | |
| 210 int lastStart = 0; | |
| 211 for (int i = 0; i < input.length; i++) { | |
| 212 if (input[i] == '\n') { | |
| 213 lines.add(new ByteChain(lastStart, i+1)); | |
| 214 lastStart = i+1; | |
| 215 } else if (input[i] == '\r') { | |
| 216 if (i+1 < input.length && input[i+1] == '\n') { | |
| 217 i++; | |
| 218 } | |
| 219 lines.add(new ByteChain(lastStart, i+1)); | |
| 220 lastStart = i+1; | |
| 221 } | |
| 222 } | |
| 223 if (lastStart < input.length) { | |
| 224 lines.add(new ByteChain(lastStart, input.length)); | |
| 225 } | |
| 226 } | |
| 227 | |
| 228 public ByteChain chunk(int index) { | |
| 229 return lines.get(index); | |
| 230 } | |
| 231 | |
| 232 public int chunkCount() { | |
| 233 return lines.size(); | |
| 234 } | |
| 235 | |
| 236 public byte[] data(int chunkFrom, int chunkTo) { | |
| 237 if (chunkFrom == chunkTo) { | |
| 238 return new byte[0]; | |
| 239 } | |
| 240 int from = chunk(chunkFrom).getOffset(), to = chunk(chunkTo).getOffset(); | |
| 241 byte[] rv = new byte[to - from]; | |
| 242 System.arraycopy(input, from, rv, 0, rv.length); | |
| 243 return rv; | |
| 244 } | |
| 245 | |
| 246 | |
| 247 final class ByteChain { | |
| 248 private final int start, end; | |
| 249 private final int hash; | |
| 250 | |
| 251 ByteChain(int s, int e) { | |
| 252 start = s; | |
| 253 end = e; | |
| 254 hash = calcHash(input, s, e); | |
| 255 } | |
| 256 | |
| 257 /** | |
| 258 * byte offset of the this ByteChain inside ChainSequence | |
| 259 */ | |
| 260 public int getOffset() { | |
| 261 return start; | |
| 262 } | |
| 263 | |
| 264 public byte[] data() { | |
| 265 byte[] rv = new byte[end - start]; | |
| 266 System.arraycopy(input, start, rv, 0, rv.length); | |
| 267 return rv; | |
| 268 } | |
| 269 | |
| 270 @Override | |
| 271 public boolean equals(Object obj) { | |
| 272 if (obj == null || obj.getClass() != ByteChain.class) { | |
| 273 return false; | |
| 274 } | |
| 275 ByteChain other = (ByteChain) obj; | |
| 276 if (other.hash != hash || other.end - other.start != end - start) { | |
| 277 return false; | |
| 278 } | |
| 279 return other.match(input, start); | |
| 280 } | |
| 281 | |
| 282 private boolean match(byte[] oi, int from) { | |
| 283 for (int i = start, j = from; i < end; i++, j++) { | |
| 284 if (ChunkSequence.this.input[i] != oi[j]) { | |
| 285 return false; | |
| 286 } | |
| 287 } | |
| 288 return true; | |
| 289 } | |
| 290 | |
| 291 @Override | |
| 292 public int hashCode() { | |
| 293 return hash; | |
| 294 } | |
| 295 | |
| 296 @Override | |
| 297 public String toString() { | |
| 298 return String.format("[@%d\"%s\"]", start, new String(data())); | |
| 299 } | |
| 300 } | |
| 301 | |
| 302 // same as Arrays.hashCode(byte[]), just for a slice of a bigger array | |
| 303 static int calcHash(byte[] data, int from, int to) { | |
| 304 int result = 1; | |
| 305 for (int i = from; i < to; i++) { | |
| 306 result = 31 * result + data[i]; | |
| 307 } | |
| 308 return result; | |
| 309 } | |
| 310 } | |
| 311 } | 
