lib/model, lib/weakhash: Hash using adler32, add heuristic in puller

Adler32 is much faster, and the heuristic avoid the obvious cases where it will not help. GitHub-Pull-Request: https://github.com/syncthing/syncthing/pull/3872
2017-01-04 21:04:13 +00:00
parent 920274bce4
commit 29d010ec0e
14 changed files with 461 additions and 190 deletions
--- a/lib/weakhash/benchmark_test.go
+++ b/lib/weakhash/benchmark_test.go
@@ -9,9 +9,12 @@ package weakhash
 import (
 	"os"
 	"testing"
+
+	"github.com/chmduquesne/rollinghash/adler32"
 )

 const testFile = "../model/testdata/~syncthing~file.tmp"
+const size = 128 << 10

 func BenchmarkFind1MFile(b *testing.B) {
 	b.ReportAllocs()
@@ -21,10 +24,38 @@ func BenchmarkFind1MFile(b *testing.B) {
 		if err != nil {
 			b.Fatal(err)
 		}
-		_, err = Find(fd, []uint32{0, 1, 2}, 128<<10)
+		_, err = Find(fd, []uint32{0, 1, 2}, size)
 		if err != nil {
 			b.Fatal(err)
 		}
 		fd.Close()
 	}
 }
+
+func BenchmarkWeakHashAdler32(b *testing.B) {
+	data := make([]byte, size)
+	hf := adler32.New()
+
+	for i := 0; i < b.N; i++ {
+		hf.Write(data)
+	}
+
+	_ = hf.Sum32()
+	b.SetBytes(size)
+}
+
+func BenchmarkWeakHashAdler32Roll(b *testing.B) {
+	data := make([]byte, size)
+	hf := adler32.New()
+	hf.Write(data)
+
+	b.ResetTimer()
+
+	for i := 0; i < b.N; i++ {
+		for i := 0; i <= size; i++ {
+			hf.Roll('a')
+		}
+	}
+
+	b.SetBytes(size)
+}
--- a/lib/weakhash/weakhash.go
+++ b/lib/weakhash/weakhash.go
@@ -8,22 +8,16 @@ package weakhash

 import (
 	"bufio"
-	"hash"
 	"io"
 	"os"
+
+	"github.com/chmduquesne/rollinghash/adler32"
 )

 const (
 	Size = 4
 )

-func NewHash(size int) hash.Hash32 {
-	return &digest{
-		buf:  make([]byte, size),
-		size: size,
-	}
-}
-
 // Find finds all the blocks of the given size within io.Reader that matches
 // the hashes provided, and returns a hash -> slice of offsets within reader
 // map, that produces the same weak hash.
@@ -33,7 +27,7 @@ func Find(ir io.Reader, hashesToFind []uint32, size int) (map[uint32][]int64, er
 	}

 	r := bufio.NewReader(ir)
-	hf := NewHash(size)
+	hf := adler32.New()

 	n, err := io.CopyN(hf, r, int64(size))
 	if err == io.EOF {
@@ -66,56 +60,11 @@ func Find(ir io.Reader, hashesToFind []uint32, size int) (map[uint32][]int64, er
 		} else if err != nil {
 			return offsets, err
 		}
-		hf.Write([]byte{bt})
+		hf.Roll(bt)
 	}
 	return offsets, nil
 }

-// Using this: http://tutorials.jenkov.com/rsync/checksums.html
-// Example implementations: https://gist.github.com/csabahenk/1096262/revisions
-// Alternative that could be used is adler32 http://blog.liw.fi/posts/rsync-in-python/#comment-fee8d5e07794fdba3fe2d76aa2706a13
-type digest struct {
-	buf  []byte
-	size int
-	a    uint16
-	b    uint16
-	j    int
-}
-
-func (d *digest) Write(data []byte) (int, error) {
-	for _, c := range data {
-		// TODO: Use this in Go 1.6
-		// d.a = d.a - uint16(d.buf[d.j]) + uint16(c)
-		// d.b = d.b - uint16(d.size)*uint16(d.buf[d.j]) + d.a
-		d.a -= uint16(d.buf[d.j])
-		d.a += uint16(c)
-		d.b -= uint16(d.size) * uint16(d.buf[d.j])
-		d.b += d.a
-
-		d.buf[d.j] = c
-		d.j = (d.j + 1) % d.size
-	}
-	return len(data), nil
-}
-
-func (d *digest) Reset() {
-	for i := range d.buf {
-		d.buf[i] = 0x0
-	}
-	d.a = 0
-	d.b = 0
-	d.j = 0
-}
-
-func (d *digest) Sum(b []byte) []byte {
-	r := d.Sum32()
-	return append(b, byte(r>>24), byte(r>>16), byte(r>>8), byte(r))
-}
-
-func (d *digest) Sum32() uint32 { return uint32(d.a) | (uint32(d.b) << 16) }
-func (digest) Size() int        { return Size }
-func (digest) BlockSize() int   { return 1 }
-
 func NewFinder(path string, size int, hashesToFind []uint32) (*Finder, error) {
 	file, err := os.Open(path)
 	if err != nil {
--- a/lib/weakhash/weakhash_test.go
+++ b/lib/weakhash/weakhash_test.go
@@ -18,129 +18,6 @@ import (
 )

 var payload = []byte("abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz")
-var hashes = []uint32{
-	64225674,
-	64881038,
-	65536402,
-	66191766,
-	66847130,
-	67502494,
-	68157858,
-	68813222,
-	69468586,
-	70123950,
-	70779314,
-	71434678,
-	72090042,
-	72745406,
-	73400770,
-	74056134,
-	74711498,
-	75366862,
-	76022226,
-	76677590,
-	77332954,
-	77988318,
-	78643682,
-	77595084,
-	74842550,
-	70386080,
-	64225674,
-	64881038,
-	65536402,
-	66191766,
-	66847130,
-	67502494,
-	68157858,
-	68813222,
-	69468586,
-	70123950,
-	70779314,
-	71434678,
-	72090042,
-	72745406,
-	73400770,
-	74056134,
-	74711498,
-	75366862,
-	76022226,
-	76677590,
-	77332954,
-	77988318,
-	78643682,
-	77595084,
-	74842550,
-	70386080,
-	64225674,
-	64881038,
-	65536402,
-	66191766,
-	66847130,
-	67502494,
-	68157858,
-	68813222,
-	69468586,
-	70123950,
-	70779314,
-	71434678,
-	72090042,
-	72745406,
-	73400770,
-	74056134,
-	74711498,
-	75366862,
-	76022226,
-	76677590,
-	77332954,
-	77988318,
-	78643682,
-	77595084,
-	74842550,
-	70386080,
-	64225674,
-	64881038,
-	65536402,
-	66191766,
-	66847130,
-	67502494,
-	68157858,
-	68813222,
-	69468586,
-	70123950,
-	70779314,
-	71434678,
-	72090042,
-	72745406,
-	73400770,
-	74056134,
-	74711498,
-	75366862,
-	76022226,
-	76677590,
-	77332954,
-	77988318,
-	78643682,
-	71893365,
-	71893365,
-}
-
-// Tested using an alternative C implementation at https://gist.github.com/csabahenk/1096262
-func TestHashCorrect(t *testing.T) {
-	h := NewHash(Size)
-	pos := 0
-	for pos < Size {
-		h.Write([]byte{payload[pos]})
-		pos++
-	}
-
-	for i := 0; pos < len(payload); i++ {
-		if h.Sum32() != hashes[i] {
-			t.Errorf("mismatch at %d", i)
-		}
-		h.Write([]byte{payload[pos]})
-		pos++
-	}
-}

 func TestFinder(t *testing.T) {
 	f, err := ioutil.TempFile("", "")
@@ -154,7 +31,7 @@ func TestFinder(t *testing.T) {
 		t.Error(err)
 	}

-	hashes := []uint32{64881038, 65536402}
+	hashes := []uint32{65143183, 65798547}
 	finder, err := NewFinder(f.Name(), 4, hashes)
 	if err != nil {
 		t.Error(err)
@@ -162,8 +39,8 @@ func TestFinder(t *testing.T) {
 	defer finder.Close()

 	expected := map[uint32][]int64{
-		64881038: []int64{1, 27, 53, 79},
-		65536402: []int64{2, 28, 54, 80},
+		65143183: []int64{1, 27, 53, 79},
+		65798547: []int64{2, 28, 54, 80},
 	}
 	actual := make(map[uint32][]int64)