lib/model, lib/weakhash: Hash using adler32, add heuristic in puller

Adler32 is much faster, and the heuristic avoid the obvious cases where it
will not help.

GitHub-Pull-Request: https://github.com/syncthing/syncthing/pull/3872
This commit is contained in:
Audrius Butkevicius
2017-01-04 21:04:13 +00:00
committed by Jakob Borg
parent 920274bce4
commit 29d010ec0e
14 changed files with 461 additions and 190 deletions

View File

@@ -108,6 +108,7 @@ func TestDeviceConfig(t *testing.T) {
Versioning: VersioningConfiguration{
Params: map[string]string{},
},
WeakHashThresholdPct: 25,
},
}

View File

@@ -40,8 +40,8 @@ type FolderConfiguration struct {
DisableSparseFiles bool `xml:"disableSparseFiles" json:"disableSparseFiles"`
DisableTempIndexes bool `xml:"disableTempIndexes" json:"disableTempIndexes"`
Fsync bool `xml:"fsync" json:"fsync"`
DisableWeakHash bool `xml:"disableWeakHash" json:"disableWeakHash"`
Paused bool `xml:"paused" json:"paused"`
WeakHashThresholdPct int `xml:"weakHashThresholdPct" json:"weakHashThresholdPct"` // Use weak hash if more than X percent of the file has changed. Set to -1 to always use weak hash.
cachedPath string
@@ -146,6 +146,10 @@ func (f *FolderConfiguration) prepare() {
if f.Versioning.Params == nil {
f.Versioning.Params = make(map[string]string)
}
if f.WeakHashThresholdPct == 0 {
f.WeakHashThresholdPct = 25
}
}
func (f *FolderConfiguration) cleanedPath() string {

View File

@@ -47,6 +47,7 @@ type pullBlockState struct {
type copyBlocksState struct {
*sharedPullerState
blocks []protocol.BlockInfo
have int
}
// Which filemode bits to preserve
@@ -1003,7 +1004,9 @@ func (f *sendReceiveFolder) renameFile(source, target protocol.FileInfo) {
func (f *sendReceiveFolder) handleFile(file protocol.FileInfo, copyChan chan<- copyBlocksState, finisherChan chan<- *sharedPullerState) {
curFile, hasCurFile := f.model.CurrentFolderFile(f.folderID, file.Name)
if hasCurFile && len(curFile.Blocks) == len(file.Blocks) && scanner.BlocksEqual(curFile.Blocks, file.Blocks) {
have, need := scanner.BlockDiff(curFile.Blocks, file.Blocks)
if hasCurFile && len(need) == 0 {
// We are supposed to copy the entire file, and then fetch nothing. We
// are only updating metadata, so we don't actually *need* to make the
// copy.
@@ -1158,6 +1161,7 @@ func (f *sendReceiveFolder) handleFile(file protocol.FileInfo, copyChan chan<- c
cs := copyBlocksState{
sharedPullerState: &s,
blocks: blocks,
have: len(have),
}
copyChan <- cs
}
@@ -1216,7 +1220,12 @@ func (f *sendReceiveFolder) copierRoutine(in <-chan copyBlocksState, pullChan ch
f.model.fmut.RUnlock()
var weakHashFinder *weakhash.Finder
if !f.DisableWeakHash {
blocksPercentChanged := 0
if tot := len(state.file.Blocks); tot > 0 {
blocksPercentChanged = (tot - state.have) * 100 / tot
}
if blocksPercentChanged >= f.WeakHashThresholdPct {
hashesToFind := make([]uint32, 0, len(state.blocks))
for _, block := range state.blocks {
if block.WeakHash != 0 {

View File

@@ -322,7 +322,7 @@ func TestWeakHash(t *testing.T) {
go fo.copierRoutine(copyChan, pullChan, finisherChan)
// Test 1 - no weak hashing, file gets fully repulled (`expectBlocks` pulls).
fo.DisableWeakHash = true
fo.WeakHashThresholdPct = 101
fo.handleFile(desiredFile, copyChan, finisherChan)
var pulls []pullBlockState
@@ -350,7 +350,7 @@ func TestWeakHash(t *testing.T) {
}
// Test 2 - using weak hash, expectPulls blocks pulled.
fo.DisableWeakHash = false
fo.WeakHashThresholdPct = -1
fo.handleFile(desiredFile, copyChan, finisherChan)
pulls = pulls[:0]

View File

@@ -11,9 +11,9 @@ import (
"fmt"
"io"
"github.com/chmduquesne/rollinghash/adler32"
"github.com/syncthing/syncthing/lib/protocol"
"github.com/syncthing/syncthing/lib/sha256"
"github.com/syncthing/syncthing/lib/weakhash"
)
var SHA256OfNothing = []uint8{0xe3, 0xb0, 0xc4, 0x42, 0x98, 0xfc, 0x1c, 0x14, 0x9a, 0xfb, 0xf4, 0xc8, 0x99, 0x6f, 0xb9, 0x24, 0x27, 0xae, 0x41, 0xe4, 0x64, 0x9b, 0x93, 0x4c, 0xa4, 0x95, 0x99, 0x1b, 0x78, 0x52, 0xb8, 0x55}
@@ -26,7 +26,8 @@ type Counter interface {
func Blocks(r io.Reader, blocksize int, sizehint int64, counter Counter) ([]protocol.BlockInfo, error) {
hf := sha256.New()
hashLength := hf.Size()
whf := weakhash.NewHash(blocksize)
whf := adler32.New()
mhf := io.MultiWriter(hf, whf)
var blocks []protocol.BlockInfo
var hashes, thisHash []byte
@@ -46,7 +47,7 @@ func Blocks(r io.Reader, blocksize int, sizehint int64, counter Counter) ([]prot
var offset int64
for {
lr := io.LimitReader(r, int64(blocksize))
n, err := io.CopyBuffer(hf, io.TeeReader(lr, whf), buf)
n, err := io.CopyBuffer(mhf, lr, buf)
if err != nil {
return nil, err
}

View File

@@ -122,3 +122,25 @@ func TestDiff(t *testing.T) {
}
}
}
func TestDiffEmpty(t *testing.T) {
emptyCases := []struct {
a []protocol.BlockInfo
b []protocol.BlockInfo
need int
have int
}{
{nil, nil, 0, 0},
{[]protocol.BlockInfo{{Offset: 3, Size: 1}}, nil, 0, 0},
{nil, []protocol.BlockInfo{{Offset: 3, Size: 1}}, 1, 0},
}
for _, emptyCase := range emptyCases {
h, n := BlockDiff(emptyCase.a, emptyCase.b)
if len(h) != emptyCase.have {
t.Errorf("incorrect have: %d != %d", len(h), emptyCase.have)
}
if len(n) != emptyCase.need {
t.Errorf("incorrect have: %d != %d", len(h), emptyCase.have)
}
}
}

View File

@@ -9,9 +9,12 @@ package weakhash
import (
"os"
"testing"
"github.com/chmduquesne/rollinghash/adler32"
)
const testFile = "../model/testdata/~syncthing~file.tmp"
const size = 128 << 10
func BenchmarkFind1MFile(b *testing.B) {
b.ReportAllocs()
@@ -21,10 +24,38 @@ func BenchmarkFind1MFile(b *testing.B) {
if err != nil {
b.Fatal(err)
}
_, err = Find(fd, []uint32{0, 1, 2}, 128<<10)
_, err = Find(fd, []uint32{0, 1, 2}, size)
if err != nil {
b.Fatal(err)
}
fd.Close()
}
}
func BenchmarkWeakHashAdler32(b *testing.B) {
data := make([]byte, size)
hf := adler32.New()
for i := 0; i < b.N; i++ {
hf.Write(data)
}
_ = hf.Sum32()
b.SetBytes(size)
}
func BenchmarkWeakHashAdler32Roll(b *testing.B) {
data := make([]byte, size)
hf := adler32.New()
hf.Write(data)
b.ResetTimer()
for i := 0; i < b.N; i++ {
for i := 0; i <= size; i++ {
hf.Roll('a')
}
}
b.SetBytes(size)
}

View File

@@ -8,22 +8,16 @@ package weakhash
import (
"bufio"
"hash"
"io"
"os"
"github.com/chmduquesne/rollinghash/adler32"
)
const (
Size = 4
)
func NewHash(size int) hash.Hash32 {
return &digest{
buf: make([]byte, size),
size: size,
}
}
// Find finds all the blocks of the given size within io.Reader that matches
// the hashes provided, and returns a hash -> slice of offsets within reader
// map, that produces the same weak hash.
@@ -33,7 +27,7 @@ func Find(ir io.Reader, hashesToFind []uint32, size int) (map[uint32][]int64, er
}
r := bufio.NewReader(ir)
hf := NewHash(size)
hf := adler32.New()
n, err := io.CopyN(hf, r, int64(size))
if err == io.EOF {
@@ -66,56 +60,11 @@ func Find(ir io.Reader, hashesToFind []uint32, size int) (map[uint32][]int64, er
} else if err != nil {
return offsets, err
}
hf.Write([]byte{bt})
hf.Roll(bt)
}
return offsets, nil
}
// Using this: http://tutorials.jenkov.com/rsync/checksums.html
// Example implementations: https://gist.github.com/csabahenk/1096262/revisions
// Alternative that could be used is adler32 http://blog.liw.fi/posts/rsync-in-python/#comment-fee8d5e07794fdba3fe2d76aa2706a13
type digest struct {
buf []byte
size int
a uint16
b uint16
j int
}
func (d *digest) Write(data []byte) (int, error) {
for _, c := range data {
// TODO: Use this in Go 1.6
// d.a = d.a - uint16(d.buf[d.j]) + uint16(c)
// d.b = d.b - uint16(d.size)*uint16(d.buf[d.j]) + d.a
d.a -= uint16(d.buf[d.j])
d.a += uint16(c)
d.b -= uint16(d.size) * uint16(d.buf[d.j])
d.b += d.a
d.buf[d.j] = c
d.j = (d.j + 1) % d.size
}
return len(data), nil
}
func (d *digest) Reset() {
for i := range d.buf {
d.buf[i] = 0x0
}
d.a = 0
d.b = 0
d.j = 0
}
func (d *digest) Sum(b []byte) []byte {
r := d.Sum32()
return append(b, byte(r>>24), byte(r>>16), byte(r>>8), byte(r))
}
func (d *digest) Sum32() uint32 { return uint32(d.a) | (uint32(d.b) << 16) }
func (digest) Size() int { return Size }
func (digest) BlockSize() int { return 1 }
func NewFinder(path string, size int, hashesToFind []uint32) (*Finder, error) {
file, err := os.Open(path)
if err != nil {

View File

@@ -18,129 +18,6 @@ import (
)
var payload = []byte("abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz")
var hashes = []uint32{
64225674,
64881038,
65536402,
66191766,
66847130,
67502494,
68157858,
68813222,
69468586,
70123950,
70779314,
71434678,
72090042,
72745406,
73400770,
74056134,
74711498,
75366862,
76022226,
76677590,
77332954,
77988318,
78643682,
77595084,
74842550,
70386080,
64225674,
64881038,
65536402,
66191766,
66847130,
67502494,
68157858,
68813222,
69468586,
70123950,
70779314,
71434678,
72090042,
72745406,
73400770,
74056134,
74711498,
75366862,
76022226,
76677590,
77332954,
77988318,
78643682,
77595084,
74842550,
70386080,
64225674,
64881038,
65536402,
66191766,
66847130,
67502494,
68157858,
68813222,
69468586,
70123950,
70779314,
71434678,
72090042,
72745406,
73400770,
74056134,
74711498,
75366862,
76022226,
76677590,
77332954,
77988318,
78643682,
77595084,
74842550,
70386080,
64225674,
64881038,
65536402,
66191766,
66847130,
67502494,
68157858,
68813222,
69468586,
70123950,
70779314,
71434678,
72090042,
72745406,
73400770,
74056134,
74711498,
75366862,
76022226,
76677590,
77332954,
77988318,
78643682,
71893365,
71893365,
}
// Tested using an alternative C implementation at https://gist.github.com/csabahenk/1096262
func TestHashCorrect(t *testing.T) {
h := NewHash(Size)
pos := 0
for pos < Size {
h.Write([]byte{payload[pos]})
pos++
}
for i := 0; pos < len(payload); i++ {
if h.Sum32() != hashes[i] {
t.Errorf("mismatch at %d", i)
}
h.Write([]byte{payload[pos]})
pos++
}
}
func TestFinder(t *testing.T) {
f, err := ioutil.TempFile("", "")
@@ -154,7 +31,7 @@ func TestFinder(t *testing.T) {
t.Error(err)
}
hashes := []uint32{64881038, 65536402}
hashes := []uint32{65143183, 65798547}
finder, err := NewFinder(f.Name(), 4, hashes)
if err != nil {
t.Error(err)
@@ -162,8 +39,8 @@ func TestFinder(t *testing.T) {
defer finder.Close()
expected := map[uint32][]int64{
64881038: []int64{1, 27, 53, 79},
65536402: []int64{2, 28, 54, 80},
65143183: []int64{1, 27, 53, 79},
65798547: []int64{2, 28, 54, 80},
}
actual := make(map[uint32][]int64)