From 29d010ec0e7430089ed417581c31b2b503e57dd9 Mon Sep 17 00:00:00 2001 From: Audrius Butkevicius Date: Wed, 4 Jan 2017 21:04:13 +0000 Subject: [PATCH] lib/model, lib/weakhash: Hash using adler32, add heuristic in puller Adler32 is much faster, and the heuristic avoid the obvious cases where it will not help. GitHub-Pull-Request: https://github.com/syncthing/syncthing/pull/3872 --- lib/config/config_test.go | 1 + lib/config/folderconfiguration.go | 6 +- lib/model/rwfolder.go | 13 +- lib/model/rwfolder_test.go | 4 +- lib/scanner/blocks.go | 7 +- lib/scanner/blocks_test.go | 22 +++ lib/weakhash/benchmark_test.go | 33 +++- lib/weakhash/weakhash.go | 59 +------- lib/weakhash/weakhash_test.go | 129 +--------------- .../rollinghash/adler32/adler32.go | 97 ++++++++++++ .../rollinghash/buzhash/buzhash.go | 143 ++++++++++++++++++ .../rollinghash/rabinkarp32/rabinkarp32.go | 89 +++++++++++ .../chmduquesne/rollinghash/rollinghash.go | 40 +++++ vendor/manifest | 8 + 14 files changed, 461 insertions(+), 190 deletions(-) create mode 100644 vendor/github.com/chmduquesne/rollinghash/adler32/adler32.go create mode 100644 vendor/github.com/chmduquesne/rollinghash/buzhash/buzhash.go create mode 100644 vendor/github.com/chmduquesne/rollinghash/rabinkarp32/rabinkarp32.go create mode 100644 vendor/github.com/chmduquesne/rollinghash/rollinghash.go diff --git a/lib/config/config_test.go b/lib/config/config_test.go index 2d3e094e..f286505a 100644 --- a/lib/config/config_test.go +++ b/lib/config/config_test.go @@ -108,6 +108,7 @@ func TestDeviceConfig(t *testing.T) { Versioning: VersioningConfiguration{ Params: map[string]string{}, }, + WeakHashThresholdPct: 25, }, } diff --git a/lib/config/folderconfiguration.go b/lib/config/folderconfiguration.go index a4f0ffa6..093e0d82 100644 --- a/lib/config/folderconfiguration.go +++ b/lib/config/folderconfiguration.go @@ -40,8 +40,8 @@ type FolderConfiguration struct { DisableSparseFiles bool `xml:"disableSparseFiles" json:"disableSparseFiles"` DisableTempIndexes bool `xml:"disableTempIndexes" json:"disableTempIndexes"` Fsync bool `xml:"fsync" json:"fsync"` - DisableWeakHash bool `xml:"disableWeakHash" json:"disableWeakHash"` Paused bool `xml:"paused" json:"paused"` + WeakHashThresholdPct int `xml:"weakHashThresholdPct" json:"weakHashThresholdPct"` // Use weak hash if more than X percent of the file has changed. Set to -1 to always use weak hash. cachedPath string @@ -146,6 +146,10 @@ func (f *FolderConfiguration) prepare() { if f.Versioning.Params == nil { f.Versioning.Params = make(map[string]string) } + + if f.WeakHashThresholdPct == 0 { + f.WeakHashThresholdPct = 25 + } } func (f *FolderConfiguration) cleanedPath() string { diff --git a/lib/model/rwfolder.go b/lib/model/rwfolder.go index fab982b9..93757127 100644 --- a/lib/model/rwfolder.go +++ b/lib/model/rwfolder.go @@ -47,6 +47,7 @@ type pullBlockState struct { type copyBlocksState struct { *sharedPullerState blocks []protocol.BlockInfo + have int } // Which filemode bits to preserve @@ -1003,7 +1004,9 @@ func (f *sendReceiveFolder) renameFile(source, target protocol.FileInfo) { func (f *sendReceiveFolder) handleFile(file protocol.FileInfo, copyChan chan<- copyBlocksState, finisherChan chan<- *sharedPullerState) { curFile, hasCurFile := f.model.CurrentFolderFile(f.folderID, file.Name) - if hasCurFile && len(curFile.Blocks) == len(file.Blocks) && scanner.BlocksEqual(curFile.Blocks, file.Blocks) { + have, need := scanner.BlockDiff(curFile.Blocks, file.Blocks) + + if hasCurFile && len(need) == 0 { // We are supposed to copy the entire file, and then fetch nothing. We // are only updating metadata, so we don't actually *need* to make the // copy. @@ -1158,6 +1161,7 @@ func (f *sendReceiveFolder) handleFile(file protocol.FileInfo, copyChan chan<- c cs := copyBlocksState{ sharedPullerState: &s, blocks: blocks, + have: len(have), } copyChan <- cs } @@ -1216,7 +1220,12 @@ func (f *sendReceiveFolder) copierRoutine(in <-chan copyBlocksState, pullChan ch f.model.fmut.RUnlock() var weakHashFinder *weakhash.Finder - if !f.DisableWeakHash { + blocksPercentChanged := 0 + if tot := len(state.file.Blocks); tot > 0 { + blocksPercentChanged = (tot - state.have) * 100 / tot + } + + if blocksPercentChanged >= f.WeakHashThresholdPct { hashesToFind := make([]uint32, 0, len(state.blocks)) for _, block := range state.blocks { if block.WeakHash != 0 { diff --git a/lib/model/rwfolder_test.go b/lib/model/rwfolder_test.go index f2890eb3..c40a1bc9 100644 --- a/lib/model/rwfolder_test.go +++ b/lib/model/rwfolder_test.go @@ -322,7 +322,7 @@ func TestWeakHash(t *testing.T) { go fo.copierRoutine(copyChan, pullChan, finisherChan) // Test 1 - no weak hashing, file gets fully repulled (`expectBlocks` pulls). - fo.DisableWeakHash = true + fo.WeakHashThresholdPct = 101 fo.handleFile(desiredFile, copyChan, finisherChan) var pulls []pullBlockState @@ -350,7 +350,7 @@ func TestWeakHash(t *testing.T) { } // Test 2 - using weak hash, expectPulls blocks pulled. - fo.DisableWeakHash = false + fo.WeakHashThresholdPct = -1 fo.handleFile(desiredFile, copyChan, finisherChan) pulls = pulls[:0] diff --git a/lib/scanner/blocks.go b/lib/scanner/blocks.go index 89a5da63..ec6d946c 100644 --- a/lib/scanner/blocks.go +++ b/lib/scanner/blocks.go @@ -11,9 +11,9 @@ import ( "fmt" "io" + "github.com/chmduquesne/rollinghash/adler32" "github.com/syncthing/syncthing/lib/protocol" "github.com/syncthing/syncthing/lib/sha256" - "github.com/syncthing/syncthing/lib/weakhash" ) var SHA256OfNothing = []uint8{0xe3, 0xb0, 0xc4, 0x42, 0x98, 0xfc, 0x1c, 0x14, 0x9a, 0xfb, 0xf4, 0xc8, 0x99, 0x6f, 0xb9, 0x24, 0x27, 0xae, 0x41, 0xe4, 0x64, 0x9b, 0x93, 0x4c, 0xa4, 0x95, 0x99, 0x1b, 0x78, 0x52, 0xb8, 0x55} @@ -26,7 +26,8 @@ type Counter interface { func Blocks(r io.Reader, blocksize int, sizehint int64, counter Counter) ([]protocol.BlockInfo, error) { hf := sha256.New() hashLength := hf.Size() - whf := weakhash.NewHash(blocksize) + whf := adler32.New() + mhf := io.MultiWriter(hf, whf) var blocks []protocol.BlockInfo var hashes, thisHash []byte @@ -46,7 +47,7 @@ func Blocks(r io.Reader, blocksize int, sizehint int64, counter Counter) ([]prot var offset int64 for { lr := io.LimitReader(r, int64(blocksize)) - n, err := io.CopyBuffer(hf, io.TeeReader(lr, whf), buf) + n, err := io.CopyBuffer(mhf, lr, buf) if err != nil { return nil, err } diff --git a/lib/scanner/blocks_test.go b/lib/scanner/blocks_test.go index 0bc208a1..533757e1 100644 --- a/lib/scanner/blocks_test.go +++ b/lib/scanner/blocks_test.go @@ -122,3 +122,25 @@ func TestDiff(t *testing.T) { } } } + +func TestDiffEmpty(t *testing.T) { + emptyCases := []struct { + a []protocol.BlockInfo + b []protocol.BlockInfo + need int + have int + }{ + {nil, nil, 0, 0}, + {[]protocol.BlockInfo{{Offset: 3, Size: 1}}, nil, 0, 0}, + {nil, []protocol.BlockInfo{{Offset: 3, Size: 1}}, 1, 0}, + } + for _, emptyCase := range emptyCases { + h, n := BlockDiff(emptyCase.a, emptyCase.b) + if len(h) != emptyCase.have { + t.Errorf("incorrect have: %d != %d", len(h), emptyCase.have) + } + if len(n) != emptyCase.need { + t.Errorf("incorrect have: %d != %d", len(h), emptyCase.have) + } + } +} diff --git a/lib/weakhash/benchmark_test.go b/lib/weakhash/benchmark_test.go index 79f5ce9d..f5e04053 100644 --- a/lib/weakhash/benchmark_test.go +++ b/lib/weakhash/benchmark_test.go @@ -9,9 +9,12 @@ package weakhash import ( "os" "testing" + + "github.com/chmduquesne/rollinghash/adler32" ) const testFile = "../model/testdata/~syncthing~file.tmp" +const size = 128 << 10 func BenchmarkFind1MFile(b *testing.B) { b.ReportAllocs() @@ -21,10 +24,38 @@ func BenchmarkFind1MFile(b *testing.B) { if err != nil { b.Fatal(err) } - _, err = Find(fd, []uint32{0, 1, 2}, 128<<10) + _, err = Find(fd, []uint32{0, 1, 2}, size) if err != nil { b.Fatal(err) } fd.Close() } } + +func BenchmarkWeakHashAdler32(b *testing.B) { + data := make([]byte, size) + hf := adler32.New() + + for i := 0; i < b.N; i++ { + hf.Write(data) + } + + _ = hf.Sum32() + b.SetBytes(size) +} + +func BenchmarkWeakHashAdler32Roll(b *testing.B) { + data := make([]byte, size) + hf := adler32.New() + hf.Write(data) + + b.ResetTimer() + + for i := 0; i < b.N; i++ { + for i := 0; i <= size; i++ { + hf.Roll('a') + } + } + + b.SetBytes(size) +} diff --git a/lib/weakhash/weakhash.go b/lib/weakhash/weakhash.go index 5c210a52..1d3c0eb1 100644 --- a/lib/weakhash/weakhash.go +++ b/lib/weakhash/weakhash.go @@ -8,22 +8,16 @@ package weakhash import ( "bufio" - "hash" "io" "os" + + "github.com/chmduquesne/rollinghash/adler32" ) const ( Size = 4 ) -func NewHash(size int) hash.Hash32 { - return &digest{ - buf: make([]byte, size), - size: size, - } -} - // Find finds all the blocks of the given size within io.Reader that matches // the hashes provided, and returns a hash -> slice of offsets within reader // map, that produces the same weak hash. @@ -33,7 +27,7 @@ func Find(ir io.Reader, hashesToFind []uint32, size int) (map[uint32][]int64, er } r := bufio.NewReader(ir) - hf := NewHash(size) + hf := adler32.New() n, err := io.CopyN(hf, r, int64(size)) if err == io.EOF { @@ -66,56 +60,11 @@ func Find(ir io.Reader, hashesToFind []uint32, size int) (map[uint32][]int64, er } else if err != nil { return offsets, err } - hf.Write([]byte{bt}) + hf.Roll(bt) } return offsets, nil } -// Using this: http://tutorials.jenkov.com/rsync/checksums.html -// Example implementations: https://gist.github.com/csabahenk/1096262/revisions -// Alternative that could be used is adler32 http://blog.liw.fi/posts/rsync-in-python/#comment-fee8d5e07794fdba3fe2d76aa2706a13 -type digest struct { - buf []byte - size int - a uint16 - b uint16 - j int -} - -func (d *digest) Write(data []byte) (int, error) { - for _, c := range data { - // TODO: Use this in Go 1.6 - // d.a = d.a - uint16(d.buf[d.j]) + uint16(c) - // d.b = d.b - uint16(d.size)*uint16(d.buf[d.j]) + d.a - d.a -= uint16(d.buf[d.j]) - d.a += uint16(c) - d.b -= uint16(d.size) * uint16(d.buf[d.j]) - d.b += d.a - - d.buf[d.j] = c - d.j = (d.j + 1) % d.size - } - return len(data), nil -} - -func (d *digest) Reset() { - for i := range d.buf { - d.buf[i] = 0x0 - } - d.a = 0 - d.b = 0 - d.j = 0 -} - -func (d *digest) Sum(b []byte) []byte { - r := d.Sum32() - return append(b, byte(r>>24), byte(r>>16), byte(r>>8), byte(r)) -} - -func (d *digest) Sum32() uint32 { return uint32(d.a) | (uint32(d.b) << 16) } -func (digest) Size() int { return Size } -func (digest) BlockSize() int { return 1 } - func NewFinder(path string, size int, hashesToFind []uint32) (*Finder, error) { file, err := os.Open(path) if err != nil { diff --git a/lib/weakhash/weakhash_test.go b/lib/weakhash/weakhash_test.go index 525d60dc..8df4e3d6 100644 --- a/lib/weakhash/weakhash_test.go +++ b/lib/weakhash/weakhash_test.go @@ -18,129 +18,6 @@ import ( ) var payload = []byte("abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz") -var hashes = []uint32{ - 64225674, - 64881038, - 65536402, - 66191766, - 66847130, - 67502494, - 68157858, - 68813222, - 69468586, - 70123950, - 70779314, - 71434678, - 72090042, - 72745406, - 73400770, - 74056134, - 74711498, - 75366862, - 76022226, - 76677590, - 77332954, - 77988318, - 78643682, - 77595084, - 74842550, - 70386080, - 64225674, - 64881038, - 65536402, - 66191766, - 66847130, - 67502494, - 68157858, - 68813222, - 69468586, - 70123950, - 70779314, - 71434678, - 72090042, - 72745406, - 73400770, - 74056134, - 74711498, - 75366862, - 76022226, - 76677590, - 77332954, - 77988318, - 78643682, - 77595084, - 74842550, - 70386080, - 64225674, - 64881038, - 65536402, - 66191766, - 66847130, - 67502494, - 68157858, - 68813222, - 69468586, - 70123950, - 70779314, - 71434678, - 72090042, - 72745406, - 73400770, - 74056134, - 74711498, - 75366862, - 76022226, - 76677590, - 77332954, - 77988318, - 78643682, - 77595084, - 74842550, - 70386080, - 64225674, - 64881038, - 65536402, - 66191766, - 66847130, - 67502494, - 68157858, - 68813222, - 69468586, - 70123950, - 70779314, - 71434678, - 72090042, - 72745406, - 73400770, - 74056134, - 74711498, - 75366862, - 76022226, - 76677590, - 77332954, - 77988318, - 78643682, - 71893365, - 71893365, -} - -// Tested using an alternative C implementation at https://gist.github.com/csabahenk/1096262 -func TestHashCorrect(t *testing.T) { - h := NewHash(Size) - pos := 0 - for pos < Size { - h.Write([]byte{payload[pos]}) - pos++ - } - - for i := 0; pos < len(payload); i++ { - if h.Sum32() != hashes[i] { - t.Errorf("mismatch at %d", i) - } - h.Write([]byte{payload[pos]}) - pos++ - } -} func TestFinder(t *testing.T) { f, err := ioutil.TempFile("", "") @@ -154,7 +31,7 @@ func TestFinder(t *testing.T) { t.Error(err) } - hashes := []uint32{64881038, 65536402} + hashes := []uint32{65143183, 65798547} finder, err := NewFinder(f.Name(), 4, hashes) if err != nil { t.Error(err) @@ -162,8 +39,8 @@ func TestFinder(t *testing.T) { defer finder.Close() expected := map[uint32][]int64{ - 64881038: []int64{1, 27, 53, 79}, - 65536402: []int64{2, 28, 54, 80}, + 65143183: []int64{1, 27, 53, 79}, + 65798547: []int64{2, 28, 54, 80}, } actual := make(map[uint32][]int64) diff --git a/vendor/github.com/chmduquesne/rollinghash/adler32/adler32.go b/vendor/github.com/chmduquesne/rollinghash/adler32/adler32.go new file mode 100644 index 00000000..a05df02d --- /dev/null +++ b/vendor/github.com/chmduquesne/rollinghash/adler32/adler32.go @@ -0,0 +1,97 @@ +// Package rollinghash/adler32 implements a rolling version of hash/adler32 + +package adler32 + +import ( + vanilla "hash/adler32" + + "github.com/chmduquesne/rollinghash" +) + +const ( + mod = 65521 +) + +const Size = 4 + +type digest struct { + a, b uint32 + + // window is treated like a circular buffer, where the oldest element + // is indicated by d.oldest + window []byte + oldest int + n uint32 +} + +// Reset resets the Hash to its initial state. +func (d *digest) Reset() { + d.a = 1 + d.b = 0 + d.window = nil + d.oldest = 0 +} + +// New returns a new rollinghash.Hash32 computing the rolling Adler-32 +// checksum. The window is copied from the last Write(). This window is +// only used to determine which is the oldest element (leaving the +// window). The calls to Roll() do not recompute the whole checksum. +func New() rollinghash.Hash32 { + return &digest{a: 1, b: 0, window: nil, oldest: 0} +} + +// Size returns the number of bytes Sum will return. +func (d *digest) Size() int { return Size } + +// BlockSize returns the hash's underlying block size. +// The Write method must be able to accept any amount +// of data, but it may operate more efficiently if all +// writes are a multiple of the block size. +func (d *digest) BlockSize() int { return 1 } + +// Write (via the embedded io.Writer interface) adds more data to the +// running hash. It never returns an error. +func (d *digest) Write(p []byte) (int, error) { + // Copy the window + d.window = make([]byte, len(p)) + copy(d.window, p) + + // Piggy-back on the core implementation + h := vanilla.New() + h.Write(p) + s := h.Sum32() + d.a, d.b = s&0xffff, s>>16 + d.n = uint32(len(p)) % mod + return len(d.window), nil +} + +func (d *digest) Sum32() uint32 { + return d.b<<16 | d.a +} + +func (d *digest) Sum(b []byte) []byte { + v := d.Sum32() + return append(b, byte(v>>24), byte(v>>16), byte(v>>8), byte(v)) +} + +// Roll updates the checksum of the window from the leaving byte and the +// entering byte. See +// http://stackoverflow.com/questions/40985080/why-does-my-rolling-adler32-checksum-not-work-in-go-modulo-arithmetic +func (d *digest) Roll(b byte) { + if len(d.window) == 0 { + d.window = make([]byte, 1) + d.window[0] = b + } + // extract the entering/leaving bytes and update the circular buffer. + enter := uint32(b) + leave := uint32(d.window[d.oldest]) + d.window[d.oldest] = b + d.oldest += 1 + if d.oldest >= len(d.window) { + d.oldest = 0 + } + + // compute + d.a = (d.a + mod + enter - leave) % mod + d.b = (d.b + (d.n*leave/mod+1)*mod + d.a - (d.n * leave) - 1) % mod +} diff --git a/vendor/github.com/chmduquesne/rollinghash/buzhash/buzhash.go b/vendor/github.com/chmduquesne/rollinghash/buzhash/buzhash.go new file mode 100644 index 00000000..64d2a4df --- /dev/null +++ b/vendor/github.com/chmduquesne/rollinghash/buzhash/buzhash.go @@ -0,0 +1,143 @@ +// Package rollinghash/buzhash implements buzhash as described by +// https://en.wikipedia.org/wiki/Rolling_hash#Cyclic_polynomial + +package buzhash + +import rollinghash "github.com/chmduquesne/rollinghash" + +// 256 random integers generated with a dummy python script +var bytehash = [256]uint32{ + 0xa5659a00, 0x2dbfda02, 0xac29a407, 0xce942c08, 0x48513609, + 0x325f158, 0xb54e5e13, 0xa9063618, 0xa5793419, 0x554b081a, + 0xe5643dac, 0xfb50e41c, 0x2b31661d, 0x335da61f, 0xe702f7b0, + 0xe31c1424, 0x6dfed825, 0xd30cf628, 0xba626a2a, 0x74b9c22b, + 0xa5d1942d, 0xf364ae2f, 0x70d2e84c, 0x190ad208, 0x92e3b740, + 0xd7e9f435, 0x15763836, 0x930ecab4, 0x641ea65e, 0xc0b2eb0a, + 0x2675e03e, 0x1a24c63f, 0xeddbcbb7, 0x3ea42bb2, 0x815f5849, + 0xa55c284b, 0xbb30964c, 0x6f7acc4e, 0x74538a50, 0x66df9652, + 0x2bae8454, 0xfe9d8055, 0x8c866fd4, 0x82f0a63d, 0x8f26365e, + 0xe66c3460, 0x6423266, 0x60696abc, 0xf75de6d, 0xd20c86e, + 0x69f8c6f, 0x8ac0f470, 0x273aab68, 0x4e044c74, 0xb2ec7875, + 0xf642d676, 0xd719e877, 0xee557e78, 0xdd20be7a, 0xd252707e, + 0xfa507a7f, 0xee537683, 0x6aac7684, 0x340e3485, 0x1c291288, + 0xab89c8c, 0xbe6e6c8d, 0xf99cf2f7, 0x69c65890, 0xd3757491, + 0xfeb63895, 0x67067a96, 0xa0089b19, 0x6c449898, 0x4eca749a, + 0x1101229b, 0x6b86d29d, 0x9c21be9e, 0xc5904933, 0xe1e820a3, + 0x6bd524a6, 0xd4695ea7, 0xc3d007e0, 0xbed8e4a9, 0x1c49d8af, + 0xedbae4b1, 0x1d2af6b4, 0x79526b9, 0xbc1d5abb, 0x6a2eb8bc, + 0x611b3695, 0x745c3cc4, 0x81005276, 0x5f442c8, 0x42dc30ca, + 0x55e460cb, 0x47648cc, 0x20da7122, 0xc4eedccd, 0xc21c14d0, + 0x27b5dfa9, 0x7e961fce, 0x8d0296d6, 0xce3684d7, 0x28e96da, + 0xedf7dcdc, 0x6817a0df, 0x51caae0, 0x8f226e1, 0xa1a00ce3, + 0xf811c6e5, 0x13e96ee6, 0xd4d4e4d1, 0xab160ee9, 0xb2cf06ea, + 0xf4ab6eb, 0x998f56f1, 0x16974cf2, 0xd42438f5, 0xe00ba6f7, + 0xbf01b8f8, 0x7a8a00f9, 0xdded6a7f, 0xb0ce58fd, 0xe5d81901, + 0xcc823b03, 0xc962e704, 0x2b4aff05, 0x5bcb7181, 0xe7207108, + 0xf3c93109, 0x1ffb650a, 0x37a31ad7, 0xfe27322d, 0x15b16d11, + 0x51a70512, 0xb579d92e, 0x53658284, 0x91fedb1b, 0x2ef0b122, + 0x93966523, 0xfa66af26, 0xa7fac32b, 0x7a81692c, 0x4f8d7f2e, + 0xf9875730, 0xa5ab2331, 0x79db8333, 0x8be32937, 0xf900af39, + 0xd09d4f3a, 0x9b22053d, 0xd2053e1c, 0xd0deaa35, 0x4a975740, + 0xcb3706e0, 0x40aea6cd, 0x769fdd44, 0x7e3e4947, 0xc20ac949, + 0x3788c34b, 0x9b23f74c, 0xb33e441d, 0x705d8a8d, 0x6a5e3a84, + 0xb4f955e3, 0xf681a155, 0x7dec1b56, 0x7bf5df58, 0xd3fa255a, + 0x3797c15c, 0xbf511562, 0xb048d65, 0xcd04f367, 0xae3a8368, + 0x769c856d, 0xc7bb9d6f, 0xe43e1f71, 0xa24de03e, 0x7f8cb376, + 0x618b778, 0x19e02f33, 0x2f810eea, 0x2b1ce595, 0x4f2f7180, + 0x72903140, 0x26a44584, 0x6af97e96, 0xb08acb86, 0x4d25cd41, + 0x1d74fd89, 0xe0f5b277, 0xbad158c, 0x5fed3b8d, 0x68b26794, + 0xcbe58795, 0xc1180797, 0xa1352399, 0x71dacd9c, 0x42b5549a, + 0xbf5371a0, 0x7ed41fa1, 0x6fe29a3, 0xa779fba5, 0x48a095a7, + 0xc2cad5a8, 0x7d7f15a9, 0xccd195aa, 0x2a9047ac, 0x3ec66ef2, + 0x252743ae, 0xdd8827af, 0x85fc5055, 0xb9d5c7b2, 0x5a224fb4, + 0xec26e7b6, 0xe4d8f7b7, 0x6e5aa58d, 0xeff753b9, 0x6c391fbb, + 0x989f65bc, 0x2fe4a7c1, 0x9d1d9bc3, 0xa09aadc6, 0x2df33fc8, + 0x5ec27933, 0x5e7f41cb, 0xb920f7cd, 0xc1a603ce, 0xf0888fcf, + 0xdc4ad1d1, 0x34b3dbd4, 0x170981d5, 0x22e5b5d6, 0x13049bd7, + 0xf12a8b95, 0xff7e87d9, 0xabb74b84, 0x215cff4f, 0xaf24f7dc, + 0xc87461d, 0x41a55e0, 0xfde9b9e1, 0x1d1956fb, 0x13d60de4, + 0x435f93e5, 0xe0ab5de6, 0x5c1d3fe7, 0x411a1fe8, 0x55e102a9, + 0x3d9b07eb, 0xdd6b8dee, 0x741293f3, 0xa5b10ca9, 0x5abad5fd, + 0x22372f55, +} + +// The size of the checksum. +const Size = 4 + +// digest represents the partial evaluation of a checksum. +type digest struct { + sum uint32 + nRotate uint + nRotateComplement uint // redundant, but pre-computed to spare an operation + + // window is treated like a circular buffer, where the oldest element + // is indicated by d.oldest + window []byte + oldest int +} + +// Reset resets the Hash to its initial state. +func (d *digest) Reset() { + d.window = nil + d.oldest = 0 + d.sum = 0 +} + +func New() rollinghash.Hash32 { + return &digest{sum: 0, window: nil, oldest: 0} +} + +// Size returns the number of bytes Sum will return. +func (d *digest) Size() int { return Size } + +// BlockSize returns the hash's underlying block size. +// The Write method must be able to accept any amount +// of data, but it may operate more efficiently if all +// writes are a multiple of the block size. +func (d *digest) BlockSize() int { return 1 } + +// Write (via the embedded io.Writer interface) adds more data to the +// running hash. It never returns an error. +func (d *digest) Write(data []byte) (int, error) { + // Copy the window + d.window = make([]byte, len(data)) + copy(d.window, data) + + for _, c := range d.window { + d.sum = d.sum<<1 | d.sum>>31 + d.sum ^= bytehash[int(c)] + } + d.nRotate = uint(len(d.window)) % 32 + d.nRotateComplement = 32 - d.nRotate + return len(d.window), nil +} + +func (d *digest) Sum32() uint32 { + return d.sum +} + +func (d *digest) Sum(b []byte) []byte { + v := d.Sum32() + return append(b, byte(v>>24), byte(v>>16), byte(v>>8), byte(v)) +} + +// Roll updates the checksum of the window from the leaving byte and the +// entering byte. +func (d *digest) Roll(c byte) { + if len(d.window) == 0 { + d.window = make([]byte, 1) + d.window[0] = c + } + // extract the entering/leaving bytes and update the circular buffer. + hn := bytehash[int(c)] + h0 := bytehash[int(d.window[d.oldest])] + + d.window[d.oldest] = c + l := len(d.window) + d.oldest += 1 + if d.oldest >= l { + d.oldest = 0 + } + + d.sum = (d.sum<<1 | d.sum>>31) ^ (h0<>d.nRotateComplement) ^ hn +} diff --git a/vendor/github.com/chmduquesne/rollinghash/rabinkarp32/rabinkarp32.go b/vendor/github.com/chmduquesne/rollinghash/rabinkarp32/rabinkarp32.go new file mode 100644 index 00000000..80d00adf --- /dev/null +++ b/vendor/github.com/chmduquesne/rollinghash/rabinkarp32/rabinkarp32.go @@ -0,0 +1,89 @@ +// Package rollinghash/rabinkarp32 implements a particular case of +// rabin-karp where the modulus is 0xffffffff (32 bits of '1') + +package rabinkarp32 + +import rollinghash "github.com/chmduquesne/rollinghash" + +// The size of a rabinkarp32 checksum. +const Size = 4 + +// digest represents the partial evaluation of a checksum. +type digest struct { + a uint32 + h uint32 + aPowerN uint32 + + // window is treated like a circular buffer, where the oldest element + // is indicated by d.oldest + window []byte + oldest int +} + +// Reset resets the Hash to its initial state. +func (d *digest) Reset() { + d.h = 0 + d.aPowerN = 1 + d.window = nil + d.oldest = 0 +} + +func NewFromInt(a uint32) rollinghash.Hash32 { + return &digest{a: a, h: 0, aPowerN: 1, window: nil, oldest: 0} +} + +func New() rollinghash.Hash32 { + return NewFromInt(65521) // largest prime fitting in 16 bits +} + +// Size returns the number of bytes Sum will return. +func (d *digest) Size() int { return Size } + +// BlockSize returns the hash's underlying block size. +// The Write method must be able to accept any amount +// of data, but it may operate more efficiently if all +// writes are a multiple of the block size. +func (d *digest) BlockSize() int { return 1 } + +// Write (via the embedded io.Writer interface) adds more data to the +// running hash. It never returns an error. +func (d *digest) Write(data []byte) (int, error) { + // Copy the window + d.window = make([]byte, len(data)) + copy(d.window, data) + for _, c := range d.window { + d.h *= d.a + d.h += uint32(c) + d.aPowerN *= d.a + } + return len(d.window), nil +} + +func (d *digest) Sum32() uint32 { + return d.h +} + +func (d *digest) Sum(b []byte) []byte { + v := d.Sum32() + return append(b, byte(v>>24), byte(v>>16), byte(v>>8), byte(v)) +} + +// Roll updates the checksum of the window from the leaving byte and the +// entering byte. +func (d *digest) Roll(c byte) { + if len(d.window) == 0 { + d.window = make([]byte, 1) + d.window[0] = c + } + // extract the entering/leaving bytes and update the circular buffer. + enter := uint32(c) + leave := uint32(d.window[d.oldest]) + d.window[d.oldest] = c + l := len(d.window) + d.oldest += 1 + if d.oldest >= l { + d.oldest = 0 + } + + d.h = d.h*d.a + enter - leave*d.aPowerN +} diff --git a/vendor/github.com/chmduquesne/rollinghash/rollinghash.go b/vendor/github.com/chmduquesne/rollinghash/rollinghash.go new file mode 100644 index 00000000..f297dcd0 --- /dev/null +++ b/vendor/github.com/chmduquesne/rollinghash/rollinghash.go @@ -0,0 +1,40 @@ +/* + +Package rollinghash implements rolling versions of some hashes + +*/ +package rollinghash + +import "hash" + +type Roller interface { + // Roll updates the hash of a rolling window from the entering byte. + // A copy of the window is internally kept from the last Write(). + // This copy is updated along with the internal state of the checksum + // in order to determine the new hash very quickly. + Roll(b byte) +} + +// rollinghash.Hash extends hash.Hash by adding the method Roll. A +// rollinghash.Hash can be updated byte by byte, by specifying which byte +// enters the window. +type Hash interface { + hash.Hash + Roller +} + +// rollinghash.Hash32 extends hash.Hash by adding the method Roll. A +// rollinghash.Hash32 can be updated byte by byte, by specifying which +// byte enters the window. +type Hash32 interface { + hash.Hash32 + Roller +} + +// rollinghash.Hash64 extends hash.Hash by adding the method Roll. A +// rollinghash.Hash64 can be updated byte by byte, by specifying which +// byte enters the window. +type Hash64 interface { + hash.Hash64 + Roller +} diff --git a/vendor/manifest b/vendor/manifest index 19071367..e9947b4e 100644 --- a/vendor/manifest +++ b/vendor/manifest @@ -44,6 +44,14 @@ "revision": "f9b9f8f7aa27725f5cabb699bd9099ca7ce09143", "branch": "master" }, + { + "importpath": "github.com/chmduquesne/rollinghash", + "repository": "https://github.com/chmduquesne/rollinghash", + "vcs": "git", + "revision": "88b86a92826991b14d01fb43456909fcb8a76b8b", + "branch": "master", + "notests": true + }, { "importpath": "github.com/cznic/b", "repository": "https://github.com/cznic/b",