diff --git a/lib/scanner/blocks_test.go b/lib/scanner/blocks_test.go index 311dbdd7..99f1a716 100644 --- a/lib/scanner/blocks_test.go +++ b/lib/scanner/blocks_test.go @@ -144,14 +144,13 @@ func TestAdler32Variants(t *testing.T) { windowSize := 128 - hf2.Reset() - hf3 := rollingAdler32.New() hf3.Write(data[:windowSize]) for i := windowSize; i < len(data); i++ { if i%windowSize == 0 { // let the reference function catch up + hf2.Reset() hf2.Write(data[i-windowSize : i]) // verify that they are in sync with the rolling function diff --git a/vendor/github.com/chmduquesne/rollinghash/adler32/adler32.go b/vendor/github.com/chmduquesne/rollinghash/adler32/adler32.go index f42bdfc6..b33cbd20 100644 --- a/vendor/github.com/chmduquesne/rollinghash/adler32/adler32.go +++ b/vendor/github.com/chmduquesne/rollinghash/adler32/adler32.go @@ -18,23 +18,24 @@ const ( // It implements the adler32 algorithm https://en.wikipedia.org/wiki/Adler-32 type Adler32 struct { a, b uint32 + n uint32 // window is treated like a circular buffer, where the oldest element // is indicated by d.oldest window []byte oldest int - n uint32 vanilla hash.Hash32 } // Reset resets the digest to its initial state. func (d *Adler32) Reset() { - d.window = d.window[:1] // Reset the size but don't reallocate - d.window[0] = 0 + d.window = d.window[:0] // Reset the size but don't reallocate + d.oldest = 0 d.a = 1 d.b = 0 - d.oldest = 0 + d.n = 0 + d.vanilla.Reset() } // New returns a new Adler32 digest @@ -42,7 +43,8 @@ func New() *Adler32 { return &Adler32{ a: 1, b: 0, - window: make([]byte, 1, rollinghash.DefaultWindowCap), + n: 0, + window: make([]byte, 0, rollinghash.DefaultWindowCap), oldest: 0, vanilla: vanilla.New(), } @@ -54,30 +56,30 @@ func (d *Adler32) Size() int { return Size } // BlockSize is 1 byte func (d *Adler32) BlockSize() int { return 1 } -// Write (re)initializes the rolling window with the input byte slice and -// adds its data to the digest. -func (d *Adler32) Write(p []byte) (int, error) { - // Copy the window, avoiding allocations where possible - l := len(p) +// Write appends data to the rolling window and updates the digest. +func (d *Adler32) Write(data []byte) (int, error) { + l := len(data) if l == 0 { - l = 1 + return 0, nil } - if len(d.window) != l { - if cap(d.window) >= l { - d.window = d.window[:l] - } else { - d.window = make([]byte, len(p)) - } + // Re-arrange the window so that the leftmost element is at index 0 + n := len(d.window) + if d.oldest != 0 { + tmp := make([]byte, d.oldest) + copy(tmp, d.window[:d.oldest]) + copy(d.window, d.window[d.oldest:]) + copy(d.window[n-d.oldest:], tmp) + d.oldest = 0 } - copy(d.window, p) + d.window = append(d.window, data...) // Piggy-back on the core implementation d.vanilla.Reset() - d.vanilla.Write(p) + d.vanilla.Write(d.window) s := d.vanilla.Sum32() d.a, d.b = s&0xffff, s>>16 - d.n = uint32(len(p)) % Mod - return len(d.window), nil + d.n = uint32(len(d.window)) % Mod + return len(data), nil } // Sum32 returns the hash as a uint32 @@ -94,6 +96,12 @@ func (d *Adler32) Sum(b []byte) []byte { // Roll updates the checksum of the window from the entering byte. You // MUST initialize a window with Write() before calling this method. func (d *Adler32) Roll(b byte) { + // This check costs 10-15% performance. If we disable it, we crash + // when the window is empty. If we enable it, we are always correct + // (an empty window never changes no matter how much you roll it). + //if len(d.window) == 0 { + // return + //} // extract the entering/leaving bytes and update the circular buffer. enter := uint32(b) leave := uint32(d.window[d.oldest]) diff --git a/vendor/github.com/chmduquesne/rollinghash/bozo32/bozo32.go b/vendor/github.com/chmduquesne/rollinghash/bozo32/bozo32.go index 97f78269..48fc45ef 100644 --- a/vendor/github.com/chmduquesne/rollinghash/bozo32/bozo32.go +++ b/vendor/github.com/chmduquesne/rollinghash/bozo32/bozo32.go @@ -14,9 +14,9 @@ const Size = 4 // Bozo32 is a digest which satisfies the rollinghash.Hash32 interface. type Bozo32 struct { - a uint32 - h uint32 - aPowerN uint32 + a uint32 + aⁿ uint32 + value uint32 // window is treated like a circular buffer, where the oldest element // is indicated by d.oldest @@ -26,19 +26,19 @@ type Bozo32 struct { // Reset resets the Hash to its initial state. func (d *Bozo32) Reset() { - d.h = 0 - d.aPowerN = 1 - d.window = nil + d.value = 0 + d.aⁿ = 1 d.oldest = 0 + d.window = d.window[:0] } func NewFromInt(a uint32) *Bozo32 { return &Bozo32{ - a: a, - h: 0, - aPowerN: 1, - window: make([]byte, 1, rollinghash.DefaultWindowCap), - oldest: 0, + a: a, + value: 0, + aⁿ: 1, + window: make([]byte, 0, rollinghash.DefaultWindowCap), + oldest: 0, } } @@ -52,32 +52,37 @@ func (d *Bozo32) Size() int { return Size } // BlockSize is 1 byte func (d *Bozo32) BlockSize() int { return 1 } -// Write (re)initializes the rolling window with the input byte slice and -// adds its data to the digest. It never returns an error. +// Write appends data to the rolling window and updates the digest. It +// never returns an error. func (d *Bozo32) Write(data []byte) (int, error) { - // Copy the window l := len(data) if l == 0 { - l = 1 + return 0, nil } - if len(d.window) >= l { - d.window = d.window[:l] - } else { - d.window = make([]byte, l) + // Re-arrange the window so that the leftmost element is at index 0 + n := len(d.window) + if d.oldest != 0 { + tmp := make([]byte, d.oldest) + copy(tmp, d.window[:d.oldest]) + copy(d.window, d.window[d.oldest:]) + copy(d.window[n-d.oldest:], tmp) + d.oldest = 0 } - copy(d.window, data) + d.window = append(d.window, data...) + d.value = 0 + d.aⁿ = 1 for _, c := range d.window { - d.h *= d.a - d.h += uint32(c) - d.aPowerN *= d.a + d.value *= d.a + d.value += uint32(c) + d.aⁿ *= d.a } - return len(d.window), nil + return len(data), nil } // Sum32 returns the hash as a uint32 func (d *Bozo32) Sum32() uint32 { - return d.h + return d.value } // Sum returns the hash as byte slice @@ -89,6 +94,12 @@ func (d *Bozo32) Sum(b []byte) []byte { // Roll updates the checksum of the window from the entering byte. You // MUST initialize a window with Write() before calling this method. func (d *Bozo32) Roll(c byte) { + // This check costs 10-15% performance. If we disable it, we crash + // when the window is empty. If we enable it, we are always correct + // (an empty window never changes no matter how much you roll it). + //if len(d.window) == 0 { + // return + //} // extract the entering/leaving bytes and update the circular buffer. enter := uint32(c) leave := uint32(d.window[d.oldest]) @@ -99,5 +110,5 @@ func (d *Bozo32) Roll(c byte) { d.oldest = 0 } - d.h = d.h*d.a + enter - leave*d.aPowerN + d.value = d.value*d.a + enter - leave*d.aⁿ } diff --git a/vendor/github.com/chmduquesne/rollinghash/buzhash32/buzhash32.go b/vendor/github.com/chmduquesne/rollinghash/buzhash32/buzhash32.go index 68b48731..576d8158 100644 --- a/vendor/github.com/chmduquesne/rollinghash/buzhash32/buzhash32.go +++ b/vendor/github.com/chmduquesne/rollinghash/buzhash32/buzhash32.go @@ -65,7 +65,7 @@ func New() *Buzhash32 { func NewFromUint32Array(b [256]uint32) *Buzhash32 { return &Buzhash32{ sum: 0, - window: make([]byte, 1, rollinghash.DefaultWindowCap), + window: make([]byte, 0, rollinghash.DefaultWindowCap), oldest: 0, bytehash: b, } @@ -77,30 +77,31 @@ func (d *Buzhash32) Size() int { return Size } // BlockSize is 1 byte func (d *Buzhash32) BlockSize() int { return 1 } -// Write (re)initializes the rolling window with the input byte slice and -// adds its data to the digest. +// Write appends data to the rolling window and updates the digest. func (d *Buzhash32) Write(data []byte) (int, error) { - // Copy the window, avoiding allocations where possible l := len(data) if l == 0 { - l = 1 + return 0, nil } - if len(d.window) != l { - if cap(d.window) >= l { - d.window = d.window[:l] - } else { - d.window = make([]byte, l) - } + // Re-arrange the window so that the leftmost element is at index 0 + n := len(d.window) + if d.oldest != 0 { + tmp := make([]byte, d.oldest) + copy(tmp, d.window[:d.oldest]) + copy(d.window, d.window[d.oldest:]) + copy(d.window[n-d.oldest:], tmp) + d.oldest = 0 } - copy(d.window, data) + d.window = append(d.window, data...) + d.sum = 0 for _, c := range d.window { d.sum = d.sum<<1 | d.sum>>31 d.sum ^= d.bytehash[int(c)] } d.nRotate = uint(len(d.window)) % 32 d.nRotateComplement = 32 - d.nRotate - return len(d.window), nil + return len(data), nil } // Sum32 returns the hash as a uint32 @@ -117,6 +118,13 @@ func (d *Buzhash32) Sum(b []byte) []byte { // Roll updates the checksum of the window from the entering byte. You // MUST initialize a window with Write() before calling this method. func (d *Buzhash32) Roll(c byte) { + // This check costs 10-15% performance. If we disable it, we crash + // when the window is empty. If we enable it, we are always correct + // (an empty window never changes no matter how much you roll it). + //if len(d.window) == 0 { + // return + //} + // extract the entering/leaving bytes and update the circular buffer. hn := d.bytehash[int(c)] h0 := d.bytehash[int(d.window[d.oldest])] diff --git a/vendor/github.com/chmduquesne/rollinghash/buzhash64/buzhash64.go b/vendor/github.com/chmduquesne/rollinghash/buzhash64/buzhash64.go index 5b0e44d8..4712bc58 100644 --- a/vendor/github.com/chmduquesne/rollinghash/buzhash64/buzhash64.go +++ b/vendor/github.com/chmduquesne/rollinghash/buzhash64/buzhash64.go @@ -65,7 +65,7 @@ func New() *Buzhash64 { func NewFromUint64Array(b [256]uint64) *Buzhash64 { return &Buzhash64{ sum: 0, - window: make([]byte, 1, rollinghash.DefaultWindowCap), + window: make([]byte, 0, rollinghash.DefaultWindowCap), oldest: 0, bytehash: b, } @@ -77,30 +77,32 @@ func (d *Buzhash64) Size() int { return Size } // BlockSize is 1 byte func (d *Buzhash64) BlockSize() int { return 1 } -// Write (re)initializes the rolling window with the input byte slice and -// adds its data to the digest. +// Write appends data to the rolling window and updates the digest. It +// never returns an error. func (d *Buzhash64) Write(data []byte) (int, error) { - // Copy the window, avoiding allocations where possible l := len(data) if l == 0 { - l = 1 + return 0, nil } - if len(d.window) != l { - if cap(d.window) >= l { - d.window = d.window[:l] - } else { - d.window = make([]byte, l) - } + // Re-arrange the window so that the leftmost element is at index 0 + n := len(d.window) + if d.oldest != 0 { + tmp := make([]byte, d.oldest) + copy(tmp, d.window[:d.oldest]) + copy(d.window, d.window[d.oldest:]) + copy(d.window[n-d.oldest:], tmp) + d.oldest = 0 } - copy(d.window, data) + d.window = append(d.window, data...) + d.sum = 0 for _, c := range d.window { d.sum = d.sum<<1 | d.sum>>63 d.sum ^= d.bytehash[int(c)] } d.nRotate = uint(len(d.window)) % 64 d.nRotateComplement = 64 - d.nRotate - return len(d.window), nil + return len(data), nil } // Sum64 returns the hash as a uint64 @@ -117,6 +119,13 @@ func (d *Buzhash64) Sum(b []byte) []byte { // Roll updates the checksum of the window from the entering byte. You // MUST initialize a window with Write() before calling this method. func (d *Buzhash64) Roll(c byte) { + // This check costs 10-15% performance. If we disable it, we crash + // when the window is empty. If we enable it, we are always correct + // (an empty window never changes no matter how much you roll it). + //if len(d.window) == 0 { + // return + //} + // extract the entering/leaving bytes and update the circular buffer. hn := d.bytehash[int(c)] h0 := d.bytehash[int(d.window[d.oldest])] diff --git a/vendor/github.com/chmduquesne/rollinghash/rabinkarp64/rabinkarp64.go b/vendor/github.com/chmduquesne/rollinghash/rabinkarp64/rabinkarp64.go index 30e3197f..3a459257 100644 --- a/vendor/github.com/chmduquesne/rollinghash/rabinkarp64/rabinkarp64.go +++ b/vendor/github.com/chmduquesne/rollinghash/rabinkarp64/rabinkarp64.go @@ -70,8 +70,10 @@ func init() { cache.entries = make(map[index]*tables) } -func (d *RabinKarp64) buildTables() { +func (d *RabinKarp64) updateTables() { windowsize := len(d.window) + pol := d.pol + idx := index{d.pol, windowsize} cache.Lock() @@ -82,8 +84,15 @@ func (d *RabinKarp64) buildTables() { return } - t = &tables{} + d.tables = buildTables(pol, windowsize) + cache.Lock() + cache.entries[idx] = d.tables + cache.Unlock() + return +} +func buildTables(pol Pol, windowsize int) (t *tables) { + t = &tables{} // calculate table for sliding out bytes. The byte to slide out is used as // the index for the table, the value contains the following: // out_table[b] = Hash(b || 0 || ... || 0) @@ -99,17 +108,17 @@ func (d *RabinKarp64) buildTables() { var h Pol h <<= 8 h |= Pol(b) - h = h.Mod(d.pol) + h = h.Mod(pol) for i := 0; i < windowsize-1; i++ { h <<= 8 h |= Pol(0) - h = h.Mod(d.pol) + h = h.Mod(pol) } t.out[b] = h } // calculate table for reduction mod Polynomial - k := d.pol.Deg() + k := pol.Deg() for b := 0; b < 256; b++ { // mod_table[b] = A | B, where A = (b(x) * x^k mod pol) and B = b(x) * x^k // @@ -118,13 +127,10 @@ func (d *RabinKarp64) buildTables() { // two parts: Part A contains the result of the modulus operation, part // B is used to cancel out the 8 top bits so that one XOR operation is // enough to reduce modulo Polynomial - t.mod[b] = Pol(uint64(b)<= l { - d.window = d.window[:l] - } else { - d.window = make([]byte, l) + // Re-arrange the window so that the leftmost element is at index 0 + n := len(d.window) + if d.oldest != 0 { + tmp := make([]byte, d.oldest) + copy(tmp, d.window[:d.oldest]) + copy(d.window, d.window[d.oldest:]) + copy(d.window[n-d.oldest:], tmp) + d.oldest = 0 } - copy(d.window, data) + d.window = append(d.window, data...) + d.value = 0 for _, b := range d.window { d.value <<= 8 d.value |= Pol(b) d.value = d.value.Mod(d.pol) } - d.buildTables() + d.updateTables() - return len(d.window), nil + return len(data), nil } // Sum64 returns the hash as a uint64 @@ -207,6 +217,12 @@ func (d *RabinKarp64) Sum(b []byte) []byte { // Roll updates the checksum of the window from the entering byte. You // MUST initialize a window with Write() before calling this method. func (d *RabinKarp64) Roll(c byte) { + // This check costs 10-15% performance. If we disable it, we crash + // when the window is empty. If we enable it, we are always correct + // (an empty window never changes no matter how much you roll it). + //if len(d.window) == 0 { + // return + //} // extract the entering/leaving bytes and update the circular buffer. enter := c leave := uint64(d.window[d.oldest]) diff --git a/vendor/github.com/chmduquesne/rollinghash/roll/main.go b/vendor/github.com/chmduquesne/rollinghash/roll/main.go index 88cf447c..baa860f4 100644 --- a/vendor/github.com/chmduquesne/rollinghash/roll/main.go +++ b/vendor/github.com/chmduquesne/rollinghash/roll/main.go @@ -3,9 +3,11 @@ package main import ( "flag" "fmt" + "hash" "io" "log" "os" + "runtime/pprof" "time" "code.cloudfoundry.org/bytefmt" @@ -33,7 +35,10 @@ func genMasks() (res []uint64) { return } -func hash2uint64(s []byte) (res uint64) { +// Gets the hash sum as a uint64 +func sum64(h hash.Hash) (res uint64) { + buf := make([]byte, 0, 8) + s := h.Sum(buf) for _, b := range s { res <<= 8 res |= uint64(b) @@ -42,18 +47,27 @@ func hash2uint64(s []byte) (res uint64) { } func main() { + cpuprofile := flag.String("cpuprofile", "", "write cpu profile to file") dostats := flag.Bool("stats", false, "Do some stats about the rolling sum") size := flag.String("size", "256M", "How much data to read") flag.Parse() + if *cpuprofile != "" { + f, err := os.Create(*cpuprofile) + if err != nil { + log.Fatal(err) + } + pprof.StartCPUProfile(f) + defer pprof.StopCPUProfile() + } + fileSize, err := bytefmt.ToBytes(*size) if err != nil { log.Fatal(err) } bufsize := 16 * MiB - rbuf := make([]byte, bufsize) - hbuf := make([]byte, 0, 8) + buf := make([]byte, bufsize) t := time.Now() f, err := os.Open("/dev/urandom") @@ -66,10 +80,10 @@ func main() { } }() - io.ReadFull(f, rbuf) + io.ReadFull(f, buf) roll := rollsum.New() - roll.Write(rbuf[:64]) + roll.Write(buf[:64]) masks := genMasks() hits := make(map[uint64]uint64) @@ -97,15 +111,15 @@ func main() { fmt.Printf(status) fmt.Printf("\r") } - _, err := io.ReadFull(f, rbuf) + _, err := io.ReadFull(f, buf) if err != nil { panic(err) } k = 0 } - roll.Roll(rbuf[k]) + roll.Roll(buf[k]) if *dostats { - s := hash2uint64(roll.Sum(hbuf)) + s := sum64(roll) for _, m := range masks { if s&m == m { hits[m] += 1 diff --git a/vendor/manifest b/vendor/manifest index 737e71d1..d326237a 100644 --- a/vendor/manifest +++ b/vendor/manifest @@ -94,7 +94,7 @@ "importpath": "github.com/chmduquesne/rollinghash", "repository": "https://github.com/chmduquesne/rollinghash", "vcs": "git", - "revision": "abb8cbaf9915e48ee20cae94bcd94221b61707a2", + "revision": "a60f8e7142b536ea61bb5d84014171189eeaaa81", "branch": "master", "notests": true },