diff --git a/Godeps/Godeps.json b/Godeps/Godeps.json index 5b8b13c5..b8e34d70 100644 --- a/Godeps/Godeps.json +++ b/Godeps/Godeps.json @@ -25,11 +25,6 @@ "Comment": "null-90", "Rev": "d65bffbc88a153d23a6d2a864531e6e7c2cde59b" }, - { - "ImportPath": "code.google.com/p/snappy-go/snappy", - "Comment": "null-15", - "Rev": "12e4b4183793ac4b061921e7980845e750679fd0" - }, { "ImportPath": "github.com/AudriusButkevicius/lfu-go", "Rev": "164bcecceb92fd6037f4d18a8d97b495ec6ef669" @@ -56,7 +51,11 @@ }, { "ImportPath": "github.com/syndtr/goleveldb/leveldb", - "Rev": "cd2b8f743192883ab9fbc5f070ebda1dc90f3732" + "Rev": "d8d1d2a5cc2d34c950dffa2f554525415d59f737" + }, + { + "ImportPath": "github.com/syndtr/gosnappy/snappy", + "Rev": "ce8acff4829e0c2458a67ead32390ac0a381c862" }, { "ImportPath": "github.com/vitrun/qart/coding", diff --git a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/batch.go b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/batch.go index 0d7911ec..ccf390c9 100644 --- a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/batch.go +++ b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/batch.go @@ -8,65 +8,84 @@ package leveldb import ( "encoding/binary" - "errors" + "fmt" + "github.com/syndtr/goleveldb/leveldb/errors" "github.com/syndtr/goleveldb/leveldb/memdb" ) -var ( - errBatchTooShort = errors.New("leveldb: batch is too short") - errBatchBadRecord = errors.New("leveldb: bad record in batch") +type ErrBatchCorrupted struct { + Reason string +} + +func (e *ErrBatchCorrupted) Error() string { + return fmt.Sprintf("leveldb: batch corrupted: %s", e.Reason) +} + +func newErrBatchCorrupted(reason string) error { + return errors.NewErrCorrupted(nil, &ErrBatchCorrupted{reason}) +} + +const ( + batchHdrLen = 8 + 4 + batchGrowRec = 3000 ) -const kBatchHdrLen = 8 + 4 - -type batchReplay interface { - put(key, value []byte, seq uint64) - delete(key []byte, seq uint64) +type BatchReplay interface { + Put(key, value []byte) + Delete(key []byte) } // Batch is a write batch. type Batch struct { - buf []byte + data []byte rLen, bLen int seq uint64 sync bool } func (b *Batch) grow(n int) { - off := len(b.buf) + off := len(b.data) if off == 0 { - // include headers - off = kBatchHdrLen - n += off + off = batchHdrLen + if b.data != nil { + b.data = b.data[:off] + } } - if cap(b.buf)-off >= n { - return + if cap(b.data)-off < n { + if b.data == nil { + b.data = make([]byte, off, off+n) + } else { + odata := b.data + div := 1 + if b.rLen > batchGrowRec { + div = b.rLen / batchGrowRec + } + b.data = make([]byte, off, off+n+(off-batchHdrLen)/div) + copy(b.data, odata) + } } - buf := make([]byte, 2*cap(b.buf)+n) - copy(buf, b.buf) - b.buf = buf[:off] } -func (b *Batch) appendRec(t vType, key, value []byte) { +func (b *Batch) appendRec(kt kType, key, value []byte) { n := 1 + binary.MaxVarintLen32 + len(key) - if t == tVal { + if kt == ktVal { n += binary.MaxVarintLen32 + len(value) } b.grow(n) - off := len(b.buf) - buf := b.buf[:off+n] - buf[off] = byte(t) + off := len(b.data) + data := b.data[:off+n] + data[off] = byte(kt) off += 1 - off += binary.PutUvarint(buf[off:], uint64(len(key))) - copy(buf[off:], key) + off += binary.PutUvarint(data[off:], uint64(len(key))) + copy(data[off:], key) off += len(key) - if t == tVal { - off += binary.PutUvarint(buf[off:], uint64(len(value))) - copy(buf[off:], value) + if kt == ktVal { + off += binary.PutUvarint(data[off:], uint64(len(value))) + copy(data[off:], value) off += len(value) } - b.buf = buf[:off] + b.data = data[:off] b.rLen++ // Include 8-byte ikey header b.bLen += len(key) + len(value) + 8 @@ -75,18 +94,51 @@ func (b *Batch) appendRec(t vType, key, value []byte) { // Put appends 'put operation' of the given key/value pair to the batch. // It is safe to modify the contents of the argument after Put returns. func (b *Batch) Put(key, value []byte) { - b.appendRec(tVal, key, value) + b.appendRec(ktVal, key, value) } // Delete appends 'delete operation' of the given key to the batch. // It is safe to modify the contents of the argument after Delete returns. func (b *Batch) Delete(key []byte) { - b.appendRec(tDel, key, nil) + b.appendRec(ktDel, key, nil) +} + +// Dump dumps batch contents. The returned slice can be loaded into the +// batch using Load method. +// The returned slice is not its own copy, so the contents should not be +// modified. +func (b *Batch) Dump() []byte { + return b.encode() +} + +// Load loads given slice into the batch. Previous contents of the batch +// will be discarded. +// The given slice will not be copied and will be used as batch buffer, so +// it is not safe to modify the contents of the slice. +func (b *Batch) Load(data []byte) error { + return b.decode(0, data) +} + +// Replay replays batch contents. +func (b *Batch) Replay(r BatchReplay) error { + return b.decodeRec(func(i int, kt kType, key, value []byte) { + switch kt { + case ktVal: + r.Put(key, value) + case ktDel: + r.Delete(key) + } + }) +} + +// Len returns number of records in the batch. +func (b *Batch) Len() int { + return b.rLen } // Reset resets the batch. func (b *Batch) Reset() { - b.buf = nil + b.data = b.data[:0] b.seq = 0 b.rLen = 0 b.bLen = 0 @@ -97,24 +149,10 @@ func (b *Batch) init(sync bool) { b.sync = sync } -func (b *Batch) put(key, value []byte, seq uint64) { - if b.rLen == 0 { - b.seq = seq - } - b.Put(key, value) -} - -func (b *Batch) delete(key []byte, seq uint64) { - if b.rLen == 0 { - b.seq = seq - } - b.Delete(key) -} - func (b *Batch) append(p *Batch) { if p.rLen > 0 { - b.grow(len(p.buf) - kBatchHdrLen) - b.buf = append(b.buf, p.buf[kBatchHdrLen:]...) + b.grow(len(p.data) - batchHdrLen) + b.data = append(b.data, p.data[batchHdrLen:]...) b.rLen += p.rLen } if p.sync { @@ -122,95 +160,93 @@ func (b *Batch) append(p *Batch) { } } -func (b *Batch) len() int { - return b.rLen -} - +// size returns sums of key/value pair length plus 8-bytes ikey. func (b *Batch) size() int { return b.bLen } func (b *Batch) encode() []byte { b.grow(0) - binary.LittleEndian.PutUint64(b.buf, b.seq) - binary.LittleEndian.PutUint32(b.buf[8:], uint32(b.rLen)) + binary.LittleEndian.PutUint64(b.data, b.seq) + binary.LittleEndian.PutUint32(b.data[8:], uint32(b.rLen)) - return b.buf + return b.data } -func (b *Batch) decode(buf []byte) error { - if len(buf) < kBatchHdrLen { - return errBatchTooShort +func (b *Batch) decode(prevSeq uint64, data []byte) error { + if len(data) < batchHdrLen { + return newErrBatchCorrupted("too short") } - b.seq = binary.LittleEndian.Uint64(buf) - b.rLen = int(binary.LittleEndian.Uint32(buf[8:])) + b.seq = binary.LittleEndian.Uint64(data) + if b.seq < prevSeq { + return newErrBatchCorrupted("invalid sequence number") + } + b.rLen = int(binary.LittleEndian.Uint32(data[8:])) + if b.rLen < 0 { + return newErrBatchCorrupted("invalid records length") + } // No need to be precise at this point, it won't be used anyway - b.bLen = len(buf) - kBatchHdrLen - b.buf = buf + b.bLen = len(data) - batchHdrLen + b.data = data return nil } -func (b *Batch) decodeRec(f func(i int, t vType, key, value []byte)) error { - off := kBatchHdrLen +func (b *Batch) decodeRec(f func(i int, kt kType, key, value []byte)) (err error) { + off := batchHdrLen for i := 0; i < b.rLen; i++ { - if off >= len(b.buf) { - return errors.New("leveldb: invalid batch record length") + if off >= len(b.data) { + return newErrBatchCorrupted("invalid records length") } - t := vType(b.buf[off]) - if t > tVal { - return errors.New("leveldb: invalid batch record type in batch") + kt := kType(b.data[off]) + if kt > ktVal { + return newErrBatchCorrupted("bad record: invalid type") } off += 1 - x, n := binary.Uvarint(b.buf[off:]) + x, n := binary.Uvarint(b.data[off:]) off += n - if n <= 0 || off+int(x) > len(b.buf) { - return errBatchBadRecord + if n <= 0 || off+int(x) > len(b.data) { + return newErrBatchCorrupted("bad record: invalid key length") } - key := b.buf[off : off+int(x)] + key := b.data[off : off+int(x)] off += int(x) - var value []byte - if t == tVal { - x, n := binary.Uvarint(b.buf[off:]) + if kt == ktVal { + x, n := binary.Uvarint(b.data[off:]) off += n - if n <= 0 || off+int(x) > len(b.buf) { - return errBatchBadRecord + if n <= 0 || off+int(x) > len(b.data) { + return newErrBatchCorrupted("bad record: invalid value length") } - value = b.buf[off : off+int(x)] + value = b.data[off : off+int(x)] off += int(x) } - f(i, t, key, value) + f(i, kt, key, value) } return nil } -func (b *Batch) replay(to batchReplay) error { - return b.decodeRec(func(i int, t vType, key, value []byte) { - switch t { - case tVal: - to.put(key, value, b.seq+uint64(i)) - case tDel: - to.delete(key, b.seq+uint64(i)) - } - }) -} - func (b *Batch) memReplay(to *memdb.DB) error { - return b.decodeRec(func(i int, t vType, key, value []byte) { - ikey := newIKey(key, b.seq+uint64(i), t) + return b.decodeRec(func(i int, kt kType, key, value []byte) { + ikey := newIkey(key, b.seq+uint64(i), kt) to.Put(ikey, value) }) } +func (b *Batch) memDecodeAndReplay(prevSeq uint64, data []byte, to *memdb.DB) error { + if err := b.decode(prevSeq, data); err != nil { + return err + } + return b.memReplay(to) +} + func (b *Batch) revertMemReplay(to *memdb.DB) error { - return b.decodeRec(func(i int, t vType, key, value []byte) { - ikey := newIKey(key, b.seq+uint64(i), t) + return b.decodeRec(func(i int, kt kType, key, value []byte) { + ikey := newIkey(key, b.seq+uint64(i), kt) to.Delete(ikey) }) } diff --git a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/batch_test.go b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/batch_test.go index 19b749b8..7fc842f4 100644 --- a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/batch_test.go +++ b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/batch_test.go @@ -15,7 +15,7 @@ import ( ) type tbRec struct { - t vType + kt kType key, value []byte } @@ -23,39 +23,39 @@ type testBatch struct { rec []*tbRec } -func (p *testBatch) put(key, value []byte, seq uint64) { - p.rec = append(p.rec, &tbRec{tVal, key, value}) +func (p *testBatch) Put(key, value []byte) { + p.rec = append(p.rec, &tbRec{ktVal, key, value}) } -func (p *testBatch) delete(key []byte, seq uint64) { - p.rec = append(p.rec, &tbRec{tDel, key, nil}) +func (p *testBatch) Delete(key []byte) { + p.rec = append(p.rec, &tbRec{ktDel, key, nil}) } func compareBatch(t *testing.T, b1, b2 *Batch) { if b1.seq != b2.seq { t.Errorf("invalid seq number want %d, got %d", b1.seq, b2.seq) } - if b1.len() != b2.len() { - t.Fatalf("invalid record length want %d, got %d", b1.len(), b2.len()) + if b1.Len() != b2.Len() { + t.Fatalf("invalid record length want %d, got %d", b1.Len(), b2.Len()) } p1, p2 := new(testBatch), new(testBatch) - err := b1.replay(p1) + err := b1.Replay(p1) if err != nil { t.Fatal("error when replaying batch 1: ", err) } - err = b2.replay(p2) + err = b2.Replay(p2) if err != nil { t.Fatal("error when replaying batch 2: ", err) } for i := range p1.rec { r1, r2 := p1.rec[i], p2.rec[i] - if r1.t != r2.t { - t.Errorf("invalid type on record '%d' want %d, got %d", i, r1.t, r2.t) + if r1.kt != r2.kt { + t.Errorf("invalid type on record '%d' want %d, got %d", i, r1.kt, r2.kt) } if !bytes.Equal(r1.key, r2.key) { t.Errorf("invalid key on record '%d' want %s, got %s", i, string(r1.key), string(r2.key)) } - if r1.t == tVal { + if r1.kt == ktVal { if !bytes.Equal(r1.value, r2.value) { t.Errorf("invalid value on record '%d' want %s, got %s", i, string(r1.value), string(r2.value)) } @@ -75,7 +75,7 @@ func TestBatch_EncodeDecode(t *testing.T) { b1.Delete([]byte("k")) buf := b1.encode() b2 := new(Batch) - err := b2.decode(buf) + err := b2.decode(0, buf) if err != nil { t.Error("error when decoding batch: ", err) } diff --git a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/cache/cache_test.go b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/cache/cache_test.go index 6207e681..865bc573 100644 --- a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/cache/cache_test.go +++ b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/cache/cache_test.go @@ -249,7 +249,7 @@ func (x *testingCacheObject) Release() { x.releaseCalled = true x.cnt.releaseOne() } else { - x.t.Errorf("duplicate setfin NS#%d KEY#%s", x.ns, x.key) + x.t.Errorf("duplicate setfin NS#%d KEY#%d", x.ns, x.key) } } @@ -489,7 +489,7 @@ func TestLRUCache_Finalizer(t *testing.T) { return true } else { if p.delfinCalled != keymax { - t.Errorf("(2) #%d not all delete fin called, diff=%d", p.ns, keymax-p.delfinCalled) + t.Errorf("(2) NS#%d not all delete fin called, diff=%d", p.nsid, keymax-p.delfinCalled) } return false } diff --git a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/config.go b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/config.go deleted file mode 100644 index 51105889..00000000 --- a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/config.go +++ /dev/null @@ -1,40 +0,0 @@ -// Copyright (c) 2012, Suryandaru Triandana -// All rights reserved. -// -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. - -package leveldb - -const ( - kNumLevels = 7 - - // Level-0 compaction is started when we hit this many files. - kL0_CompactionTrigger float64 = 4 - - // Soft limit on number of level-0 files. We slow down writes at this point. - kL0_SlowdownWritesTrigger = 8 - - // Maximum number of level-0 files. We stop writes at this point. - kL0_StopWritesTrigger = 12 - - // Maximum level to which a new compacted memdb is pushed if it - // does not create overlap. We try to push to level 2 to avoid the - // relatively expensive level 0=>1 compactions and to avoid some - // expensive manifest file operations. We do not push all the way to - // the largest level since that can generate a lot of wasted disk - // space if the same key space is being repeatedly overwritten. - kMaxMemCompactLevel = 2 - - // Maximum size of a table. - kMaxTableSize = 2 * 1048576 - - // Maximum bytes of overlaps in grandparent (i.e., level+2) before we - // stop building a single file in a level->level+1 compaction. - kMaxGrandParentOverlapBytes = 10 * kMaxTableSize - - // Maximum number of bytes in all compacted files. We avoid expanding - // the lower level file set of a compaction if it would make the - // total compaction cover more than this many bytes. - kExpCompactionMaxBytes = 25 * kMaxTableSize -) diff --git a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/db.go b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/db.go index 2f4c92e8..a653ec3b 100644 --- a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/db.go +++ b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/db.go @@ -8,7 +8,6 @@ package leveldb import ( "container/list" - "errors" "fmt" "io" "os" @@ -18,6 +17,7 @@ import ( "sync/atomic" "time" + "github.com/syndtr/goleveldb/leveldb/errors" "github.com/syndtr/goleveldb/leveldb/iterator" "github.com/syndtr/goleveldb/leveldb/journal" "github.com/syndtr/goleveldb/leveldb/memdb" @@ -57,18 +57,19 @@ type DB struct { writeMergedC chan bool writeLockC chan struct{} writeAckC chan error + writeDelay time.Duration + writeDelayN int journalC chan *Batch journalAckC chan error // Compaction. - tcompCmdC chan cCmd - tcompPauseC chan chan<- struct{} - tcompTriggerC chan struct{} - mcompCmdC chan cCmd - mcompTriggerC chan struct{} - compErrC chan error - compErrSetC chan error - compStats [kNumLevels]cStats + tcompCmdC chan cCmd + tcompPauseC chan chan<- struct{} + mcompCmdC chan cCmd + compErrC chan error + compPerErrC chan error + compErrSetC chan error + compStats []cStats // Close. closeW sync.WaitGroup @@ -83,7 +84,7 @@ func openDB(s *session) (*DB, error) { db := &DB{ s: s, // Initial sequence - seq: s.stSeq, + seq: s.stSeqNum, // MemDB memPool: make(chan *memdb.DB, 1), // Snapshot @@ -96,13 +97,13 @@ func openDB(s *session) (*DB, error) { journalC: make(chan *Batch), journalAckC: make(chan error), // Compaction - tcompCmdC: make(chan cCmd), - tcompPauseC: make(chan chan<- struct{}), - tcompTriggerC: make(chan struct{}, 1), - mcompCmdC: make(chan cCmd), - mcompTriggerC: make(chan struct{}, 1), - compErrC: make(chan error), - compErrSetC: make(chan error), + tcompCmdC: make(chan cCmd), + tcompPauseC: make(chan chan<- struct{}), + mcompCmdC: make(chan cCmd), + compErrC: make(chan error), + compPerErrC: make(chan error), + compErrSetC: make(chan error), + compStats: make([]cStats, s.o.GetNumLevel()), // Close closeC: make(chan struct{}), } @@ -121,14 +122,14 @@ func openDB(s *session) (*DB, error) { return nil, err } - // Don't include compaction error goroutine into wait group. + // Doesn't need to be included in the wait group. go db.compactionError() + go db.mpoolDrain() db.closeW.Add(3) go db.tCompaction() go db.mCompaction() go db.jWriter() - go db.mpoolDrain() s.logf("db@open done T·%v", time.Since(start)) @@ -255,6 +256,10 @@ func RecoverFile(path string, o *opt.Options) (db *DB, err error) { } func recoverTable(s *session, o *opt.Options) error { + o = dupOptions(o) + // Mask StrictReader, lets StrictRecovery doing its job. + o.Strict &= ^opt.StrictReader + // Get all tables and sort it by file number. tableFiles_, err := s.getFiles(storage.TypeTable) if err != nil { @@ -263,10 +268,16 @@ func recoverTable(s *session, o *opt.Options) error { tableFiles := files(tableFiles_) tableFiles.sort() - var mSeq uint64 - var good, corrupted int - rec := new(sessionRecord) - bpool := util.NewBufferPool(o.GetBlockSize() + 5) + var ( + mSeq uint64 + recoveredKey, goodKey, corruptedKey, corruptedBlock, droppedTable int + + // We will drop corrupted table. + strict = o.GetStrict(opt.StrictRecovery) + + rec = &sessionRecord{numLevel: o.GetNumLevel()} + bpool = util.NewBufferPool(o.GetBlockSize() + 5) + ) buildTable := func(iter iterator.Iterator) (tmp storage.File, size int64, err error) { tmp = s.newTemp() writer, err := tmp.Create() @@ -321,25 +332,32 @@ func recoverTable(s *session, o *opt.Options) error { return err } - var tSeq uint64 - var tgood, tcorrupted, blockerr int - var imin, imax []byte - tr := table.NewReader(reader, size, nil, bpool, o) + var ( + tSeq uint64 + tgoodKey, tcorruptedKey, tcorruptedBlock int + imin, imax []byte + ) + tr, err := table.NewReader(reader, size, storage.NewFileInfo(file), nil, bpool, o) + if err != nil { + return err + } iter := tr.NewIterator(nil, nil) iter.(iterator.ErrorCallbackSetter).SetErrorCallback(func(err error) { - s.logf("table@recovery found error @%d %q", file.Num(), err) - blockerr++ + if errors.IsCorrupted(err) { + s.logf("table@recovery block corruption @%d %q", file.Num(), err) + tcorruptedBlock++ + } }) // Scan the table. for iter.Next() { key := iter.Key() - _, seq, _, ok := parseIkey(key) - if !ok { - tcorrupted++ + _, seq, _, kerr := parseIkey(key) + if kerr != nil { + tcorruptedKey++ continue } - tgood++ + tgoodKey++ if seq > tSeq { tSeq = seq } @@ -354,8 +372,18 @@ func recoverTable(s *session, o *opt.Options) error { } iter.Release() - if tgood > 0 { - if tcorrupted > 0 || blockerr > 0 { + goodKey += tgoodKey + corruptedKey += tcorruptedKey + corruptedBlock += tcorruptedBlock + + if strict && (tcorruptedKey > 0 || tcorruptedBlock > 0) { + droppedTable++ + s.logf("table@recovery dropped @%d Gk·%d Ck·%d Cb·%d S·%d Q·%d", file.Num(), tgoodKey, tcorruptedKey, tcorruptedBlock, size, tSeq) + return nil + } + + if tgoodKey > 0 { + if tcorruptedKey > 0 || tcorruptedBlock > 0 { // Rebuild the table. s.logf("table@recovery rebuilding @%d", file.Num()) iter := tr.NewIterator(nil, nil) @@ -373,16 +401,15 @@ func recoverTable(s *session, o *opt.Options) error { if tSeq > mSeq { mSeq = tSeq } + recoveredKey += tgoodKey // Add table to level 0. rec.addTable(0, file.Num(), uint64(size), imin, imax) - s.logf("table@recovery recovered @%d N·%d C·%d B·%d S·%d Q·%d", file.Num(), tgood, tcorrupted, blockerr, size, tSeq) + s.logf("table@recovery recovered @%d Gk·%d Ck·%d Cb·%d S·%d Q·%d", file.Num(), tgoodKey, tcorruptedKey, tcorruptedBlock, size, tSeq) } else { - s.logf("table@recovery unrecoverable @%d C·%d B·%d S·%d", file.Num(), tcorrupted, blockerr, size) + droppedTable++ + s.logf("table@recovery unrecoverable @%d Ck·%d Cb·%d S·%d", file.Num(), tcorruptedKey, tcorruptedBlock, size) } - good += tgood - corrupted += tcorrupted - return nil } @@ -399,11 +426,11 @@ func recoverTable(s *session, o *opt.Options) error { } } - s.logf("table@recovery recovered F·%d N·%d C·%d Q·%d", len(tableFiles), good, corrupted, mSeq) + s.logf("table@recovery recovered F·%d N·%d Gk·%d Ck·%d Q·%d", len(tableFiles), recoveredKey, goodKey, corruptedKey, mSeq) } // Set sequence number. - rec.setSeq(mSeq + 1) + rec.setSeqNum(mSeq + 1) // Create new manifest. if err := s.create(); err != nil { @@ -486,26 +513,30 @@ func (db *DB) recoverJournal() error { if err == io.EOF { break } - return err + return errors.SetFile(err, file) } buf.Reset() if _, err := buf.ReadFrom(r); err != nil { if err == io.ErrUnexpectedEOF { + // This is error returned due to corruption, with strict == false. continue } else { - return err + return errors.SetFile(err, file) } } - if err := batch.decode(buf.Bytes()); err != nil { - return err - } - if err := batch.memReplay(mem); err != nil { - return err + if err := batch.memDecodeAndReplay(db.seq, buf.Bytes(), mem); err != nil { + if strict || !errors.IsCorrupted(err) { + return errors.SetFile(err, file) + } else { + db.s.logf("journal error: %v (skipped)", err) + // We won't apply sequence number as it might be corrupted. + continue + } } // Save sequence number. - db.seq = batch.seq + uint64(batch.len()) + db.seq = batch.seq + uint64(batch.Len()) // Flush it if large enough. if mem.Size() >= writeBuffer { @@ -566,7 +597,7 @@ func (db *DB) recoverJournal() error { } func (db *DB) get(key []byte, seq uint64, ro *opt.ReadOptions) (value []byte, err error) { - ikey := newIKey(key, seq, tSeek) + ikey := newIkey(key, seq, ktSeek) em, fm := db.getMems() for _, m := range [...]*memDB{em, fm} { @@ -577,9 +608,13 @@ func (db *DB) get(key []byte, seq uint64, ro *opt.ReadOptions) (value []byte, er mk, mv, me := m.mdb.Find(ikey) if me == nil { - ukey, _, t, ok := parseIkey(mk) - if ok && db.s.icmp.uCompare(ukey, key) == 0 { - if t == tDel { + ukey, _, kt, kerr := parseIkey(mk) + if kerr != nil { + // Shouldn't have had happen. + panic(kerr) + } + if db.s.icmp.uCompare(ukey, key) == 0 { + if kt == ktDel { return nil, ErrNotFound } return append([]byte{}, mv...), nil @@ -594,7 +629,7 @@ func (db *DB) get(key []byte, seq uint64, ro *opt.ReadOptions) (value []byte, er v.release() if cSched { // Trigger table compaction. - db.compTrigger(db.tcompTriggerC) + db.compSendTrigger(db.tcompCmdC) } return } @@ -697,7 +732,7 @@ func (db *DB) GetProperty(name string) (value string, err error) { var level uint var rest string n, _ := fmt.Sscanf(p[len(numFilesPrefix):], "%d%s", &level, &rest) - if n != 1 || level >= kNumLevels { + if n != 1 || int(level) >= db.s.o.GetNumLevel() { err = errors.New("leveldb: GetProperty: invalid property: " + name) } else { value = fmt.Sprint(v.tLen(int(level))) @@ -759,8 +794,8 @@ func (db *DB) SizeOf(ranges []util.Range) (Sizes, error) { sizes := make(Sizes, 0, len(ranges)) for _, r := range ranges { - imin := newIKey(r.Start, kMaxSeq, tSeek) - imax := newIKey(r.Limit, kMaxSeq, tSeek) + imin := newIkey(r.Start, kMaxSeq, ktSeek) + imax := newIkey(r.Limit, kMaxSeq, ktSeek) start, err := v.offsetOf(imin) if err != nil { return nil, err @@ -816,6 +851,10 @@ func (db *DB) Close() error { db.journalWriter.Close() } + if db.writeDelayN > 0 { + db.logf("db@write was delayed N·%d T·%v", db.writeDelayN, db.writeDelay) + } + // Close session. db.s.close() db.logf("db@close done T·%v", time.Since(start)) diff --git a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/db_compaction.go b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/db_compaction.go index e0ae721e..447407ab 100644 --- a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/db_compaction.go +++ b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/db_compaction.go @@ -7,11 +7,12 @@ package leveldb import ( - "errors" "sync" "time" + "github.com/syndtr/goleveldb/leveldb/errors" "github.com/syndtr/goleveldb/leveldb/memdb" + "github.com/syndtr/goleveldb/leveldb/opt" ) var ( @@ -68,7 +69,7 @@ type cMem struct { } func newCMem(s *session) *cMem { - return &cMem{s: s, rec: new(sessionRecord)} + return &cMem{s: s, rec: &sessionRecord{numLevel: s.o.GetNumLevel()}} } func (c *cMem) flush(mem *memdb.DB, level int) error { @@ -84,7 +85,9 @@ func (c *cMem) flush(mem *memdb.DB, level int) error { // Pick level. if level < 0 { - level = s.version_NB().pickLevel(t.imin.ukey(), t.imax.ukey()) + v := s.version() + level = v.pickLevel(t.imin.ukey(), t.imax.ukey()) + v.release() } c.rec.addTableFile(level, t) @@ -95,24 +98,32 @@ func (c *cMem) flush(mem *memdb.DB, level int) error { } func (c *cMem) reset() { - c.rec = new(sessionRecord) + c.rec = &sessionRecord{numLevel: c.s.o.GetNumLevel()} } func (c *cMem) commit(journal, seq uint64) error { c.rec.setJournalNum(journal) - c.rec.setSeq(seq) + c.rec.setSeqNum(seq) // Commit changes. return c.s.commit(c.rec) } func (db *DB) compactionError() { - var err error + var ( + err error + wlocked bool + ) noerr: + // No error. for { select { case err = <-db.compErrSetC: - if err != nil { + switch { + case err == nil: + case errors.IsCorrupted(err): + goto hasperr + default: goto haserr } case _, _ = <-db.closeC: @@ -120,17 +131,39 @@ noerr: } } haserr: + // Transient error. for { select { case db.compErrC <- err: case err = <-db.compErrSetC: - if err == nil { + switch { + case err == nil: goto noerr + case errors.IsCorrupted(err): + goto hasperr + default: } case _, _ = <-db.closeC: return } } +hasperr: + // Persistent error. + for { + select { + case db.compErrC <- err: + case db.compPerErrC <- err: + case db.writeLockC <- struct{}{}: + // Hold write lock, so that write won't pass-through. + wlocked = true + case _, _ = <-db.closeC: + if wlocked { + // We should release the lock or Close will hang. + <-db.writeLockC + } + return + } + } } type compactionTransactCounter int @@ -139,12 +172,17 @@ func (cnt *compactionTransactCounter) incr() { *cnt++ } -func (db *DB) compactionTransact(name string, exec func(cnt *compactionTransactCounter) error, rollback func() error) { +type compactionTransactInterface interface { + run(cnt *compactionTransactCounter) error + revert() error +} + +func (db *DB) compactionTransact(name string, t compactionTransactInterface) { defer func() { if x := recover(); x != nil { - if x == errCompactionTransactExiting && rollback != nil { - if err := rollback(); err != nil { - db.logf("%s rollback error %q", name, err) + if x == errCompactionTransactExiting { + if err := t.revert(); err != nil { + db.logf("%s revert error %q", name, err) } } panic(x) @@ -156,9 +194,13 @@ func (db *DB) compactionTransact(name string, exec func(cnt *compactionTransactC backoffMax = 8 * time.Second backoffMul = 2 * time.Second ) - backoff := backoffMin - backoffT := time.NewTimer(backoff) - lastCnt := compactionTransactCounter(0) + var ( + backoff = backoffMin + backoffT = time.NewTimer(backoff) + lastCnt = compactionTransactCounter(0) + + disableBackoff = db.s.o.GetDisableCompactionBackoff() + ) for n := 0; ; n++ { // Check wether the DB is closed. if db.isClosed() { @@ -170,11 +212,19 @@ func (db *DB) compactionTransact(name string, exec func(cnt *compactionTransactC // Execute. cnt := compactionTransactCounter(0) - err := exec(&cnt) + err := t.run(&cnt) + if err != nil { + db.logf("%s error I·%d %q", name, cnt, err) + } // Set compaction error status. select { case db.compErrSetC <- err: + case perr := <-db.compPerErrC: + if err != nil { + db.logf("%s exiting (persistent error %q)", name, perr) + db.compactionExitTransact() + } case _, _ = <-db.closeC: db.logf("%s exiting", name) db.compactionExitTransact() @@ -182,31 +232,56 @@ func (db *DB) compactionTransact(name string, exec func(cnt *compactionTransactC if err == nil { return } - db.logf("%s error I·%d %q", name, cnt, err) - - // Reset backoff duration if counter is advancing. - if cnt > lastCnt { - backoff = backoffMin - lastCnt = cnt - } - - // Backoff. - backoffT.Reset(backoff) - if backoff < backoffMax { - backoff *= backoffMul - if backoff > backoffMax { - backoff = backoffMax - } - } - select { - case <-backoffT.C: - case _, _ = <-db.closeC: - db.logf("%s exiting", name) + if errors.IsCorrupted(err) { + db.logf("%s exiting (corruption detected)", name) db.compactionExitTransact() } + + if !disableBackoff { + // Reset backoff duration if counter is advancing. + if cnt > lastCnt { + backoff = backoffMin + lastCnt = cnt + } + + // Backoff. + backoffT.Reset(backoff) + if backoff < backoffMax { + backoff *= backoffMul + if backoff > backoffMax { + backoff = backoffMax + } + } + select { + case <-backoffT.C: + case _, _ = <-db.closeC: + db.logf("%s exiting", name) + db.compactionExitTransact() + } + } } } +type compactionTransactFunc struct { + runFunc func(cnt *compactionTransactCounter) error + revertFunc func() error +} + +func (t *compactionTransactFunc) run(cnt *compactionTransactCounter) error { + return t.runFunc(cnt) +} + +func (t *compactionTransactFunc) revert() error { + if t.revertFunc != nil { + return t.revertFunc() + } + return nil +} + +func (db *DB) compactionTransactFunc(name string, run func(cnt *compactionTransactCounter) error, revert func() error) { + db.compactionTransact(name, &compactionTransactFunc{run, revert}) +} + func (db *DB) compactionExitTransact() { panic(errCompactionTransactExiting) } @@ -232,20 +307,23 @@ func (db *DB) memCompaction() { } // Pause table compaction. - ch := make(chan struct{}) + resumeC := make(chan struct{}) select { - case db.tcompPauseC <- (chan<- struct{})(ch): + case db.tcompPauseC <- (chan<- struct{})(resumeC): + case <-db.compPerErrC: + close(resumeC) + resumeC = nil case _, _ = <-db.closeC: return } - db.compactionTransact("mem@flush", func(cnt *compactionTransactCounter) (err error) { + db.compactionTransactFunc("mem@flush", func(cnt *compactionTransactCounter) (err error) { stats.startTimer() defer stats.stopTimer() return c.flush(mem.mdb, -1) }, func() error { for _, r := range c.rec.addedTables { - db.logf("mem@flush rollback @%d", r.num) + db.logf("mem@flush revert @%d", r.num) f := db.s.getTableFile(r.num) if err := f.Remove(); err != nil { return err @@ -254,7 +332,7 @@ func (db *DB) memCompaction() { return nil }) - db.compactionTransact("mem@commit", func(cnt *compactionTransactCounter) (err error) { + db.compactionTransactFunc("mem@commit", func(cnt *compactionTransactCounter) (err error) { stats.startTimer() defer stats.stopTimer() return c.commit(db.journalFile.Num(), db.frozenSeq) @@ -271,26 +349,223 @@ func (db *DB) memCompaction() { db.dropFrozenMem() // Resume table compaction. - select { - case <-ch: - case _, _ = <-db.closeC: - return + if resumeC != nil { + select { + case <-resumeC: + close(resumeC) + case _, _ = <-db.closeC: + return + } } // Trigger table compaction. - db.compTrigger(db.mcompTriggerC) + db.compSendTrigger(db.tcompCmdC) +} + +type tableCompactionBuilder struct { + db *DB + s *session + c *compaction + rec *sessionRecord + stat0, stat1 *cStatsStaging + + snapHasLastUkey bool + snapLastUkey []byte + snapLastSeq uint64 + snapIter int + snapKerrCnt int + snapDropCnt int + + kerrCnt int + dropCnt int + + minSeq uint64 + strict bool + tableSize int + + tw *tWriter +} + +func (b *tableCompactionBuilder) appendKV(key, value []byte) error { + // Create new table if not already. + if b.tw == nil { + // Check for pause event. + if b.db != nil { + select { + case ch := <-b.db.tcompPauseC: + b.db.pauseCompaction(ch) + case _, _ = <-b.db.closeC: + b.db.compactionExitTransact() + default: + } + } + + // Create new table. + var err error + b.tw, err = b.s.tops.create() + if err != nil { + return err + } + } + + // Write key/value into table. + return b.tw.append(key, value) +} + +func (b *tableCompactionBuilder) needFlush() bool { + return b.tw.tw.BytesLen() >= b.tableSize +} + +func (b *tableCompactionBuilder) flush() error { + t, err := b.tw.finish() + if err != nil { + return err + } + b.rec.addTableFile(b.c.level+1, t) + b.stat1.write += t.size + b.s.logf("table@build created L%d@%d N·%d S·%s %q:%q", b.c.level+1, t.file.Num(), b.tw.tw.EntriesLen(), shortenb(int(t.size)), t.imin, t.imax) + b.tw = nil + return nil +} + +func (b *tableCompactionBuilder) cleanup() { + if b.tw != nil { + b.tw.drop() + b.tw = nil + } +} + +func (b *tableCompactionBuilder) run(cnt *compactionTransactCounter) error { + snapResumed := b.snapIter > 0 + hasLastUkey := b.snapHasLastUkey // The key might has zero length, so this is necessary. + lastUkey := append([]byte{}, b.snapLastUkey...) + lastSeq := b.snapLastSeq + b.kerrCnt = b.snapKerrCnt + b.dropCnt = b.snapDropCnt + // Restore compaction state. + b.c.restore() + + defer b.cleanup() + + b.stat1.startTimer() + defer b.stat1.stopTimer() + + iter := b.c.newIterator() + defer iter.Release() + for i := 0; iter.Next(); i++ { + // Incr transact counter. + cnt.incr() + + // Skip until last state. + if i < b.snapIter { + continue + } + + resumed := false + if snapResumed { + resumed = true + snapResumed = false + } + + ikey := iter.Key() + ukey, seq, kt, kerr := parseIkey(ikey) + + if kerr == nil { + shouldStop := !resumed && b.c.shouldStopBefore(ikey) + + if !hasLastUkey || b.s.icmp.uCompare(lastUkey, ukey) != 0 { + // First occurrence of this user key. + + // Only rotate tables if ukey doesn't hop across. + if b.tw != nil && (shouldStop || b.needFlush()) { + if err := b.flush(); err != nil { + return err + } + + // Creates snapshot of the state. + b.c.save() + b.snapHasLastUkey = hasLastUkey + b.snapLastUkey = append(b.snapLastUkey[:0], lastUkey...) + b.snapLastSeq = lastSeq + b.snapIter = i + b.snapKerrCnt = b.kerrCnt + b.snapDropCnt = b.dropCnt + } + + hasLastUkey = true + lastUkey = append(lastUkey[:0], ukey...) + lastSeq = kMaxSeq + } + + switch { + case lastSeq <= b.minSeq: + // Dropped because newer entry for same user key exist + fallthrough // (A) + case kt == ktDel && seq <= b.minSeq && b.c.baseLevelForKey(lastUkey): + // For this user key: + // (1) there is no data in higher levels + // (2) data in lower levels will have larger seq numbers + // (3) data in layers that are being compacted here and have + // smaller seq numbers will be dropped in the next + // few iterations of this loop (by rule (A) above). + // Therefore this deletion marker is obsolete and can be dropped. + lastSeq = seq + b.dropCnt++ + continue + default: + lastSeq = seq + } + } else { + if b.strict { + return kerr + } + + // Don't drop corrupted keys. + hasLastUkey = false + lastUkey = lastUkey[:0] + lastSeq = kMaxSeq + b.kerrCnt++ + } + + if err := b.appendKV(ikey, iter.Value()); err != nil { + return err + } + } + + if err := iter.Error(); err != nil { + return err + } + + // Finish last table. + if b.tw != nil && !b.tw.empty() { + return b.flush() + } + return nil +} + +func (b *tableCompactionBuilder) revert() error { + for _, at := range b.rec.addedTables { + b.s.logf("table@build revert @%d", at.num) + f := b.s.getTableFile(at.num) + if err := f.Remove(); err != nil { + return err + } + } + return nil } func (db *DB) tableCompaction(c *compaction, noTrivial bool) { - rec := new(sessionRecord) - rec.addCompactionPointer(c.level, c.imax) + defer c.release() + + rec := &sessionRecord{numLevel: db.s.o.GetNumLevel()} + rec.addCompPtr(c.level, c.imax) if !noTrivial && c.trivial() { t := c.tables[0][0] db.logf("table@move L%d@%d -> L%d", c.level, t.file.Num(), c.level+1) - rec.deleteTable(c.level, t.file.Num()) + rec.delTable(c.level, t.file.Num()) rec.addTableFile(c.level+1, t) - db.compactionTransact("table@move", func(cnt *compactionTransactCounter) (err error) { + db.compactionTransactFunc("table@move", func(cnt *compactionTransactCounter) (err error) { return db.s.commit(rec) }, nil) return @@ -301,184 +576,34 @@ func (db *DB) tableCompaction(c *compaction, noTrivial bool) { for _, t := range tables { stats[i].read += t.size // Insert deleted tables into record - rec.deleteTable(c.level+i, t.file.Num()) + rec.delTable(c.level+i, t.file.Num()) } } sourceSize := int(stats[0].read + stats[1].read) minSeq := db.minSeq() db.logf("table@compaction L%d·%d -> L%d·%d S·%s Q·%d", c.level, len(c.tables[0]), c.level+1, len(c.tables[1]), shortenb(sourceSize), minSeq) - var snapUkey []byte - var snapHasUkey bool - var snapSeq uint64 - var snapIter int - var snapDropCnt int - var dropCnt int - db.compactionTransact("table@build", func(cnt *compactionTransactCounter) (err error) { - ukey := append([]byte{}, snapUkey...) - hasUkey := snapHasUkey - lseq := snapSeq - dropCnt = snapDropCnt - snapSched := snapIter == 0 - - var tw *tWriter - finish := func() error { - t, err := tw.finish() - if err != nil { - return err - } - rec.addTableFile(c.level+1, t) - stats[1].write += t.size - db.logf("table@build created L%d@%d N·%d S·%s %q:%q", c.level+1, t.file.Num(), tw.tw.EntriesLen(), shortenb(int(t.size)), t.imin, t.imax) - return nil - } - - defer func() { - stats[1].stopTimer() - if tw != nil { - tw.drop() - tw = nil - } - }() - - stats[1].startTimer() - iter := c.newIterator() - defer iter.Release() - for i := 0; iter.Next(); i++ { - // Incr transact counter. - cnt.incr() - - // Skip until last state. - if i < snapIter { - continue - } - - ikey := iKey(iter.Key()) - - if c.shouldStopBefore(ikey) && tw != nil { - err = finish() - if err != nil { - return - } - snapSched = true - tw = nil - } - - // Scheduled for snapshot, snapshot will used to retry compaction - // if error occured. - if snapSched { - snapUkey = append(snapUkey[:0], ukey...) - snapHasUkey = hasUkey - snapSeq = lseq - snapIter = i - snapDropCnt = dropCnt - snapSched = false - } - - if seq, vt, ok := ikey.parseNum(); !ok { - // Don't drop error keys - ukey = ukey[:0] - hasUkey = false - lseq = kMaxSeq - } else { - if !hasUkey || db.s.icmp.uCompare(ikey.ukey(), ukey) != 0 { - // First occurrence of this user key - ukey = append(ukey[:0], ikey.ukey()...) - hasUkey = true - lseq = kMaxSeq - } - - drop := false - if lseq <= minSeq { - // Dropped because newer entry for same user key exist - drop = true // (A) - } else if vt == tDel && seq <= minSeq && c.baseLevelForKey(ukey) { - // For this user key: - // (1) there is no data in higher levels - // (2) data in lower levels will have larger seq numbers - // (3) data in layers that are being compacted here and have - // smaller seq numbers will be dropped in the next - // few iterations of this loop (by rule (A) above). - // Therefore this deletion marker is obsolete and can be dropped. - drop = true - } - - lseq = seq - if drop { - dropCnt++ - continue - } - } - - // Create new table if not already - if tw == nil { - // Check for pause event. - select { - case ch := <-db.tcompPauseC: - db.pauseCompaction(ch) - case _, _ = <-db.closeC: - db.compactionExitTransact() - default: - } - - // Create new table. - tw, err = db.s.tops.create() - if err != nil { - return - } - } - - // Write key/value into table - err = tw.append(ikey, iter.Value()) - if err != nil { - return - } - - // Finish table if it is big enough - if tw.tw.BytesLen() >= kMaxTableSize { - err = finish() - if err != nil { - return - } - snapSched = true - tw = nil - } - } - - err = iter.Error() - if err != nil { - return - } - - // Finish last table - if tw != nil && !tw.empty() { - err = finish() - if err != nil { - return - } - tw = nil - } - return - }, func() error { - for _, r := range rec.addedTables { - db.logf("table@build rollback @%d", r.num) - f := db.s.getTableFile(r.num) - if err := f.Remove(); err != nil { - return err - } - } - return nil - }) + b := &tableCompactionBuilder{ + db: db, + s: db.s, + c: c, + rec: rec, + stat1: &stats[1], + minSeq: minSeq, + strict: db.s.o.GetStrict(opt.StrictCompaction), + tableSize: db.s.o.GetCompactionTableSize(c.level + 1), + } + db.compactionTransact("table@build", b) // Commit changes - db.compactionTransact("table@commit", func(cnt *compactionTransactCounter) (err error) { + db.compactionTransactFunc("table@commit", func(cnt *compactionTransactCounter) (err error) { stats[1].startTimer() defer stats[1].stopTimer() return db.s.commit(rec) }, nil) resultSize := int(stats[1].write) - db.logf("table@compaction committed F%s S%s D·%d T·%v", sint(len(rec.addedTables)-len(rec.deletedTables)), sshortenb(resultSize-sourceSize), dropCnt, stats[1].duration) + db.logf("table@compaction committed F%s S%s Ke·%d D·%d T·%v", sint(len(rec.addedTables)-len(rec.deletedTables)), sshortenb(resultSize-sourceSize), b.kerrCnt, b.dropCnt, stats[1].duration) // Save compaction stats for i := range stats { @@ -494,14 +619,14 @@ func (db *DB) tableRangeCompaction(level int, umin, umax []byte) { db.tableCompaction(c, true) } } else { - v := db.s.version_NB() - + v := db.s.version() m := 1 for i, t := range v.tables[1:] { if t.overlaps(db.s.icmp, umin, umax, false) { m = i + 1 } } + v.release() for level := 0; level < m; level++ { if c := db.s.getCompactionRange(level, umin, umax); c != nil { @@ -518,7 +643,9 @@ func (db *DB) tableAutoCompaction() { } func (db *DB) tableNeedCompaction() bool { - return db.s.version_NB().needCompaction() + v := db.s.version() + defer v.release() + return v.needCompaction() } func (db *DB) pauseCompaction(ch chan<- struct{}) { @@ -538,10 +665,12 @@ type cIdle struct { } func (r cIdle) ack(err error) { - defer func() { - recover() - }() - r.ackC <- err + if r.ackC != nil { + defer func() { + recover() + }() + r.ackC <- err + } } type cRange struct { @@ -559,6 +688,7 @@ func (r cRange) ack(err error) { } } +// This will trigger auto compation and/or wait for all compaction to be done. func (db *DB) compSendIdle(compC chan<- cCmd) (err error) { ch := make(chan error) defer close(ch) @@ -580,6 +710,15 @@ func (db *DB) compSendIdle(compC chan<- cCmd) (err error) { return err } +// This will trigger auto compaction but will not wait for it. +func (db *DB) compSendTrigger(compC chan<- cCmd) { + select { + case compC <- cIdle{}: + default: + } +} + +// Send range compaction request. func (db *DB) compSendRange(compC chan<- cCmd, level int, min, max []byte) (err error) { ch := make(chan error) defer close(ch) @@ -601,13 +740,6 @@ func (db *DB) compSendRange(compC chan<- cCmd, level int, min, max []byte) (err return err } -func (db *DB) compTrigger(compTriggerC chan struct{}) { - select { - case compTriggerC <- struct{}{}: - default: - } -} - func (db *DB) mCompaction() { var x cCmd @@ -626,11 +758,14 @@ func (db *DB) mCompaction() { for { select { case x = <-db.mcompCmdC: - db.memCompaction() - x.ack(nil) - x = nil - case <-db.mcompTriggerC: - db.memCompaction() + switch x.(type) { + case cIdle: + db.memCompaction() + x.ack(nil) + x = nil + default: + panic("leveldb: unknown command") + } case _, _ = <-db.closeC: return } @@ -661,7 +796,6 @@ func (db *DB) tCompaction() { if db.tableNeedCompaction() { select { case x = <-db.tcompCmdC: - case <-db.tcompTriggerC: case ch := <-db.tcompPauseC: db.pauseCompaction(ch) continue @@ -677,7 +811,6 @@ func (db *DB) tCompaction() { ackQ = ackQ[:0] select { case x = <-db.tcompCmdC: - case <-db.tcompTriggerC: case ch := <-db.tcompPauseC: db.pauseCompaction(ch) continue @@ -692,6 +825,8 @@ func (db *DB) tCompaction() { case cRange: db.tableRangeCompaction(cmd.level, cmd.min, cmd.max) x.ack(nil) + default: + panic("leveldb: unknown command") } x = nil } diff --git a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/db_iter.go b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/db_iter.go index 5c36a193..4607e5da 100644 --- a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/db_iter.go +++ b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/db_iter.go @@ -48,7 +48,8 @@ func (db *DB) newRawIterator(slice *util.Range, ro *opt.ReadOptions) iterator.It i = append(i, fmi) } i = append(i, ti...) - mi := iterator.NewMergedIterator(i, db.s.icmp, true) + strict := opt.GetStrict(db.s.o.Options, ro, opt.StrictReader) + mi := iterator.NewMergedIterator(i, db.s.icmp, strict) mi.SetReleaser(&versionReleaser{v: v}) return mi } @@ -58,10 +59,10 @@ func (db *DB) newIterator(seq uint64, slice *util.Range, ro *opt.ReadOptions) *d if slice != nil { islice = &util.Range{} if slice.Start != nil { - islice.Start = newIKey(slice.Start, kMaxSeq, tSeek) + islice.Start = newIkey(slice.Start, kMaxSeq, ktSeek) } if slice.Limit != nil { - islice.Limit = newIKey(slice.Limit, kMaxSeq, tSeek) + islice.Limit = newIkey(slice.Limit, kMaxSeq, ktSeek) } } rawIter := db.newRawIterator(islice, ro) @@ -70,7 +71,7 @@ func (db *DB) newIterator(seq uint64, slice *util.Range, ro *opt.ReadOptions) *d icmp: db.s.icmp, iter: rawIter, seq: seq, - strict: db.s.o.GetStrict(opt.StrictIterator) || ro.GetStrict(opt.StrictIterator), + strict: opt.GetStrict(db.s.o.Options, ro, opt.StrictReader), key: make([]byte, 0), value: make([]byte, 0), } @@ -161,7 +162,7 @@ func (i *dbIter) Seek(key []byte) bool { return false } - ikey := newIKey(key, i.seq, tSeek) + ikey := newIkey(key, i.seq, ktSeek) if i.iter.Seek(ikey) { i.dir = dirSOI return i.next() @@ -173,15 +174,14 @@ func (i *dbIter) Seek(key []byte) bool { func (i *dbIter) next() bool { for { - ukey, seq, t, ok := parseIkey(i.iter.Key()) - if ok { + if ukey, seq, kt, kerr := parseIkey(i.iter.Key()); kerr == nil { if seq <= i.seq { - switch t { - case tDel: + switch kt { + case ktDel: // Skip deleted key. i.key = append(i.key[:0], ukey...) i.dir = dirForward - case tVal: + case ktVal: if i.dir == dirSOI || i.icmp.uCompare(ukey, i.key) > 0 { i.key = append(i.key[:0], ukey...) i.value = append(i.value[:0], i.iter.Value()...) @@ -191,7 +191,7 @@ func (i *dbIter) next() bool { } } } else if i.strict { - i.setErr(errInvalidIkey) + i.setErr(kerr) break } if !i.iter.Next() { @@ -224,20 +224,19 @@ func (i *dbIter) prev() bool { del := true if i.iter.Valid() { for { - ukey, seq, t, ok := parseIkey(i.iter.Key()) - if ok { + if ukey, seq, kt, kerr := parseIkey(i.iter.Key()); kerr == nil { if seq <= i.seq { if !del && i.icmp.uCompare(ukey, i.key) < 0 { return true } - del = (t == tDel) + del = (kt == ktDel) if !del { i.key = append(i.key[:0], ukey...) i.value = append(i.value[:0], i.iter.Value()...) } } } else if i.strict { - i.setErr(errInvalidIkey) + i.setErr(kerr) return false } if !i.iter.Prev() { @@ -266,13 +265,12 @@ func (i *dbIter) Prev() bool { return i.Last() case dirForward: for i.iter.Prev() { - ukey, _, _, ok := parseIkey(i.iter.Key()) - if ok { + if ukey, _, _, kerr := parseIkey(i.iter.Key()); kerr == nil { if i.icmp.uCompare(ukey, i.key) < 0 { goto cont } } else if i.strict { - i.setErr(errInvalidIkey) + i.setErr(kerr) return false } } diff --git a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/db_test.go b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/db_test.go index aa2c8013..8a99583d 100644 --- a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/db_test.go +++ b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/db_test.go @@ -7,6 +7,7 @@ package leveldb import ( + "bytes" "container/list" crand "crypto/rand" "encoding/binary" @@ -23,6 +24,7 @@ import ( "unsafe" "github.com/syndtr/goleveldb/leveldb/comparer" + "github.com/syndtr/goleveldb/leveldb/errors" "github.com/syndtr/goleveldb/leveldb/filter" "github.com/syndtr/goleveldb/leveldb/iterator" "github.com/syndtr/goleveldb/leveldb/opt" @@ -151,7 +153,10 @@ func (h *dbHarness) maxNextLevelOverlappingBytes(want uint64) { t := h.t db := h.db - var res uint64 + var ( + maxOverlaps uint64 + maxLevel int + ) v := db.s.version() for i, tt := range v.tables[1 : len(v.tables)-1] { level := i + 1 @@ -159,15 +164,18 @@ func (h *dbHarness) maxNextLevelOverlappingBytes(want uint64) { for _, t := range tt { r := next.getOverlaps(nil, db.s.icmp, t.imin.ukey(), t.imax.ukey(), false) sum := r.size() - if sum > res { - res = sum + if sum > maxOverlaps { + maxOverlaps = sum + maxLevel = level } } } v.release() - if res > want { - t.Errorf("next level overlapping bytes is more than %d, got=%d", want, res) + if maxOverlaps > want { + t.Errorf("next level most overlapping bytes is more than %d, got=%d level=%d", want, maxOverlaps, maxLevel) + } else { + t.Logf("next level most overlapping bytes is %d, level=%d want=%d", maxOverlaps, maxLevel, want) } } @@ -240,7 +248,7 @@ func (h *dbHarness) allEntriesFor(key, want string) { db := h.db s := db.s - ikey := newIKey([]byte(key), kMaxSeq, tVal) + ikey := newIkey([]byte(key), kMaxSeq, ktVal) iter := db.newRawIterator(nil, nil) if !iter.Seek(ikey) && iter.Error() != nil { t.Error("AllEntries: error during seek, err: ", iter.Error()) @@ -249,19 +257,18 @@ func (h *dbHarness) allEntriesFor(key, want string) { res := "[ " first := true for iter.Valid() { - rkey := iKey(iter.Key()) - if _, t, ok := rkey.parseNum(); ok { - if s.icmp.uCompare(ikey.ukey(), rkey.ukey()) != 0 { + if ukey, _, kt, kerr := parseIkey(iter.Key()); kerr == nil { + if s.icmp.uCompare(ikey.ukey(), ukey) != 0 { break } if !first { res += ", " } first = false - switch t { - case tVal: + switch kt { + case ktVal: res += string(iter.Value()) - case tDel: + case ktDel: res += "DEL" } } else { @@ -326,6 +333,8 @@ func (h *dbHarness) compactMem() { t := h.t db := h.db + t.Log("starting memdb compaction") + db.writeLockC <- struct{}{} defer func() { <-db.writeLockC @@ -341,6 +350,8 @@ func (h *dbHarness) compactMem() { if h.totalTables() == 0 { t.Error("zero tables after mem compaction") } + + t.Log("memdb compaction done") } func (h *dbHarness) compactRangeAtErr(level int, min, max string, wanterr bool) { @@ -355,6 +366,8 @@ func (h *dbHarness) compactRangeAtErr(level int, min, max string, wanterr bool) _max = []byte(max) } + t.Logf("starting table range compaction: level=%d, min=%q, max=%q", level, min, max) + if err := db.compSendRange(db.tcompCmdC, level, _min, _max); err != nil { if wanterr { t.Log("CompactRangeAt: got error (expected): ", err) @@ -364,6 +377,8 @@ func (h *dbHarness) compactRangeAtErr(level int, min, max string, wanterr bool) } else if wanterr { t.Error("CompactRangeAt: expect error") } + + t.Log("table range compaction done") } func (h *dbHarness) compactRangeAt(level int, min, max string) { @@ -374,6 +389,8 @@ func (h *dbHarness) compactRange(min, max string) { t := h.t db := h.db + t.Logf("starting DB range compaction: min=%q, max=%q", min, max) + var r util.Range if min != "" { r.Start = []byte(min) @@ -384,6 +401,8 @@ func (h *dbHarness) compactRange(min, max string) { if err := db.CompactRange(r); err != nil { t.Error("CompactRange: got error: ", err) } + + t.Log("DB range compaction done") } func (h *dbHarness) sizeAssert(start, limit string, low, hi uint64) { @@ -505,10 +524,10 @@ func Test_FieldsAligned(t *testing.T) { p1 := new(DB) testAligned(t, "DB.seq", unsafe.Offsetof(p1.seq)) p2 := new(session) - testAligned(t, "session.stFileNum", unsafe.Offsetof(p2.stFileNum)) + testAligned(t, "session.stNextFileNum", unsafe.Offsetof(p2.stNextFileNum)) testAligned(t, "session.stJournalNum", unsafe.Offsetof(p2.stJournalNum)) testAligned(t, "session.stPrevJournalNum", unsafe.Offsetof(p2.stPrevJournalNum)) - testAligned(t, "session.stSeq", unsafe.Offsetof(p2.stSeq)) + testAligned(t, "session.stSeqNum", unsafe.Offsetof(p2.stSeqNum)) } func TestDb_Locking(t *testing.T) { @@ -944,7 +963,7 @@ func TestDb_RepeatedWritesToSameKey(t *testing.T) { h := newDbHarnessWopt(t, &opt.Options{WriteBuffer: 100000}) defer h.close() - maxTables := kNumLevels + kL0_StopWritesTrigger + maxTables := h.o.GetNumLevel() + h.o.GetWriteL0PauseTrigger() value := strings.Repeat("v", 2*h.o.GetWriteBuffer()) for i := 0; i < 5*maxTables; i++ { @@ -962,7 +981,7 @@ func TestDb_RepeatedWritesToSameKeyAfterReopen(t *testing.T) { h.reopenDB() - maxTables := kNumLevels + kL0_StopWritesTrigger + maxTables := h.o.GetNumLevel() + h.o.GetWriteL0PauseTrigger() value := strings.Repeat("v", 2*h.o.GetWriteBuffer()) for i := 0; i < 5*maxTables; i++ { @@ -978,7 +997,7 @@ func TestDb_SparseMerge(t *testing.T) { h := newDbHarnessWopt(t, &opt.Options{Compression: opt.NoCompression}) defer h.close() - h.putMulti(kNumLevels, "A", "Z") + h.putMulti(h.o.GetNumLevel(), "A", "Z") // Suppose there is: // small amount of data with prefix A @@ -1002,6 +1021,7 @@ func TestDb_SparseMerge(t *testing.T) { h.put("C", "vc2") h.compactMem() + h.waitCompaction() h.maxNextLevelOverlappingBytes(20 * 1048576) h.compactRangeAt(0, "", "") h.waitCompaction() @@ -1172,7 +1192,7 @@ func TestDb_HiddenValuesAreRemoved(t *testing.T) { h.put("foo", "v1") h.compactMem() - m := kMaxMemCompactLevel + m := h.o.GetMaxMemCompationLevel() v := s.version() num := v.tLen(m) v.release() @@ -1216,7 +1236,7 @@ func TestDb_DeletionMarkers2(t *testing.T) { h.put("foo", "v1") h.compactMem() - m := kMaxMemCompactLevel + m := h.o.GetMaxMemCompationLevel() v := s.version() num := v.tLen(m) v.release() @@ -1269,14 +1289,14 @@ func TestDb_CompactionTableOpenError(t *testing.T) { t.Errorf("total tables is %d, want %d", n, im) } - h.stor.SetOpenErr(storage.TypeTable) + h.stor.SetEmuErr(storage.TypeTable, tsOpOpen) go h.db.CompactRange(util.Range{}) if err := h.db.compSendIdle(h.db.tcompCmdC); err != nil { t.Log("compaction error: ", err) } h.closeDB0() h.openDB() - h.stor.SetOpenErr(0) + h.stor.SetEmuErr(0, tsOpOpen) for i := 0; i < im; i++ { for j := 0; j < jm; j++ { @@ -1287,7 +1307,7 @@ func TestDb_CompactionTableOpenError(t *testing.T) { func TestDb_OverlapInLevel0(t *testing.T) { trun(t, func(h *dbHarness) { - if kMaxMemCompactLevel != 2 { + if h.o.GetMaxMemCompationLevel() != 2 { t.Fatal("fix test to reflect the config") } @@ -1407,23 +1427,23 @@ func TestDb_ManifestWriteError(t *testing.T) { h.compactMem() h.getVal("foo", "bar") v := h.db.s.version() - if n := v.tLen(kMaxMemCompactLevel); n != 1 { + if n := v.tLen(h.o.GetMaxMemCompationLevel()); n != 1 { t.Errorf("invalid total tables, want=1 got=%d", n) } v.release() if i == 0 { - h.stor.SetWriteErr(storage.TypeManifest) + h.stor.SetEmuErr(storage.TypeManifest, tsOpWrite) } else { - h.stor.SetSyncErr(storage.TypeManifest) + h.stor.SetEmuErr(storage.TypeManifest, tsOpSync) } // Merging compaction (will fail) - h.compactRangeAtErr(kMaxMemCompactLevel, "", "", true) + h.compactRangeAtErr(h.o.GetMaxMemCompationLevel(), "", "", true) h.db.Close() - h.stor.SetWriteErr(0) - h.stor.SetSyncErr(0) + h.stor.SetEmuErr(0, tsOpWrite) + h.stor.SetEmuErr(0, tsOpSync) // Should not lose data h.openDB() @@ -1573,7 +1593,7 @@ func TestDb_ManualCompaction(t *testing.T) { h := newDbHarness(t) defer h.close() - if kMaxMemCompactLevel != 2 { + if h.o.GetMaxMemCompationLevel() != 2 { t.Fatal("fix test to reflect the config") } @@ -1857,7 +1877,7 @@ func TestDb_DeletionMarkersOnMemdb(t *testing.T) { } func TestDb_LeveldbIssue178(t *testing.T) { - nKeys := (kMaxTableSize / 30) * 5 + nKeys := (opt.DefaultCompactionTableSize / 30) * 5 key1 := func(i int) string { return fmt.Sprintf("my_key_%d", i) } @@ -2125,7 +2145,7 @@ func TestDb_GoleveldbIssue72and83(t *testing.T) { } } if err := iter.Error(); err != nil { - t.Fatalf("READER0 #%d.%d W#%d snap.Iterator: %v", i, k, err) + t.Fatalf("READER0 #%d.%d W#%d snap.Iterator: %v", i, k, writei, err) } iter.Release() snap.Release() @@ -2164,5 +2184,385 @@ func TestDb_GoleveldbIssue72and83(t *testing.T) { }() wg.Wait() - +} + +func TestDb_TransientError(t *testing.T) { + h := newDbHarnessWopt(t, &opt.Options{ + WriteBuffer: 128 * opt.KiB, + CachedOpenFiles: 3, + DisableCompactionBackoff: true, + }) + defer h.close() + + const ( + nSnap = 20 + nKey = 10000 + ) + + var ( + snaps [nSnap]*Snapshot + b = &Batch{} + ) + for i := range snaps { + vtail := fmt.Sprintf("VAL%030d", i) + b.Reset() + for k := 0; k < nKey; k++ { + key := fmt.Sprintf("KEY%8d", k) + b.Put([]byte(key), []byte(key+vtail)) + } + h.stor.SetEmuRandErr(storage.TypeTable, tsOpOpen, tsOpRead, tsOpReadAt) + if err := h.db.Write(b, nil); err != nil { + t.Logf("WRITE #%d error: %v", i, err) + h.stor.SetEmuRandErr(0, tsOpOpen, tsOpRead, tsOpReadAt, tsOpWrite) + for { + if err := h.db.Write(b, nil); err == nil { + break + } else if errors.IsCorrupted(err) { + t.Fatalf("WRITE #%d corrupted: %v", i, err) + } + } + } + + snaps[i] = h.db.newSnapshot() + b.Reset() + for k := 0; k < nKey; k++ { + key := fmt.Sprintf("KEY%8d", k) + b.Delete([]byte(key)) + } + h.stor.SetEmuRandErr(storage.TypeTable, tsOpOpen, tsOpRead, tsOpReadAt) + if err := h.db.Write(b, nil); err != nil { + t.Logf("WRITE #%d error: %v", i, err) + h.stor.SetEmuRandErr(0, tsOpOpen, tsOpRead, tsOpReadAt) + for { + if err := h.db.Write(b, nil); err == nil { + break + } else if errors.IsCorrupted(err) { + t.Fatalf("WRITE #%d corrupted: %v", i, err) + } + } + } + } + h.stor.SetEmuRandErr(0, tsOpOpen, tsOpRead, tsOpReadAt) + + runtime.GOMAXPROCS(runtime.NumCPU()) + + rnd := rand.New(rand.NewSource(0xecafdaed)) + wg := &sync.WaitGroup{} + for i, snap := range snaps { + wg.Add(2) + + go func(i int, snap *Snapshot, sk []int) { + defer wg.Done() + + vtail := fmt.Sprintf("VAL%030d", i) + for _, k := range sk { + key := fmt.Sprintf("KEY%8d", k) + xvalue, err := snap.Get([]byte(key), nil) + if err != nil { + t.Fatalf("READER_GET #%d SEQ=%d K%d error: %v", i, snap.elem.seq, k, err) + } + value := key + vtail + if !bytes.Equal([]byte(value), xvalue) { + t.Fatalf("READER_GET #%d SEQ=%d K%d invalid value: want %q, got %q", i, snap.elem.seq, k, value, xvalue) + } + } + }(i, snap, rnd.Perm(nKey)) + + go func(i int, snap *Snapshot) { + defer wg.Done() + + vtail := fmt.Sprintf("VAL%030d", i) + iter := snap.NewIterator(nil, nil) + defer iter.Release() + for k := 0; k < nKey; k++ { + if !iter.Next() { + if err := iter.Error(); err != nil { + t.Fatalf("READER_ITER #%d K%d error: %v", i, k, err) + } else { + t.Fatalf("READER_ITER #%d K%d eoi", i, k) + } + } + key := fmt.Sprintf("KEY%8d", k) + xkey := iter.Key() + if !bytes.Equal([]byte(key), xkey) { + t.Fatalf("READER_ITER #%d K%d invalid key: want %q, got %q", i, k, key, xkey) + } + value := key + vtail + xvalue := iter.Value() + if !bytes.Equal([]byte(value), xvalue) { + t.Fatalf("READER_ITER #%d K%d invalid value: want %q, got %q", i, k, value, xvalue) + } + } + }(i, snap) + } + + wg.Wait() +} + +func TestDb_UkeyShouldntHopAcrossTable(t *testing.T) { + h := newDbHarnessWopt(t, &opt.Options{ + WriteBuffer: 112 * opt.KiB, + CompactionTableSize: 90 * opt.KiB, + CompactionExpandLimitFactor: 1, + }) + defer h.close() + + const ( + nSnap = 190 + nKey = 140 + ) + + var ( + snaps [nSnap]*Snapshot + b = &Batch{} + ) + for i := range snaps { + vtail := fmt.Sprintf("VAL%030d", i) + b.Reset() + for k := 0; k < nKey; k++ { + key := fmt.Sprintf("KEY%08d", k) + b.Put([]byte(key), []byte(key+vtail)) + } + if err := h.db.Write(b, nil); err != nil { + t.Fatalf("WRITE #%d error: %v", i, err) + } + + snaps[i] = h.db.newSnapshot() + b.Reset() + for k := 0; k < nKey; k++ { + key := fmt.Sprintf("KEY%08d", k) + b.Delete([]byte(key)) + } + if err := h.db.Write(b, nil); err != nil { + t.Fatalf("WRITE #%d error: %v", i, err) + } + } + + h.compactMem() + + h.waitCompaction() + for level, tables := range h.db.s.stVersion.tables { + for _, table := range tables { + t.Logf("L%d@%d %q:%q", level, table.file.Num(), table.imin, table.imax) + } + } + + h.compactRangeAt(0, "", "") + h.waitCompaction() + for level, tables := range h.db.s.stVersion.tables { + for _, table := range tables { + t.Logf("L%d@%d %q:%q", level, table.file.Num(), table.imin, table.imax) + } + } + h.compactRangeAt(1, "", "") + h.waitCompaction() + for level, tables := range h.db.s.stVersion.tables { + for _, table := range tables { + t.Logf("L%d@%d %q:%q", level, table.file.Num(), table.imin, table.imax) + } + } + runtime.GOMAXPROCS(runtime.NumCPU()) + + wg := &sync.WaitGroup{} + for i, snap := range snaps { + wg.Add(1) + + go func(i int, snap *Snapshot) { + defer wg.Done() + + vtail := fmt.Sprintf("VAL%030d", i) + for k := 0; k < nKey; k++ { + key := fmt.Sprintf("KEY%08d", k) + xvalue, err := snap.Get([]byte(key), nil) + if err != nil { + t.Fatalf("READER_GET #%d SEQ=%d K%d error: %v", i, snap.elem.seq, k, err) + } + value := key + vtail + if !bytes.Equal([]byte(value), xvalue) { + t.Fatalf("READER_GET #%d SEQ=%d K%d invalid value: want %q, got %q", i, snap.elem.seq, k, value, xvalue) + } + } + }(i, snap) + } + + wg.Wait() +} + +func TestDb_TableCompactionBuilder(t *testing.T) { + stor := newTestStorage(t) + defer stor.Close() + + const nSeq = 99 + + o := &opt.Options{ + WriteBuffer: 112 * opt.KiB, + CompactionTableSize: 43 * opt.KiB, + CompactionExpandLimitFactor: 1, + CompactionGPOverlapsFactor: 1, + BlockCache: opt.NoCache, + } + s, err := newSession(stor, o) + if err != nil { + t.Fatal(err) + } + if err := s.create(); err != nil { + t.Fatal(err) + } + defer s.close() + var ( + seq uint64 + targetSize = 5 * o.CompactionTableSize + value = bytes.Repeat([]byte{'0'}, 100) + ) + for i := 0; i < 2; i++ { + tw, err := s.tops.create() + if err != nil { + t.Fatal(err) + } + for k := 0; tw.tw.BytesLen() < targetSize; k++ { + key := []byte(fmt.Sprintf("%09d", k)) + seq += nSeq - 1 + for x := uint64(0); x < nSeq; x++ { + if err := tw.append(newIkey(key, seq-x, ktVal), value); err != nil { + t.Fatal(err) + } + } + } + tf, err := tw.finish() + if err != nil { + t.Fatal(err) + } + rec := &sessionRecord{numLevel: s.o.GetNumLevel()} + rec.addTableFile(i, tf) + if err := s.commit(rec); err != nil { + t.Fatal(err) + } + } + + // Build grandparent. + v := s.version() + c := newCompaction(s, v, 1, append(tFiles{}, v.tables[1]...)) + rec := &sessionRecord{numLevel: s.o.GetNumLevel()} + b := &tableCompactionBuilder{ + s: s, + c: c, + rec: rec, + stat1: new(cStatsStaging), + minSeq: 0, + strict: true, + tableSize: o.CompactionTableSize/3 + 961, + } + if err := b.run(new(compactionTransactCounter)); err != nil { + t.Fatal(err) + } + for _, t := range c.tables[0] { + rec.delTable(c.level, t.file.Num()) + } + if err := s.commit(rec); err != nil { + t.Fatal(err) + } + c.release() + + // Build level-1. + v = s.version() + c = newCompaction(s, v, 0, append(tFiles{}, v.tables[0]...)) + rec = &sessionRecord{numLevel: s.o.GetNumLevel()} + b = &tableCompactionBuilder{ + s: s, + c: c, + rec: rec, + stat1: new(cStatsStaging), + minSeq: 0, + strict: true, + tableSize: o.CompactionTableSize, + } + if err := b.run(new(compactionTransactCounter)); err != nil { + t.Fatal(err) + } + for _, t := range c.tables[0] { + rec.delTable(c.level, t.file.Num()) + } + // Move grandparent to level-3 + for _, t := range v.tables[2] { + rec.delTable(2, t.file.Num()) + rec.addTableFile(3, t) + } + if err := s.commit(rec); err != nil { + t.Fatal(err) + } + c.release() + + v = s.version() + for level, want := range []bool{false, true, false, true, false} { + got := len(v.tables[level]) > 0 + if want != got { + t.Fatalf("invalid level-%d tables len: want %v, got %v", level, want, got) + } + } + for i, f := range v.tables[1][:len(v.tables[1])-1] { + nf := v.tables[1][i+1] + if bytes.Equal(f.imax.ukey(), nf.imin.ukey()) { + t.Fatalf("KEY %q hop across table %d .. %d", f.imax.ukey(), f.file.Num(), nf.file.Num()) + } + } + v.release() + + // Compaction with transient error. + v = s.version() + c = newCompaction(s, v, 1, append(tFiles{}, v.tables[1]...)) + rec = &sessionRecord{numLevel: s.o.GetNumLevel()} + b = &tableCompactionBuilder{ + s: s, + c: c, + rec: rec, + stat1: new(cStatsStaging), + minSeq: 0, + strict: true, + tableSize: o.CompactionTableSize, + } + stor.SetEmuErrOnce(storage.TypeTable, tsOpSync) + stor.SetEmuRandErr(storage.TypeTable, tsOpRead, tsOpReadAt, tsOpWrite) + stor.SetEmuRandErrProb(0xf0) + for { + if err := b.run(new(compactionTransactCounter)); err != nil { + t.Logf("(expected) b.run: %v", err) + } else { + break + } + } + if err := s.commit(rec); err != nil { + t.Fatal(err) + } + c.release() + + stor.SetEmuErrOnce(0, tsOpSync) + stor.SetEmuRandErr(0, tsOpRead, tsOpReadAt, tsOpWrite) + + v = s.version() + if len(v.tables[1]) != len(v.tables[2]) { + t.Fatalf("invalid tables length, want %d, got %d", len(v.tables[1]), len(v.tables[2])) + } + for i, f0 := range v.tables[1] { + f1 := v.tables[2][i] + iter0 := s.tops.newIterator(f0, nil, nil) + iter1 := s.tops.newIterator(f1, nil, nil) + for j := 0; true; j++ { + next0 := iter0.Next() + next1 := iter1.Next() + if next0 != next1 { + t.Fatalf("#%d.%d invalid eoi: want %v, got %v", i, j, next0, next1) + } + key0 := iter0.Key() + key1 := iter1.Key() + if !bytes.Equal(key0, key1) { + t.Fatalf("#%d.%d invalid key: want %q, got %q", i, j, key0, key1) + } + if next0 == false { + break + } + } + iter0.Release() + iter1.Release() + } + v.release() } diff --git a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/db_util.go b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/db_util.go index 4f6b792d..a8a2bdf7 100644 --- a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/db_util.go +++ b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/db_util.go @@ -7,8 +7,7 @@ package leveldb import ( - "errors" - + "github.com/syndtr/goleveldb/leveldb/errors" "github.com/syndtr/goleveldb/leveldb/iterator" "github.com/syndtr/goleveldb/leveldb/opt" "github.com/syndtr/goleveldb/leveldb/storage" @@ -38,7 +37,9 @@ func (db *DB) logf(format string, v ...interface{}) { db.s.logf(format, v...) } // Check and clean files. func (db *DB) checkAndCleanFiles() error { - v := db.s.version_NB() + v := db.s.version() + defer v.release() + tablesMap := make(map[uint64]bool) for _, tables := range v.tables { for _, t := range tables { @@ -78,12 +79,14 @@ func (db *DB) checkAndCleanFiles() error { } if nTables != len(tablesMap) { + var missing []*storage.FileInfo for num, present := range tablesMap { if !present { + missing = append(missing, &storage.FileInfo{Type: storage.TypeTable, Num: num}) db.logf("db@janitor table missing @%d", num) } } - return ErrCorrupted{Type: MissingFiles, Err: errors.New("leveldb: table files missing")} + return errors.NewErrCorrupted(nil, &errors.ErrMissingFiles{Files: missing}) } db.logf("db@janitor F·%d G·%d", len(files), len(rem)) diff --git a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/db_write.go b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/db_write.go index 939d9c3b..e94b7b60 100644 --- a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/db_write.go +++ b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/db_write.go @@ -59,7 +59,7 @@ func (db *DB) rotateMem(n int) (mem *memDB, err error) { } // Schedule memdb compaction. - db.compTrigger(db.mcompTriggerC) + db.compSendTrigger(db.mcompCmdC) return } @@ -77,12 +77,12 @@ func (db *DB) flush(n int) (mem *memDB, nn int, err error) { }() nn = mem.mdb.Free() switch { - case v.tLen(0) >= kL0_SlowdownWritesTrigger && !delayed: + case v.tLen(0) >= db.s.o.GetWriteL0SlowdownTrigger() && !delayed: delayed = true time.Sleep(time.Millisecond) case nn >= n: return false - case v.tLen(0) >= kL0_StopWritesTrigger: + case v.tLen(0) >= db.s.o.GetWriteL0PauseTrigger(): delayed = true err = db.compSendIdle(db.tcompCmdC) if err != nil { @@ -109,7 +109,12 @@ func (db *DB) flush(n int) (mem *memDB, nn int, err error) { for flush() { } if delayed { - db.logf("db@write delayed T·%v", time.Since(start)) + db.writeDelay += time.Since(start) + db.writeDelayN++ + } else if db.writeDelayN > 0 { + db.writeDelay = 0 + db.writeDelayN = 0 + db.logf("db@write was delayed N·%d T·%v", db.writeDelayN, db.writeDelay) } return } @@ -120,7 +125,7 @@ func (db *DB) flush(n int) (mem *memDB, nn int, err error) { // It is safe to modify the contents of the arguments after Write returns. func (db *DB) Write(b *Batch, wo *opt.WriteOptions) (err error) { err = db.ok() - if err != nil || b == nil || b.len() == 0 { + if err != nil || b == nil || b.Len() == 0 { return } @@ -133,6 +138,8 @@ func (db *DB) Write(b *Batch, wo *opt.WriteOptions) (err error) { return <-db.writeAckC } case db.writeLockC <- struct{}{}: + case err = <-db.compPerErrC: + return case _, _ = <-db.closeC: return ErrClosed } @@ -188,35 +195,43 @@ drain: if b.size() >= (128 << 10) { // Push the write batch to the journal writer select { + case db.journalC <- b: + // Write into memdb + if berr := b.memReplay(mem.mdb); berr != nil { + panic(berr) + } + case err = <-db.compPerErrC: + return case _, _ = <-db.closeC: err = ErrClosed return - case db.journalC <- b: - // Write into memdb - b.memReplay(mem.mdb) } // Wait for journal writer select { - case _, _ = <-db.closeC: - err = ErrClosed - return case err = <-db.journalAckC: if err != nil { // Revert memdb if error detected - b.revertMemReplay(mem.mdb) + if berr := b.revertMemReplay(mem.mdb); berr != nil { + panic(berr) + } return } + case _, _ = <-db.closeC: + err = ErrClosed + return } } else { err = db.writeJournal(b) if err != nil { return } - b.memReplay(mem.mdb) + if berr := b.memReplay(mem.mdb); berr != nil { + panic(berr) + } } // Set last seq number. - db.addSeq(uint64(b.len())) + db.addSeq(uint64(b.Len())) if b.size() >= memFree { db.rotateMem(0) @@ -268,6 +283,8 @@ func (db *DB) CompactRange(r util.Range) error { // Lock writer. select { case db.writeLockC <- struct{}{}: + case err := <-db.compPerErrC: + return err case _, _ = <-db.closeC: return ErrClosed } diff --git a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/error.go b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/errors.go similarity index 50% rename from Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/error.go rename to Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/errors.go index 8066bd9a..29d0d2f2 100644 --- a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/error.go +++ b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/errors.go @@ -7,32 +7,12 @@ package leveldb import ( - "errors" - - "github.com/syndtr/goleveldb/leveldb/util" + "github.com/syndtr/goleveldb/leveldb/errors" ) var ( - ErrNotFound = util.ErrNotFound + ErrNotFound = errors.ErrNotFound ErrSnapshotReleased = errors.New("leveldb: snapshot released") ErrIterReleased = errors.New("leveldb: iterator released") ErrClosed = errors.New("leveldb: closed") ) - -type CorruptionType int - -const ( - CorruptedManifest CorruptionType = iota - MissingFiles -) - -// ErrCorrupted is the type that wraps errors that indicate corruption in -// the database. -type ErrCorrupted struct { - Type CorruptionType - Err error -} - -func (e ErrCorrupted) Error() string { - return e.Err.Error() -} diff --git a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/errors/errors.go b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/errors/errors.go new file mode 100644 index 00000000..84b5d6b7 --- /dev/null +++ b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/errors/errors.go @@ -0,0 +1,76 @@ +// Copyright (c) 2014, Suryandaru Triandana +// All rights reserved. +// +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// Package errors provides common error types used throughout leveldb. +package errors + +import ( + "errors" + "fmt" + + "github.com/syndtr/goleveldb/leveldb/storage" + "github.com/syndtr/goleveldb/leveldb/util" +) + +var ( + ErrNotFound = New("leveldb: not found") + ErrReleased = util.ErrReleased + ErrHasReleaser = util.ErrHasReleaser +) + +// New returns an error that formats as the given text. +func New(text string) error { + return errors.New(text) +} + +// ErrCorrupted is the type that wraps errors that indicate corruption in +// the database. +type ErrCorrupted struct { + File *storage.FileInfo + Err error +} + +func (e *ErrCorrupted) Error() string { + if e.File != nil { + return fmt.Sprintf("%v [file=%v]", e.Err, e.File) + } else { + return e.Err.Error() + } +} + +// NewErrCorrupted creates new ErrCorrupted error. +func NewErrCorrupted(f storage.File, err error) error { + return &ErrCorrupted{storage.NewFileInfo(f), err} +} + +// IsCorrupted returns a boolean indicating whether the error is indicating +// a corruption. +func IsCorrupted(err error) bool { + switch err.(type) { + case *ErrCorrupted: + return true + } + return false +} + +// ErrMissingFiles is the type that indicating a corruption due to missing +// files. +type ErrMissingFiles struct { + Files []*storage.FileInfo +} + +func (e *ErrMissingFiles) Error() string { return "file missing" } + +// SetFile sets 'file info' of the given error with the given file. +// Currently only ErrCorrupted is supported, otherwise will do nothing. +func SetFile(err error, f storage.File) error { + switch x := err.(type) { + case *ErrCorrupted: + x.File = storage.NewFileInfo(f) + return x + } + return err +} diff --git a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/iterator/indexed_iter.go b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/iterator/indexed_iter.go index 8353b357..939adbb9 100644 --- a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/iterator/indexed_iter.go +++ b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/iterator/indexed_iter.go @@ -7,6 +7,7 @@ package iterator import ( + "github.com/syndtr/goleveldb/leveldb/errors" "github.com/syndtr/goleveldb/leveldb/util" ) @@ -22,9 +23,8 @@ type IteratorIndexer interface { type indexedIterator struct { util.BasicReleaser - index IteratorIndexer - strict bool - strictGet bool + index IteratorIndexer + strict bool data Iterator err error @@ -37,11 +37,6 @@ func (i *indexedIterator) setData() { i.data.Release() } i.data = i.index.Get() - if i.strictGet { - if err := i.data.Error(); err != nil { - i.err = err - } - } } func (i *indexedIterator) clearData() { @@ -61,13 +56,11 @@ func (i *indexedIterator) indexErr() { } func (i *indexedIterator) dataErr() bool { - if i.errf != nil { - if err := i.data.Error(); err != nil { + if err := i.data.Error(); err != nil { + if i.errf != nil { i.errf(err) } - } - if i.strict { - if err := i.data.Error(); err != nil { + if i.strict || !errors.IsCorrupted(err) { i.err = err return true } @@ -236,16 +229,14 @@ func (i *indexedIterator) SetErrorCallback(f func(err error)) { i.errf = f } -// NewIndexedIterator returns an indexed iterator. An index is iterator -// that returns another iterator, a data iterator. A data iterator is the +// NewIndexedIterator returns an 'indexed iterator'. An index is iterator +// that returns another iterator, a 'data iterator'. A 'data iterator' is the // iterator that contains actual key/value pairs. // -// If strict is true then error yield by data iterator will halt the indexed -// iterator, on contrary if strict is false then the indexed iterator will -// ignore those error and move on to the next index. If strictGet is true and -// index.Get() yield an 'error iterator' then the indexed iterator will be halted. -// An 'error iterator' is iterator which its Error() method always return non-nil -// even before any 'seeks method' is called. -func NewIndexedIterator(index IteratorIndexer, strict, strictGet bool) Iterator { - return &indexedIterator{index: index, strict: strict, strictGet: strictGet} +// If strict is true the any 'corruption errors' (i.e errors.IsCorrupted(err) == true) +// won't be ignored and will halt 'indexed iterator', otherwise the iterator will +// continue to the next 'data iterator'. Corruption on 'index iterator' will not be +// ignored and will halt the iterator. +func NewIndexedIterator(index IteratorIndexer, strict bool) Iterator { + return &indexedIterator{index: index, strict: strict} } diff --git a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/iterator/indexed_iter_test.go b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/iterator/indexed_iter_test.go index 6a89b383..72a79789 100644 --- a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/iterator/indexed_iter_test.go +++ b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/iterator/indexed_iter_test.go @@ -65,7 +65,7 @@ var _ = testutil.Defer(func() { // Test the iterator. t := testutil.IteratorTesting{ KeyValue: kv.Clone(), - Iter: NewIndexedIterator(NewArrayIndexer(index), true, true), + Iter: NewIndexedIterator(NewArrayIndexer(index), true), } testutil.DoIteratorTesting(&t) done <- true diff --git a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/iterator/merged_iter.go b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/iterator/merged_iter.go index 8370e25e..1a7e29df 100644 --- a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/iterator/merged_iter.go +++ b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/iterator/merged_iter.go @@ -8,6 +8,7 @@ package iterator import ( "github.com/syndtr/goleveldb/leveldb/comparer" + "github.com/syndtr/goleveldb/leveldb/errors" "github.com/syndtr/goleveldb/leveldb/util" ) @@ -42,13 +43,11 @@ func assertKey(key []byte) []byte { } func (i *mergedIterator) iterErr(iter Iterator) bool { - if i.errf != nil { - if err := iter.Error(); err != nil { + if err := iter.Error(); err != nil { + if i.errf != nil { i.errf(err) } - } - if i.strict { - if err := iter.Error(); err != nil { + if i.strict || !errors.IsCorrupted(err) { i.err = err return true } @@ -292,9 +291,9 @@ func (i *mergedIterator) SetErrorCallback(f func(err error)) { // keys: if iters[i] contains a key k then iters[j] will not contain that key k. // None of the iters may be nil. // -// If strict is true then error yield by any iterators will halt the merged -// iterator, on contrary if strict is false then the merged iterator will -// ignore those error and move on to the next iterator. +// If strict is true the any 'corruption errors' (i.e errors.IsCorrupted(err) == true) +// won't be ignored and will halt 'merged iterator', otherwise the iterator will +// continue to the next 'input iterator'. func NewMergedIterator(iters []Iterator, cmp comparer.Comparer, strict bool) Iterator { return &mergedIterator{ iters: iters, diff --git a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/journal/journal.go b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/journal/journal.go index e9a19ebc..6519ec66 100644 --- a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/journal/journal.go +++ b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/journal/journal.go @@ -79,10 +79,10 @@ package journal import ( "encoding/binary" - "errors" "fmt" "io" + "github.com/syndtr/goleveldb/leveldb/errors" "github.com/syndtr/goleveldb/leveldb/util" ) @@ -109,7 +109,7 @@ type ErrCorrupted struct { Reason string } -func (e ErrCorrupted) Error() string { +func (e *ErrCorrupted) Error() string { return fmt.Sprintf("leveldb/journal: block/chunk corrupted: %s (%d bytes)", e.Reason, e.Size) } @@ -162,10 +162,10 @@ var errSkip = errors.New("leveldb/journal: skipped") func (r *Reader) corrupt(n int, reason string, skip bool) error { if r.dropper != nil { - r.dropper.Drop(ErrCorrupted{n, reason}) + r.dropper.Drop(&ErrCorrupted{n, reason}) } if r.strict && !skip { - r.err = ErrCorrupted{n, reason} + r.err = errors.NewErrCorrupted(nil, &ErrCorrupted{n, reason}) return r.err } return errSkip diff --git a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/key.go b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/key.go index b9acf932..98b21f44 100644 --- a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/key.go +++ b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/key.go @@ -9,15 +9,30 @@ package leveldb import ( "encoding/binary" "fmt" + + "github.com/syndtr/goleveldb/leveldb/errors" ) -type vType int +type ErrIkeyCorrupted struct { + Ikey []byte + Reason string +} -func (t vType) String() string { - switch t { - case tDel: +func (e *ErrIkeyCorrupted) Error() string { + return fmt.Sprintf("leveldb: iKey %q corrupted: %s", e.Ikey, e.Reason) +} + +func newErrIkeyCorrupted(ikey []byte, reason string) error { + return errors.NewErrCorrupted(nil, &ErrIkeyCorrupted{append([]byte{}, ikey...), reason}) +} + +type kType int + +func (kt kType) String() string { + switch kt { + case ktDel: return "d" - case tVal: + case ktVal: return "v" } return "x" @@ -26,16 +41,16 @@ func (t vType) String() string { // Value types encoded as the last component of internal keys. // Don't modify; this value are saved to disk. const ( - tDel vType = iota - tVal + ktDel kType = iota + ktVal ) -// tSeek defines the vType that should be passed when constructing an +// ktSeek defines the kType that should be passed when constructing an // internal key for seeking to a particular sequence number (since we // sort sequence numbers in decreasing order and the value type is // embedded as the low 8 bits in the sequence number in internal keys, // we need to use the highest-numbered ValueType, not the lowest). -const tSeek = tVal +const ktSeek = ktVal const ( // Maximum value possible for sequence number; the 8-bits are @@ -43,7 +58,7 @@ const ( // 64-bit integer. kMaxSeq uint64 = (uint64(1) << 56) - 1 // Maximum value possible for packed sequence number and type. - kMaxNum uint64 = (kMaxSeq << 8) | uint64(tSeek) + kMaxNum uint64 = (kMaxSeq << 8) | uint64(ktSeek) ) // Maximum number encoded in bytes. @@ -55,85 +70,73 @@ func init() { type iKey []byte -func newIKey(ukey []byte, seq uint64, t vType) iKey { - if seq > kMaxSeq || t > tVal { - panic("invalid seq number or value type") +func newIkey(ukey []byte, seq uint64, kt kType) iKey { + if seq > kMaxSeq { + panic("leveldb: invalid sequence number") + } else if kt > ktVal { + panic("leveldb: invalid type") } - b := make(iKey, len(ukey)+8) - copy(b, ukey) - binary.LittleEndian.PutUint64(b[len(ukey):], (seq<<8)|uint64(t)) - return b + ik := make(iKey, len(ukey)+8) + copy(ik, ukey) + binary.LittleEndian.PutUint64(ik[len(ukey):], (seq<<8)|uint64(kt)) + return ik } -func parseIkey(p []byte) (ukey []byte, seq uint64, t vType, ok bool) { - if len(p) < 8 { - return +func parseIkey(ik []byte) (ukey []byte, seq uint64, kt kType, err error) { + if len(ik) < 8 { + return nil, 0, 0, newErrIkeyCorrupted(ik, "invalid length") } - num := binary.LittleEndian.Uint64(p[len(p)-8:]) - seq, t = uint64(num>>8), vType(num&0xff) - if t > tVal { - return + num := binary.LittleEndian.Uint64(ik[len(ik)-8:]) + seq, kt = uint64(num>>8), kType(num&0xff) + if kt > ktVal { + return nil, 0, 0, newErrIkeyCorrupted(ik, "invalid type") } - ukey = p[:len(p)-8] - ok = true + ukey = ik[:len(ik)-8] return } -func validIkey(p []byte) bool { - _, _, _, ok := parseIkey(p) - return ok +func validIkey(ik []byte) bool { + _, _, _, err := parseIkey(ik) + return err == nil } -func (p iKey) assert() { - if p == nil { - panic("nil iKey") +func (ik iKey) assert() { + if ik == nil { + panic("leveldb: nil iKey") } - if len(p) < 8 { - panic(fmt.Sprintf("invalid iKey %q, len=%d", []byte(p), len(p))) + if len(ik) < 8 { + panic(fmt.Sprintf("leveldb: iKey %q, len=%d: invalid length", ik, len(ik))) } } -func (p iKey) ok() bool { - if len(p) < 8 { - return false - } - _, _, ok := p.parseNum() - return ok +func (ik iKey) ukey() []byte { + ik.assert() + return ik[:len(ik)-8] } -func (p iKey) ukey() []byte { - p.assert() - return p[:len(p)-8] +func (ik iKey) num() uint64 { + ik.assert() + return binary.LittleEndian.Uint64(ik[len(ik)-8:]) } -func (p iKey) num() uint64 { - p.assert() - return binary.LittleEndian.Uint64(p[len(p)-8:]) -} - -func (p iKey) parseNum() (seq uint64, t vType, ok bool) { - if p == nil { - panic("nil iKey") +func (ik iKey) parseNum() (seq uint64, kt kType) { + num := ik.num() + seq, kt = uint64(num>>8), kType(num&0xff) + if kt > ktVal { + panic(fmt.Sprintf("leveldb: iKey %q, len=%d: invalid type %#x", ik, len(ik), kt)) } - if len(p) < 8 { - return - } - num := p.num() - seq, t = uint64(num>>8), vType(num&0xff) - if t > tVal { - return 0, 0, false - } - ok = true return } -func (p iKey) String() string { - if len(p) == 0 { +func (ik iKey) String() string { + if ik == nil { return "" } - if seq, t, ok := p.parseNum(); ok { - return fmt.Sprintf("%s,%s%d", shorten(string(p.ukey())), t, seq) + + if ukey, seq, kt, err := parseIkey(ik); err == nil { + return fmt.Sprintf("%s,%s%d", shorten(string(ukey)), kt, seq) + } else { + return "" } - return "" } diff --git a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/key_test.go b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/key_test.go index e307cfc1..30eadf78 100644 --- a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/key_test.go +++ b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/key_test.go @@ -15,8 +15,8 @@ import ( var defaultIComparer = &iComparer{comparer.DefaultComparer} -func ikey(key string, seq uint64, t vType) iKey { - return newIKey([]byte(key), uint64(seq), t) +func ikey(key string, seq uint64, kt kType) iKey { + return newIkey([]byte(key), uint64(seq), kt) } func shortSep(a, b []byte) []byte { @@ -37,27 +37,37 @@ func shortSuccessor(b []byte) []byte { return dst } -func testSingleKey(t *testing.T, key string, seq uint64, vt vType) { - ik := ikey(key, seq, vt) +func testSingleKey(t *testing.T, key string, seq uint64, kt kType) { + ik := ikey(key, seq, kt) if !bytes.Equal(ik.ukey(), []byte(key)) { t.Errorf("user key does not equal, got %v, want %v", string(ik.ukey()), key) } - if rseq, rt, ok := ik.parseNum(); ok { + rseq, rt := ik.parseNum() + if rseq != seq { + t.Errorf("seq number does not equal, got %v, want %v", rseq, seq) + } + if rt != kt { + t.Errorf("type does not equal, got %v, want %v", rt, kt) + } + + if rukey, rseq, rt, kerr := parseIkey(ik); kerr == nil { + if !bytes.Equal(rukey, []byte(key)) { + t.Errorf("user key does not equal, got %v, want %v", string(ik.ukey()), key) + } if rseq != seq { t.Errorf("seq number does not equal, got %v, want %v", rseq, seq) } - - if rt != vt { - t.Errorf("type does not equal, got %v, want %v", rt, vt) + if rt != kt { + t.Errorf("type does not equal, got %v, want %v", rt, kt) } } else { - t.Error("cannot parse seq and type") + t.Errorf("key error: %v", kerr) } } -func TestIKey_EncodeDecode(t *testing.T) { +func TestIkey_EncodeDecode(t *testing.T) { keys := []string{"", "k", "hello", "longggggggggggggggggggggg"} seqs := []uint64{ 1, 2, 3, @@ -67,8 +77,8 @@ func TestIKey_EncodeDecode(t *testing.T) { } for _, key := range keys { for _, seq := range seqs { - testSingleKey(t, key, seq, tVal) - testSingleKey(t, "hello", 1, tDel) + testSingleKey(t, key, seq, ktVal) + testSingleKey(t, "hello", 1, ktDel) } } } @@ -79,45 +89,45 @@ func assertBytes(t *testing.T, want, got []byte) { } } -func TestIKeyShortSeparator(t *testing.T) { +func TestIkeyShortSeparator(t *testing.T) { // When user keys are same - assertBytes(t, ikey("foo", 100, tVal), - shortSep(ikey("foo", 100, tVal), - ikey("foo", 99, tVal))) - assertBytes(t, ikey("foo", 100, tVal), - shortSep(ikey("foo", 100, tVal), - ikey("foo", 101, tVal))) - assertBytes(t, ikey("foo", 100, tVal), - shortSep(ikey("foo", 100, tVal), - ikey("foo", 100, tVal))) - assertBytes(t, ikey("foo", 100, tVal), - shortSep(ikey("foo", 100, tVal), - ikey("foo", 100, tDel))) + assertBytes(t, ikey("foo", 100, ktVal), + shortSep(ikey("foo", 100, ktVal), + ikey("foo", 99, ktVal))) + assertBytes(t, ikey("foo", 100, ktVal), + shortSep(ikey("foo", 100, ktVal), + ikey("foo", 101, ktVal))) + assertBytes(t, ikey("foo", 100, ktVal), + shortSep(ikey("foo", 100, ktVal), + ikey("foo", 100, ktVal))) + assertBytes(t, ikey("foo", 100, ktVal), + shortSep(ikey("foo", 100, ktVal), + ikey("foo", 100, ktDel))) // When user keys are misordered - assertBytes(t, ikey("foo", 100, tVal), - shortSep(ikey("foo", 100, tVal), - ikey("bar", 99, tVal))) + assertBytes(t, ikey("foo", 100, ktVal), + shortSep(ikey("foo", 100, ktVal), + ikey("bar", 99, ktVal))) // When user keys are different, but correctly ordered - assertBytes(t, ikey("g", uint64(kMaxSeq), tSeek), - shortSep(ikey("foo", 100, tVal), - ikey("hello", 200, tVal))) + assertBytes(t, ikey("g", uint64(kMaxSeq), ktSeek), + shortSep(ikey("foo", 100, ktVal), + ikey("hello", 200, ktVal))) // When start user key is prefix of limit user key - assertBytes(t, ikey("foo", 100, tVal), - shortSep(ikey("foo", 100, tVal), - ikey("foobar", 200, tVal))) + assertBytes(t, ikey("foo", 100, ktVal), + shortSep(ikey("foo", 100, ktVal), + ikey("foobar", 200, ktVal))) // When limit user key is prefix of start user key - assertBytes(t, ikey("foobar", 100, tVal), - shortSep(ikey("foobar", 100, tVal), - ikey("foo", 200, tVal))) + assertBytes(t, ikey("foobar", 100, ktVal), + shortSep(ikey("foobar", 100, ktVal), + ikey("foo", 200, ktVal))) } -func TestIKeyShortestSuccessor(t *testing.T) { - assertBytes(t, ikey("g", uint64(kMaxSeq), tSeek), - shortSuccessor(ikey("foo", 100, tVal))) - assertBytes(t, ikey("\xff\xff", 100, tVal), - shortSuccessor(ikey("\xff\xff", 100, tVal))) +func TestIkeyShortestSuccessor(t *testing.T) { + assertBytes(t, ikey("g", uint64(kMaxSeq), ktSeek), + shortSuccessor(ikey("foo", 100, ktVal))) + assertBytes(t, ikey("\xff\xff", 100, ktVal), + shortSuccessor(ikey("\xff\xff", 100, ktVal))) } diff --git a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/memdb/memdb.go b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/memdb/memdb.go index 83ff7bc6..e5398873 100644 --- a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/memdb/memdb.go +++ b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/memdb/memdb.go @@ -8,17 +8,17 @@ package memdb import ( - "errors" "math/rand" "sync" "github.com/syndtr/goleveldb/leveldb/comparer" + "github.com/syndtr/goleveldb/leveldb/errors" "github.com/syndtr/goleveldb/leveldb/iterator" "github.com/syndtr/goleveldb/leveldb/util" ) var ( - ErrNotFound = util.ErrNotFound + ErrNotFound = errors.ErrNotFound ErrIterReleased = errors.New("leveldb/memdb: iterator released") ) diff --git a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/opt/options.go b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/opt/options.go index 126e2a36..d928a213 100644 --- a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/opt/options.go +++ b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/opt/options.go @@ -11,6 +11,7 @@ import ( "github.com/syndtr/goleveldb/leveldb/cache" "github.com/syndtr/goleveldb/leveldb/comparer" "github.com/syndtr/goleveldb/leveldb/filter" + "math" ) const ( @@ -20,12 +21,24 @@ const ( ) const ( - DefaultBlockCacheSize = 8 * MiB - DefaultBlockRestartInterval = 16 - DefaultBlockSize = 4 * KiB - DefaultCompressionType = SnappyCompression - DefaultCachedOpenFiles = 500 - DefaultWriteBuffer = 4 * MiB + DefaultBlockCacheSize = 8 * MiB + DefaultBlockRestartInterval = 16 + DefaultBlockSize = 4 * KiB + DefaultCompactionExpandLimitFactor = 25 + DefaultCompactionGPOverlapsFactor = 10 + DefaultCompactionL0Trigger = 4 + DefaultCompactionSourceLimitFactor = 1 + DefaultCompactionTableSize = 2 * MiB + DefaultCompactionTableSizeMultiplier = 1.0 + DefaultCompactionTotalSize = 10 * MiB + DefaultCompactionTotalSizeMultiplier = 10.0 + DefaultCompressionType = SnappyCompression + DefaultCachedOpenFiles = 500 + DefaultMaxMemCompationLevel = 2 + DefaultNumLevel = 7 + DefaultWriteBuffer = 4 * MiB + DefaultWriteL0PauseTrigger = 12 + DefaultWriteL0SlowdownTrigger = 8 ) type noCache struct{} @@ -65,34 +78,47 @@ const ( nCompression ) -// Strict is the DB strict level. +// Strict is the DB 'strict level'. type Strict uint const ( // If present then a corrupted or invalid chunk or block in manifest - // journal will cause an error istead of being dropped. + // journal will cause an error instead of being dropped. + // This will prevent database with corrupted manifest to be opened. StrictManifest Strict = 1 << iota - // If present then a corrupted or invalid chunk or block in journal - // will cause an error istead of being dropped. - StrictJournal - // If present then journal chunk checksum will be verified. StrictJournalChecksum - // If present then an invalid key/value pair will cause an error - // instead of being skipped. - StrictIterator + // If present then a corrupted or invalid chunk or block in journal + // will cause an error instead of being dropped. + // This will prevent database with corrupted journal to be opened. + StrictJournal // If present then 'sorted table' block checksum will be verified. + // This has effect on both 'read operation' and compaction. StrictBlockChecksum + // If present then a corrupted 'sorted table' will fails compaction. + // The database will enter read-only mode. + StrictCompaction + + // If present then a corrupted 'sorted table' will halts 'read operation'. + StrictReader + + // If present then leveldb.Recover will drop corrupted 'sorted table'. + StrictRecovery + + // This only applicable for ReadOptions, if present then this ReadOptions + // 'strict level' will override global ones. + StrictOverride + // StrictAll enables all strict flags. - StrictAll = StrictManifest | StrictJournal | StrictJournalChecksum | StrictIterator | StrictBlockChecksum + StrictAll = StrictManifest | StrictJournalChecksum | StrictJournal | StrictBlockChecksum | StrictCompaction | StrictReader // DefaultStrict is the default strict flags. Specify any strict flags // will override default strict flags as whole (i.e. not OR'ed). - DefaultStrict = StrictJournalChecksum | StrictIterator | StrictBlockChecksum + DefaultStrict = StrictJournalChecksum | StrictBlockChecksum | StrictCompaction | StrictReader // NoStrict disables all strict flags. Override default strict flags. NoStrict = ^StrictAll @@ -132,6 +158,73 @@ type Options struct { // The default value is 500. CachedOpenFiles int + // CompactionExpandLimitFactor limits compaction size after expanded. + // This will be multiplied by table size limit at compaction target level. + // + // The default value is 25. + CompactionExpandLimitFactor int + + // CompactionGPOverlapsFactor limits overlaps in grandparent (Level + 2) that a + // single 'sorted table' generates. + // This will be multiplied by table size limit at grandparent level. + // + // The default value is 10. + CompactionGPOverlapsFactor int + + // CompactionL0Trigger defines number of 'sorted table' at level-0 that will + // trigger compaction. + // + // The default value is 4. + CompactionL0Trigger int + + // CompactionSourceLimitFactor limits compaction source size. This doesn't apply to + // level-0. + // This will be multiplied by table size limit at compaction target level. + // + // The default value is 1. + CompactionSourceLimitFactor int + + // CompactionTableSize limits size of 'sorted table' that compaction generates. + // The limits for each level will be calculated as: + // CompactionTableSize * (CompactionTableSizeMultiplier ^ Level) + // The multiplier for each level can also fine-tuned using CompactionTableSizeMultiplierPerLevel. + // + // The default value is 2MiB. + CompactionTableSize int + + // CompactionTableSizeMultiplier defines multiplier for CompactionTableSize. + // + // The default value is 1. + CompactionTableSizeMultiplier float64 + + // CompactionTableSizeMultiplierPerLevel defines per-level multiplier for + // CompactionTableSize. + // Use zero to skip a level. + // + // The default value is nil. + CompactionTableSizeMultiplierPerLevel []float64 + + // CompactionTotalSize limits total size of 'sorted table' for each level. + // The limits for each level will be calculated as: + // CompactionTotalSize * (CompactionTotalSizeMultiplier ^ Level) + // The multiplier for each level can also fine-tuned using + // CompactionTotalSizeMultiplierPerLevel. + // + // The default value is 10MiB. + CompactionTotalSize int + + // CompactionTotalSizeMultiplier defines multiplier for CompactionTotalSize. + // + // The default value is 10. + CompactionTotalSizeMultiplier float64 + + // CompactionTotalSizeMultiplierPerLevel defines per-level multiplier for + // CompactionTotalSize. + // Use zero to skip a level. + // + // The default value is nil. + CompactionTotalSizeMultiplierPerLevel []float64 + // Comparer defines a total ordering over the space of []byte keys: a 'less // than' relationship. The same comparison algorithm must be used for reads // and writes over the lifetime of the DB. @@ -144,6 +237,11 @@ type Options struct { // The default value (DefaultCompression) uses snappy compression. Compression Compression + // DisableCompactionBackoff allows disable compaction retry backoff. + // + // The default value is false. + DisableCompactionBackoff bool + // ErrorIfExist defines whether an error should returned if the DB already // exist. // @@ -172,6 +270,19 @@ type Options struct { // The default value is nil. Filter filter.Filter + // MaxMemCompationLevel defines maximum level a newly compacted 'memdb' + // will be pushed into if doesn't creates overlap. This should less than + // NumLevel. Use -1 for level-0. + // + // The default is 2. + MaxMemCompationLevel int + + // NumLevel defines number of database level. The level shouldn't changed + // between opens, or the database will panic. + // + // The default is 7. + NumLevel int + // Strict defines the DB strict level. Strict Strict @@ -183,6 +294,18 @@ type Options struct { // // The default value is 4MiB. WriteBuffer int + + // WriteL0StopTrigger defines number of 'sorted table' at level-0 that will + // pause write. + // + // The default value is 12. + WriteL0PauseTrigger int + + // WriteL0SlowdownTrigger defines number of 'sorted table' at level-0 that + // will trigger write slowdown. + // + // The default value is 8. + WriteL0SlowdownTrigger int } func (o *Options) GetAltFilters() []filter.Filter { @@ -222,6 +345,79 @@ func (o *Options) GetCachedOpenFiles() int { return o.CachedOpenFiles } +func (o *Options) GetCompactionExpandLimit(level int) int { + factor := DefaultCompactionExpandLimitFactor + if o != nil && o.CompactionExpandLimitFactor > 0 { + factor = o.CompactionExpandLimitFactor + } + return o.GetCompactionTableSize(level+1) * factor +} + +func (o *Options) GetCompactionGPOverlaps(level int) int { + factor := DefaultCompactionGPOverlapsFactor + if o != nil && o.CompactionGPOverlapsFactor > 0 { + factor = o.CompactionGPOverlapsFactor + } + return o.GetCompactionTableSize(level+2) * factor +} + +func (o *Options) GetCompactionL0Trigger() int { + if o == nil || o.CompactionL0Trigger == 0 { + return DefaultCompactionL0Trigger + } + return o.CompactionL0Trigger +} + +func (o *Options) GetCompactionSourceLimit(level int) int { + factor := DefaultCompactionSourceLimitFactor + if o != nil && o.CompactionSourceLimitFactor > 0 { + factor = o.CompactionSourceLimitFactor + } + return o.GetCompactionTableSize(level+1) * factor +} + +func (o *Options) GetCompactionTableSize(level int) int { + var ( + base = DefaultCompactionTableSize + mult float64 + ) + if o != nil { + if o.CompactionTableSize > 0 { + base = o.CompactionTableSize + } + if len(o.CompactionTableSizeMultiplierPerLevel) > level && o.CompactionTableSizeMultiplierPerLevel[level] > 0 { + mult = o.CompactionTableSizeMultiplierPerLevel[level] + } else if o.CompactionTableSizeMultiplier > 0 { + mult = math.Pow(o.CompactionTableSizeMultiplier, float64(level)) + } + } + if mult == 0 { + mult = math.Pow(DefaultCompactionTableSizeMultiplier, float64(level)) + } + return int(float64(base) * mult) +} + +func (o *Options) GetCompactionTotalSize(level int) int64 { + var ( + base = DefaultCompactionTotalSize + mult float64 + ) + if o != nil { + if o.CompactionTotalSize > 0 { + base = o.CompactionTotalSize + } + if len(o.CompactionTotalSizeMultiplierPerLevel) > level && o.CompactionTotalSizeMultiplierPerLevel[level] > 0 { + mult = o.CompactionTotalSizeMultiplierPerLevel[level] + } else if o.CompactionTotalSizeMultiplier > 0 { + mult = math.Pow(o.CompactionTotalSizeMultiplier, float64(level)) + } + } + if mult == 0 { + mult = math.Pow(DefaultCompactionTotalSizeMultiplier, float64(level)) + } + return int64(float64(base) * mult) +} + func (o *Options) GetComparer() comparer.Comparer { if o == nil || o.Comparer == nil { return comparer.DefaultComparer @@ -236,6 +432,13 @@ func (o *Options) GetCompression() Compression { return o.Compression } +func (o *Options) GetDisableCompactionBackoff() bool { + if o == nil { + return false + } + return o.DisableCompactionBackoff +} + func (o *Options) GetErrorIfExist() bool { if o == nil { return false @@ -257,6 +460,28 @@ func (o *Options) GetFilter() filter.Filter { return o.Filter } +func (o *Options) GetMaxMemCompationLevel() int { + level := DefaultMaxMemCompationLevel + if o != nil { + if o.MaxMemCompationLevel > 0 { + level = o.MaxMemCompationLevel + } else if o.MaxMemCompationLevel == -1 { + level = 0 + } + } + if level >= o.GetNumLevel() { + return o.GetNumLevel() - 1 + } + return level +} + +func (o *Options) GetNumLevel() int { + if o == nil || o.NumLevel <= 0 { + return DefaultNumLevel + } + return o.NumLevel +} + func (o *Options) GetStrict(strict Strict) bool { if o == nil || o.Strict == 0 { return DefaultStrict&strict != 0 @@ -271,6 +496,20 @@ func (o *Options) GetWriteBuffer() int { return o.WriteBuffer } +func (o *Options) GetWriteL0PauseTrigger() int { + if o == nil || o.WriteL0PauseTrigger == 0 { + return DefaultWriteL0PauseTrigger + } + return o.WriteL0PauseTrigger +} + +func (o *Options) GetWriteL0SlowdownTrigger() int { + if o == nil || o.WriteL0SlowdownTrigger == 0 { + return DefaultWriteL0SlowdownTrigger + } + return o.WriteL0SlowdownTrigger +} + // ReadOptions holds the optional parameters for 'read operation'. The // 'read operation' includes Get, Find and NewIterator. type ReadOptions struct { @@ -281,8 +520,8 @@ type ReadOptions struct { // The default value is false. DontFillCache bool - // Strict overrides global DB strict level. Only StrictIterator and - // StrictBlockChecksum that does have effects here. + // Strict will be OR'ed with global DB 'strict level' unless StrictOverride + // is present. Currently only StrictReader that has effect here. Strict Strict } @@ -324,3 +563,11 @@ func (wo *WriteOptions) GetSync() bool { } return wo.Sync } + +func GetStrict(o *Options, ro *ReadOptions, strict Strict) bool { + if ro.GetStrict(StrictOverride) { + return ro.GetStrict(strict) + } else { + return o.GetStrict(strict) || ro.GetStrict(strict) + } +} diff --git a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/options.go b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/options.go index fc6a9696..a50257b9 100644 --- a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/options.go +++ b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/options.go @@ -12,30 +12,86 @@ import ( "github.com/syndtr/goleveldb/leveldb/opt" ) -func (s *session) setOptions(o *opt.Options) { - s.o = &opt.Options{} +func dupOptions(o *opt.Options) *opt.Options { + newo := &opt.Options{} if o != nil { - *s.o = *o + *newo = *o } + return newo +} + +func (s *session) setOptions(o *opt.Options) { + no := dupOptions(o) // Alternative filters. if filters := o.GetAltFilters(); len(filters) > 0 { - s.o.AltFilters = make([]filter.Filter, len(filters)) + no.AltFilters = make([]filter.Filter, len(filters)) for i, filter := range filters { - s.o.AltFilters[i] = &iFilter{filter} + no.AltFilters[i] = &iFilter{filter} } } // Block cache. switch o.GetBlockCache() { case nil: - s.o.BlockCache = cache.NewLRUCache(opt.DefaultBlockCacheSize) + no.BlockCache = cache.NewLRUCache(opt.DefaultBlockCacheSize) case opt.NoCache: - s.o.BlockCache = nil + no.BlockCache = nil } // Comparer. s.icmp = &iComparer{o.GetComparer()} - s.o.Comparer = s.icmp + no.Comparer = s.icmp // Filter. if filter := o.GetFilter(); filter != nil { - s.o.Filter = &iFilter{filter} + no.Filter = &iFilter{filter} + } + + s.o = &cachedOptions{Options: no} + s.o.cache() +} + +type cachedOptions struct { + *opt.Options + + compactionExpandLimit []int + compactionGPOverlaps []int + compactionSourceLimit []int + compactionTableSize []int + compactionTotalSize []int64 +} + +func (co *cachedOptions) cache() { + numLevel := co.Options.GetNumLevel() + + co.compactionExpandLimit = make([]int, numLevel) + co.compactionGPOverlaps = make([]int, numLevel) + co.compactionSourceLimit = make([]int, numLevel) + co.compactionTableSize = make([]int, numLevel) + co.compactionTotalSize = make([]int64, numLevel) + + for level := 0; level < numLevel; level++ { + co.compactionExpandLimit[level] = co.Options.GetCompactionExpandLimit(level) + co.compactionGPOverlaps[level] = co.Options.GetCompactionGPOverlaps(level) + co.compactionSourceLimit[level] = co.Options.GetCompactionSourceLimit(level) + co.compactionTableSize[level] = co.Options.GetCompactionTableSize(level) + co.compactionTotalSize[level] = co.Options.GetCompactionTotalSize(level) } } + +func (co *cachedOptions) GetCompactionExpandLimit(level int) int { + return co.compactionExpandLimit[level] +} + +func (co *cachedOptions) GetCompactionGPOverlaps(level int) int { + return co.compactionGPOverlaps[level] +} + +func (co *cachedOptions) GetCompactionSourceLimit(level int) int { + return co.compactionSourceLimit[level] +} + +func (co *cachedOptions) GetCompactionTableSize(level int) int { + return co.compactionTableSize[level] +} + +func (co *cachedOptions) GetCompactionTotalSize(level int) int64 { + return co.compactionTotalSize[level] +} diff --git a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/session.go b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/session.go index 7fc08e7d..5b251efb 100644 --- a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/session.go +++ b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/session.go @@ -7,12 +7,13 @@ package leveldb import ( - "errors" + "fmt" "io" "os" "sync" "sync/atomic" + "github.com/syndtr/goleveldb/leveldb/errors" "github.com/syndtr/goleveldb/leveldb/iterator" "github.com/syndtr/goleveldb/leveldb/journal" "github.com/syndtr/goleveldb/leveldb/opt" @@ -20,18 +21,31 @@ import ( "github.com/syndtr/goleveldb/leveldb/util" ) +type ErrManifestCorrupted struct { + Field string + Reason string +} + +func (e *ErrManifestCorrupted) Error() string { + return fmt.Sprintf("leveldb: manifest corrupted (field '%s'): %s", e.Field, e.Reason) +} + +func newErrManifestCorrupted(f storage.File, field, reason string) error { + return errors.NewErrCorrupted(f, &ErrManifestCorrupted{field, reason}) +} + // session represent a persistent database session. type session struct { // Need 64-bit alignment. - stFileNum uint64 // current unused file number + stNextFileNum uint64 // current unused file number stJournalNum uint64 // current journal file number; need external synchronization stPrevJournalNum uint64 // prev journal file number; no longer used; for compatibility with older version of leveldb - stSeq uint64 // last mem compacted seq; need external synchronization + stSeqNum uint64 // last mem compacted seq; need external synchronization stTempFileNum uint64 stor storage.Storage storLock util.Releaser - o *opt.Options + o *cachedOptions icmp *iComparer tops *tOps @@ -39,9 +53,9 @@ type session struct { manifestWriter storage.Writer manifestFile storage.File - stCptrs [kNumLevels]iKey // compact pointers; need external synchronization - stVersion *version // current version - vmu sync.Mutex + stCompPtrs []iKey // compaction pointers; need external synchronization + stVersion *version // current version + vmu sync.Mutex } // Creates new initialized session instance. @@ -54,13 +68,14 @@ func newSession(stor storage.Storage, o *opt.Options) (s *session, err error) { return } s = &session{ - stor: stor, - storLock: storLock, + stor: stor, + storLock: storLock, + stCompPtrs: make([]iKey, o.GetNumLevel()), } s.setOptions(o) s.tops = newTableOps(s, s.o.GetCachedOpenFiles()) - s.setVersion(&version{s: s}) - s.log("log@legend F·NumFile S·FileSize N·Entry C·BadEntry B·BadBlock D·DeletedEntry L·Level Q·SeqNum T·TimeElapsed") + s.setVersion(newVersion(s)) + s.log("log@legend F·NumFile S·FileSize N·Entry C·BadEntry B·BadBlock Ke·KeyError D·DroppedEntry L·Level Q·SeqNum T·TimeElapsed") return } @@ -100,26 +115,26 @@ func (s *session) recover() (err error) { // Don't return os.ErrNotExist if the underlying storage contains // other files that belong to LevelDB. So the DB won't get trashed. if files, _ := s.stor.GetFiles(storage.TypeAll); len(files) > 0 { - err = ErrCorrupted{Type: CorruptedManifest, Err: errors.New("leveldb: manifest file missing")} + err = &errors.ErrCorrupted{File: &storage.FileInfo{Type: storage.TypeManifest}, Err: &errors.ErrMissingFiles{}} } } }() - file, err := s.stor.GetManifest() + m, err := s.stor.GetManifest() if err != nil { return } - reader, err := file.Open() + reader, err := m.Open() if err != nil { return } defer reader.Close() strict := s.o.GetStrict(opt.StrictManifest) - jr := journal.NewReader(reader, dropper{s, file}, strict, true) + jr := journal.NewReader(reader, dropper{s, m}, strict, true) - staging := s.version_NB().newStaging() - rec := &sessionRecord{} + staging := s.stVersion.newStaging() + rec := &sessionRecord{numLevel: s.o.GetNumLevel()} for { var r io.Reader r, err = jr.Next() @@ -128,51 +143,57 @@ func (s *session) recover() (err error) { err = nil break } - return + return errors.SetFile(err, m) } err = rec.decode(r) if err == nil { // save compact pointers - for _, r := range rec.compactionPointers { - s.stCptrs[r.level] = iKey(r.ikey) + for _, r := range rec.compPtrs { + s.stCompPtrs[r.level] = iKey(r.ikey) } // commit record to version staging staging.commit(rec) - } else if strict { - return ErrCorrupted{Type: CorruptedManifest, Err: err} } else { - s.logf("manifest error: %v (skipped)", err) + err = errors.SetFile(err, m) + if strict || !errors.IsCorrupted(err) { + return + } else { + s.logf("manifest error: %v (skipped)", errors.SetFile(err, m)) + } } - rec.resetCompactionPointers() + rec.resetCompPtrs() rec.resetAddedTables() rec.resetDeletedTables() } switch { case !rec.has(recComparer): - return ErrCorrupted{Type: CorruptedManifest, Err: errors.New("leveldb: manifest missing comparer name")} + return newErrManifestCorrupted(m, "comparer", "missing") case rec.comparer != s.icmp.uName(): - return ErrCorrupted{Type: CorruptedManifest, Err: errors.New("leveldb: comparer mismatch, " + "want '" + s.icmp.uName() + "', " + "got '" + rec.comparer + "'")} - case !rec.has(recNextNum): - return ErrCorrupted{Type: CorruptedManifest, Err: errors.New("leveldb: manifest missing next file number")} + return newErrManifestCorrupted(m, "comparer", fmt.Sprintf("mismatch: want '%s', got '%s'", s.icmp.uName(), rec.comparer)) + case !rec.has(recNextFileNum): + return newErrManifestCorrupted(m, "next-file-num", "missing") case !rec.has(recJournalNum): - return ErrCorrupted{Type: CorruptedManifest, Err: errors.New("leveldb: manifest missing journal file number")} - case !rec.has(recSeq): - return ErrCorrupted{Type: CorruptedManifest, Err: errors.New("leveldb: manifest missing seq number")} + return newErrManifestCorrupted(m, "journal-file-num", "missing") + case !rec.has(recSeqNum): + return newErrManifestCorrupted(m, "seq-num", "missing") } - s.manifestFile = file + s.manifestFile = m s.setVersion(staging.finish()) - s.setFileNum(rec.nextNum) + s.setNextFileNum(rec.nextFileNum) s.recordCommited(rec) return nil } // Commit session; need external synchronization. func (s *session) commit(r *sessionRecord) (err error) { + v := s.version() + defer v.release() + // spawn new version based on current version - nv := s.version_NB().spawn(r) + nv := v.spawn(r) if s.manifest == nil { // manifest journal writer not yet created, create one @@ -191,13 +212,13 @@ func (s *session) commit(r *sessionRecord) (err error) { // Pick a compaction based on current state; need external synchronization. func (s *session) pickCompaction() *compaction { - v := s.version_NB() + v := s.version() var level int var t0 tFiles if v.cScore >= 1 { level = v.cLevel - cptr := s.stCptrs[level] + cptr := s.stCompPtrs[level] tables := v.tables[level] for _, t := range tables { if cptr == nil || s.icmp.Compare(t.imax, cptr) > 0 { @@ -214,27 +235,21 @@ func (s *session) pickCompaction() *compaction { level = ts.level t0 = append(t0, ts.table) } else { + v.release() return nil } } - c := &compaction{s: s, v: v, level: level} - if level == 0 { - imin, imax := t0.getRange(s.icmp) - t0 = v.tables[0].getOverlaps(t0[:0], s.icmp, imin.ukey(), imax.ukey(), true) - } - - c.tables[0] = t0 - c.expand() - return c + return newCompaction(s, v, level, t0) } // Create compaction from given level and range; need external synchronization. func (s *session) getCompactionRange(level int, umin, umax []byte) *compaction { - v := s.version_NB() + v := s.version() t0 := v.tables[level].getOverlaps(nil, s.icmp, umin, umax, level == 0) if len(t0) == 0 { + v.release() return nil } @@ -243,7 +258,7 @@ func (s *session) getCompactionRange(level int, umin, umax []byte) *compaction { // and we must not pick one file and drop another older file if the // two files overlap. if level > 0 { - limit := uint64(kMaxTableSize) + limit := uint64(v.s.o.GetCompactionSourceLimit(level)) total := uint64(0) for i, t := range t0 { total += t.size @@ -255,9 +270,20 @@ func (s *session) getCompactionRange(level int, umin, umax []byte) *compaction { } } - c := &compaction{s: s, v: v, level: level} - c.tables[0] = t0 + return newCompaction(s, v, level, t0) +} + +func newCompaction(s *session, v *version, level int, t0 tFiles) *compaction { + c := &compaction{ + s: s, + v: v, + level: level, + tables: [2]tFiles{t0, nil}, + maxGPOverlaps: uint64(s.o.GetCompactionGPOverlaps(level)), + tPtrs: make([]int, s.o.GetNumLevel()), + } c.expand() + c.save() return c } @@ -266,25 +292,57 @@ type compaction struct { s *session v *version - level int - tables [2]tFiles + level int + tables [2]tFiles + maxGPOverlaps uint64 - gp tFiles - gpidx int - seenKey bool - overlappedBytes uint64 - imin, imax iKey + gp tFiles + gpi int + seenKey bool + gpOverlappedBytes uint64 + imin, imax iKey + tPtrs []int + released bool - tPtrs [kNumLevels]int + snapGPI int + snapSeenKey bool + snapGPOverlappedBytes uint64 + snapTPtrs []int +} + +func (c *compaction) save() { + c.snapGPI = c.gpi + c.snapSeenKey = c.seenKey + c.snapGPOverlappedBytes = c.gpOverlappedBytes + c.snapTPtrs = append(c.snapTPtrs[:0], c.tPtrs...) +} + +func (c *compaction) restore() { + c.gpi = c.snapGPI + c.seenKey = c.snapSeenKey + c.gpOverlappedBytes = c.snapGPOverlappedBytes + c.tPtrs = append(c.tPtrs[:0], c.snapTPtrs...) +} + +func (c *compaction) release() { + if !c.released { + c.released = true + c.v.release() + } } // Expand compacted tables; need external synchronization. func (c *compaction) expand() { - level := c.level - vt0, vt1 := c.v.tables[level], c.v.tables[level+1] + limit := uint64(c.s.o.GetCompactionExpandLimit(c.level)) + vt0, vt1 := c.v.tables[c.level], c.v.tables[c.level+1] t0, t1 := c.tables[0], c.tables[1] imin, imax := t0.getRange(c.s.icmp) + // We expand t0 here just incase ukey hop across tables. + t0 = vt0.getOverlaps(t0, c.s.icmp, imin.ukey(), imax.ukey(), c.level == 0) + if len(t0) != len(c.tables[0]) { + imin, imax = t0.getRange(c.s.icmp) + } t1 = vt1.getOverlaps(t1, c.s.icmp, imin.ukey(), imax.ukey(), false) // Get entire range covered by compaction. amin, amax := append(t0, t1...).getRange(c.s.icmp) @@ -292,13 +350,13 @@ func (c *compaction) expand() { // See if we can grow the number of inputs in "level" without // changing the number of "level+1" files we pick up. if len(t1) > 0 { - exp0 := vt0.getOverlaps(nil, c.s.icmp, amin.ukey(), amax.ukey(), level == 0) - if len(exp0) > len(t0) && t1.size()+exp0.size() < kExpCompactionMaxBytes { + exp0 := vt0.getOverlaps(nil, c.s.icmp, amin.ukey(), amax.ukey(), c.level == 0) + if len(exp0) > len(t0) && t1.size()+exp0.size() < limit { xmin, xmax := exp0.getRange(c.s.icmp) exp1 := vt1.getOverlaps(nil, c.s.icmp, xmin.ukey(), xmax.ukey(), false) if len(exp1) == len(t1) { c.s.logf("table@compaction expanding L%d+L%d (F·%d S·%s)+(F·%d S·%s) -> (F·%d S·%s)+(F·%d S·%s)", - level, level+1, len(t0), shortenb(int(t0.size())), len(t1), shortenb(int(t1.size())), + c.level, c.level+1, len(t0), shortenb(int(t0.size())), len(t1), shortenb(int(t1.size())), len(exp0), shortenb(int(exp0.size())), len(exp1), shortenb(int(exp1.size()))) imin, imax = xmin, xmax t0, t1 = exp0, exp1 @@ -309,8 +367,8 @@ func (c *compaction) expand() { // Compute the set of grandparent files that overlap this compaction // (parent == level+1; grandparent == level+2) - if level+2 < kNumLevels { - c.gp = c.v.tables[level+2].getOverlaps(c.gp, c.s.icmp, amin.ukey(), amax.ukey(), false) + if c.level+2 < c.s.o.GetNumLevel() { + c.gp = c.v.tables[c.level+2].getOverlaps(c.gp, c.s.icmp, amin.ukey(), amax.ukey(), false) } c.tables[0], c.tables[1] = t0, t1 @@ -319,7 +377,7 @@ func (c *compaction) expand() { // Check whether compaction is trivial. func (c *compaction) trivial() bool { - return len(c.tables[0]) == 1 && len(c.tables[1]) == 0 && c.gp.size() <= kMaxGrandParentOverlapBytes + return len(c.tables[0]) == 1 && len(c.tables[1]) == 0 && c.gp.size() <= c.maxGPOverlaps } func (c *compaction) baseLevelForKey(ukey []byte) bool { @@ -341,20 +399,20 @@ func (c *compaction) baseLevelForKey(ukey []byte) bool { } func (c *compaction) shouldStopBefore(ikey iKey) bool { - for ; c.gpidx < len(c.gp); c.gpidx++ { - gp := c.gp[c.gpidx] + for ; c.gpi < len(c.gp); c.gpi++ { + gp := c.gp[c.gpi] if c.s.icmp.Compare(ikey, gp.imax) <= 0 { break } if c.seenKey { - c.overlappedBytes += gp.size + c.gpOverlappedBytes += gp.size } } c.seenKey = true - if c.overlappedBytes > kMaxGrandParentOverlapBytes { + if c.gpOverlappedBytes > c.maxGPOverlaps { // Too much overlap for current output; start new output. - c.overlappedBytes = 0 + c.gpOverlappedBytes = 0 return true } return false @@ -373,8 +431,12 @@ func (c *compaction) newIterator() iterator.Iterator { // Options. ro := &opt.ReadOptions{ DontFillCache: true, + Strict: opt.StrictOverride, + } + strict := c.s.o.GetStrict(opt.StrictCompaction) + if strict { + ro.Strict |= opt.StrictReader } - strict := c.s.o.GetStrict(opt.StrictIterator) for i, tables := range c.tables { if len(tables) == 0 { @@ -387,10 +449,10 @@ func (c *compaction) newIterator() iterator.Iterator { its = append(its, c.s.tops.newIterator(t, nil, ro)) } } else { - it := iterator.NewIndexedIterator(tables.newIndexIterator(c.s.tops, c.s.icmp, nil, ro), strict, true) + it := iterator.NewIndexedIterator(tables.newIndexIterator(c.s.tops, c.s.icmp, nil, ro), strict) its = append(its, it) } } - return iterator.NewMergedIterator(its, c.s.icmp, true) + return iterator.NewMergedIterator(its, c.s.icmp, strict) } diff --git a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/session_record.go b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/session_record.go index 27212958..1bdcc68f 100644 --- a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/session_record.go +++ b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/session_record.go @@ -9,11 +9,11 @@ package leveldb import ( "bufio" "encoding/binary" - "errors" "io" -) + "strings" -var errCorruptManifest = errors.New("leveldb: corrupt manifest") + "github.com/syndtr/goleveldb/leveldb/errors" +) type byteReader interface { io.Reader @@ -22,13 +22,13 @@ type byteReader interface { // These numbers are written to disk and should not be changed. const ( - recComparer = 1 - recJournalNum = 2 - recNextNum = 3 - recSeq = 4 - recCompactionPointer = 5 - recDeletedTable = 6 - recNewTable = 7 + recComparer = 1 + recJournalNum = 2 + recNextFileNum = 3 + recSeqNum = 4 + recCompPtr = 5 + recDelTable = 6 + recAddTable = 7 // 8 was used for large value refs recPrevJournalNum = 9 ) @@ -38,7 +38,7 @@ type cpRecord struct { ikey iKey } -type ntRecord struct { +type atRecord struct { level int num uint64 size uint64 @@ -46,27 +46,26 @@ type ntRecord struct { imax iKey } -func (r ntRecord) makeFile(s *session) *tFile { - return newTableFile(s.getTableFile(r.num), r.size, r.imin, r.imax) -} - type dtRecord struct { level int num uint64 } type sessionRecord struct { - hasRec int - comparer string - journalNum uint64 - prevJournalNum uint64 - nextNum uint64 - seq uint64 - compactionPointers []cpRecord - addedTables []ntRecord - deletedTables []dtRecord - scratch [binary.MaxVarintLen64]byte - err error + numLevel int + + hasRec int + comparer string + journalNum uint64 + prevJournalNum uint64 + nextFileNum uint64 + seqNum uint64 + compPtrs []cpRecord + addedTables []atRecord + deletedTables []dtRecord + + scratch [binary.MaxVarintLen64]byte + err error } func (p *sessionRecord) has(rec int) bool { @@ -88,29 +87,29 @@ func (p *sessionRecord) setPrevJournalNum(num uint64) { p.prevJournalNum = num } -func (p *sessionRecord) setNextNum(num uint64) { - p.hasRec |= 1 << recNextNum - p.nextNum = num +func (p *sessionRecord) setNextFileNum(num uint64) { + p.hasRec |= 1 << recNextFileNum + p.nextFileNum = num } -func (p *sessionRecord) setSeq(seq uint64) { - p.hasRec |= 1 << recSeq - p.seq = seq +func (p *sessionRecord) setSeqNum(num uint64) { + p.hasRec |= 1 << recSeqNum + p.seqNum = num } -func (p *sessionRecord) addCompactionPointer(level int, ikey iKey) { - p.hasRec |= 1 << recCompactionPointer - p.compactionPointers = append(p.compactionPointers, cpRecord{level, ikey}) +func (p *sessionRecord) addCompPtr(level int, ikey iKey) { + p.hasRec |= 1 << recCompPtr + p.compPtrs = append(p.compPtrs, cpRecord{level, ikey}) } -func (p *sessionRecord) resetCompactionPointers() { - p.hasRec &= ^(1 << recCompactionPointer) - p.compactionPointers = p.compactionPointers[:0] +func (p *sessionRecord) resetCompPtrs() { + p.hasRec &= ^(1 << recCompPtr) + p.compPtrs = p.compPtrs[:0] } func (p *sessionRecord) addTable(level int, num, size uint64, imin, imax iKey) { - p.hasRec |= 1 << recNewTable - p.addedTables = append(p.addedTables, ntRecord{level, num, size, imin, imax}) + p.hasRec |= 1 << recAddTable + p.addedTables = append(p.addedTables, atRecord{level, num, size, imin, imax}) } func (p *sessionRecord) addTableFile(level int, t *tFile) { @@ -118,17 +117,17 @@ func (p *sessionRecord) addTableFile(level int, t *tFile) { } func (p *sessionRecord) resetAddedTables() { - p.hasRec &= ^(1 << recNewTable) + p.hasRec &= ^(1 << recAddTable) p.addedTables = p.addedTables[:0] } -func (p *sessionRecord) deleteTable(level int, num uint64) { - p.hasRec |= 1 << recDeletedTable +func (p *sessionRecord) delTable(level int, num uint64) { + p.hasRec |= 1 << recDelTable p.deletedTables = append(p.deletedTables, dtRecord{level, num}) } func (p *sessionRecord) resetDeletedTables() { - p.hasRec &= ^(1 << recDeletedTable) + p.hasRec &= ^(1 << recDelTable) p.deletedTables = p.deletedTables[:0] } @@ -161,26 +160,26 @@ func (p *sessionRecord) encode(w io.Writer) error { p.putUvarint(w, recJournalNum) p.putUvarint(w, p.journalNum) } - if p.has(recNextNum) { - p.putUvarint(w, recNextNum) - p.putUvarint(w, p.nextNum) + if p.has(recNextFileNum) { + p.putUvarint(w, recNextFileNum) + p.putUvarint(w, p.nextFileNum) } - if p.has(recSeq) { - p.putUvarint(w, recSeq) - p.putUvarint(w, p.seq) + if p.has(recSeqNum) { + p.putUvarint(w, recSeqNum) + p.putUvarint(w, p.seqNum) } - for _, r := range p.compactionPointers { - p.putUvarint(w, recCompactionPointer) + for _, r := range p.compPtrs { + p.putUvarint(w, recCompPtr) p.putUvarint(w, uint64(r.level)) p.putBytes(w, r.ikey) } for _, r := range p.deletedTables { - p.putUvarint(w, recDeletedTable) + p.putUvarint(w, recDelTable) p.putUvarint(w, uint64(r.level)) p.putUvarint(w, r.num) } for _, r := range p.addedTables { - p.putUvarint(w, recNewTable) + p.putUvarint(w, recAddTable) p.putUvarint(w, uint64(r.level)) p.putUvarint(w, r.num) p.putUvarint(w, r.size) @@ -190,14 +189,16 @@ func (p *sessionRecord) encode(w io.Writer) error { return p.err } -func (p *sessionRecord) readUvarint(r io.ByteReader) uint64 { +func (p *sessionRecord) readUvarintMayEOF(field string, r io.ByteReader, mayEOF bool) uint64 { if p.err != nil { return 0 } x, err := binary.ReadUvarint(r) if err != nil { - if err == io.EOF { - p.err = errCorruptManifest + if err == io.ErrUnexpectedEOF || (mayEOF == false && err == io.EOF) { + p.err = errors.NewErrCorrupted(nil, &ErrManifestCorrupted{field, "short read"}) + } else if strings.HasPrefix(err.Error(), "binary:") { + p.err = errors.NewErrCorrupted(nil, &ErrManifestCorrupted{field, err.Error()}) } else { p.err = err } @@ -206,35 +207,39 @@ func (p *sessionRecord) readUvarint(r io.ByteReader) uint64 { return x } -func (p *sessionRecord) readBytes(r byteReader) []byte { +func (p *sessionRecord) readUvarint(field string, r io.ByteReader) uint64 { + return p.readUvarintMayEOF(field, r, false) +} + +func (p *sessionRecord) readBytes(field string, r byteReader) []byte { if p.err != nil { return nil } - n := p.readUvarint(r) + n := p.readUvarint(field, r) if p.err != nil { return nil } x := make([]byte, n) _, p.err = io.ReadFull(r, x) if p.err != nil { - if p.err == io.EOF { - p.err = errCorruptManifest + if p.err == io.ErrUnexpectedEOF { + p.err = errors.NewErrCorrupted(nil, &ErrManifestCorrupted{field, "short read"}) } return nil } return x } -func (p *sessionRecord) readLevel(r io.ByteReader) int { +func (p *sessionRecord) readLevel(field string, r io.ByteReader) int { if p.err != nil { return 0 } - x := p.readUvarint(r) + x := p.readUvarint(field, r) if p.err != nil { return 0 } - if x >= kNumLevels { - p.err = errCorruptManifest + if x >= uint64(p.numLevel) { + p.err = errors.NewErrCorrupted(nil, &ErrManifestCorrupted{field, "invalid level number"}) return 0 } return int(x) @@ -247,59 +252,59 @@ func (p *sessionRecord) decode(r io.Reader) error { } p.err = nil for p.err == nil { - rec, err := binary.ReadUvarint(br) - if err != nil { - if err == io.EOF { - err = nil + rec := p.readUvarintMayEOF("field-header", br, true) + if p.err != nil { + if p.err == io.EOF { + return nil } - return err + return p.err } switch rec { case recComparer: - x := p.readBytes(br) + x := p.readBytes("comparer", br) if p.err == nil { p.setComparer(string(x)) } case recJournalNum: - x := p.readUvarint(br) + x := p.readUvarint("journal-num", br) if p.err == nil { p.setJournalNum(x) } case recPrevJournalNum: - x := p.readUvarint(br) + x := p.readUvarint("prev-journal-num", br) if p.err == nil { p.setPrevJournalNum(x) } - case recNextNum: - x := p.readUvarint(br) + case recNextFileNum: + x := p.readUvarint("next-file-num", br) if p.err == nil { - p.setNextNum(x) + p.setNextFileNum(x) } - case recSeq: - x := p.readUvarint(br) + case recSeqNum: + x := p.readUvarint("seq-num", br) if p.err == nil { - p.setSeq(x) + p.setSeqNum(x) } - case recCompactionPointer: - level := p.readLevel(br) - ikey := p.readBytes(br) + case recCompPtr: + level := p.readLevel("comp-ptr.level", br) + ikey := p.readBytes("comp-ptr.ikey", br) if p.err == nil { - p.addCompactionPointer(level, iKey(ikey)) + p.addCompPtr(level, iKey(ikey)) } - case recNewTable: - level := p.readLevel(br) - num := p.readUvarint(br) - size := p.readUvarint(br) - imin := p.readBytes(br) - imax := p.readBytes(br) + case recAddTable: + level := p.readLevel("add-table.level", br) + num := p.readUvarint("add-table.num", br) + size := p.readUvarint("add-table.size", br) + imin := p.readBytes("add-table.imin", br) + imax := p.readBytes("add-table.imax", br) if p.err == nil { p.addTable(level, num, size, imin, imax) } - case recDeletedTable: - level := p.readLevel(br) - num := p.readUvarint(br) + case recDelTable: + level := p.readLevel("del-table.level", br) + num := p.readUvarint("del-table.num", br) if p.err == nil { - p.deleteTable(level, num) + p.delTable(level, num) } } } diff --git a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/session_record_test.go b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/session_record_test.go index 029fabfe..c0c035ae 100644 --- a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/session_record_test.go +++ b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/session_record_test.go @@ -9,6 +9,8 @@ package leveldb import ( "bytes" "testing" + + "github.com/syndtr/goleveldb/leveldb/opt" ) func decodeEncode(v *sessionRecord) (res bool, err error) { @@ -17,7 +19,7 @@ func decodeEncode(v *sessionRecord) (res bool, err error) { if err != nil { return } - v2 := new(sessionRecord) + v2 := &sessionRecord{numLevel: opt.DefaultNumLevel} err = v.decode(b) if err != nil { return @@ -32,7 +34,7 @@ func decodeEncode(v *sessionRecord) (res bool, err error) { func TestSessionRecord_EncodeDecode(t *testing.T) { big := uint64(1) << 50 - v := new(sessionRecord) + v := &sessionRecord{numLevel: opt.DefaultNumLevel} i := uint64(0) test := func() { res, err := decodeEncode(v) @@ -47,16 +49,16 @@ func TestSessionRecord_EncodeDecode(t *testing.T) { for ; i < 4; i++ { test() v.addTable(3, big+300+i, big+400+i, - newIKey([]byte("foo"), big+500+1, tVal), - newIKey([]byte("zoo"), big+600+1, tDel)) - v.deleteTable(4, big+700+i) - v.addCompactionPointer(int(i), newIKey([]byte("x"), big+900+1, tVal)) + newIkey([]byte("foo"), big+500+1, ktVal), + newIkey([]byte("zoo"), big+600+1, ktDel)) + v.delTable(4, big+700+i) + v.addCompPtr(int(i), newIkey([]byte("x"), big+900+1, ktVal)) } v.setComparer("foo") v.setJournalNum(big + 100) v.setPrevJournalNum(big + 99) - v.setNextNum(big + 200) - v.setSeq(big + 1000) + v.setNextFileNum(big + 200) + v.setSeqNum(big + 1000) test() } diff --git a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/session_util.go b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/session_util.go index 715c9f5b..007c02cd 100644 --- a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/session_util.go +++ b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/session_util.go @@ -22,7 +22,7 @@ type dropper struct { } func (d dropper) Drop(err error) { - if e, ok := err.(journal.ErrCorrupted); ok { + if e, ok := err.(*journal.ErrCorrupted); ok { d.s.logf("journal@drop %s-%d S·%s %q", d.file.Type(), d.file.Num(), shortenb(e.Size), e.Reason) } else { d.s.logf("journal@drop %s-%d %q", d.file.Type(), d.file.Num(), err) @@ -51,9 +51,14 @@ func (s *session) newTemp() storage.File { return s.stor.GetFile(num, storage.TypeTemp) } +func (s *session) tableFileFromRecord(r atRecord) *tFile { + return newTableFile(s.getTableFile(r.num), r.size, r.imin, r.imax) +} + // Session state. -// Get current version. +// Get current version. This will incr version ref, must call +// version.release (exactly once) after use. func (s *session) version() *version { s.vmu.Lock() defer s.vmu.Unlock() @@ -61,61 +66,56 @@ func (s *session) version() *version { return s.stVersion } -// Get current version; no barrier. -func (s *session) version_NB() *version { - return s.stVersion -} - // Set current version to v. func (s *session) setVersion(v *version) { s.vmu.Lock() - v.ref = 1 + v.ref = 1 // Holds by session. if old := s.stVersion; old != nil { - v.ref++ + v.ref++ // Holds by old version. old.next = v - old.release_NB() + old.releaseNB() } s.stVersion = v s.vmu.Unlock() } // Get current unused file number. -func (s *session) fileNum() uint64 { - return atomic.LoadUint64(&s.stFileNum) +func (s *session) nextFileNum() uint64 { + return atomic.LoadUint64(&s.stNextFileNum) } -// Get current unused file number to num. -func (s *session) setFileNum(num uint64) { - atomic.StoreUint64(&s.stFileNum, num) +// Set current unused file number to num. +func (s *session) setNextFileNum(num uint64) { + atomic.StoreUint64(&s.stNextFileNum, num) } // Mark file number as used. func (s *session) markFileNum(num uint64) { - num += 1 + nextFileNum := num + 1 for { - old, x := s.stFileNum, num + old, x := s.stNextFileNum, nextFileNum if old > x { x = old } - if atomic.CompareAndSwapUint64(&s.stFileNum, old, x) { + if atomic.CompareAndSwapUint64(&s.stNextFileNum, old, x) { break } } } // Allocate a file number. -func (s *session) allocFileNum() (num uint64) { - return atomic.AddUint64(&s.stFileNum, 1) - 1 +func (s *session) allocFileNum() uint64 { + return atomic.AddUint64(&s.stNextFileNum, 1) - 1 } // Reuse given file number. func (s *session) reuseFileNum(num uint64) { for { - old, x := s.stFileNum, num + old, x := s.stNextFileNum, num if old != x+1 { x = old } - if atomic.CompareAndSwapUint64(&s.stFileNum, old, x) { + if atomic.CompareAndSwapUint64(&s.stNextFileNum, old, x) { break } } @@ -126,20 +126,20 @@ func (s *session) reuseFileNum(num uint64) { // Fill given session record obj with current states; need external // synchronization. func (s *session) fillRecord(r *sessionRecord, snapshot bool) { - r.setNextNum(s.fileNum()) + r.setNextFileNum(s.nextFileNum()) if snapshot { if !r.has(recJournalNum) { r.setJournalNum(s.stJournalNum) } - if !r.has(recSeq) { - r.setSeq(s.stSeq) + if !r.has(recSeqNum) { + r.setSeqNum(s.stSeqNum) } - for level, ik := range s.stCptrs { + for level, ik := range s.stCompPtrs { if ik != nil { - r.addCompactionPointer(level, ik) + r.addCompPtr(level, ik) } } @@ -158,12 +158,12 @@ func (s *session) recordCommited(r *sessionRecord) { s.stPrevJournalNum = r.prevJournalNum } - if r.has(recSeq) { - s.stSeq = r.seq + if r.has(recSeqNum) { + s.stSeqNum = r.seqNum } - for _, p := range r.compactionPointers { - s.stCptrs[p.level] = iKey(p.ikey) + for _, p := range r.compPtrs { + s.stCompPtrs[p.level] = iKey(p.ikey) } } @@ -178,10 +178,11 @@ func (s *session) newManifest(rec *sessionRecord, v *version) (err error) { jw := journal.NewWriter(writer) if v == nil { - v = s.version_NB() + v = s.version() + defer v.release() } if rec == nil { - rec = new(sessionRecord) + rec = &sessionRecord{numLevel: s.o.GetNumLevel()} } s.fillRecord(rec, true) v.fillRecord(rec) diff --git a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/storage/storage.go b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/storage/storage.go index 5a1885e6..85dd70b0 100644 --- a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/storage/storage.go +++ b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/storage/storage.go @@ -125,3 +125,33 @@ type Storage interface { // Other methods should not be called after the storage has been closed. Close() error } + +// FileInfo wraps basic file info. +type FileInfo struct { + Type FileType + Num uint64 +} + +func (fi FileInfo) String() string { + switch fi.Type { + case TypeManifest: + return fmt.Sprintf("MANIFEST-%06d", fi.Num) + case TypeJournal: + return fmt.Sprintf("%06d.log", fi.Num) + case TypeTable: + return fmt.Sprintf("%06d.ldb", fi.Num) + case TypeTemp: + return fmt.Sprintf("%06d.tmp", fi.Num) + default: + return fmt.Sprintf("%#x-%d", fi.Type, fi.Num) + } +} + +// NewFileInfo creates new FileInfo from the given File. It will returns nil +// if File is nil. +func NewFileInfo(f File) *FileInfo { + if f == nil { + return nil + } + return &FileInfo{f.Type(), f.Num()} +} diff --git a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/storage_test.go b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/storage_test.go index d6628b29..06dfd2ee 100644 --- a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/storage_test.go +++ b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/storage_test.go @@ -11,6 +11,7 @@ import ( "fmt" "io" "io/ioutil" + "math/rand" "os" "path/filepath" "sync" @@ -36,6 +37,19 @@ var ( tsNum = 0 ) +type tsOp uint + +const ( + tsOpOpen tsOp = iota + tsOpCreate + tsOpRead + tsOpReadAt + tsOpWrite + tsOpSync + + tsOpNum +) + type tsLock struct { ts *testStorage r util.Releaser @@ -54,6 +68,9 @@ type tsReader struct { func (tr tsReader) Read(b []byte) (n int, err error) { ts := tr.tf.ts ts.countRead(tr.tf.Type()) + if tr.tf.shouldErrLocked(tsOpRead) { + return 0, errors.New("leveldb.testStorage: emulated read error") + } n, err = tr.Reader.Read(b) if err != nil && err != io.EOF { ts.t.Errorf("E: read error, num=%d type=%v n=%d: %v", tr.tf.Num(), tr.tf.Type(), n, err) @@ -64,6 +81,9 @@ func (tr tsReader) Read(b []byte) (n int, err error) { func (tr tsReader) ReadAt(b []byte, off int64) (n int, err error) { ts := tr.tf.ts ts.countRead(tr.tf.Type()) + if tr.tf.shouldErrLocked(tsOpReadAt) { + return 0, errors.New("leveldb.testStorage: emulated readAt error") + } n, err = tr.Reader.ReadAt(b, off) if err != nil && err != io.EOF { ts.t.Errorf("E: readAt error, num=%d type=%v off=%d n=%d: %v", tr.tf.Num(), tr.tf.Type(), off, n, err) @@ -83,15 +103,12 @@ type tsWriter struct { } func (tw tsWriter) Write(b []byte) (n int, err error) { - ts := tw.tf.ts - ts.mu.Lock() - defer ts.mu.Unlock() - if ts.emuWriteErr&tw.tf.Type() != 0 { + if tw.tf.shouldErrLocked(tsOpWrite) { return 0, errors.New("leveldb.testStorage: emulated write error") } n, err = tw.Writer.Write(b) if err != nil { - ts.t.Errorf("E: write error, num=%d type=%v n=%d: %v", tw.tf.Num(), tw.tf.Type(), n, err) + tw.tf.ts.t.Errorf("E: write error, num=%d type=%v n=%d: %v", tw.tf.Num(), tw.tf.Type(), n, err) } return } @@ -99,23 +116,23 @@ func (tw tsWriter) Write(b []byte) (n int, err error) { func (tw tsWriter) Sync() (err error) { ts := tw.tf.ts ts.mu.Lock() - defer ts.mu.Unlock() for ts.emuDelaySync&tw.tf.Type() != 0 { ts.cond.Wait() } - if ts.emuSyncErr&tw.tf.Type() != 0 { + ts.mu.Unlock() + if tw.tf.shouldErrLocked(tsOpSync) { return errors.New("leveldb.testStorage: emulated sync error") } err = tw.Writer.Sync() if err != nil { - ts.t.Errorf("E: sync error, num=%d type=%v: %v", tw.tf.Num(), tw.tf.Type(), err) + tw.tf.ts.t.Errorf("E: sync error, num=%d type=%v: %v", tw.tf.Num(), tw.tf.Type(), err) } return } func (tw tsWriter) Close() (err error) { err = tw.Writer.Close() - tw.tf.close("reader", err) + tw.tf.close("writer", err) return } @@ -128,6 +145,16 @@ func (tf tsFile) x() uint64 { return tf.Num()< 0 { // Find the earliest possible internal key for min. - i = tf.searchMax(icmp, newIKey(umin, kMaxSeq, tSeek)) + i = tf.searchMax(icmp, newIkey(umin, kMaxSeq, ktSeek)) } if i >= len(tf) { // Beginning of range is after all files, so no overlap. @@ -159,24 +172,25 @@ func (tf tFiles) overlaps(icmp *iComparer, umin, umax []byte, unsorted bool) boo } // Returns tables whose its key range overlaps with given key range. -// If overlapped is true then the search will be expanded to tables that -// overlaps with each other. +// Range will be expanded if ukey found hop across tables. +// If overlapped is true then the search will be restarted if umax +// expanded. +// The dst content will be overwritten. func (tf tFiles) getOverlaps(dst tFiles, icmp *iComparer, umin, umax []byte, overlapped bool) tFiles { - x := len(dst) + dst = dst[:0] for i := 0; i < len(tf); { t := tf[i] if t.overlaps(icmp, umin, umax) { - if overlapped { - // For overlapped files, check if the newly added file has - // expanded the range. If so, restart search. - if umin != nil && icmp.uCompare(t.imin.ukey(), umin) < 0 { - umin = t.imin.ukey() - dst = dst[:x] - i = 0 - continue - } else if umax != nil && icmp.uCompare(t.imax.ukey(), umax) > 0 { - umax = t.imax.ukey() - dst = dst[:x] + if umin != nil && icmp.uCompare(t.imin.ukey(), umin) < 0 { + umin = t.imin.ukey() + dst = dst[:0] + i = 0 + continue + } else if umax != nil && icmp.uCompare(t.imax.ukey(), umax) > 0 { + umax = t.imax.ukey() + // Restart search if it is overlapped. + if overlapped { + dst = dst[:0] i = 0 continue } @@ -289,7 +303,7 @@ func (t *tOps) create() (*tWriter, error) { t: t, file: file, w: fw, - tw: table.NewWriter(fw, t.s.o), + tw: table.NewWriter(fw, t.s.o.Options), }, nil } @@ -337,7 +351,13 @@ func (t *tOps) open(f *tFile) (ch cache.Handle, err error) { if bc := t.s.o.GetBlockCache(); bc != nil { bcacheNS = bc.GetNamespace(num) } - return 1, table.NewReader(r, int64(f.size), bcacheNS, t.bpool, t.s.o) + var tr *table.Reader + tr, err = table.NewReader(r, int64(f.size), storage.NewFileInfo(f.file), bcacheNS, t.bpool, t.s.o.Options) + if err != nil { + r.Close() + return 0, nil + } + return 1, tr }) if ch == nil && err == nil { err = ErrClosed @@ -440,28 +460,34 @@ func (w *tWriter) empty() bool { return w.first == nil } +// Closes the storage.Writer. +func (w *tWriter) close() { + if w.w != nil { + w.w.Close() + w.w = nil + } +} + // Finalizes the table and returns table file. func (w *tWriter) finish() (f *tFile, err error) { + defer w.close() err = w.tw.Close() if err != nil { return } err = w.w.Sync() if err != nil { - w.w.Close() return } - w.w.Close() f = newTableFile(w.file, uint64(w.tw.BytesLen()), iKey(w.first), iKey(w.last)) return } // Drops the table. func (w *tWriter) drop() { - w.w.Close() + w.close() w.file.Remove() w.t.s.reuseFileNum(w.file.Num()) - w.w = nil w.file = nil w.tw = nil w.first = nil diff --git a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/table/reader.go b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/table/reader.go index 5efd70b0..a7a2c9b3 100644 --- a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/table/reader.go +++ b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/table/reader.go @@ -8,29 +8,41 @@ package table import ( "encoding/binary" - "errors" "fmt" "io" "sort" "strings" "sync" - "code.google.com/p/snappy-go/snappy" + "github.com/syndtr/gosnappy/snappy" "github.com/syndtr/goleveldb/leveldb/cache" "github.com/syndtr/goleveldb/leveldb/comparer" + "github.com/syndtr/goleveldb/leveldb/errors" "github.com/syndtr/goleveldb/leveldb/filter" "github.com/syndtr/goleveldb/leveldb/iterator" "github.com/syndtr/goleveldb/leveldb/opt" + "github.com/syndtr/goleveldb/leveldb/storage" "github.com/syndtr/goleveldb/leveldb/util" ) var ( - ErrNotFound = util.ErrNotFound + ErrNotFound = errors.ErrNotFound ErrReaderReleased = errors.New("leveldb/table: reader released") ErrIterReleased = errors.New("leveldb/table: iterator released") ) +type ErrCorrupted struct { + Pos int64 + Size int64 + Kind string + Reason string +} + +func (e *ErrCorrupted) Error() string { + return fmt.Sprintf("leveldb/table: corruption on %s (pos=%d): %s", e.Kind, e.Pos, e.Reason) +} + func max(x, y int) int { if x > y { return x @@ -38,13 +50,19 @@ func max(x, y int) int { return y } +func verifyBlockChecksum(data []byte) bool { + n := len(data) - 4 + checksum0 := binary.LittleEndian.Uint32(data[n:]) + checksum1 := util.NewCRC(data[:n]).Value() + return checksum0 == checksum1 +} + type block struct { bpool *util.BufferPool + bh blockHandle data []byte restartsLen int restartsOffset int - // Whether checksum is verified and valid. - checksum bool } func (b *block) seek(cmp comparer.Comparer, rstart, rlimit int, key []byte) (index, offset int, err error) { @@ -77,7 +95,7 @@ func (b *block) restartOffset(index int) int { func (b *block) entry(offset int) (key, value []byte, nShared, n int, err error) { if offset >= b.restartsOffset { if offset != b.restartsOffset { - err = errors.New("leveldb/table: Reader: BlockEntry: invalid block (block entries offset not aligned)") + err = &ErrCorrupted{Reason: "entries offset not aligned"} } return } @@ -87,7 +105,7 @@ func (b *block) entry(offset int) (key, value []byte, nShared, n int, err error) m := n0 + n1 + n2 n = m + int(v1) + int(v2) if n0 <= 0 || n1 <= 0 || n2 <= 0 || offset+n > b.restartsOffset { - err = errors.New("leveldb/table: Reader: invalid block (block entries corrupted)") + err = &ErrCorrupted{Reason: "entries corrupted"} return } key = b.data[offset+m : offset+m+int(v1)] @@ -251,7 +269,7 @@ func (i *blockIter) Next() bool { for i.offset < i.offsetRealStart { key, value, nShared, n, err := i.block.entry(i.offset) if err != nil { - i.sErr(err) + i.sErr(i.tr.fixErrCorruptedBH(i.block.bh, err)) return false } if n == 0 { @@ -265,13 +283,13 @@ func (i *blockIter) Next() bool { if i.offset >= i.offsetLimit { i.dir = dirEOI if i.offset != i.offsetLimit { - i.sErr(errors.New("leveldb/table: Reader: Next: invalid block (block entries offset not aligned)")) + i.sErr(i.tr.newErrCorruptedBH(i.block.bh, "entries offset not aligned")) } return false } key, value, nShared, n, err := i.block.entry(i.offset) if err != nil { - i.sErr(err) + i.sErr(i.tr.fixErrCorruptedBH(i.block.bh, err)) return false } if n == 0 { @@ -356,7 +374,7 @@ func (i *blockIter) Prev() bool { for { key, value, nShared, n, err := i.block.entry(offset) if err != nil { - i.sErr(err) + i.sErr(i.tr.fixErrCorruptedBH(i.block.bh, err)) return false } if offset >= i.offsetRealStart { @@ -375,7 +393,7 @@ func (i *blockIter) Prev() bool { // Stop if target offset reached. if offset >= i.offset { if offset != i.offset { - i.sErr(errors.New("leveldb/table: Reader: Prev: invalid block (block entries offset not aligned)")) + i.sErr(i.tr.newErrCorruptedBH(i.block.bh, "entries offset not aligned")) return false } @@ -473,7 +491,6 @@ type indexIter struct { tr *Reader slice *util.Range // Options - checksum bool fillCache bool } @@ -484,28 +501,29 @@ func (i *indexIter) Get() iterator.Iterator { } dataBH, n := decodeBlockHandle(value) if n == 0 { - return iterator.NewEmptyIterator(errors.New("leveldb/table: Reader: invalid table (bad data block handle)")) + return iterator.NewEmptyIterator(i.tr.newErrCorruptedBH(i.tr.indexBH, "bad data block handle")) } var slice *util.Range if i.slice != nil && (i.blockIter.isFirst() || i.blockIter.isLast()) { slice = i.slice } - return i.tr.getDataIterErr(dataBH, slice, i.checksum, i.fillCache) + return i.tr.getDataIterErr(dataBH, slice, i.tr.verifyChecksum, i.fillCache) } // Reader is a table reader. type Reader struct { mu sync.RWMutex + fi *storage.FileInfo reader io.ReaderAt cache cache.Namespace err error bpool *util.BufferPool // Options - cmp comparer.Comparer - filter filter.Filter - checksum bool - strictIter bool + o *opt.Options + cmp comparer.Comparer + filter filter.Filter + verifyChecksum bool dataEnd int64 indexBH, filterBH blockHandle @@ -513,23 +531,43 @@ type Reader struct { filterBlock *filterBlock } -func verifyChecksum(data []byte) bool { - n := len(data) - 4 - checksum0 := binary.LittleEndian.Uint32(data[n:]) - checksum1 := util.NewCRC(data[:n]).Value() - return checksum0 == checksum1 +func (r *Reader) blockKind(bh blockHandle) string { + switch bh.offset { + case r.indexBH.offset: + return "index-block" + case r.filterBH.offset: + return "filter-block" + default: + return "data-block" + } } -func (r *Reader) readRawBlock(bh blockHandle, checksum bool) ([]byte, error) { +func (r *Reader) newErrCorrupted(pos, size int64, kind, reason string) error { + return &errors.ErrCorrupted{File: r.fi, Err: &ErrCorrupted{Pos: pos, Size: size, Kind: kind, Reason: reason}} +} + +func (r *Reader) newErrCorruptedBH(bh blockHandle, reason string) error { + return r.newErrCorrupted(int64(bh.offset), int64(bh.length), r.blockKind(bh), reason) +} + +func (r *Reader) fixErrCorruptedBH(bh blockHandle, err error) error { + if cerr, ok := err.(*ErrCorrupted); ok { + cerr.Pos = int64(bh.offset) + cerr.Size = int64(bh.length) + cerr.Kind = r.blockKind(bh) + return &errors.ErrCorrupted{File: r.fi, Err: cerr} + } + return err +} + +func (r *Reader) readRawBlock(bh blockHandle, verifyChecksum bool) ([]byte, error) { data := r.bpool.Get(int(bh.length + blockTrailerLen)) if _, err := r.reader.ReadAt(data, int64(bh.offset)); err != nil && err != io.EOF { return nil, err } - if checksum || r.checksum { - if !verifyChecksum(data) { - r.bpool.Put(data) - return nil, errors.New("leveldb/table: Reader: invalid block (checksum mismatch)") - } + if verifyChecksum && !verifyBlockChecksum(data) { + r.bpool.Put(data) + return nil, r.newErrCorruptedBH(bh, "checksum mismatch") } switch data[bh.length] { case blockTypeNoCompression: @@ -537,38 +575,40 @@ func (r *Reader) readRawBlock(bh blockHandle, checksum bool) ([]byte, error) { case blockTypeSnappyCompression: decLen, err := snappy.DecodedLen(data[:bh.length]) if err != nil { - return nil, err + return nil, r.newErrCorruptedBH(bh, err.Error()) } - tmp := data - data, err = snappy.Decode(r.bpool.Get(decLen), tmp[:bh.length]) - r.bpool.Put(tmp) + decData := r.bpool.Get(decLen) + decData, err = snappy.Decode(decData, data[:bh.length]) + r.bpool.Put(data) if err != nil { - return nil, err + r.bpool.Put(decData) + return nil, r.newErrCorruptedBH(bh, err.Error()) } + data = decData default: r.bpool.Put(data) - return nil, fmt.Errorf("leveldb/table: Reader: unknown block compression type: %d", data[bh.length]) + return nil, r.newErrCorruptedBH(bh, fmt.Sprintf("unknown compression type %#x", data[bh.length])) } return data, nil } -func (r *Reader) readBlock(bh blockHandle, checksum bool) (*block, error) { - data, err := r.readRawBlock(bh, checksum) +func (r *Reader) readBlock(bh blockHandle, verifyChecksum bool) (*block, error) { + data, err := r.readRawBlock(bh, verifyChecksum) if err != nil { return nil, err } restartsLen := int(binary.LittleEndian.Uint32(data[len(data)-4:])) b := &block{ bpool: r.bpool, + bh: bh, data: data, restartsLen: restartsLen, restartsOffset: len(data) - (restartsLen+1)*4, - checksum: checksum || r.checksum, } return b, nil } -func (r *Reader) readBlockCached(bh blockHandle, checksum, fillCache bool) (*block, util.Releaser, error) { +func (r *Reader) readBlockCached(bh blockHandle, verifyChecksum, fillCache bool) (*block, util.Releaser, error) { if r.cache != nil { var err error ch := r.cache.Get(bh.offset, func() (charge int, value interface{}) { @@ -576,7 +616,7 @@ func (r *Reader) readBlockCached(bh blockHandle, checksum, fillCache bool) (*blo return 0, nil } var b *block - b, err = r.readBlock(bh, checksum) + b, err = r.readBlock(bh, verifyChecksum) if err != nil { return 0, nil } @@ -586,14 +626,7 @@ func (r *Reader) readBlockCached(bh blockHandle, checksum, fillCache bool) (*blo b, ok := ch.Value().(*block) if !ok { ch.Release() - return nil, nil, errors.New("leveldb/table: Reader: inconsistent block type") - } - if !b.checksum && (r.checksum || checksum) { - if !verifyChecksum(b.data) { - ch.Release() - return nil, nil, errors.New("leveldb/table: Reader: invalid block (checksum mismatch)") - } - b.checksum = true + return nil, nil, errors.New("leveldb/table: inconsistent block type") } return b, ch, err } else if err != nil { @@ -601,7 +634,7 @@ func (r *Reader) readBlockCached(bh blockHandle, checksum, fillCache bool) (*blo } } - b, err := r.readBlock(bh, checksum) + b, err := r.readBlock(bh, verifyChecksum) return b, b, err } @@ -612,12 +645,12 @@ func (r *Reader) readFilterBlock(bh blockHandle) (*filterBlock, error) { } n := len(data) if n < 5 { - return nil, errors.New("leveldb/table: Reader: invalid filter block (too short)") + return nil, r.newErrCorruptedBH(bh, "too short") } m := n - 5 oOffset := int(binary.LittleEndian.Uint32(data[m:])) if oOffset > m { - return nil, errors.New("leveldb/table: Reader: invalid filter block (invalid offset)") + return nil, r.newErrCorruptedBH(bh, "invalid data-offsets offset") } b := &filterBlock{ bpool: r.bpool, @@ -647,7 +680,7 @@ func (r *Reader) readFilterBlockCached(bh blockHandle, fillCache bool) (*filterB b, ok := ch.Value().(*filterBlock) if !ok { ch.Release() - return nil, nil, errors.New("leveldb/table: Reader: inconsistent block type") + return nil, nil, errors.New("leveldb/table: inconsistent block type") } return b, ch, err } else if err != nil { @@ -673,25 +706,6 @@ func (r *Reader) getFilterBlock(fillCache bool) (*filterBlock, util.Releaser, er return r.filterBlock, util.NoopReleaser{}, nil } -func (r *Reader) getDataIter(dataBH blockHandle, slice *util.Range, checksum, fillCache bool) iterator.Iterator { - b, rel, err := r.readBlockCached(dataBH, checksum, fillCache) - if err != nil { - return iterator.NewEmptyIterator(err) - } - return r.newBlockIter(b, rel, slice, false) -} - -func (r *Reader) getDataIterErr(dataBH blockHandle, slice *util.Range, checksum, fillCache bool) iterator.Iterator { - r.mu.RLock() - defer r.mu.RUnlock() - - if r.err != nil { - return iterator.NewEmptyIterator(r.err) - } - - return r.getDataIter(dataBH, slice, checksum, fillCache) -} - func (r *Reader) newBlockIter(b *block, bReleaser util.Releaser, slice *util.Range, inclLimit bool) *blockIter { bi := &blockIter{ tr: r, @@ -726,12 +740,31 @@ func (r *Reader) newBlockIter(b *block, bReleaser util.Releaser, slice *util.Ran } bi.reset() if bi.offsetStart > bi.offsetLimit { - bi.sErr(errors.New("leveldb/table: Reader: invalid slice range")) + bi.sErr(errors.New("leveldb/table: invalid slice range")) } } return bi } +func (r *Reader) getDataIter(dataBH blockHandle, slice *util.Range, verifyChecksum, fillCache bool) iterator.Iterator { + b, rel, err := r.readBlockCached(dataBH, verifyChecksum, fillCache) + if err != nil { + return iterator.NewEmptyIterator(err) + } + return r.newBlockIter(b, rel, slice, false) +} + +func (r *Reader) getDataIterErr(dataBH blockHandle, slice *util.Range, verifyChecksum, fillCache bool) iterator.Iterator { + r.mu.RLock() + defer r.mu.RUnlock() + + if r.err != nil { + return iterator.NewEmptyIterator(r.err) + } + + return r.getDataIter(dataBH, slice, verifyChecksum, fillCache) +} + // NewIterator creates an iterator from the table. // // Slice allows slicing the iterator to only contains keys in the given @@ -760,10 +793,9 @@ func (r *Reader) NewIterator(slice *util.Range, ro *opt.ReadOptions) iterator.It blockIter: r.newBlockIter(indexBlock, rel, slice, true), tr: r, slice: slice, - checksum: ro.GetStrict(opt.StrictBlockChecksum), fillCache: !ro.GetDontFillCache(), } - return iterator.NewIndexedIterator(index, r.strictIter || ro.GetStrict(opt.StrictIterator), true) + return iterator.NewIndexedIterator(index, opt.GetStrict(r.o, ro, opt.StrictReader)) } // Find finds key/value pair whose key is greater than or equal to the @@ -798,7 +830,7 @@ func (r *Reader) Find(key []byte, ro *opt.ReadOptions) (rkey, value []byte, err } dataBH, n := decodeBlockHandle(index.Value()) if n == 0 { - err = errors.New("leveldb/table: Reader: invalid table (bad data block handle)") + r.err = r.newErrCorruptedBH(r.indexBH, "bad data block handle") return } if r.filter != nil { @@ -811,7 +843,7 @@ func (r *Reader) Find(key []byte, ro *opt.ReadOptions) (rkey, value []byte, err rel.Release() } } - data := r.getDataIter(dataBH, nil, ro.GetStrict(opt.StrictBlockChecksum), !ro.GetDontFillCache()) + data := r.getDataIter(dataBH, nil, r.verifyChecksum, !ro.GetDontFillCache()) defer data.Release() if !data.Seek(key) { err = data.Error() @@ -877,7 +909,7 @@ func (r *Reader) OffsetOf(key []byte) (offset int64, err error) { if index.Seek(key) { dataBH, n := decodeBlockHandle(index.Value()) if n == 0 { - err = errors.New("leveldb/table: Reader: invalid table (bad data block handle)") + r.err = r.newErrCorruptedBH(r.indexBH, "bad data block handle") return } offset = int64(dataBH.offset) @@ -914,51 +946,56 @@ func (r *Reader) Release() { } // NewReader creates a new initialized table reader for the file. -// The cache and bpool is optional and can be nil. +// The fi, cache and bpool is optional and can be nil. // // The returned table reader instance is goroutine-safe. -func NewReader(f io.ReaderAt, size int64, cache cache.Namespace, bpool *util.BufferPool, o *opt.Options) *Reader { +func NewReader(f io.ReaderAt, size int64, fi *storage.FileInfo, cache cache.Namespace, bpool *util.BufferPool, o *opt.Options) (*Reader, error) { r := &Reader{ - reader: f, - cache: cache, - bpool: bpool, - cmp: o.GetComparer(), - checksum: o.GetStrict(opt.StrictBlockChecksum), - strictIter: o.GetStrict(opt.StrictIterator), + fi: fi, + reader: f, + cache: cache, + bpool: bpool, + o: o, + cmp: o.GetComparer(), + verifyChecksum: o.GetStrict(opt.StrictBlockChecksum), } if f == nil { - r.err = errors.New("leveldb/table: Reader: nil file") - return r + return nil, errors.New("leveldb/table: nil file") } if size < footerLen { - r.err = errors.New("leveldb/table: Reader: invalid table (file size is too small)") - return r + r.err = r.newErrCorrupted(0, size, "table", "too small") + return r, nil } + footerPos := size - footerLen var footer [footerLen]byte - if _, err := r.reader.ReadAt(footer[:], size-footerLen); err != nil && err != io.EOF { - r.err = fmt.Errorf("leveldb/table: Reader: invalid table (could not read footer): %v", err) + if _, err := r.reader.ReadAt(footer[:], footerPos); err != nil && err != io.EOF { + return nil, err } if string(footer[footerLen-len(magic):footerLen]) != magic { - r.err = errors.New("leveldb/table: Reader: invalid table (bad magic number)") - return r + r.err = r.newErrCorrupted(footerPos, footerLen, "table-footer", "bad magic number") + return r, nil } // Decode the metaindex block handle. metaBH, n := decodeBlockHandle(footer[:]) if n == 0 { - r.err = errors.New("leveldb/table: Reader: invalid table (bad metaindex block handle)") - return r + r.err = r.newErrCorrupted(footerPos, footerLen, "table-footer", "bad metaindex block handle") + return r, nil } // Decode the index block handle. r.indexBH, n = decodeBlockHandle(footer[n:]) if n == 0 { - r.err = errors.New("leveldb/table: Reader: invalid table (bad index block handle)") - return r + r.err = r.newErrCorrupted(footerPos, footerLen, "table-footer", "bad index block handle") + return r, nil } // Read metaindex block. metaBlock, err := r.readBlock(metaBH, true) if err != nil { - r.err = err - return r + if errors.IsCorrupted(err) { + r.err = err + return r, nil + } else { + return nil, err + } } // Set data end. r.dataEnd = int64(metaBH.offset) @@ -995,13 +1032,22 @@ func NewReader(f io.ReaderAt, size int64, cache cache.Namespace, bpool *util.Buf // Cache index and filter block locally, since we don't have global cache. if cache == nil { - r.indexBlock, r.err = r.readBlock(r.indexBH, true) - if r.err != nil { - return r + r.indexBlock, err = r.readBlock(r.indexBH, true) + if err != nil { + if errors.IsCorrupted(err) { + r.err = err + return r, nil + } else { + return nil, err + } } if r.filter != nil { r.filterBlock, err = r.readFilterBlock(r.filterBH) if err != nil { + if !errors.IsCorrupted(r.err) { + return nil, err + } + // Don't use filter then. r.filter = nil r.filterBH = blockHandle{} @@ -1009,5 +1055,5 @@ func NewReader(f io.ReaderAt, size int64, cache cache.Namespace, bpool *util.Buf } } - return r + return r, nil } diff --git a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/table/table.go b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/table/table.go index c0ac70d9..beacdc1f 100644 --- a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/table/table.go +++ b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/table/table.go @@ -133,9 +133,9 @@ Filter block trailer: +- 4-bytes -+ / \ - +---------------+---------------+---------------+-------------------------+------------------+ - | offset 1 | .... | offset n | filter offset (4-bytes) | base Lg (1-byte) | - +-------------- +---------------+---------------+-------------------------+------------------+ + +---------------+---------------+---------------+-------------------------------+------------------+ + | data 1 offset | .... | data n offset | data-offsets offset (4-bytes) | base Lg (1-byte) | + +-------------- +---------------+---------------+-------------------------------+------------------+ NOTE: All fixed-length integer are little-endian. diff --git a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/table/table_test.go b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/table/table_test.go index 3e6e8583..6662a599 100644 --- a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/table/table_test.go +++ b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/table/table_test.go @@ -59,7 +59,8 @@ var _ = testutil.Defer(func() { It("Should be able to approximate offset of a key correctly", func() { Expect(err).ShouldNot(HaveOccurred()) - tr := NewReader(bytes.NewReader(buf.Bytes()), int64(buf.Len()), nil, nil, o) + tr, err := NewReader(bytes.NewReader(buf.Bytes()), int64(buf.Len()), nil, nil, nil, o) + Expect(err).ShouldNot(HaveOccurred()) CheckOffset := func(key string, expect, threshold int) { offset, err := tr.OffsetOf([]byte(key)) Expect(err).ShouldNot(HaveOccurred()) @@ -95,7 +96,7 @@ var _ = testutil.Defer(func() { tw.Close() // Opening the table. - tr := NewReader(bytes.NewReader(buf.Bytes()), int64(buf.Len()), nil, nil, o) + tr, _ := NewReader(bytes.NewReader(buf.Bytes()), int64(buf.Len()), nil, nil, nil, o) return tableWrapper{tr} } Test := func(kv *testutil.KeyValue, body func(r *Reader)) func() { diff --git a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/table/writer.go b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/table/writer.go index 4e19e93a..274c95fa 100644 --- a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/table/writer.go +++ b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/table/writer.go @@ -12,7 +12,7 @@ import ( "fmt" "io" - "code.google.com/p/snappy-go/snappy" + "github.com/syndtr/gosnappy/snappy" "github.com/syndtr/goleveldb/leveldb/comparer" "github.com/syndtr/goleveldb/leveldb/filter" diff --git a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/testutil/db.go b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/testutil/db.go index 4b87b5ef..2e2d81fa 100644 --- a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/testutil/db.go +++ b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/testutil/db.go @@ -12,6 +12,7 @@ import ( . "github.com/onsi/gomega" + "github.com/syndtr/goleveldb/leveldb/errors" "github.com/syndtr/goleveldb/leveldb/iterator" "github.com/syndtr/goleveldb/leveldb/util" ) @@ -110,7 +111,7 @@ func (t *DBTesting) TestAllPresent() { func (t *DBTesting) TestDeletedKey(key []byte) { _, err := t.DB.TestGet(key) - Expect(err).Should(Equal(util.ErrNotFound), "Get on deleted key %q, %s", key, t.text()) + Expect(err).Should(Equal(errors.ErrNotFound), "Get on deleted key %q, %s", key, t.text()) } func (t *DBTesting) TestAllDeleted() { diff --git a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/testutil/kvtest.go b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/testutil/kvtest.go index 0cead03b..3b28f8bf 100644 --- a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/testutil/kvtest.go +++ b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/testutil/kvtest.go @@ -13,6 +13,7 @@ import ( . "github.com/onsi/ginkgo" . "github.com/onsi/gomega" + "github.com/syndtr/goleveldb/leveldb/errors" "github.com/syndtr/goleveldb/leveldb/util" ) @@ -59,7 +60,7 @@ func KeyValueTesting(rnd *rand.Rand, kv KeyValue, p DB, setup func(KeyValue) DB, } rkey, _, err := db.TestFind(key) Expect(err).Should(HaveOccurred(), "Find for key %q yield key %q", key, rkey) - Expect(err).Should(Equal(util.ErrNotFound)) + Expect(err).Should(Equal(errors.ErrNotFound)) } }) @@ -77,7 +78,7 @@ func KeyValueTesting(rnd *rand.Rand, kv KeyValue, p DB, setup func(KeyValue) DB, if len(key_) > 0 { _, err = db.TestGet(key_) Expect(err).Should(HaveOccurred(), "Error for key %q", key_) - Expect(err).Should(Equal(util.ErrNotFound)) + Expect(err).Should(Equal(errors.ErrNotFound)) } }) } diff --git a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/util.go b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/util.go index a43d2e46..1a5bf71a 100644 --- a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/util.go +++ b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/util.go @@ -14,10 +14,10 @@ import ( ) func shorten(str string) string { - if len(str) <= 4 { + if len(str) <= 8 { return str } - return str[:1] + ".." + str[len(str)-1:] + return str[:3] + ".." + str[len(str)-3:] } var bunits = [...]string{"", "Ki", "Mi", "Gi"} diff --git a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/util/buffer_pool.go b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/util/buffer_pool.go index d9509d2a..2b8453d7 100644 --- a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/util/buffer_pool.go +++ b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/util/buffer_pool.go @@ -8,6 +8,7 @@ package util import ( "fmt" + "sync" "sync/atomic" "time" ) @@ -19,17 +20,16 @@ type buffer struct { // BufferPool is a 'buffer pool'. type BufferPool struct { - pool [6]chan []byte - size [5]uint32 - sizeMiss [5]uint32 - sizeHalf [5]uint32 - baseline [4]int - baselinex0 int - baselinex1 int - baseline0 int - baseline1 int - baseline2 int - close chan struct{} + pool [6]chan []byte + size [5]uint32 + sizeMiss [5]uint32 + sizeHalf [5]uint32 + baseline [4]int + baseline0 int + + mu sync.RWMutex + closed bool + closeC chan struct{} get uint32 put uint32 @@ -58,6 +58,13 @@ func (p *BufferPool) Get(n int) []byte { return make([]byte, n) } + p.mu.RLock() + defer p.mu.RUnlock() + + if p.closed { + return make([]byte, n) + } + atomic.AddUint32(&p.get, 1) poolNum := p.poolNum(n) @@ -153,12 +160,16 @@ func (p *BufferPool) Put(b []byte) { return } + p.mu.RLock() + defer p.mu.RUnlock() + + if p.closed { + return + } + atomic.AddUint32(&p.put, 1) pool := p.pool[p.poolNum(cap(b))] - defer func() { - recover() - }() select { case pool <- b: default: @@ -171,10 +182,12 @@ func (p *BufferPool) Close() { return } - select { - case p.close <- struct{}{}: - default: + p.mu.Lock() + if !p.closed { + p.closed = true + p.closeC <- struct{}{} } + p.mu.Unlock() } func (p *BufferPool) String() string { @@ -197,7 +210,8 @@ func (p *BufferPool) drain() { default: } } - case <-p.close: + case <-p.closeC: + close(p.closeC) for _, ch := range p.pool { close(ch) } @@ -214,7 +228,7 @@ func NewBufferPool(baseline int) *BufferPool { p := &BufferPool{ baseline0: baseline, baseline: [...]int{baseline / 4, baseline / 2, baseline * 2, baseline * 4}, - close: make(chan struct{}, 1), + closeC: make(chan struct{}, 1), } for i, cap := range []int{2, 2, 4, 4, 2, 1} { p.pool[i] = make(chan []byte, cap) diff --git a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/util/util.go b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/util/util.go index f690e484..f3597686 100644 --- a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/util/util.go +++ b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/util/util.go @@ -12,7 +12,6 @@ import ( ) var ( - ErrNotFound = errors.New("leveldb: not found") ErrReleased = errors.New("leveldb: resource already relesed") ErrHasReleaser = errors.New("leveldb: releaser already defined") ) diff --git a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/version.go b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/version.go index 81fd9ee0..e598648e 100644 --- a/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/version.go +++ b/Godeps/_workspace/src/github.com/syndtr/goleveldb/leveldb/version.go @@ -7,7 +7,6 @@ package leveldb import ( - "errors" "sync/atomic" "unsafe" @@ -16,19 +15,6 @@ import ( "github.com/syndtr/goleveldb/leveldb/util" ) -var levelMaxSize [kNumLevels]float64 - -func init() { - // Precompute max size of each level - for level := range levelMaxSize { - res := float64(10 * 1048576) - for n := level; n > 1; n-- { - res *= 10 - } - levelMaxSize[level] = res - } -} - type tSet struct { level int table *tFile @@ -37,7 +23,7 @@ type tSet struct { type version struct { s *session - tables [kNumLevels]tFiles + tables []tFiles // Level that should be compacted next and its compaction score. // Score < 1 means compaction is not strictly needed. These fields @@ -47,11 +33,16 @@ type version struct { cSeek unsafe.Pointer - ref int + ref int + // Succeeding version. next *version } -func (v *version) release_NB() { +func newVersion(s *session) *version { + return &version{s: s, tables: make([]tFiles, s.o.GetNumLevel())} +} + +func (v *version) releaseNB() { v.ref-- if v.ref > 0 { return @@ -77,13 +68,13 @@ func (v *version) release_NB() { } } - v.next.release_NB() + v.next.releaseNB() v.next = nil } func (v *version) release() { v.s.vmu.Lock() - v.release_NB() + v.releaseNB() v.s.vmu.Unlock() } @@ -130,10 +121,11 @@ func (v *version) get(ikey iKey, ro *opt.ReadOptions) (value []byte, tcomp bool, tset *tSet tseek bool - l0found bool - l0seq uint64 - l0vt vType - l0val []byte + // Level-0. + zfound bool + zseq uint64 + zkt kType + zval []byte ) err = ErrNotFound @@ -150,55 +142,52 @@ func (v *version) get(ikey iKey, ro *opt.ReadOptions) (value []byte, tcomp bool, } } - ikey__, val_, err_ := v.s.tops.find(t, ikey, ro) - switch err_ { + fikey, fval, ferr := v.s.tops.find(t, ikey, ro) + switch ferr { case nil: case ErrNotFound: return true default: - err = err_ + err = ferr return false } - ikey_ := iKey(ikey__) - if seq, vt, ok := ikey_.parseNum(); ok { - if v.s.icmp.uCompare(ukey, ikey_.ukey()) != 0 { - return true - } - - if level == 0 { - if seq >= l0seq { - l0found = true - l0seq = seq - l0vt = vt - l0val = val_ + if fukey, fseq, fkt, fkerr := parseIkey(fikey); fkerr == nil { + if v.s.icmp.uCompare(ukey, fukey) == 0 { + if level == 0 { + if fseq >= zseq { + zfound = true + zseq = fseq + zkt = fkt + zval = fval + } + } else { + switch fkt { + case ktVal: + value = fval + err = nil + case ktDel: + default: + panic("leveldb: invalid iKey type") + } + return false } - } else { - switch vt { - case tVal: - value = val_ - err = nil - case tDel: - default: - panic("leveldb: invalid internal key type") - } - return false } } else { - err = errors.New("leveldb: internal key corrupted") + err = fkerr return false } return true }, func(level int) bool { - if l0found { - switch l0vt { - case tVal: - value = l0val + if zfound { + switch zkt { + case ktVal: + value = zval err = nil - case tDel: + case ktDel: default: - panic("leveldb: invalid internal key type") + panic("leveldb: invalid iKey type") } return false } @@ -216,13 +205,13 @@ func (v *version) getIterators(slice *util.Range, ro *opt.ReadOptions) (its []it its = append(its, it) } - strict := v.s.o.GetStrict(opt.StrictIterator) || ro.GetStrict(opt.StrictIterator) + strict := opt.GetStrict(v.s.o.Options, ro, opt.StrictReader) for _, tables := range v.tables[1:] { if len(tables) == 0 { continue } - it := iterator.NewIndexedIterator(tables.newIndexIterator(v.s.tops, v.s.icmp, slice, ro), strict, true) + it := iterator.NewIndexedIterator(tables.newIndexIterator(v.s.tops, v.s.icmp, slice, ro), strict) its = append(its, it) } @@ -230,7 +219,7 @@ func (v *version) getIterators(slice *util.Range, ro *opt.ReadOptions) (its []it } func (v *version) newStaging() *versionStaging { - return &versionStaging{base: v} + return &versionStaging{base: v, tables: make([]tablesScratch, v.s.o.GetNumLevel())} } // Spawn a new version based on this version. @@ -285,12 +274,13 @@ func (v *version) offsetOf(ikey iKey) (n uint64, err error) { func (v *version) pickLevel(umin, umax []byte) (level int) { if !v.tables[0].overlaps(v.s.icmp, umin, umax, true) { var overlaps tFiles - for ; level < kMaxMemCompactLevel; level++ { + maxLevel := v.s.o.GetMaxMemCompationLevel() + for ; level < maxLevel; level++ { if v.tables[level+1].overlaps(v.s.icmp, umin, umax, false) { break } overlaps = v.tables[level+2].getOverlaps(overlaps, v.s.icmp, umin, umax, false) - if overlaps.size() > kMaxGrandParentOverlapBytes { + if overlaps.size() > uint64(v.s.o.GetCompactionGPOverlaps(level)) { break } } @@ -318,9 +308,9 @@ func (v *version) computeCompaction() { // file size is small (perhaps because of a small write-buffer // setting, or very high compression ratios, or lots of // overwrites/deletions). - score = float64(len(tables)) / kL0_CompactionTrigger + score = float64(len(tables)) / float64(v.s.o.GetCompactionL0Trigger()) } else { - score = float64(tables.size()) / levelMaxSize[level] + score = float64(tables.size()) / float64(v.s.o.GetCompactionTotalSize(level)) } if score > bestScore { @@ -337,12 +327,14 @@ func (v *version) needCompaction() bool { return v.cScore >= 1 || atomic.LoadPointer(&v.cSeek) != nil } +type tablesScratch struct { + added map[uint64]atRecord + deleted map[uint64]struct{} +} + type versionStaging struct { base *version - tables [kNumLevels]struct { - added map[uint64]ntRecord - deleted map[uint64]struct{} - } + tables []tablesScratch } func (p *versionStaging) commit(r *sessionRecord) { @@ -367,7 +359,7 @@ func (p *versionStaging) commit(r *sessionRecord) { tm := &(p.tables[r.level]) if tm.added == nil { - tm.added = make(map[uint64]ntRecord) + tm.added = make(map[uint64]atRecord) } tm.added[r.num] = r @@ -379,7 +371,7 @@ func (p *versionStaging) commit(r *sessionRecord) { func (p *versionStaging) finish() *version { // Build new version. - nv := &version{s: p.base.s} + nv := newVersion(p.base.s) for level, tm := range p.tables { btables := p.base.tables[level] @@ -402,7 +394,7 @@ func (p *versionStaging) finish() *version { // New tables. for _, r := range tm.added { - nt = append(nt, r.makeFile(p.base.s)) + nt = append(nt, p.base.s.tableFileFromRecord(r)) } // Sort tables. @@ -429,7 +421,7 @@ func (vr *versionReleaser) Release() { v := vr.v v.s.vmu.Lock() if !vr.once { - v.release_NB() + v.releaseNB() vr.once = true } v.s.vmu.Unlock() diff --git a/Godeps/_workspace/src/code.google.com/p/snappy-go/snappy/decode.go b/Godeps/_workspace/src/github.com/syndtr/gosnappy/snappy/decode.go similarity index 100% rename from Godeps/_workspace/src/code.google.com/p/snappy-go/snappy/decode.go rename to Godeps/_workspace/src/github.com/syndtr/gosnappy/snappy/decode.go diff --git a/Godeps/_workspace/src/code.google.com/p/snappy-go/snappy/encode.go b/Godeps/_workspace/src/github.com/syndtr/gosnappy/snappy/encode.go similarity index 100% rename from Godeps/_workspace/src/code.google.com/p/snappy-go/snappy/encode.go rename to Godeps/_workspace/src/github.com/syndtr/gosnappy/snappy/encode.go diff --git a/Godeps/_workspace/src/code.google.com/p/snappy-go/snappy/snappy.go b/Godeps/_workspace/src/github.com/syndtr/gosnappy/snappy/snappy.go similarity index 100% rename from Godeps/_workspace/src/code.google.com/p/snappy-go/snappy/snappy.go rename to Godeps/_workspace/src/github.com/syndtr/gosnappy/snappy/snappy.go diff --git a/Godeps/_workspace/src/code.google.com/p/snappy-go/snappy/snappy_test.go b/Godeps/_workspace/src/github.com/syndtr/gosnappy/snappy/snappy_test.go similarity index 100% rename from Godeps/_workspace/src/code.google.com/p/snappy-go/snappy/snappy_test.go rename to Godeps/_workspace/src/github.com/syndtr/gosnappy/snappy/snappy_test.go