all: Implement variable sized blocks (fixes #4807)

This commit is contained in:
Jakob Borg
2018-04-16 20:08:50 +02:00
committed by Audrius Butkevicius
parent 01aef75c96
commit 19c7cd99f5
27 changed files with 536 additions and 293 deletions

View File

@@ -63,7 +63,6 @@ func HashFile(ctx context.Context, fs fs.Filesystem, path string, blockSize int,
// is closed and all items handled.
type parallelHasher struct {
fs fs.Filesystem
blockSize int
workers int
outbox chan<- protocol.FileInfo
inbox <-chan protocol.FileInfo
@@ -73,10 +72,9 @@ type parallelHasher struct {
wg sync.WaitGroup
}
func newParallelHasher(ctx context.Context, fs fs.Filesystem, blockSize, workers int, outbox chan<- protocol.FileInfo, inbox <-chan protocol.FileInfo, counter Counter, done chan<- struct{}, useWeakHashes bool) {
func newParallelHasher(ctx context.Context, fs fs.Filesystem, workers int, outbox chan<- protocol.FileInfo, inbox <-chan protocol.FileInfo, counter Counter, done chan<- struct{}, useWeakHashes bool) {
ph := &parallelHasher{
fs: fs,
blockSize: blockSize,
workers: workers,
outbox: outbox,
inbox: inbox,
@@ -108,7 +106,7 @@ func (ph *parallelHasher) hashFiles(ctx context.Context) {
panic("Bug. Asked to hash a directory or a deleted file.")
}
blocks, err := HashFile(ctx, ph.fs, f.Name, ph.blockSize, ph.counter, ph.useWeakHashes)
blocks, err := HashFile(ctx, ph.fs, f.Name, f.BlockSize(), ph.counter, ph.useWeakHashes)
if err != nil {
l.Debugln("hash error:", f.Name, err)
continue

View File

@@ -125,7 +125,7 @@ func TestAdler32Variants(t *testing.T) {
}
// protocol block sized data
data := make([]byte, protocol.BlockSize)
data := make([]byte, protocol.MinBlockSize)
for i := 0; i < 5; i++ {
rand.Read(data)
if !checkFn(data) {

View File

@@ -55,6 +55,48 @@ func (i infiniteFS) Open(name string) (fs.File, error) {
return &fakeFile{name, i.filesize, 0}, nil
}
type singleFileFS struct {
fs.Filesystem
name string
filesize int64
}
func (s singleFileFS) Lstat(name string) (fs.FileInfo, error) {
switch name {
case ".":
return fakeInfo{".", 0}, nil
case s.name:
return fakeInfo{s.name, s.filesize}, nil
default:
return nil, errors.New("no such file")
}
}
func (s singleFileFS) Stat(name string) (fs.FileInfo, error) {
switch name {
case ".":
return fakeInfo{".", 0}, nil
case s.name:
return fakeInfo{s.name, s.filesize}, nil
default:
return nil, errors.New("no such file")
}
}
func (s singleFileFS) DirNames(name string) ([]string, error) {
if name != "." {
return nil, errors.New("no such file")
}
return []string{s.name}, nil
}
func (s singleFileFS) Open(name string) (fs.File, error) {
if name != s.name {
return nil, errors.New("no such file")
}
return &fakeFile{s.name, s.filesize, 0}, nil
}
type fakeInfo struct {
name string
size int64

View File

@@ -42,8 +42,6 @@ type Config struct {
Folder string
// Limit walking to these paths within Dir, or no limit if Sub is empty
Subs []string
// BlockSize controls the size of the block used when hashing.
BlockSize int
// If Matcher is not nil, it is used to identify files to ignore which were specified by the user.
Matcher *ignore.Matcher
// Number of hours to keep temporary files for
@@ -68,6 +66,8 @@ type Config struct {
ProgressTickIntervalS int
// Whether or not we should also compute weak hashes
UseWeakHashes bool
// Whether to use large blocks for large files or the old standard of 128KiB for everything.
UseLargeBlocks bool
}
type CurrentFiler interface {
@@ -98,7 +98,7 @@ type walker struct {
// Walk returns the list of files found in the local folder by scanning the
// file system. Files are blockwise hashed.
func (w *walker) walk(ctx context.Context) chan protocol.FileInfo {
l.Debugln("Walk", w.Subs, w.BlockSize, w.Matcher)
l.Debugln("Walk", w.Subs, w.Matcher)
toHashChan := make(chan protocol.FileInfo)
finishedChan := make(chan protocol.FileInfo)
@@ -120,7 +120,7 @@ func (w *walker) walk(ctx context.Context) chan protocol.FileInfo {
// We're not required to emit scan progress events, just kick off hashers,
// and feed inputs directly from the walker.
if w.ProgressTickIntervalS < 0 {
newParallelHasher(ctx, w.Filesystem, w.BlockSize, w.Hashers, finishedChan, toHashChan, nil, nil, w.UseWeakHashes)
newParallelHasher(ctx, w.Filesystem, w.Hashers, finishedChan, toHashChan, nil, nil, w.UseWeakHashes)
return finishedChan
}
@@ -151,7 +151,7 @@ func (w *walker) walk(ctx context.Context) chan protocol.FileInfo {
done := make(chan struct{})
progress := newByteCounter()
newParallelHasher(ctx, w.Filesystem, w.BlockSize, w.Hashers, finishedChan, realToHashChan, progress, done, w.UseWeakHashes)
newParallelHasher(ctx, w.Filesystem, w.Hashers, finishedChan, realToHashChan, progress, done, w.UseWeakHashes)
// A routine which actually emits the FolderScanProgress events
// every w.ProgressTicker ticks, until the hasher routines terminate.
@@ -161,7 +161,7 @@ func (w *walker) walk(ctx context.Context) chan protocol.FileInfo {
for {
select {
case <-done:
l.Debugln("Walk progress done", w.Folder, w.Subs, w.BlockSize, w.Matcher)
l.Debugln("Walk progress done", w.Folder, w.Subs, w.Matcher)
ticker.Stop()
return
case <-ticker.C:
@@ -285,32 +285,52 @@ func (w *walker) walkRegular(ctx context.Context, relPath string, info fs.FileIn
curMode |= 0111
}
cf, ok := w.CurrentFiler.CurrentFile(relPath)
blockSize := protocol.MinBlockSize
curFile, hasCurFile := w.CurrentFiler.CurrentFile(relPath)
if w.UseLargeBlocks {
blockSize = protocol.BlockSize(info.Size())
if hasCurFile {
// Check if we should retain current block size.
curBlockSize := curFile.BlockSize()
if blockSize > curBlockSize && blockSize/curBlockSize <= 2 {
// New block size is larger, but not more than twice larger.
// Retain.
blockSize = curBlockSize
} else if curBlockSize > blockSize && curBlockSize/blockSize <= 2 {
// Old block size is larger, but not more than twice larger.
// Retain.
blockSize = curBlockSize
}
}
}
f := protocol.FileInfo{
Name: relPath,
Type: protocol.FileInfoTypeFile,
Version: cf.Version.Update(w.ShortID),
Version: curFile.Version.Update(w.ShortID),
Permissions: curMode & uint32(maskModePerm),
NoPermissions: w.IgnorePerms,
ModifiedS: info.ModTime().Unix(),
ModifiedNs: int32(info.ModTime().Nanosecond()),
ModifiedBy: w.ShortID,
Size: info.Size(),
RawBlockSize: int32(blockSize),
}
if ok {
if cf.IsEquivalent(f, w.IgnorePerms, true) {
if hasCurFile {
if curFile.IsEquivalent(f, w.IgnorePerms, true) {
return nil
}
if cf.Invalid {
if curFile.Invalid {
// We do not want to override the global version with the file we
// currently have. Keeping only our local counter makes sure we are in
// conflict with any other existing versions, which will be resolved by
// the normal pulling mechanisms.
f.Version = f.Version.DropOthers(w.ShortID)
}
l.Debugln("rescan:", cf, info.ModTime().Unix(), info.Mode()&fs.ModePerm)
l.Debugln("rescan:", curFile, info.ModTime().Unix(), info.Mode()&fs.ModePerm)
}
l.Debugln("to hash:", relPath, f)

View File

@@ -65,7 +65,6 @@ func TestWalkSub(t *testing.T) {
fchan := Walk(context.TODO(), Config{
Filesystem: fs.NewFilesystem(fs.FilesystemTypeBasic, "testdata"),
Subs: []string{"dir2"},
BlockSize: 128 * 1024,
Matcher: ignores,
Hashers: 2,
})
@@ -98,7 +97,6 @@ func TestWalk(t *testing.T) {
fchan := Walk(context.TODO(), Config{
Filesystem: fs.NewFilesystem(fs.FilesystemTypeBasic, "testdata"),
BlockSize: 128 * 1024,
Matcher: ignores,
Hashers: 2,
})
@@ -221,11 +219,11 @@ func TestNormalization(t *testing.T) {
// make sure it all gets done. In production, things will be correct
// eventually...
_, err := walkDir(fs, "testdata/normalization")
_, err := walkDir(fs, "testdata/normalization", nil)
if err != nil {
t.Fatal(err)
}
tmp, err := walkDir(fs, "testdata/normalization")
tmp, err := walkDir(fs, "testdata/normalization", nil)
if err != nil {
t.Fatal(err)
}
@@ -272,7 +270,7 @@ func TestWalkSymlinkUnix(t *testing.T) {
for _, path := range []string{".", "link"} {
// Scan it
files, _ := walkDir(fs.NewFilesystem(fs.FilesystemTypeBasic, "_symlinks"), path)
files, _ := walkDir(fs.NewFilesystem(fs.FilesystemTypeBasic, "_symlinks"), path, nil)
// Verify that we got one symlink and with the correct attributes
if len(files) != 1 {
@@ -303,7 +301,7 @@ func TestWalkSymlinkWindows(t *testing.T) {
for _, path := range []string{".", "link"} {
// Scan it
files, _ := walkDir(fs.NewFilesystem(fs.FilesystemTypeBasic, "_symlinks"), path)
files, _ := walkDir(fs.NewFilesystem(fs.FilesystemTypeBasic, "_symlinks"), path, nil)
// Verify that we got zero symlinks
if len(files) != 0 {
@@ -332,7 +330,7 @@ func TestWalkRootSymlink(t *testing.T) {
}
// Scan it
files, err := walkDir(fs.NewFilesystem(fs.FilesystemTypeBasic, link), ".")
files, err := walkDir(fs.NewFilesystem(fs.FilesystemTypeBasic, link), ".", nil)
if err != nil {
t.Fatal("Expected no error when root folder path is provided via a symlink: " + err.Error())
}
@@ -342,13 +340,83 @@ func TestWalkRootSymlink(t *testing.T) {
}
}
func walkDir(fs fs.Filesystem, dir string) ([]protocol.FileInfo, error) {
func TestBlocksizeHysteresis(t *testing.T) {
// Verify that we select the right block size in the presence of old
// file information.
sf := fs.NewWalkFilesystem(&singleFileFS{
name: "testfile.dat",
filesize: 500 << 20, // 500 MiB
})
current := make(fakeCurrentFiler)
runTest := func(expectedBlockSize int) {
files, err := walkDir(sf, ".", current)
if err != nil {
t.Fatal(err)
}
if len(files) != 1 {
t.Fatalf("expected one file, not %d", len(files))
}
if s := files[0].BlockSize(); s != expectedBlockSize {
t.Fatalf("incorrect block size %d != expected %d", s, expectedBlockSize)
}
}
// Scan with no previous knowledge. We should get a 512 KiB block size.
runTest(512 << 10)
// Scan on the assumption that previous size was 256 KiB. Retain 256 KiB
// block size.
current["testfile.dat"] = protocol.FileInfo{
Name: "testfile.dat",
Size: 500 << 20,
RawBlockSize: 256 << 10,
}
runTest(256 << 10)
// Scan on the assumption that previous size was 1 MiB. Retain 1 MiB
// block size.
current["testfile.dat"] = protocol.FileInfo{
Name: "testfile.dat",
Size: 500 << 20,
RawBlockSize: 1 << 20,
}
runTest(1 << 20)
// Scan on the assumption that previous size was 128 KiB. Move to 512
// KiB because the difference is large.
current["testfile.dat"] = protocol.FileInfo{
Name: "testfile.dat",
Size: 500 << 20,
RawBlockSize: 128 << 10,
}
runTest(512 << 10)
// Scan on the assumption that previous size was 2 MiB. Move to 512
// KiB because the difference is large.
current["testfile.dat"] = protocol.FileInfo{
Name: "testfile.dat",
Size: 500 << 20,
RawBlockSize: 2 << 20,
}
runTest(512 << 10)
}
func walkDir(fs fs.Filesystem, dir string, cfiler CurrentFiler) ([]protocol.FileInfo, error) {
fchan := Walk(context.TODO(), Config{
Filesystem: fs,
Subs: []string{dir},
BlockSize: 128 * 1024,
AutoNormalize: true,
Hashers: 2,
Filesystem: fs,
Subs: []string{dir},
AutoNormalize: true,
Hashers: 2,
UseLargeBlocks: true,
CurrentFiler: cfiler,
})
var tmp []protocol.FileInfo
@@ -410,7 +478,7 @@ func BenchmarkHashFile(b *testing.B) {
b.ResetTimer()
for i := 0; i < b.N; i++ {
if _, err := HashFile(context.TODO(), fs.NewFilesystem(fs.FilesystemTypeBasic, ""), testdataName, protocol.BlockSize, nil, true); err != nil {
if _, err := HashFile(context.TODO(), fs.NewFilesystem(fs.FilesystemTypeBasic, ""), testdataName, protocol.MinBlockSize, nil, true); err != nil {
b.Fatal(err)
}
}
@@ -451,7 +519,6 @@ func TestStopWalk(t *testing.T) {
ctx, cancel := context.WithCancel(context.Background())
fchan := Walk(ctx, Config{
Filesystem: fs,
BlockSize: 128 * 1024,
Hashers: numHashers,
ProgressTickIntervalS: -1, // Don't attempt to build the full list of files before starting to scan...
})
@@ -513,7 +580,7 @@ func TestIssue4799(t *testing.T) {
}
fd.Close()
files, err := walkDir(fs, "/foo")
files, err := walkDir(fs, "/foo", nil)
if err != nil {
t.Fatal(err)
}
@@ -540,7 +607,6 @@ func TestIssue4841(t *testing.T) {
fchan := Walk(context.TODO(), Config{
Filesystem: fs,
Subs: nil,
BlockSize: 128 * 1024,
AutoNormalize: true,
Hashers: 2,
CurrentFiler: fakeCurrentFiler{