Refactor out file scanner into separate package

2014-03-08 23:02:01 +01:00
parent d6c9afd07f
commit 1448cfe66a
21 changed files with 471 additions and 338 deletions
--- a/scanner/blocks.go
+++ b/scanner/blocks.go
@@ -0,0 +1,74 @@
+package scanner
+
+import (
+	"bytes"
+	"crypto/sha256"
+	"io"
+)
+
+type Block struct {
+	Offset int64
+	Size   uint32
+	Hash   []byte
+}
+
+// Blocks returns the blockwise hash of the reader.
+func Blocks(r io.Reader, blocksize int) ([]Block, error) {
+	var blocks []Block
+	var offset int64
+	for {
+		lr := &io.LimitedReader{R: r, N: int64(blocksize)}
+		hf := sha256.New()
+		n, err := io.Copy(hf, lr)
+		if err != nil {
+			return nil, err
+		}
+
+		if n == 0 {
+			break
+		}
+
+		b := Block{
+			Offset: offset,
+			Size:   uint32(n),
+			Hash:   hf.Sum(nil),
+		}
+		blocks = append(blocks, b)
+		offset += int64(n)
+	}
+
+	if len(blocks) == 0 {
+		// Empty file
+		blocks = append(blocks, Block{
+			Offset: 0,
+			Size:   0,
+			Hash:   []uint8{0xe3, 0xb0, 0xc4, 0x42, 0x98, 0xfc, 0x1c, 0x14, 0x9a, 0xfb, 0xf4, 0xc8, 0x99, 0x6f, 0xb9, 0x24, 0x27, 0xae, 0x41, 0xe4, 0x64, 0x9b, 0x93, 0x4c, 0xa4, 0x95, 0x99, 0x1b, 0x78, 0x52, 0xb8, 0x55},
+		})
+	}
+
+	return blocks, nil
+}
+
+// BlockDiff returns lists of common and missing (to transform src into tgt)
+// blocks. Both block lists must have been created with the same block size.
+func BlockDiff(src, tgt []Block) (have, need []Block) {
+	if len(tgt) == 0 && len(src) != 0 {
+		return nil, nil
+	}
+
+	if len(tgt) != 0 && len(src) == 0 {
+		// Copy the entire file
+		return nil, tgt
+	}
+
+	for i := range tgt {
+		if i >= len(src) || bytes.Compare(tgt[i].Hash, src[i].Hash) != 0 {
+			// Copy differing block
+			need = append(need, tgt[i])
+		} else {
+			have = append(have, tgt[i])
+		}
+	}
+
+	return have, need
+}
--- a/scanner/blocks_test.go
+++ b/scanner/blocks_test.go
@@ -0,0 +1,116 @@
+package scanner
+
+import (
+	"bytes"
+	"fmt"
+	"testing"
+)
+
+var blocksTestData = []struct {
+	data      []byte
+	blocksize int
+	hash      []string
+}{
+	{[]byte(""), 1024, []string{
+		"e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855"}},
+	{[]byte("contents"), 1024, []string{
+		"d1b2a59fbea7e20077af9f91b27e95e865061b270be03ff539ab3b73587882e8"}},
+	{[]byte("contents"), 9, []string{
+		"d1b2a59fbea7e20077af9f91b27e95e865061b270be03ff539ab3b73587882e8"}},
+	{[]byte("contents"), 8, []string{
+		"d1b2a59fbea7e20077af9f91b27e95e865061b270be03ff539ab3b73587882e8"}},
+	{[]byte("contents"), 7, []string{
+		"ed7002b439e9ac845f22357d822bac1444730fbdb6016d3ec9432297b9ec9f73",
+		"043a718774c572bd8a25adbeb1bfcd5c0256ae11cecf9f9c3f925d0e52beaf89"},
+	},
+	{[]byte("contents"), 3, []string{
+		"1143da2bc54c495c4be31d3868785d39ffdfd56df5668f0645d8f14d47647952",
+		"e4432baa90819aaef51d2a7f8e148bf7e679610f3173752fabb4dcb2d0f418d3",
+		"44ad63f60af0f6db6fdde6d5186ef78176367df261fa06be3079b6c80c8adba4"},
+	},
+	{[]byte("conconts"), 3, []string{
+		"1143da2bc54c495c4be31d3868785d39ffdfd56df5668f0645d8f14d47647952",
+		"1143da2bc54c495c4be31d3868785d39ffdfd56df5668f0645d8f14d47647952",
+		"44ad63f60af0f6db6fdde6d5186ef78176367df261fa06be3079b6c80c8adba4"},
+	},
+	{[]byte("contenten"), 3, []string{
+		"1143da2bc54c495c4be31d3868785d39ffdfd56df5668f0645d8f14d47647952",
+		"e4432baa90819aaef51d2a7f8e148bf7e679610f3173752fabb4dcb2d0f418d3",
+		"e4432baa90819aaef51d2a7f8e148bf7e679610f3173752fabb4dcb2d0f418d3"},
+	},
+}
+
+func TestBlocks(t *testing.T) {
+	for _, test := range blocksTestData {
+		buf := bytes.NewBuffer(test.data)
+		blocks, err := Blocks(buf, test.blocksize)
+
+		if err != nil {
+			t.Fatal(err)
+		}
+
+		if l := len(blocks); l != len(test.hash) {
+			t.Fatalf("Incorrect number of blocks %d != %d", l, len(test.hash))
+		} else {
+			i := 0
+			for off := int64(0); off < int64(len(test.data)); off += int64(test.blocksize) {
+				if blocks[i].Offset != off {
+					t.Errorf("Incorrect offset for block %d: %d != %d", i, blocks[i].Offset, off)
+				}
+
+				bs := test.blocksize
+				if rem := len(test.data) - int(off); bs > rem {
+					bs = rem
+				}
+				if int(blocks[i].Size) != bs {
+					t.Errorf("Incorrect length for block %d: %d != %d", i, blocks[i].Size, bs)
+				}
+				if h := fmt.Sprintf("%x", blocks[i].Hash); h != test.hash[i] {
+					t.Errorf("Incorrect block hash %q != %q", h, test.hash[i])
+				}
+
+				i++
+			}
+		}
+	}
+}
+
+var diffTestData = []struct {
+	a string
+	b string
+	s int
+	d []Block
+}{
+	{"contents", "contents", 1024, []Block{}},
+	{"", "", 1024, []Block{}},
+	{"contents", "contents", 3, []Block{}},
+	{"contents", "cantents", 3, []Block{{0, 3, nil}}},
+	{"contents", "contants", 3, []Block{{3, 3, nil}}},
+	{"contents", "cantants", 3, []Block{{0, 3, nil}, {3, 3, nil}}},
+	{"contents", "", 3, []Block{{0, 0, nil}}},
+	{"", "contents", 3, []Block{{0, 3, nil}, {3, 3, nil}, {6, 2, nil}}},
+	{"con", "contents", 3, []Block{{3, 3, nil}, {6, 2, nil}}},
+	{"contents", "con", 3, nil},
+	{"contents", "cont", 3, []Block{{3, 1, nil}}},
+	{"cont", "contents", 3, []Block{{3, 3, nil}, {6, 2, nil}}},
+}
+
+func TestDiff(t *testing.T) {
+	for i, test := range diffTestData {
+		a, _ := Blocks(bytes.NewBufferString(test.a), test.s)
+		b, _ := Blocks(bytes.NewBufferString(test.b), test.s)
+		_, d := BlockDiff(a, b)
+		if len(d) != len(test.d) {
+			t.Fatalf("Incorrect length for diff %d; %d != %d", i, len(d), len(test.d))
+		} else {
+			for j := range test.d {
+				if d[j].Offset != test.d[j].Offset {
+					t.Errorf("Incorrect offset for diff %d block %d; %d != %d", i, j, d[j].Offset, test.d[j].Offset)
+				}
+				if d[j].Size != test.d[j].Size {
+					t.Errorf("Incorrect length for diff %d block %d; %d != %d", i, j, d[j].Size, test.d[j].Size)
+				}
+			}
+		}
+	}
+}
--- a/scanner/debug.go
+++ b/scanner/debug.go
@@ -0,0 +1,12 @@
+package scanner
+
+import (
+	"log"
+	"os"
+	"strings"
+)
+
+var (
+	dlog  = log.New(os.Stderr, "scanner: ", log.Lmicroseconds|log.Lshortfile)
+	debug = strings.Contains(os.Getenv("STTRACE"), "scanner")
+)
--- a/scanner/file.go
+++ b/scanner/file.go
@@ -0,0 +1,25 @@
+package scanner
+
+import "fmt"
+
+type File struct {
+	Name     string
+	Flags    uint32
+	Modified int64
+	Version  uint32
+	Size     int64
+	Blocks   []Block
+}
+
+func (f File) String() string {
+	return fmt.Sprintf("File{Name:%q, Flags:0x%x, Modified:%d, Version:%d, Size:%d, NumBlocks:%d}",
+		f.Name, f.Flags, f.Modified, f.Version, f.Size, len(f.Blocks))
+}
+
+func (f File) Equals(o File) bool {
+	return f.Modified == o.Modified && f.Version == o.Version
+}
+
+func (f File) NewerThan(o File) bool {
+	return f.Modified > o.Modified || (f.Modified == o.Modified && f.Version > o.Version)
+}
--- a/scanner/testdata/.stignore
+++ b/scanner/testdata/.stignore
@@ -0,0 +1,2 @@
+.*
+quux
--- a/scanner/testdata/bar
+++ b/scanner/testdata/bar
@@ -0,0 +1 @@
+foobarbaz
--- a/scanner/testdata/baz/quux
+++ b/scanner/testdata/baz/quux
@@ -0,0 +1 @@
+baazquux
--- a/scanner/testdata/empty
+++ b/scanner/testdata/empty
--- a/scanner/testdata/foo
+++ b/scanner/testdata/foo
@@ -0,0 +1 @@
+foobar
--- a/scanner/walk.go
+++ b/scanner/walk.go
@@ -0,0 +1,259 @@
+package scanner
+
+import (
+	"bytes"
+	"io/ioutil"
+	"log"
+	"os"
+	"path"
+	"path/filepath"
+	"strings"
+	"time"
+
+	"github.com/calmh/syncthing/protocol"
+)
+
+type Walker struct {
+	// Dir is the base directory for the walk
+	Dir string
+	// If FollowSymlinks is true, symbolic links directly under Dir will be followed.
+	// Symbolic links at deeper levels are never followed regardless of this flag.
+	FollowSymlinks bool
+	// BlockSize controls the size of the block used when hashing.
+	BlockSize int
+	// If IgnoreFile is not empty, it is the name used for the file that holds ignore patterns.
+	IgnoreFile string
+	// If TempNamer is not nil, it is used to ignore tempory files when walking.
+	TempNamer TempNamer
+	// If Suppressor is not nil, it is queried for supression of modified files.
+	Suppressor Suppressor
+
+	previous   map[string]File // file name -> last seen file state
+	suppressed map[string]bool // file name -> suppression status
+}
+
+type TempNamer interface {
+	// Temporary returns a temporary name for the filed referred to by path.
+	TempName(path string) string
+	// IsTemporary returns true if path refers to the name of temporary file.
+	IsTemporary(path string) bool
+}
+
+type Suppressor interface {
+	// Supress returns true if the update to the named file should be ignored.
+	Suppress(name string, fi os.FileInfo) bool
+}
+
+// Walk returns the list of files found in the local repository by scanning the
+// file system. Files are blockwise hashed.
+func (w *Walker) Walk() (files []File, ignore map[string][]string) {
+	w.lazyInit()
+
+	if debug {
+		dlog.Println("Walk", w.Dir, w.FollowSymlinks, w.BlockSize, w.IgnoreFile)
+	}
+	t0 := time.Now()
+
+	ignore = make(map[string][]string)
+	hashFiles := w.walkAndHashFiles(&files, ignore)
+
+	filepath.Walk(w.Dir, w.loadIgnoreFiles(w.Dir, ignore))
+	filepath.Walk(w.Dir, hashFiles)
+
+	if w.FollowSymlinks {
+		d, err := os.Open(w.Dir)
+		if err != nil {
+			return
+		}
+		defer d.Close()
+
+		fis, err := d.Readdir(-1)
+		if err != nil {
+			return
+		}
+
+		for _, info := range fis {
+			if info.Mode()&os.ModeSymlink != 0 {
+				dir := path.Join(w.Dir, info.Name()) + "/"
+				filepath.Walk(dir, w.loadIgnoreFiles(dir, ignore))
+				filepath.Walk(dir, hashFiles)
+			}
+		}
+	}
+
+	if debug {
+		t1 := time.Now()
+		d := t1.Sub(t0).Seconds()
+		dlog.Printf("Walk in %.02f ms, %.0f files/s", d*1000, float64(len(files))/d)
+	}
+	return
+}
+
+// CleanTempFiles removes all files that match the temporary filename pattern.
+func (w *Walker) CleanTempFiles() {
+	filepath.Walk(w.Dir, w.cleanTempFile)
+}
+
+func (w *Walker) lazyInit() {
+	if w.previous == nil {
+		w.previous = make(map[string]File)
+		w.suppressed = make(map[string]bool)
+	}
+}
+
+func (w *Walker) loadIgnoreFiles(dir string, ign map[string][]string) filepath.WalkFunc {
+	return func(p string, info os.FileInfo, err error) error {
+		if err != nil {
+			return nil
+		}
+
+		p, err = filepath.Rel(dir, p)
+		if err != nil {
+			return nil
+		}
+
+		if pn, sn := path.Split(p); sn == w.IgnoreFile {
+			pn := strings.Trim(pn, "/")
+			bs, _ := ioutil.ReadFile(p)
+			lines := bytes.Split(bs, []byte("\n"))
+			var patterns []string
+			for _, line := range lines {
+				if len(line) > 0 {
+					patterns = append(patterns, string(line))
+				}
+			}
+			ign[pn] = patterns
+		}
+
+		return nil
+	}
+}
+
+func (w *Walker) walkAndHashFiles(res *[]File, ign map[string][]string) filepath.WalkFunc {
+	return func(p string, info os.FileInfo, err error) error {
+		if err != nil {
+			if debug {
+				dlog.Println("error:", p, info, err)
+			}
+			return nil
+		}
+
+		p, err = filepath.Rel(w.Dir, p)
+		if err != nil {
+			return nil
+		}
+
+		if w.TempNamer != nil && w.TempNamer.IsTemporary(p) {
+			if debug {
+				dlog.Println("temporary:", p)
+			}
+			return nil
+		}
+
+		if _, sn := path.Split(p); sn == w.IgnoreFile {
+			if debug {
+				dlog.Println("ignorefile:", p)
+			}
+			return nil
+		}
+
+		if w.ignoreFile(ign, p) {
+			if debug {
+				dlog.Println("ignored:", p)
+			}
+			return nil
+		}
+
+		if info.Mode()&os.ModeType == 0 {
+			modified := info.ModTime().Unix()
+			pf := w.previous[p]
+
+			if pf.Modified == modified {
+				if nf := uint32(info.Mode()); nf != pf.Flags {
+					if debug {
+						dlog.Println("new flags:", p)
+					}
+					pf.Flags = nf
+					pf.Version++
+					w.previous[p] = pf
+				} else if debug {
+					dlog.Println("unchanged:", p)
+				}
+				*res = append(*res, pf)
+				return nil
+			}
+
+			if w.Suppressor != nil && w.Suppressor.Suppress(p, info) {
+				if debug {
+					dlog.Println("suppressed:", p)
+				}
+				if !w.suppressed[p] {
+					w.suppressed[p] = true
+					log.Printf("INFO: Changes to %q are being temporarily suppressed because it changes too frequently.", p)
+				}
+				f := pf
+				f.Flags = protocol.FlagInvalid
+				f.Blocks = nil
+				*res = append(*res, f)
+				return nil
+			} else if w.suppressed[p] {
+				log.Printf("INFO: Changes to %q are no longer suppressed.", p)
+				delete(w.suppressed, p)
+			}
+
+			fd, err := os.Open(p)
+			if err != nil {
+				return nil
+			}
+			defer fd.Close()
+
+			t0 := time.Now()
+			blocks, err := Blocks(fd, w.BlockSize)
+			if err != nil {
+				if debug {
+					dlog.Println("hash error:", p, err)
+				}
+				return nil
+			}
+			if debug {
+				t1 := time.Now()
+				dlog.Println("hashed:", p, ";", len(blocks), "blocks;", info.Size(), "bytes;", int(float64(info.Size())/1024/t1.Sub(t0).Seconds()), "KB/s")
+			}
+			f := File{
+				Name:     p,
+				Size:     info.Size(),
+				Flags:    uint32(info.Mode()),
+				Modified: modified,
+				Blocks:   blocks,
+			}
+			w.previous[p] = f
+			*res = append(*res, f)
+		}
+
+		return nil
+	}
+}
+
+func (w *Walker) cleanTempFile(path string, info os.FileInfo, err error) error {
+	if err != nil {
+		return err
+	}
+	if info.Mode()&os.ModeType == 0 && w.TempNamer.IsTemporary(path) {
+		os.Remove(path)
+	}
+	return nil
+}
+
+func (w *Walker) ignoreFile(patterns map[string][]string, file string) bool {
+	first, last := path.Split(file)
+	for prefix, pats := range patterns {
+		if len(prefix) == 0 || prefix == first || strings.HasPrefix(first, prefix+"/") {
+			for _, pattern := range pats {
+				if match, _ := path.Match(pattern, last); match {
+					return true
+				}
+			}
+		}
+	}
+	return false
+}
--- a/scanner/walk_test.go
+++ b/scanner/walk_test.go
@@ -0,0 +1,88 @@
+package scanner
+
+import (
+	"fmt"
+	"reflect"
+	"testing"
+	"time"
+)
+
+var testdata = []struct {
+	name string
+	size int
+	hash string
+}{
+	{"bar", 10, "2f72cc11a6fcd0271ecef8c61056ee1eb1243be3805bf9a9df98f92f7636b05c"},
+	{"empty", 0, "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855"},
+	{"foo", 7, "aec070645fe53ee3b3763059376134f058cc337247c978add178b6ccdfb0019f"},
+}
+
+var correctIgnores = map[string][]string{
+	"": {".*", "quux"},
+}
+
+func TestWalk(t *testing.T) {
+	w := Walker{
+		Dir:        "testdata",
+		BlockSize:  128 * 1024,
+		IgnoreFile: ".stignore",
+	}
+	files, ignores := w.Walk()
+
+	if l1, l2 := len(files), len(testdata); l1 != l2 {
+		t.Fatalf("Incorrect number of walked files %d != %d", l1, l2)
+	}
+
+	for i := range testdata {
+		if n1, n2 := testdata[i].name, files[i].Name; n1 != n2 {
+			t.Errorf("Incorrect file name %q != %q for case #%d", n1, n2, i)
+		}
+
+		if h1, h2 := fmt.Sprintf("%x", files[i].Blocks[0].Hash), testdata[i].hash; h1 != h2 {
+			t.Errorf("Incorrect hash %q != %q for case #%d", h1, h2, i)
+		}
+
+		t0 := time.Date(2010, 1, 1, 0, 0, 0, 0, time.UTC).Unix()
+		t1 := time.Date(2020, 1, 1, 0, 0, 0, 0, time.UTC).Unix()
+		if mt := files[i].Modified; mt < t0 || mt > t1 {
+			t.Errorf("Unrealistic modtime %d for test %d", mt, i)
+		}
+	}
+
+	if !reflect.DeepEqual(ignores, correctIgnores) {
+		t.Errorf("Incorrect ignores\n  %v\n  %v", correctIgnores, ignores)
+	}
+}
+
+func TestIgnore(t *testing.T) {
+	var patterns = map[string][]string{
+		"":        {"t2"},
+		"foo":     {"bar", "z*"},
+		"foo/baz": {"quux", ".*"},
+	}
+	var tests = []struct {
+		f string
+		r bool
+	}{
+		{"foo/bar", true},
+		{"foo/quux", false},
+		{"foo/zuux", true},
+		{"foo/qzuux", false},
+		{"foo/baz/t1", false},
+		{"foo/baz/t2", true},
+		{"foo/baz/bar", true},
+		{"foo/baz/quuxa", false},
+		{"foo/baz/aquux", false},
+		{"foo/baz/.quux", true},
+		{"foo/baz/zquux", true},
+		{"foo/baz/quux", true},
+		{"foo/bazz/quux", false},
+	}
+
+	w := Walker{}
+	for i, tc := range tests {
+		if r := w.ignoreFile(patterns, tc.f); r != tc.r {
+			t.Errorf("Incorrect ignoreFile() #%d; E: %v, A: %v", i, tc.r, r)
+		}
+	}
+}