Refactor out file scanner into separate package

This commit is contained in:
Jakob Borg
2014-03-08 23:02:01 +01:00
parent d6c9afd07f
commit 1448cfe66a
21 changed files with 471 additions and 338 deletions

74
scanner/blocks.go Normal file
View File

@@ -0,0 +1,74 @@
package scanner
import (
"bytes"
"crypto/sha256"
"io"
)
type Block struct {
Offset int64
Size uint32
Hash []byte
}
// Blocks returns the blockwise hash of the reader.
func Blocks(r io.Reader, blocksize int) ([]Block, error) {
var blocks []Block
var offset int64
for {
lr := &io.LimitedReader{R: r, N: int64(blocksize)}
hf := sha256.New()
n, err := io.Copy(hf, lr)
if err != nil {
return nil, err
}
if n == 0 {
break
}
b := Block{
Offset: offset,
Size: uint32(n),
Hash: hf.Sum(nil),
}
blocks = append(blocks, b)
offset += int64(n)
}
if len(blocks) == 0 {
// Empty file
blocks = append(blocks, Block{
Offset: 0,
Size: 0,
Hash: []uint8{0xe3, 0xb0, 0xc4, 0x42, 0x98, 0xfc, 0x1c, 0x14, 0x9a, 0xfb, 0xf4, 0xc8, 0x99, 0x6f, 0xb9, 0x24, 0x27, 0xae, 0x41, 0xe4, 0x64, 0x9b, 0x93, 0x4c, 0xa4, 0x95, 0x99, 0x1b, 0x78, 0x52, 0xb8, 0x55},
})
}
return blocks, nil
}
// BlockDiff returns lists of common and missing (to transform src into tgt)
// blocks. Both block lists must have been created with the same block size.
func BlockDiff(src, tgt []Block) (have, need []Block) {
if len(tgt) == 0 && len(src) != 0 {
return nil, nil
}
if len(tgt) != 0 && len(src) == 0 {
// Copy the entire file
return nil, tgt
}
for i := range tgt {
if i >= len(src) || bytes.Compare(tgt[i].Hash, src[i].Hash) != 0 {
// Copy differing block
need = append(need, tgt[i])
} else {
have = append(have, tgt[i])
}
}
return have, need
}

116
scanner/blocks_test.go Normal file
View File

@@ -0,0 +1,116 @@
package scanner
import (
"bytes"
"fmt"
"testing"
)
var blocksTestData = []struct {
data []byte
blocksize int
hash []string
}{
{[]byte(""), 1024, []string{
"e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855"}},
{[]byte("contents"), 1024, []string{
"d1b2a59fbea7e20077af9f91b27e95e865061b270be03ff539ab3b73587882e8"}},
{[]byte("contents"), 9, []string{
"d1b2a59fbea7e20077af9f91b27e95e865061b270be03ff539ab3b73587882e8"}},
{[]byte("contents"), 8, []string{
"d1b2a59fbea7e20077af9f91b27e95e865061b270be03ff539ab3b73587882e8"}},
{[]byte("contents"), 7, []string{
"ed7002b439e9ac845f22357d822bac1444730fbdb6016d3ec9432297b9ec9f73",
"043a718774c572bd8a25adbeb1bfcd5c0256ae11cecf9f9c3f925d0e52beaf89"},
},
{[]byte("contents"), 3, []string{
"1143da2bc54c495c4be31d3868785d39ffdfd56df5668f0645d8f14d47647952",
"e4432baa90819aaef51d2a7f8e148bf7e679610f3173752fabb4dcb2d0f418d3",
"44ad63f60af0f6db6fdde6d5186ef78176367df261fa06be3079b6c80c8adba4"},
},
{[]byte("conconts"), 3, []string{
"1143da2bc54c495c4be31d3868785d39ffdfd56df5668f0645d8f14d47647952",
"1143da2bc54c495c4be31d3868785d39ffdfd56df5668f0645d8f14d47647952",
"44ad63f60af0f6db6fdde6d5186ef78176367df261fa06be3079b6c80c8adba4"},
},
{[]byte("contenten"), 3, []string{
"1143da2bc54c495c4be31d3868785d39ffdfd56df5668f0645d8f14d47647952",
"e4432baa90819aaef51d2a7f8e148bf7e679610f3173752fabb4dcb2d0f418d3",
"e4432baa90819aaef51d2a7f8e148bf7e679610f3173752fabb4dcb2d0f418d3"},
},
}
func TestBlocks(t *testing.T) {
for _, test := range blocksTestData {
buf := bytes.NewBuffer(test.data)
blocks, err := Blocks(buf, test.blocksize)
if err != nil {
t.Fatal(err)
}
if l := len(blocks); l != len(test.hash) {
t.Fatalf("Incorrect number of blocks %d != %d", l, len(test.hash))
} else {
i := 0
for off := int64(0); off < int64(len(test.data)); off += int64(test.blocksize) {
if blocks[i].Offset != off {
t.Errorf("Incorrect offset for block %d: %d != %d", i, blocks[i].Offset, off)
}
bs := test.blocksize
if rem := len(test.data) - int(off); bs > rem {
bs = rem
}
if int(blocks[i].Size) != bs {
t.Errorf("Incorrect length for block %d: %d != %d", i, blocks[i].Size, bs)
}
if h := fmt.Sprintf("%x", blocks[i].Hash); h != test.hash[i] {
t.Errorf("Incorrect block hash %q != %q", h, test.hash[i])
}
i++
}
}
}
}
var diffTestData = []struct {
a string
b string
s int
d []Block
}{
{"contents", "contents", 1024, []Block{}},
{"", "", 1024, []Block{}},
{"contents", "contents", 3, []Block{}},
{"contents", "cantents", 3, []Block{{0, 3, nil}}},
{"contents", "contants", 3, []Block{{3, 3, nil}}},
{"contents", "cantants", 3, []Block{{0, 3, nil}, {3, 3, nil}}},
{"contents", "", 3, []Block{{0, 0, nil}}},
{"", "contents", 3, []Block{{0, 3, nil}, {3, 3, nil}, {6, 2, nil}}},
{"con", "contents", 3, []Block{{3, 3, nil}, {6, 2, nil}}},
{"contents", "con", 3, nil},
{"contents", "cont", 3, []Block{{3, 1, nil}}},
{"cont", "contents", 3, []Block{{3, 3, nil}, {6, 2, nil}}},
}
func TestDiff(t *testing.T) {
for i, test := range diffTestData {
a, _ := Blocks(bytes.NewBufferString(test.a), test.s)
b, _ := Blocks(bytes.NewBufferString(test.b), test.s)
_, d := BlockDiff(a, b)
if len(d) != len(test.d) {
t.Fatalf("Incorrect length for diff %d; %d != %d", i, len(d), len(test.d))
} else {
for j := range test.d {
if d[j].Offset != test.d[j].Offset {
t.Errorf("Incorrect offset for diff %d block %d; %d != %d", i, j, d[j].Offset, test.d[j].Offset)
}
if d[j].Size != test.d[j].Size {
t.Errorf("Incorrect length for diff %d block %d; %d != %d", i, j, d[j].Size, test.d[j].Size)
}
}
}
}
}

12
scanner/debug.go Normal file
View File

@@ -0,0 +1,12 @@
package scanner
import (
"log"
"os"
"strings"
)
var (
dlog = log.New(os.Stderr, "scanner: ", log.Lmicroseconds|log.Lshortfile)
debug = strings.Contains(os.Getenv("STTRACE"), "scanner")
)

25
scanner/file.go Normal file
View File

@@ -0,0 +1,25 @@
package scanner
import "fmt"
type File struct {
Name string
Flags uint32
Modified int64
Version uint32
Size int64
Blocks []Block
}
func (f File) String() string {
return fmt.Sprintf("File{Name:%q, Flags:0x%x, Modified:%d, Version:%d, Size:%d, NumBlocks:%d}",
f.Name, f.Flags, f.Modified, f.Version, f.Size, len(f.Blocks))
}
func (f File) Equals(o File) bool {
return f.Modified == o.Modified && f.Version == o.Version
}
func (f File) NewerThan(o File) bool {
return f.Modified > o.Modified || (f.Modified == o.Modified && f.Version > o.Version)
}

2
scanner/testdata/.stignore vendored Normal file
View File

@@ -0,0 +1,2 @@
.*
quux

1
scanner/testdata/bar vendored Normal file
View File

@@ -0,0 +1 @@
foobarbaz

1
scanner/testdata/baz/quux vendored Normal file
View File

@@ -0,0 +1 @@
baazquux

0
scanner/testdata/empty vendored Normal file
View File

1
scanner/testdata/foo vendored Normal file
View File

@@ -0,0 +1 @@
foobar

259
scanner/walk.go Normal file
View File

@@ -0,0 +1,259 @@
package scanner
import (
"bytes"
"io/ioutil"
"log"
"os"
"path"
"path/filepath"
"strings"
"time"
"github.com/calmh/syncthing/protocol"
)
type Walker struct {
// Dir is the base directory for the walk
Dir string
// If FollowSymlinks is true, symbolic links directly under Dir will be followed.
// Symbolic links at deeper levels are never followed regardless of this flag.
FollowSymlinks bool
// BlockSize controls the size of the block used when hashing.
BlockSize int
// If IgnoreFile is not empty, it is the name used for the file that holds ignore patterns.
IgnoreFile string
// If TempNamer is not nil, it is used to ignore tempory files when walking.
TempNamer TempNamer
// If Suppressor is not nil, it is queried for supression of modified files.
Suppressor Suppressor
previous map[string]File // file name -> last seen file state
suppressed map[string]bool // file name -> suppression status
}
type TempNamer interface {
// Temporary returns a temporary name for the filed referred to by path.
TempName(path string) string
// IsTemporary returns true if path refers to the name of temporary file.
IsTemporary(path string) bool
}
type Suppressor interface {
// Supress returns true if the update to the named file should be ignored.
Suppress(name string, fi os.FileInfo) bool
}
// Walk returns the list of files found in the local repository by scanning the
// file system. Files are blockwise hashed.
func (w *Walker) Walk() (files []File, ignore map[string][]string) {
w.lazyInit()
if debug {
dlog.Println("Walk", w.Dir, w.FollowSymlinks, w.BlockSize, w.IgnoreFile)
}
t0 := time.Now()
ignore = make(map[string][]string)
hashFiles := w.walkAndHashFiles(&files, ignore)
filepath.Walk(w.Dir, w.loadIgnoreFiles(w.Dir, ignore))
filepath.Walk(w.Dir, hashFiles)
if w.FollowSymlinks {
d, err := os.Open(w.Dir)
if err != nil {
return
}
defer d.Close()
fis, err := d.Readdir(-1)
if err != nil {
return
}
for _, info := range fis {
if info.Mode()&os.ModeSymlink != 0 {
dir := path.Join(w.Dir, info.Name()) + "/"
filepath.Walk(dir, w.loadIgnoreFiles(dir, ignore))
filepath.Walk(dir, hashFiles)
}
}
}
if debug {
t1 := time.Now()
d := t1.Sub(t0).Seconds()
dlog.Printf("Walk in %.02f ms, %.0f files/s", d*1000, float64(len(files))/d)
}
return
}
// CleanTempFiles removes all files that match the temporary filename pattern.
func (w *Walker) CleanTempFiles() {
filepath.Walk(w.Dir, w.cleanTempFile)
}
func (w *Walker) lazyInit() {
if w.previous == nil {
w.previous = make(map[string]File)
w.suppressed = make(map[string]bool)
}
}
func (w *Walker) loadIgnoreFiles(dir string, ign map[string][]string) filepath.WalkFunc {
return func(p string, info os.FileInfo, err error) error {
if err != nil {
return nil
}
p, err = filepath.Rel(dir, p)
if err != nil {
return nil
}
if pn, sn := path.Split(p); sn == w.IgnoreFile {
pn := strings.Trim(pn, "/")
bs, _ := ioutil.ReadFile(p)
lines := bytes.Split(bs, []byte("\n"))
var patterns []string
for _, line := range lines {
if len(line) > 0 {
patterns = append(patterns, string(line))
}
}
ign[pn] = patterns
}
return nil
}
}
func (w *Walker) walkAndHashFiles(res *[]File, ign map[string][]string) filepath.WalkFunc {
return func(p string, info os.FileInfo, err error) error {
if err != nil {
if debug {
dlog.Println("error:", p, info, err)
}
return nil
}
p, err = filepath.Rel(w.Dir, p)
if err != nil {
return nil
}
if w.TempNamer != nil && w.TempNamer.IsTemporary(p) {
if debug {
dlog.Println("temporary:", p)
}
return nil
}
if _, sn := path.Split(p); sn == w.IgnoreFile {
if debug {
dlog.Println("ignorefile:", p)
}
return nil
}
if w.ignoreFile(ign, p) {
if debug {
dlog.Println("ignored:", p)
}
return nil
}
if info.Mode()&os.ModeType == 0 {
modified := info.ModTime().Unix()
pf := w.previous[p]
if pf.Modified == modified {
if nf := uint32(info.Mode()); nf != pf.Flags {
if debug {
dlog.Println("new flags:", p)
}
pf.Flags = nf
pf.Version++
w.previous[p] = pf
} else if debug {
dlog.Println("unchanged:", p)
}
*res = append(*res, pf)
return nil
}
if w.Suppressor != nil && w.Suppressor.Suppress(p, info) {
if debug {
dlog.Println("suppressed:", p)
}
if !w.suppressed[p] {
w.suppressed[p] = true
log.Printf("INFO: Changes to %q are being temporarily suppressed because it changes too frequently.", p)
}
f := pf
f.Flags = protocol.FlagInvalid
f.Blocks = nil
*res = append(*res, f)
return nil
} else if w.suppressed[p] {
log.Printf("INFO: Changes to %q are no longer suppressed.", p)
delete(w.suppressed, p)
}
fd, err := os.Open(p)
if err != nil {
return nil
}
defer fd.Close()
t0 := time.Now()
blocks, err := Blocks(fd, w.BlockSize)
if err != nil {
if debug {
dlog.Println("hash error:", p, err)
}
return nil
}
if debug {
t1 := time.Now()
dlog.Println("hashed:", p, ";", len(blocks), "blocks;", info.Size(), "bytes;", int(float64(info.Size())/1024/t1.Sub(t0).Seconds()), "KB/s")
}
f := File{
Name: p,
Size: info.Size(),
Flags: uint32(info.Mode()),
Modified: modified,
Blocks: blocks,
}
w.previous[p] = f
*res = append(*res, f)
}
return nil
}
}
func (w *Walker) cleanTempFile(path string, info os.FileInfo, err error) error {
if err != nil {
return err
}
if info.Mode()&os.ModeType == 0 && w.TempNamer.IsTemporary(path) {
os.Remove(path)
}
return nil
}
func (w *Walker) ignoreFile(patterns map[string][]string, file string) bool {
first, last := path.Split(file)
for prefix, pats := range patterns {
if len(prefix) == 0 || prefix == first || strings.HasPrefix(first, prefix+"/") {
for _, pattern := range pats {
if match, _ := path.Match(pattern, last); match {
return true
}
}
}
}
return false
}

88
scanner/walk_test.go Normal file
View File

@@ -0,0 +1,88 @@
package scanner
import (
"fmt"
"reflect"
"testing"
"time"
)
var testdata = []struct {
name string
size int
hash string
}{
{"bar", 10, "2f72cc11a6fcd0271ecef8c61056ee1eb1243be3805bf9a9df98f92f7636b05c"},
{"empty", 0, "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855"},
{"foo", 7, "aec070645fe53ee3b3763059376134f058cc337247c978add178b6ccdfb0019f"},
}
var correctIgnores = map[string][]string{
"": {".*", "quux"},
}
func TestWalk(t *testing.T) {
w := Walker{
Dir: "testdata",
BlockSize: 128 * 1024,
IgnoreFile: ".stignore",
}
files, ignores := w.Walk()
if l1, l2 := len(files), len(testdata); l1 != l2 {
t.Fatalf("Incorrect number of walked files %d != %d", l1, l2)
}
for i := range testdata {
if n1, n2 := testdata[i].name, files[i].Name; n1 != n2 {
t.Errorf("Incorrect file name %q != %q for case #%d", n1, n2, i)
}
if h1, h2 := fmt.Sprintf("%x", files[i].Blocks[0].Hash), testdata[i].hash; h1 != h2 {
t.Errorf("Incorrect hash %q != %q for case #%d", h1, h2, i)
}
t0 := time.Date(2010, 1, 1, 0, 0, 0, 0, time.UTC).Unix()
t1 := time.Date(2020, 1, 1, 0, 0, 0, 0, time.UTC).Unix()
if mt := files[i].Modified; mt < t0 || mt > t1 {
t.Errorf("Unrealistic modtime %d for test %d", mt, i)
}
}
if !reflect.DeepEqual(ignores, correctIgnores) {
t.Errorf("Incorrect ignores\n %v\n %v", correctIgnores, ignores)
}
}
func TestIgnore(t *testing.T) {
var patterns = map[string][]string{
"": {"t2"},
"foo": {"bar", "z*"},
"foo/baz": {"quux", ".*"},
}
var tests = []struct {
f string
r bool
}{
{"foo/bar", true},
{"foo/quux", false},
{"foo/zuux", true},
{"foo/qzuux", false},
{"foo/baz/t1", false},
{"foo/baz/t2", true},
{"foo/baz/bar", true},
{"foo/baz/quuxa", false},
{"foo/baz/aquux", false},
{"foo/baz/.quux", true},
{"foo/baz/zquux", true},
{"foo/baz/quux", true},
{"foo/bazz/quux", false},
}
w := Walker{}
for i, tc := range tests {
if r := w.ignoreFile(patterns, tc.f); r != tc.r {
t.Errorf("Incorrect ignoreFile() #%d; E: %v, A: %v", i, tc.r, r)
}
}
}