diff --git a/internal/config/config.go b/internal/config/config.go index a1f19816..ba9aa3ca 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -50,6 +50,7 @@ type FolderConfiguration struct { ReadOnly bool `xml:"ro,attr" json:"readOnly"` RescanIntervalS int `xml:"rescanIntervalS,attr" json:"rescanIntervalS" default:"60"` IgnorePerms bool `xml:"ignorePerms,attr" json:"ignorePerms"` + AutoNormalize bool `xml:"autoNormalize,attr" json:"autoNormalize" default:"true"` Versioning VersioningConfiguration `xml:"versioning" json:"versioning"` LenientMtimes bool `xml:"lenientMtimes" json:"lenientMTimes"` Copiers int `xml:"copiers" json:"copiers" default:"1"` // This defines how many files are handled concurrently. diff --git a/internal/model/model.go b/internal/model/model.go index 0cdf7fd1..4de15fcf 100644 --- a/internal/model/model.go +++ b/internal/model/model.go @@ -1139,15 +1139,16 @@ func (m *Model) ScanFolderSub(folder, sub string) error { } w := &scanner.Walker{ - Dir: folderCfg.Path, - Sub: sub, - Matcher: ignores, - BlockSize: protocol.BlockSize, - TempNamer: defTempNamer, - TempLifetime: time.Duration(m.cfg.Options().KeepTemporariesH) * time.Hour, - CurrentFiler: cFiler{m, folder}, - IgnorePerms: folderCfg.IgnorePerms, - Hashers: folderCfg.Hashers, + Dir: folderCfg.Path, + Sub: sub, + Matcher: ignores, + BlockSize: protocol.BlockSize, + TempNamer: defTempNamer, + TempLifetime: time.Duration(m.cfg.Options().KeepTemporariesH) * time.Hour, + CurrentFiler: cFiler{m, folder}, + IgnorePerms: folderCfg.IgnorePerms, + AutoNormalize: folderCfg.AutoNormalize, + Hashers: folderCfg.Hashers, } runner.setState(FolderScanning) diff --git a/internal/scanner/walk.go b/internal/scanner/walk.go index b7da9e00..522b24d0 100644 --- a/internal/scanner/walk.go +++ b/internal/scanner/walk.go @@ -13,6 +13,7 @@ import ( "runtime" "strings" "time" + "unicode/utf8" "github.com/syncthing/protocol" "github.com/syncthing/syncthing/internal/ignore" @@ -55,6 +56,9 @@ type Walker struct { // detected. Scanned files will get zero permission bits and the // NoPermissionBits flag set. IgnorePerms bool + // When AutoNormalize is set, file names that are in UTF8 but incorrect + // normalization form will be corrected. + AutoNormalize bool // Number of routines to use for hashing Hashers int } @@ -104,11 +108,18 @@ func (w *Walker) Walk() (chan protocol.FileInfo, error) { func (w *Walker) walkAndHashFiles(fchan chan protocol.FileInfo) filepath.WalkFunc { now := time.Now() return func(p string, info os.FileInfo, err error) error { + // Return value used when we are returning early and don't want to + // process the item. For directories, this means do-not-descend. + var skip error // nil + if info.IsDir() { + skip = filepath.SkipDir + } + if err != nil { if debug { l.Debugln("error:", p, info, err) } - return nil + return skip } rn, err := filepath.Rel(w.Dir, p) @@ -116,7 +127,7 @@ func (w *Walker) walkAndHashFiles(fchan chan protocol.FileInfo) filepath.WalkFun if debug { l.Debugln("rel error:", p, err) } - return nil + return skip } if rn == "." { @@ -143,33 +154,62 @@ func (w *Walker) walkAndHashFiles(fchan chan protocol.FileInfo) filepath.WalkFun if debug { l.Debugln("ignored:", rn) } - if info.IsDir() { - return filepath.SkipDir - } - return nil + return skip } - if (runtime.GOOS == "linux" || runtime.GOOS == "windows") && !norm.NFC.IsNormalString(rn) { - l.Warnf("File %q contains non-NFC UTF-8 sequences and cannot be synced. Consider renaming.", rn) - return nil + if !utf8.ValidString(rn) { + l.Warnf("File name %q is not in UTF8 encoding; skipping.", rn) + return skip + } + + var normalizedRn string + if runtime.GOOS == "darwin" { + // Mac OS X file names should always be NFD normalized. + normalizedRn = norm.NFD.String(rn) + } else { + // Every other OS in the known universe uses NFC or just plain + // doesn't bother to define an encoding. In our case *we* do care, + // so we enforce NFC regardless. + normalizedRn = norm.NFC.String(rn) + } + + if rn != normalizedRn { + // The file name was not normalized. + + if !w.AutoNormalize { + // We're not authorized to do anything about it, so complain and skip. + + l.Warnf("File name %q is not in the correct UTF8 normalization form; skipping.", rn) + return skip + } + + // We will attempt to normalize it. + normalizedPath := filepath.Join(w.Dir, normalizedRn) + if _, err := os.Lstat(normalizedPath); os.IsNotExist(err) { + // Nothing exists with the normalized filename. Good. + if err = os.Rename(p, normalizedPath); err != nil { + l.Infof(`Error normalizing UTF8 encoding of file "%s": %v`, rn, err) + return skip + } + l.Infof(`Normalized UTF8 encoding of file name "%s".`, rn) + } else { + // There is something already in the way at the normalized + // file name. + l.Infof(`File "%s" has UTF8 encoding conflict with another file; ignoring.`, rn) + return skip + } + + rn = normalizedRn } // Index wise symlinks are always files, regardless of what the target // is, because symlinks carry their target path as their content. if info.Mode()&os.ModeSymlink == os.ModeSymlink { - var rval error // If the target is a directory, do NOT descend down there. This // will cause files to get tracked, and removing the symlink will - // as a result remove files in their real location. But do not - // SkipDir if the target is not a directory, as it will stop - // scanning the current directory. - if info.IsDir() { - rval = filepath.SkipDir - } - - // If we don't support symlinks, skip. + // as a result remove files in their real location. if !symlinks.Supported { - return rval + return skip } // We always rehash symlinks as they have no modtime or @@ -183,7 +223,7 @@ func (w *Walker) walkAndHashFiles(fchan chan protocol.FileInfo) filepath.WalkFun if debug { l.Debugln("readlink error:", p, err) } - return rval + return skip } blocks, err := Blocks(strings.NewReader(target), w.BlockSize, 0) @@ -191,7 +231,7 @@ func (w *Walker) walkAndHashFiles(fchan chan protocol.FileInfo) filepath.WalkFun if debug { l.Debugln("hash link error:", p, err) } - return rval + return skip } if w.CurrentFiler != nil { @@ -204,7 +244,7 @@ func (w *Walker) walkAndHashFiles(fchan chan protocol.FileInfo) filepath.WalkFun // - the block list (i.e. hash of target) was the same cf, ok := w.CurrentFiler.CurrentFile(rn) if ok && !cf.IsDeleted() && cf.IsSymlink() && !cf.IsInvalid() && SymlinkTypeEqual(flags, cf.Flags) && BlocksEqual(cf.Blocks, blocks) { - return rval + return skip } } @@ -222,7 +262,7 @@ func (w *Walker) walkAndHashFiles(fchan chan protocol.FileInfo) filepath.WalkFun fchan <- f - return rval + return skip } if info.Mode().IsDir() { diff --git a/internal/scanner/walk_test.go b/internal/scanner/walk_test.go index 821c0b9e..e6fe1cae 100644 --- a/internal/scanner/walk_test.go +++ b/internal/scanner/walk_test.go @@ -9,14 +9,17 @@ package scanner import ( "bytes" "fmt" + "os" "path/filepath" "reflect" + "runtime" rdebug "runtime/debug" "sort" "testing" "github.com/syncthing/protocol" "github.com/syncthing/syncthing/internal/ignore" + "golang.org/x/text/unicode/norm" ) type testfile struct { @@ -181,6 +184,102 @@ func TestVerify(t *testing.T) { } } +func TestNormalization(t *testing.T) { + if runtime.GOOS == "darwin" { + t.Skip("Normalization test not possible on darwin") + return + } + + os.RemoveAll("testdata/normalization") + defer os.RemoveAll("testdata/normalization") + + tests := []string{ + "0-A", // ASCII A -- accepted + "1-\xC3\x84", // NFC 'Ä' -- conflicts with the entry below, accepted + "1-\x41\xCC\x88", // NFD 'Ä' -- conflicts with the entry above, ignored + "2-\xC3\x85", // NFC 'Å' -- accepted + "3-\x41\xCC\x83", // NFD 'Ã' -- converted to NFC + "4-\xE2\x98\x95", // U+2615 HOT BEVERAGE (☕) -- accepted + "5-\xCD\xE2", // EUC-CN "wài" (外) -- ignored (not UTF8) + } + numInvalid := 2 + numValid := len(tests) - numInvalid + + for _, s1 := range tests { + // Create a directory for each of the interesting strings above + if err := os.MkdirAll(filepath.Join("testdata/normalization", s1), 0755); err != nil { + t.Fatal(err) + } + + for _, s2 := range tests { + // Within each dir, create a file with each of the interesting + // file names. Ensure that the file doesn't exist when it's + // created. This detects and fails if there's file name + // normalization stuff at the filesystem level. + if fd, err := os.OpenFile(filepath.Join("testdata/normalization", s1, s2), os.O_CREATE|os.O_EXCL, 0644); err != nil { + t.Fatal(err) + } else { + fd.WriteString("test") + fd.Close() + } + } + } + + // We can normalize a directory name, but we can't descend into it in the + // same pass due to how filepath.Walk works. So we run the scan twice to + // make sure it all gets done. In production, things will be correct + // eventually... + + _, err := walkDir("testdata/normalization") + if err != nil { + t.Fatal(err) + } + tmp, err := walkDir("testdata/normalization") + if err != nil { + t.Fatal(err) + } + + files := fileList(tmp).testfiles() + + // We should have one file per combination, plus the directories + // themselves + + expectedNum := numValid*numValid + numValid + if len(files) != expectedNum { + t.Errorf("Expected %d files, got %d", expectedNum, len(files)) + } + + // The file names should all be in NFC form. + + for _, f := range files { + t.Logf("%q (% x) %v", f.name, f.name, norm.NFC.IsNormalString(f.name)) + if !norm.NFC.IsNormalString(f.name) { + t.Errorf("File name %q is not NFC normalized", f.name) + } + } +} + +func walkDir(dir string) ([]protocol.FileInfo, error) { + w := Walker{ + Dir: dir, + BlockSize: 128 * 1024, + AutoNormalize: true, + } + + fchan, err := w.Walk() + if err != nil { + return nil, err + } + + var tmp []protocol.FileInfo + for f := range fchan { + tmp = append(tmp, f) + } + sort.Sort(fileList(tmp)) + + return tmp, nil +} + type fileList []protocol.FileInfo func (l fileList) Len() int {