lib/ur: Implement crash (panic) reporting (fixes #959) (#5702)

* lib/ur: Implement crash (panic) reporting (fixes #959)

This implements a simple crash reporting method. It piggybacks on the
panic log files created by the monitor process, picking these up and
uploading them from the usage reporting routine.

A new config value points to the crash receiver base URL, which defaults
to "https://crash.syncthing.net/newcrash" (following the pattern of
"https://data.syncthing.net/newdata" for usage reports, but allowing us
to separate the service as required).
This commit is contained in:
Jakob Borg
2019-06-11 08:19:11 +02:00
committed by GitHub
parent 93e57bd357
commit 42ce6be9b9
19 changed files with 1925 additions and 19 deletions

View File

@@ -0,0 +1,152 @@
// Copyright (C) 2019 The Syncthing Authors.
//
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this file,
// You can obtain one at https://mozilla.org/MPL/2.0/.
package main
import (
"bytes"
"context"
"fmt"
"io/ioutil"
"net/http"
"os"
"path/filepath"
"sort"
"strings"
"time"
"github.com/syncthing/syncthing/lib/sha256"
)
const (
headRequestTimeout = 10 * time.Second
putRequestTimeout = time.Minute
)
// uploadPanicLogs attempts to upload all the panic logs in the named
// directory to the crash reporting server as urlBase. Uploads are attempted
// with the newest log first.
//
// This can can block for a long time. The context can set a final deadline
// for this.
func uploadPanicLogs(ctx context.Context, urlBase, dir string) {
files, err := filepath.Glob(filepath.Join(dir, "panic-*.log"))
if err != nil {
l.Warnln("Failed to list panic logs:", err)
return
}
sort.Sort(sort.Reverse(sort.StringSlice(files)))
for _, file := range files {
if strings.Contains(file, ".reported.") {
// We've already sent this file. It'll be cleaned out at some
// point.
continue
}
if err := uploadPanicLog(ctx, urlBase, file); err != nil {
l.Warnln("Reporting crash:", err)
} else {
// Rename the log so we don't have to try to report it again. This
// succeeds, or it does not. There is no point complaining about it.
_ = os.Rename(file, strings.Replace(file, ".log", ".reported.log", 1))
}
}
}
// uploadPanicLog attempts to upload the named panic log to the crash
// reporting server at urlBase. The panic ID is constructed as the sha256 of
// the log contents. A HEAD request is made to see if the log has already
// been reported. If not, a PUT is made with the log contents.
func uploadPanicLog(ctx context.Context, urlBase, file string) error {
data, err := ioutil.ReadFile(file)
if err != nil {
return err
}
// Remove log lines, for privacy.
data = filterLogLines(data)
hash := fmt.Sprintf("%x", sha256.Sum256(data))
l.Infof("Reporting crash found in %s (report ID %s) ...\n", filepath.Base(file), hash[:8])
url := fmt.Sprintf("%s/%s", urlBase, hash)
headReq, err := http.NewRequest(http.MethodHead, url, nil)
if err != nil {
return err
}
// Set a reasonable timeout on the HEAD request
headCtx, headCancel := context.WithTimeout(ctx, headRequestTimeout)
defer headCancel()
headReq = headReq.WithContext(headCtx)
resp, err := http.DefaultClient.Do(headReq)
if err != nil {
return err
}
resp.Body.Close()
if resp.StatusCode == http.StatusOK {
// It's known, we're done
return nil
}
putReq, err := http.NewRequest(http.MethodPut, url, bytes.NewReader(data))
if err != nil {
return err
}
// Set a reasonable timeout on the PUT request
putCtx, putCancel := context.WithTimeout(ctx, putRequestTimeout)
defer putCancel()
putReq = putReq.WithContext(putCtx)
resp, err = http.DefaultClient.Do(putReq)
if err != nil {
return err
}
resp.Body.Close()
if resp.StatusCode != http.StatusOK {
return fmt.Errorf("upload: %s", resp.Status)
}
return nil
}
// filterLogLines returns the data without any log lines between the first
// line and the panic trace. This is done in-place: the original data slice
// is destroyed.
func filterLogLines(data []byte) []byte {
filtered := data[:0]
matched := false
for _, line := range bytes.Split(data, []byte("\n")) {
switch {
case !matched && bytes.HasPrefix(line, []byte("Panic ")):
// This begins the panic trace, set the matched flag and append.
matched = true
fallthrough
case len(filtered) == 0 || matched:
// This is the first line or inside the panic trace.
if len(filtered) > 0 {
// We add the newline before rather than after because
// bytes.Split sees the \n as *separator* and not line
// ender, so ir will generate a last empty line that we
// don't really want. (We want to keep blank lines in the
// middle of the trace though.)
filtered = append(filtered, '\n')
}
// Remove the device ID prefix. The "plus two" stuff is because
// the line will look like "[foo] whatever" and the end variable
// will end up pointing at the ] and we want to step over that
// and the following space.
if end := bytes.Index(line, []byte("]")); end > 1 && end < len(line)-2 && bytes.HasPrefix(line, []byte("[")) {
line = line[end+2:]
}
filtered = append(filtered, line...)
}
}
return filtered
}

View File

@@ -0,0 +1,37 @@
// Copyright (C) 2019 The Syncthing Authors.
//
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this file,
// You can obtain one at https://mozilla.org/MPL/2.0/.
package main
import (
"bytes"
"testing"
)
func TestFilterLogLines(t *testing.T) {
in := []byte(`[ABCD123] syncthing version whatever
here is more log data
and more
...
and some more
yet more
Panic detected at like right now
here is panic data
and yet more panic stuff
`)
filtered := []byte(`syncthing version whatever
Panic detected at like right now
here is panic data
and yet more panic stuff
`)
result := filterLogLines(in)
if !bytes.Equal(result, filtered) {
t.Logf("%q\n", result)
t.Error("it should have been filtered")
}
}

View File

@@ -8,6 +8,7 @@ package main
import (
"bufio"
"context"
"io"
"os"
"os/exec"
@@ -33,6 +34,8 @@ const (
loopThreshold = 60 * time.Second
logFileAutoCloseDelay = 5 * time.Second
logFileMaxOpenTime = time.Minute
panicUploadMaxWait = 30 * time.Second
panicUploadNoticeWait = 10 * time.Second
)
func monitorMain(runtimeOptions RuntimeOptions) {
@@ -72,6 +75,8 @@ func monitorMain(runtimeOptions RuntimeOptions) {
childEnv := childEnv()
first := true
for {
maybeReportPanics()
if t := time.Since(restarts[0]); t < loopThreshold {
l.Warnf("%d restarts in %v; not retrying further", countRestarts, t)
os.Exit(exitError)
@@ -173,6 +178,13 @@ func copyStderr(stderr io.Reader, dst io.Writer) {
br := bufio.NewReader(stderr)
var panicFd *os.File
defer func() {
if panicFd != nil {
_ = panicFd.Close()
maybeReportPanics()
}
}()
for {
line, err := br.ReadString('\n')
if err != nil {
@@ -430,3 +442,39 @@ func childEnv() []string {
env = append(env, "STMONITORED=yes")
return env
}
// maybeReportPanics tries to figure out if crash reporting is on or off,
// and reports any panics it can find if it's enabled. We spend at most
// panicUploadMaxWait uploading panics...
func maybeReportPanics() {
// Try to get a config to see if/where panics should be reported.
cfg, err := loadOrDefaultConfig()
if err != nil {
l.Warnln("Couldn't load config; not reporting crash")
return
}
// Bail if we're not supposed to report panics.
opts := cfg.Options()
if !opts.CREnabled {
return
}
// Set up a timeout on the whole operation.
ctx, cancel := context.WithTimeout(context.Background(), panicUploadMaxWait)
defer cancel()
// Print a notice if the upload takes a long time.
go func() {
select {
case <-ctx.Done():
return
case <-time.After(panicUploadNoticeWait):
l.Warnln("Uploading crash reports is taking a while, please wait...")
}
}()
// Report the panics.
dir := locations.GetBaseDir(locations.ConfigBaseDir)
uploadPanicLogs(ctx, opts.CRURL, dir)
}