cmd/stdiscosrv: New discovery server (fixes #4618)

This is a new revision of the discovery server. Relevant changes and
non-changes:

- Protocol towards clients is unchanged.

- Recommended large scale design is still to be deployed nehind nginx (I
  tested, and it's still a lot faster at terminating TLS).

- Database backend is leveldb again, only. It scales enough, is easy to
  setup, and we don't need any backend to take care of.

- Server supports replication. This is a simple TCP channel - protect it
  with a firewall when deploying over the internet. (We deploy this within
  the same datacenter, and with firewall.) Any incoming client announces
  are sent over the replication channel(s) to other peer discosrvs.
  Incoming replication changes are applied to the database as if they came
  from clients, but without the TLS/certificate overhead.

- Metrics are exposed using the prometheus library, when enabled.

- The database values and replication protocol is protobuf, because JSON
  was quite CPU intensive when I tried that and benchmarked it.

- The "Retry-After" value for failed lookups gets slowly increased from
  a default of 120 seconds, by 5 seconds for each failed lookup,
  independently by each discosrv. This lowers the query load over time for
  clients that are never seen. The Retry-After maxes out at 3600 after a
  couple of weeks of this increase. The number of failed lookups is
  stored in the database, now and then (avoiding making each lookup a
  database put).

All in all this means clients can be pointed towards a cluster using
just multiple A / AAAA records to gain both load sharing and redundancy
(if one is down, clients will talk to the remaining ones).

GitHub-Pull-Request: https://github.com/syncthing/syncthing/pull/4648
This commit is contained in:
Jakob Borg
2018-01-14 08:52:31 +00:00
parent 341b9691a7
commit 916ec63af6
864 changed files with 216825 additions and 64540 deletions

View File

@@ -1,141 +1,108 @@
// Copyright (C) 2014-2015 Jakob Borg and Contributors (see the CONTRIBUTORS file).
// Copyright (C) 2018 The Syncthing Authors.
//
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this file,
// You can obtain one at https://mozilla.org/MPL/2.0/.
package main
import (
"bytes"
"database/sql"
"fmt"
"io/ioutil"
"log"
"os"
"sync/atomic"
"time"
"github.com/prometheus/client_golang/prometheus"
)
type stats struct {
// Incremented atomically
announces int64
queries int64
answers int64
errors int64
}
func (s *stats) Announce() {
atomic.AddInt64(&s.announces, 1)
}
func (s *stats) Query() {
atomic.AddInt64(&s.queries, 1)
}
func (s *stats) Answer() {
atomic.AddInt64(&s.answers, 1)
}
func (s *stats) Error() {
atomic.AddInt64(&s.errors, 1)
}
// Reset returns a copy of the current stats and resets the counters to
// zero.
func (s *stats) Reset() stats {
// Create a copy of the stats using atomic reads
copy := stats{
announces: atomic.LoadInt64(&s.announces),
queries: atomic.LoadInt64(&s.queries),
answers: atomic.LoadInt64(&s.answers),
errors: atomic.LoadInt64(&s.errors),
}
// Reset the stats by subtracting the values that we copied
atomic.AddInt64(&s.announces, -copy.announces)
atomic.AddInt64(&s.queries, -copy.queries)
atomic.AddInt64(&s.answers, -copy.answers)
atomic.AddInt64(&s.errors, -copy.errors)
return copy
}
type statssrv struct {
intv time.Duration
file string
db *sql.DB
}
func (s *statssrv) Serve() {
lastReset := time.Now()
for {
time.Sleep(next(s.intv))
stats := globalStats.Reset()
d := time.Since(lastReset).Seconds()
lastReset = time.Now()
log.Printf("Stats: %.02f announces/s, %.02f queries/s, %.02f answers/s, %.02f errors/s",
float64(stats.announces)/d, float64(stats.queries)/d, float64(stats.answers)/d, float64(stats.errors)/d)
if s.file != "" {
s.writeToFile(stats, d)
}
}
}
func (s *statssrv) Stop() {
panic("stop unimplemented")
}
func (s *statssrv) writeToFile(stats stats, secs float64) {
newLine := []byte("\n")
var addrs int
row := s.db.QueryRow("SELECT COUNT(*) FROM Addresses")
if err := row.Scan(&addrs); err != nil {
log.Println("stats query:", err)
return
}
fd, err := os.OpenFile(s.file, os.O_RDWR|os.O_CREATE, 0666)
if err != nil {
log.Println("stats file:", err)
return
}
defer func() {
err = fd.Close()
if err != nil {
log.Println("stats file:", err)
}
}()
bs, err := ioutil.ReadAll(fd)
if err != nil {
log.Println("stats file:", err)
return
}
lines := bytes.Split(bytes.TrimSpace(bs), newLine)
if len(lines) > 12 {
lines = lines[len(lines)-12:]
}
latest := fmt.Sprintf("%v: %6d addresses, %8.02f announces/s, %8.02f queries/s, %8.02f answers/s, %8.02f errors/s\n",
time.Now().UTC().Format(time.RFC3339), addrs,
float64(stats.announces)/secs, float64(stats.queries)/secs, float64(stats.answers)/secs, float64(stats.errors)/secs)
lines = append(lines, []byte(latest))
_, err = fd.Seek(0, 0)
if err != nil {
log.Println("stats file:", err)
return
}
err = fd.Truncate(0)
if err != nil {
log.Println("stats file:", err)
return
}
_, err = fd.Write(bytes.Join(lines, newLine))
if err != nil {
log.Println("stats file:", err)
return
}
var (
apiRequestsTotal = prometheus.NewCounterVec(
prometheus.CounterOpts{
Namespace: "syncthing",
Subsystem: "discovery",
Name: "api_requests_total",
Help: "Number of API requests.",
}, []string{"type", "result"})
apiRequestsSeconds = prometheus.NewSummaryVec(
prometheus.SummaryOpts{
Namespace: "syncthing",
Subsystem: "discovery",
Name: "api_requests_seconds",
Help: "Latency of API requests.",
Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.99: 0.001},
}, []string{"type"})
lookupRequestsTotal = prometheus.NewCounterVec(
prometheus.CounterOpts{
Namespace: "syncthing",
Subsystem: "discovery",
Name: "lookup_requests_total",
Help: "Number of lookup requests.",
}, []string{"result"})
announceRequestsTotal = prometheus.NewCounterVec(
prometheus.CounterOpts{
Namespace: "syncthing",
Subsystem: "discovery",
Name: "announcement_requests_total",
Help: "Number of announcement requests.",
}, []string{"result"})
replicationSendsTotal = prometheus.NewCounterVec(
prometheus.CounterOpts{
Namespace: "syncthing",
Subsystem: "discovery",
Name: "replication_sends_total",
Help: "Number of replication sends.",
}, []string{"result"})
replicationRecvsTotal = prometheus.NewCounterVec(
prometheus.CounterOpts{
Namespace: "syncthing",
Subsystem: "discovery",
Name: "replication_recvs_total",
Help: "Number of replication receives.",
}, []string{"result"})
databaseKeys = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: "syncthing",
Subsystem: "discovery",
Name: "database_keys",
Help: "Number of database keys at last count.",
}, []string{"category"})
databaseStatisticsSeconds = prometheus.NewGauge(
prometheus.GaugeOpts{
Namespace: "syncthing",
Subsystem: "discovery",
Name: "database_statistics_seconds",
Help: "Time spent running the statistics routine.",
})
databaseOperations = prometheus.NewCounterVec(
prometheus.CounterOpts{
Namespace: "syncthing",
Subsystem: "discovery",
Name: "database_operations_total",
Help: "Number of database operations.",
}, []string{"operation", "result"})
databaseOperationSeconds = prometheus.NewSummaryVec(
prometheus.SummaryOpts{
Namespace: "syncthing",
Subsystem: "discovery",
Name: "database_operation_seconds",
Help: "Latency of database operations.",
Objectives: map[float64]float64{0.5: 0.05, 0.9: 0.01, 0.99: 0.001},
}, []string{"operation"})
)
const (
dbOpGet = "get"
dbOpPut = "put"
dbOpMerge = "merge"
dbResSuccess = "success"
dbResNotFound = "not_found"
dbResError = "error"
dbResUnmarshalError = "unmarsh_err"
)
func init() {
prometheus.MustRegister(apiRequestsTotal, apiRequestsSeconds,
lookupRequestsTotal, announceRequestsTotal,
replicationSendsTotal, replicationRecvsTotal,
databaseKeys, databaseStatisticsSeconds,
databaseOperations, databaseOperationSeconds)
}