cryptocore: prefetch nonces in 512-byte blocks

On my machine, reading 512-byte blocks from /dev/urandom (same via getentropy syscall) is a lot faster in terms of throughput: Blocksize Throughput 16 28.18 MB/s 512 83.75 MB/s For a single-threaded streaming write, this drops the CPU usage of nonceGenerator.Get to almost 1/3: flat flat% sum% cum cum% Before 0 0% 95.08% 0.35s 2.92% github.com/rfjakob/gocryptfs/internal/cryptocore.(*nonceGenerator).Get After 0.01s 0.092% 92.34% 0.13s 1.20% github.com/rfjakob/gocryptfs/internal/cryptocore.(*nonceGenerator).Get This change makes the nonce reading single-threaded, which may hurt massively-parallel writes.
2017-06-09 21:52:26 +02:00 · 2017-06-09 21:52:26 +02:00 · 80516ed335
parent da1bd74246
commit 80516ed335
3 changed files with 91 additions and 2 deletions
--- a/internal/cryptocore/nonce.go
+++ b/internal/cryptocore/nonce.go
@ -28,6 +28,5 @@ type nonceGenerator struct {

 // Get a random "nonceLen"-byte nonce
 func (n *nonceGenerator) Get() []byte {
-	nonce := RandBytes(n.nonceLen)
-	return nonce
+	return randPrefetcher.read(n.nonceLen)
 }
--- a/internal/cryptocore/randprefetch.go
+++ b/internal/cryptocore/randprefetch.go
@ -0,0 +1,50 @@
+package cryptocore
+
+import (
+	"bytes"
+	"log"
+	"sync"
+)
+
+/*
+Number of bytes to prefetch.
+
+512 looks like a good compromise between throughput and latency:
+Benchmark16-2      	 3000000	       567 ns/op	  28.18 MB/s
+Benchmark64-2      	 5000000	       293 ns/op	  54.51 MB/s
+Benchmark128-2     	10000000	       220 ns/op	  72.48 MB/s
+Benchmark256-2     	10000000	       210 ns/op	  76.17 MB/s
+Benchmark512-2     	10000000	       191 ns/op	  83.75 MB/s
+Benchmark1024-2    	10000000	       171 ns/op	  93.48 MB/s
+Benchmark2048-2    	10000000	       165 ns/op	  96.45 MB/s
+Benchmark4096-2    	10000000	       165 ns/op	  96.58 MB/s
+Benchmark40960-2   	10000000	       147 ns/op	 108.82 MB/s
+*/
+const prefetchN = 512
+
+type randPrefetcherT struct {
+	sync.Mutex
+	buf bytes.Buffer
+}
+
+func (r *randPrefetcherT) read(want int) (out []byte) {
+	out = make([]byte, want)
+	r.Lock()
+	// Note: don't use defer, it slows us down!
+	have, err := r.buf.Read(out)
+	if have == want && err == nil {
+		r.Unlock()
+		return out
+	}
+	// Buffer was empty -> re-fill
+	r.buf.Reset()
+	r.buf.Write(RandBytes(prefetchN))
+	have, err = r.buf.Read(out)
+	if have != want || err != nil {
+		log.Panicf("randPrefetcher could not satisfy read: have=%d want=%d err=%v", have, want, err)
+	}
+	r.Unlock()
+	return out
+}
+
+var randPrefetcher randPrefetcherT
--- a/internal/cryptocore/randprefetch_test.go
+++ b/internal/cryptocore/randprefetch_test.go
@ -0,0 +1,40 @@
+package cryptocore
+
+import (
+	"bytes"
+	"compress/flate"
+	"runtime"
+	"sync"
+	"testing"
+)
+
+// TestRandPrefetch hammers the randPrefetcher with 100 goroutines and verifies
+// that the result is incompressible
+func TestRandPrefetch(t *testing.T) {
+	runtime.GOMAXPROCS(10)
+	p := 100
+	l := 200
+	vec := make([][]byte, p)
+	var wg sync.WaitGroup
+	for i := 0; i < p; i++ {
+		wg.Add(1)
+		go func(i int) {
+			var tmp []byte
+			for x := 0; x < l; x++ {
+				tmp = append(tmp, randPrefetcher.read(l)...)
+			}
+			vec[i] = tmp
+			wg.Done()
+		}(i)
+	}
+	wg.Wait()
+	var b bytes.Buffer
+	fw, _ := flate.NewWriter(&b, flate.BestCompression)
+	for _, v := range vec {
+		fw.Write(v)
+	}
+	fw.Close()
+	if b.Len() < p*l*l {
+		t.Errorf("random data should be incompressible, but: in=%d compressed=%d\n", p*l*l, b.Len())
+	}
+}