cryptocore: prefetch nonces in 512-byte blocks

On my machine, reading 512-byte blocks from /dev/urandom
(same via getentropy syscall) is a lot faster in terms of
throughput:

Blocksize    Throughput
 16          28.18 MB/s
512          83.75 MB/s

For a single-threaded streaming write, this drops the CPU usage of
nonceGenerator.Get to almost 1/3:

        flat  flat%   sum%        cum   cum%
Before     0     0% 95.08%      0.35s  2.92%  github.com/rfjakob/gocryptfs/internal/cryptocore.(*nonceGenerator).Get
After  0.01s 0.092% 92.34%      0.13s  1.20%  github.com/rfjakob/gocryptfs/internal/cryptocore.(*nonceGenerator).Get

This change makes the nonce reading single-threaded, which may
hurt massively-parallel writes.
This commit is contained in:
Jakob Unterwurzacher 2017-06-09 21:52:26 +02:00
parent da1bd74246
commit 80516ed335
3 changed files with 91 additions and 2 deletions

View File

@ -28,6 +28,5 @@ type nonceGenerator struct {
// Get a random "nonceLen"-byte nonce
func (n *nonceGenerator) Get() []byte {
nonce := RandBytes(n.nonceLen)
return nonce
return randPrefetcher.read(n.nonceLen)
}

View File

@ -0,0 +1,50 @@
package cryptocore
import (
"bytes"
"log"
"sync"
)
/*
Number of bytes to prefetch.
512 looks like a good compromise between throughput and latency:
Benchmark16-2 3000000 567 ns/op 28.18 MB/s
Benchmark64-2 5000000 293 ns/op 54.51 MB/s
Benchmark128-2 10000000 220 ns/op 72.48 MB/s
Benchmark256-2 10000000 210 ns/op 76.17 MB/s
Benchmark512-2 10000000 191 ns/op 83.75 MB/s
Benchmark1024-2 10000000 171 ns/op 93.48 MB/s
Benchmark2048-2 10000000 165 ns/op 96.45 MB/s
Benchmark4096-2 10000000 165 ns/op 96.58 MB/s
Benchmark40960-2 10000000 147 ns/op 108.82 MB/s
*/
const prefetchN = 512
type randPrefetcherT struct {
sync.Mutex
buf bytes.Buffer
}
func (r *randPrefetcherT) read(want int) (out []byte) {
out = make([]byte, want)
r.Lock()
// Note: don't use defer, it slows us down!
have, err := r.buf.Read(out)
if have == want && err == nil {
r.Unlock()
return out
}
// Buffer was empty -> re-fill
r.buf.Reset()
r.buf.Write(RandBytes(prefetchN))
have, err = r.buf.Read(out)
if have != want || err != nil {
log.Panicf("randPrefetcher could not satisfy read: have=%d want=%d err=%v", have, want, err)
}
r.Unlock()
return out
}
var randPrefetcher randPrefetcherT

View File

@ -0,0 +1,40 @@
package cryptocore
import (
"bytes"
"compress/flate"
"runtime"
"sync"
"testing"
)
// TestRandPrefetch hammers the randPrefetcher with 100 goroutines and verifies
// that the result is incompressible
func TestRandPrefetch(t *testing.T) {
runtime.GOMAXPROCS(10)
p := 100
l := 200
vec := make([][]byte, p)
var wg sync.WaitGroup
for i := 0; i < p; i++ {
wg.Add(1)
go func(i int) {
var tmp []byte
for x := 0; x < l; x++ {
tmp = append(tmp, randPrefetcher.read(l)...)
}
vec[i] = tmp
wg.Done()
}(i)
}
wg.Wait()
var b bytes.Buffer
fw, _ := flate.NewWriter(&b, flate.BestCompression)
for _, v := range vec {
fw.Write(v)
}
fw.Close()
if b.Len() < p*l*l {
t.Errorf("random data should be incompressible, but: in=%d compressed=%d\n", p*l*l, b.Len())
}
}