cryptocore: prefetch nonces in 512-byte blocks
On my machine, reading 512-byte blocks from /dev/urandom (same via getentropy syscall) is a lot faster in terms of throughput: Blocksize Throughput 16 28.18 MB/s 512 83.75 MB/s For a single-threaded streaming write, this drops the CPU usage of nonceGenerator.Get to almost 1/3: flat flat% sum% cum cum% Before 0 0% 95.08% 0.35s 2.92% github.com/rfjakob/gocryptfs/internal/cryptocore.(*nonceGenerator).Get After 0.01s 0.092% 92.34% 0.13s 1.20% github.com/rfjakob/gocryptfs/internal/cryptocore.(*nonceGenerator).Get This change makes the nonce reading single-threaded, which may hurt massively-parallel writes.
This commit is contained in:
parent
da1bd74246
commit
80516ed335
@ -28,6 +28,5 @@ type nonceGenerator struct {
|
||||
|
||||
// Get a random "nonceLen"-byte nonce
|
||||
func (n *nonceGenerator) Get() []byte {
|
||||
nonce := RandBytes(n.nonceLen)
|
||||
return nonce
|
||||
return randPrefetcher.read(n.nonceLen)
|
||||
}
|
||||
|
50
internal/cryptocore/randprefetch.go
Normal file
50
internal/cryptocore/randprefetch.go
Normal file
@ -0,0 +1,50 @@
|
||||
package cryptocore
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"log"
|
||||
"sync"
|
||||
)
|
||||
|
||||
/*
|
||||
Number of bytes to prefetch.
|
||||
|
||||
512 looks like a good compromise between throughput and latency:
|
||||
Benchmark16-2 3000000 567 ns/op 28.18 MB/s
|
||||
Benchmark64-2 5000000 293 ns/op 54.51 MB/s
|
||||
Benchmark128-2 10000000 220 ns/op 72.48 MB/s
|
||||
Benchmark256-2 10000000 210 ns/op 76.17 MB/s
|
||||
Benchmark512-2 10000000 191 ns/op 83.75 MB/s
|
||||
Benchmark1024-2 10000000 171 ns/op 93.48 MB/s
|
||||
Benchmark2048-2 10000000 165 ns/op 96.45 MB/s
|
||||
Benchmark4096-2 10000000 165 ns/op 96.58 MB/s
|
||||
Benchmark40960-2 10000000 147 ns/op 108.82 MB/s
|
||||
*/
|
||||
const prefetchN = 512
|
||||
|
||||
type randPrefetcherT struct {
|
||||
sync.Mutex
|
||||
buf bytes.Buffer
|
||||
}
|
||||
|
||||
func (r *randPrefetcherT) read(want int) (out []byte) {
|
||||
out = make([]byte, want)
|
||||
r.Lock()
|
||||
// Note: don't use defer, it slows us down!
|
||||
have, err := r.buf.Read(out)
|
||||
if have == want && err == nil {
|
||||
r.Unlock()
|
||||
return out
|
||||
}
|
||||
// Buffer was empty -> re-fill
|
||||
r.buf.Reset()
|
||||
r.buf.Write(RandBytes(prefetchN))
|
||||
have, err = r.buf.Read(out)
|
||||
if have != want || err != nil {
|
||||
log.Panicf("randPrefetcher could not satisfy read: have=%d want=%d err=%v", have, want, err)
|
||||
}
|
||||
r.Unlock()
|
||||
return out
|
||||
}
|
||||
|
||||
var randPrefetcher randPrefetcherT
|
40
internal/cryptocore/randprefetch_test.go
Normal file
40
internal/cryptocore/randprefetch_test.go
Normal file
@ -0,0 +1,40 @@
|
||||
package cryptocore
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"compress/flate"
|
||||
"runtime"
|
||||
"sync"
|
||||
"testing"
|
||||
)
|
||||
|
||||
// TestRandPrefetch hammers the randPrefetcher with 100 goroutines and verifies
|
||||
// that the result is incompressible
|
||||
func TestRandPrefetch(t *testing.T) {
|
||||
runtime.GOMAXPROCS(10)
|
||||
p := 100
|
||||
l := 200
|
||||
vec := make([][]byte, p)
|
||||
var wg sync.WaitGroup
|
||||
for i := 0; i < p; i++ {
|
||||
wg.Add(1)
|
||||
go func(i int) {
|
||||
var tmp []byte
|
||||
for x := 0; x < l; x++ {
|
||||
tmp = append(tmp, randPrefetcher.read(l)...)
|
||||
}
|
||||
vec[i] = tmp
|
||||
wg.Done()
|
||||
}(i)
|
||||
}
|
||||
wg.Wait()
|
||||
var b bytes.Buffer
|
||||
fw, _ := flate.NewWriter(&b, flate.BestCompression)
|
||||
for _, v := range vec {
|
||||
fw.Write(v)
|
||||
}
|
||||
fw.Close()
|
||||
if b.Len() < p*l*l {
|
||||
t.Errorf("random data should be incompressible, but: in=%d compressed=%d\n", p*l*l, b.Len())
|
||||
}
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user