diff --git a/pkg/abi/linux/fs.go b/pkg/abi/linux/fs.go index e1c4a13bc6..f7abb18b72 100644 --- a/pkg/abi/linux/fs.go +++ b/pkg/abi/linux/fs.go @@ -14,6 +14,12 @@ package linux +import ( + "math" + + "gvisor.dev/gvisor/pkg/hostarch" +) + // Filesystem types used in statfs(2). // // See linux/magic.h. @@ -127,3 +133,8 @@ const ( WHITEOUT_MODE = 0 WHITEOUT_DEV = 0 ) + +// MAX_RW_COUNT is the maximum size in bytes of a single read or write. +// Reads and writes that exceed this size may be truncated. +// (Linux: include/linux/fs.h:MAX_RW_COUNT) +var MAX_RW_COUNT = int(hostarch.PageRoundDown(uint32(math.MaxInt32))) diff --git a/pkg/sentry/kernel/BUILD b/pkg/sentry/kernel/BUILD index 11b6b4331c..607724d939 100644 --- a/pkg/sentry/kernel/BUILD +++ b/pkg/sentry/kernel/BUILD @@ -390,6 +390,7 @@ go_library( "//pkg/sentry/seccheck/points:points_go_proto", "//pkg/sentry/socket/netlink/port", "//pkg/sentry/socket/unix/transport", + "//pkg/sentry/state/stateio", "//pkg/sentry/time", "//pkg/sentry/unimpl", "//pkg/sentry/unimpl:unimplemented_syscall_go_proto", diff --git a/pkg/sentry/kernel/kernel_restore.go b/pkg/sentry/kernel/kernel_restore.go index 4b3bd86180..8a57d9d9c2 100644 --- a/pkg/sentry/kernel/kernel_restore.go +++ b/pkg/sentry/kernel/kernel_restore.go @@ -15,15 +15,14 @@ package kernel import ( - "bufio" "fmt" "io" "gvisor.dev/gvisor/pkg/cleanup" "gvisor.dev/gvisor/pkg/context" - "gvisor.dev/gvisor/pkg/fd" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/pgalloc" + "gvisor.dev/gvisor/pkg/sentry/state/stateio" "gvisor.dev/gvisor/pkg/state" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/timing" @@ -272,7 +271,7 @@ type AsyncMFLoader struct { // If timeline is provided, it will be used to track async page loading. // It takes ownership of the timeline, and will end it when done loading all // pages. -func NewAsyncMFLoader(pagesMetadata, pagesFile *fd.FD, mainMF *pgalloc.MemoryFile, timeline *timing.Timeline) *AsyncMFLoader { +func NewAsyncMFLoader(pagesMetadata io.ReadCloser, pagesFile stateio.AsyncReader, mainMF *pgalloc.MemoryFile, timeline *timing.Timeline) *AsyncMFLoader { mfl := &AsyncMFLoader{ privateMFsChan: make(chan map[string]*pgalloc.MemoryFile, 1), } @@ -283,27 +282,25 @@ func NewAsyncMFLoader(pagesMetadata, pagesFile *fd.FD, mainMF *pgalloc.MemoryFil return mfl } -func (mfl *AsyncMFLoader) backgroundGoroutine(pagesMetadataFD, pagesFileFD *fd.FD, mainMF *pgalloc.MemoryFile, timeline *timing.Timeline) { +func (mfl *AsyncMFLoader) backgroundGoroutine(pagesMetadata io.ReadCloser, pagesFile stateio.AsyncReader, mainMF *pgalloc.MemoryFile, timeline *timing.Timeline) { defer timeline.End() - defer pagesMetadataFD.Close() - defer pagesFileFD.Close() + defer pagesMetadata.Close() cu := cleanup.Make(func() { mfl.metadataWg.Done() mfl.loadWg.Done() }) defer cu.Clean() - // //pkg/state/wire reads one byte at a time; buffer these reads to - // avoid making one syscall per read. For the "main" state file, this - // buffering is handled by statefile.NewReader() => compressio.Reader - // or compressio.NewSimpleReader(). - pagesMetadata := bufio.NewReader(pagesMetadataFD) - mfl.loadWg.Add(1) - apfl := pgalloc.StartAsyncPagesFileLoad(int32(pagesFileFD.FD()), func(err error) { + apfl, err := pgalloc.StartAsyncPagesFileLoad(pagesFile, func(err error) { defer mfl.loadWg.Done() mfl.loadErr = err - }, timeline) + }, timeline) // transfers ownership of pagesFile + if err != nil { + mfl.loadWg.Done() + log.Warningf("Failed to start async page loading: %v", err) + return + } cu.Add(apfl.MemoryFilesDone) opts := pgalloc.LoadOpts{ @@ -314,7 +311,7 @@ func (mfl *AsyncMFLoader) backgroundGoroutine(pagesMetadataFD, pagesFileFD *fd.F timeline.Reached("loading mainMF") log.Infof("Loading metadata for main MemoryFile: %p", mainMF) ctx := context.Background() - err := mainMF.LoadFrom(ctx, pagesMetadata, &opts) + err = mainMF.LoadFrom(ctx, pagesMetadata, &opts) mfl.metadataErr = err mfl.mainMetadataErr = err mfl.mainMFStartWg.Done() diff --git a/pkg/sentry/kernel/task_usermem.go b/pkg/sentry/kernel/task_usermem.go index 6d32c8bc67..cda826f3cc 100644 --- a/pkg/sentry/kernel/task_usermem.go +++ b/pkg/sentry/kernel/task_usermem.go @@ -28,11 +28,6 @@ import ( const iovecLength = 16 -// MAX_RW_COUNT is the maximum size in bytes of a single read or write. -// Reads and writes that exceed this size may be silently truncated. -// (Linux: include/linux/fs.h:MAX_RW_COUNT) -var MAX_RW_COUNT = int(hostarch.Addr(math.MaxInt32).RoundDown()) - // Activate ensures that the task has an active address space. func (t *Task) Activate() { if mm := t.MemoryManager(); mm != nil { @@ -190,7 +185,7 @@ func copyInIovec(ctx marshal.CopyContext, t *Task, addr hostarch.Addr) (hostarch if err != nil { return hostarch.AddrRangeSeq{}, err } - return hostarch.AddrRangeSeqOf(ar).TakeFirst(MAX_RW_COUNT), nil + return hostarch.AddrRangeSeqOf(ar).TakeFirst(linux.MAX_RW_COUNT), nil } // copyInIovecs copies an array of numIovecs struct iovecs from the memory @@ -243,7 +238,7 @@ func copyInIovecs(ctx marshal.CopyContext, t *Task, addr hostarch.Addr, numIovec var total uint64 for i := range dst { dstlen := uint64(dst[i].Length()) - if rem := uint64(MAX_RW_COUNT) - total; rem < dstlen { + if rem := uint64(linux.MAX_RW_COUNT) - total; rem < dstlen { dst[i].End -= hostarch.Addr(dstlen - rem) dstlen = rem } @@ -288,8 +283,8 @@ func makeIovec(ctx marshal.CopyContext, t *Task, addr hostarch.Addr, b []byte) ( // access_ok() in fs/read_write.c:vfs_read/vfs_write, and overflowing address // ranges are truncated to MAX_RW_COUNT by fs/read_write.c:rw_verify_area().) func (t *Task) SingleIOSequence(addr hostarch.Addr, length int, opts usermem.IOOpts) (usermem.IOSequence, error) { - if length > MAX_RW_COUNT { - length = MAX_RW_COUNT + if length > linux.MAX_RW_COUNT { + length = linux.MAX_RW_COUNT } ar, ok := t.MemoryManager().CheckIORange(addr, int64(length)) if !ok { diff --git a/pkg/sentry/pgalloc/BUILD b/pkg/sentry/pgalloc/BUILD index 02263596c1..d8c0721747 100644 --- a/pkg/sentry/pgalloc/BUILD +++ b/pkg/sentry/pgalloc/BUILD @@ -166,7 +166,6 @@ go_library( visibility = ["//pkg/sentry:internal"], deps = [ "//pkg/abi/linux", - "//pkg/aio", "//pkg/atomicbitops", "//pkg/bitmap", "//pkg/context", @@ -180,6 +179,7 @@ go_library( "//pkg/sentry/arch", "//pkg/sentry/hostmm", "//pkg/sentry/memmap", + "//pkg/sentry/state/stateio", "//pkg/sentry/usage", "//pkg/state", "//pkg/state/wire", diff --git a/pkg/sentry/pgalloc/pgalloc_unsafe.go b/pkg/sentry/pgalloc/pgalloc_unsafe.go index 48fb3964f3..50da2040dc 100644 --- a/pkg/sentry/pgalloc/pgalloc_unsafe.go +++ b/pkg/sentry/pgalloc/pgalloc_unsafe.go @@ -37,10 +37,6 @@ func mincore(s []byte, buf []byte, off uint64, wasCommitted bool) error { return nil } -func sliceFromIovec(iov unix.Iovec) []byte { - return unsafe.Slice(iov.Base, iov.Len) -} - func canMergeIovecAndSlice(iov unix.Iovec, bs []byte) bool { return uintptr(unsafe.Pointer(iov.Base))+uintptr(iov.Len) == uintptr(unsafe.Pointer(unsafe.SliceData(bs))) } diff --git a/pkg/sentry/pgalloc/save_restore.go b/pkg/sentry/pgalloc/save_restore.go index 2135d9529b..19127addc8 100644 --- a/pkg/sentry/pgalloc/save_restore.go +++ b/pkg/sentry/pgalloc/save_restore.go @@ -25,7 +25,6 @@ import ( "time" "golang.org/x/sys/unix" - "gvisor.dev/gvisor/pkg/aio" "gvisor.dev/gvisor/pkg/atomicbitops" "gvisor.dev/gvisor/pkg/bitmap" "gvisor.dev/gvisor/pkg/errors/linuxerr" @@ -35,6 +34,7 @@ import ( "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/ringdeque" "gvisor.dev/gvisor/pkg/sentry/memmap" + "gvisor.dev/gvisor/pkg/sentry/state/stateio" "gvisor.dev/gvisor/pkg/sentry/usage" "gvisor.dev/gvisor/pkg/state" "gvisor.dev/gvisor/pkg/state/wire" @@ -302,7 +302,8 @@ func (f *MemoryFile) LoadFrom(ctx context.Context, r io.Reader, opts *LoadOpts) f.chunks.Store(&chunks) mfTimeline.Reached("metadata loaded") log.Infof("MemoryFile(%p): loaded metadata in %s", f, time.Since(timeMetadataStart)) - if err := f.file.Truncate(int64(len(chunks)) * chunkSize); err != nil { + fileSize := uint64(len(chunks)) * chunkSize + if err := f.file.Truncate(int64(fileSize)); err != nil { return fmt.Errorf("failed to truncate MemoryFile: %w", err) } // Obtain chunk mappings, then madvise them concurrently with loading data. @@ -315,7 +316,7 @@ func (f *MemoryFile) LoadFrom(ctx context.Context, r io.Reader, opts *LoadOpts) m, _, errno := unix.Syscall6( unix.SYS_MMAP, 0, - uintptr(len(chunks)*chunkSize), + uintptr(fileSize), unix.PROT_READ|unix.PROT_WRITE, unix.MAP_SHARED, f.file.Fd(), @@ -349,9 +350,18 @@ func (f *MemoryFile) LoadFrom(ctx context.Context, r io.Reader, opts *LoadOpts) // been provided. var amfl *asyncMemoryFileLoad if opts.PagesFile != nil { + var df stateio.DestinationFile + if opts.PagesFile.ar.NeedRegisterDestinationFD() { + var err error + df, err = opts.PagesFile.ar.RegisterDestinationFD(int32(f.file.Fd()), fileSize, f.getClientFileRangeSettings(fileSize)) + if err != nil { + return fmt.Errorf("failed to register MemoryFile with pages file: %w", err) + } + } amfl = &asyncMemoryFileLoad{ f: f, pf: opts.PagesFile, + df: df, timeline: mfTimeline.Transfer(), } amfl.pf.amflsMu.Lock() @@ -456,18 +466,6 @@ func (f *MemoryFile) LoadFrom(ctx context.Context, r io.Reader, opts *LoadOpts) return nil } -const ( - // When a pages file is provided, reads from it will be issued - // asynchronously via an aio.Queue of capacity aplQueueCapacity, and each - // read will be of size aplReadMaxBytes when possible; reads may be smaller - // in some circumstances but will never be larger. - // TODO: Pass these via LoadOpts and make them flag-controlled. - aplReadMaxBytes = 256 * 1024 - aplQueueCapacity = 128 - - aplOpMaxIovecs = aplReadMaxBytes / hostarch.PageSize -) - // AsyncPagesFileLoad holds async page loading state for a single pages file. type AsyncPagesFileLoad struct { // loadOff is the offset in the pages file from which the next page should @@ -530,12 +528,15 @@ type AsyncPagesFileLoad struct { timeline *timing.Timeline // immutable doneCallback func(error) // immutable - // q issues reads to the pages file. GoQueue is faster than LinuxQueue here - // since it can allocate and zero MemoryFile pages in parallel. q is + // ar is the pages file. ar is immutable. + ar stateio.AsyncReader + + // maxReadBytes is hostarch.PageRoundDown(ar.MaxReadBytes()), cached to + // avoid interface method calls and recomputation. maxReadBytes is // immutable. - q *aio.GoQueue + maxReadBytes uint64 - // qavail is unused capacity in q. + // qavail is unused capacity in ar. qavail int // The async page loader combines multiple loads with contiguous pages file @@ -546,15 +547,12 @@ type AsyncPagesFileLoad struct { curOp *aplOp curOpID uint32 - // fd is the host file descriptor for the pages file. - fd int32 // immutable - // opsBusy tracks which aplOps in ops are in use (correspond to // inflight operations or curOp). opsBusy bitmap.Bitmap // ops stores all aplOps. - ops [aplQueueCapacity]aplOp + ops []aplOp } // Possible events in AsyncPagesFileLoad.lfStatus: @@ -577,9 +575,10 @@ func (apfl *AsyncPagesFileLoad) err() error { // asyncMemoryFileLoad holds async page loading state for a single MemoryFile. type asyncMemoryFileLoad struct { - f *MemoryFile // immutable - pf *AsyncPagesFileLoad // immutable - timeline *timing.Timeline // immutable + f *MemoryFile // immutable + pf *AsyncPagesFileLoad // immutable + df stateio.DestinationFile // immutable + timeline *timing.Timeline // immutable // minUnloaded is the MemoryFile offset of the first unloaded byte. minUnloaded atomicbitops.Uint64 @@ -653,53 +652,53 @@ type aplOp struct { // amfl represents the MemoryFile being loaded. amfl *asyncMemoryFileLoad - // frs() = frsData[:frsLen] are the MemoryFile ranges being loaded. - frsData [aplOpMaxIovecs]memmap.FileRange - frsLen uint8 + // frs are the MemoryFile ranges being loaded. + frs []memmap.FileRange - // iovecsLen is described below, but stored here to minimize alignment - // padding. - iovecsLen uint8 + // iovecs contains mappings of frs. + iovecs []unix.Iovec - // If tempRef is true, a temporary reference is held on pages in frs() that + // If tempRef is true, a temporary reference is held on pages in frs that // should be dropped after completion. tempRef bool - - // iovecs() = iovecsData[:iovecsLen] contains mappings of frs(). - iovecsData [aplOpMaxIovecs]unix.Iovec } func (op *aplOp) off() int64 { return int64(op.end - op.total) } -func (op *aplOp) frs() []memmap.FileRange { - return op.frsData[:op.frsLen] -} - -func (op *aplOp) iovecs() []unix.Iovec { - return op.iovecsData[:op.iovecsLen] -} - // StartAsyncPagesFileLoad constructs asynchronous loading state for the pages -// file with host file descriptor pagesFD. It does not take ownership of -// pagesFD, which must remain valid until doneCallback is invoked. -func StartAsyncPagesFileLoad(pagesFD int32, doneCallback func(error), timeline *timing.Timeline) *AsyncPagesFileLoad { +// file ar. It takes ownership of ar, even if it returns a non-nil error. +func StartAsyncPagesFileLoad(ar stateio.AsyncReader, doneCallback func(error), timeline *timing.Timeline) (*AsyncPagesFileLoad, error) { + maxReadBytes := hostarch.PageRoundDown(ar.MaxReadBytes()) + if maxReadBytes <= 0 { + ar.Close() + return nil, fmt.Errorf("stateio.AsyncReader.MaxReadBytes() (%d) must be at least one page)", ar.MaxReadBytes()) + } + maxParallel := ar.MaxParallel() apfl := &AsyncPagesFileLoad{ timeline: timeline.Fork("async page loading"), doneCallback: doneCallback, - q: aio.NewGoQueue(aplQueueCapacity), - qavail: aplQueueCapacity, - fd: pagesFD, - opsBusy: bitmap.New(aplQueueCapacity), + ar: ar, + maxReadBytes: maxReadBytes, + qavail: maxParallel, + opsBusy: bitmap.New(uint32(maxParallel)), + ops: make([]aplOp, maxParallel), } // Mark ops in opsBusy that don't actually exist as permanently busy. - for i, n := aplQueueCapacity, apfl.opsBusy.Size(); i < n; i++ { + for i, n := maxParallel, apfl.opsBusy.Size(); i < n; i++ { apfl.opsBusy.Add(uint32(i)) } + // Pre-allocate slices in ops. + maxRanges := ar.MaxRanges() + for i := range apfl.ops { + op := &apfl.ops[i] + op.frs = make([]memmap.FileRange, 0, maxRanges) + op.iovecs = make([]unix.Iovec, 0, maxRanges) + } apfl.lfStatus.Init() go apfl.main() - return apfl + return apfl, nil } // MemoryFilesDone must be called after calling LoadFrom() for all MemoryFiles @@ -807,21 +806,21 @@ func (apfl *AsyncPagesFileLoad) enqueueCurOp() { if op.total == 0 { panic("invalid read of 0 bytes") } - if op.total > aplReadMaxBytes { - panic(fmt.Sprintf("read of %d bytes exceeds per-read limit of %d bytes", op.total, aplReadMaxBytes)) + if op.total > apfl.maxReadBytes { + panic(fmt.Sprintf("read of %d bytes exceeds per-read limit of %d bytes", op.total, apfl.maxReadBytes)) } apfl.qavail-- apfl.curOp = nil - if op.iovecsLen == 1 { - // Perform a non-vectorized read to save an indirection (and - // userspace-to-kernelspace copy) in the aio.Queue implementation. - aio.Read(apfl.q, uint64(apfl.curOpID), apfl.fd, op.off(), sliceFromIovec(op.iovecsData[0])) + if len(op.frs) == 1 && len(op.iovecs) == 1 { + // Perform a non-vectorized read to save an indirection (and possible + // userspace-to-kernelspace copy) in the AsyncReader implementation. + apfl.ar.AddRead(int(apfl.curOpID), op.off(), op.amfl.df, op.frs[0], stateio.SliceFromIovec(op.iovecs[0])) } else { - aio.Readv(apfl.q, uint64(apfl.curOpID), apfl.fd, op.off(), op.iovecs()) + apfl.ar.AddReadv(int(apfl.curOpID), op.off(), op.total, op.amfl.df, op.frs, op.iovecs) } if logAwaitedLoads && !op.tempRef { - log.Infof("MemoryFile(%p): awaited opid %d start, read %d bytes: %v", op.amfl.f, apfl.curOpID, op.total, op.frs()) + log.Infof("MemoryFile(%p): awaited opid %d start, read %d bytes: %v", op.amfl.f, apfl.curOpID, op.total, op.frs) } } @@ -839,8 +838,8 @@ func (apfl *AsyncPagesFileLoad) enqueueRange(amfl *asyncMemoryFileLoad, fr memma apfl.opsBusy.Add(id) op := &apfl.ops[id] op.total = 0 - op.frsLen = 0 - op.iovecsLen = 0 + op.frs = op.frs[:0] + op.iovecs = op.iovecs[:0] apfl.curOp = op apfl.curOpID = id } @@ -870,12 +869,10 @@ func (apfl *AsyncPagesFileLoad) combine(amfl *asyncMemoryFileLoad, fr memmap.Fil return 0 } if op.amfl != amfl { - // Differing MemoryFile. We could handle this by making the - // asyncMemoryFileLoad per-FileRange, but this would bloat aplOp - // and should happen very infrequently. + // Differing MemoryFile. return 0 } - if int(op.frsLen) == len(op.frsData) && op.frsData[op.frsLen-1].End != fr.Start { + if len(op.frs) == cap(op.frs) && op.frs[len(op.frs)-1].End != fr.Start { // Non-contiguous in the MemoryFile, and we're out of space for // FileRanges. return 0 @@ -891,8 +888,8 @@ func (apfl *AsyncPagesFileLoad) combine(amfl *asyncMemoryFileLoad, fr memmap.Fil // Apply direct length limits. n := fr.Length() - if op.total+n >= aplReadMaxBytes { - n = aplReadMaxBytes - op.total + if op.total+n >= apfl.maxReadBytes { + n = apfl.maxReadBytes - op.total } if n == 0 { return 0 @@ -902,19 +899,20 @@ func (apfl *AsyncPagesFileLoad) combine(amfl *asyncMemoryFileLoad, fr memmap.Fil // Collect iovecs, which may further limit length. n = 0 amfl.f.forEachMappingSlice(fr, func(bs []byte) { - if op.iovecsLen > 0 { - if canMergeIovecAndSlice(op.iovecsData[op.iovecsLen-1], bs) { - op.iovecsData[op.iovecsLen-1].Len += uint64(len(bs)) + if len(op.iovecs) > 0 { + if canMergeIovecAndSlice(op.iovecs[len(op.iovecs)-1], bs) { + op.iovecs[len(op.iovecs)-1].Len += uint64(len(bs)) n += uint64(len(bs)) return } - if int(op.iovecsLen) == len(op.iovecsData) { + if len(op.iovecs) == cap(op.iovecs) { return } } - op.iovecsData[op.iovecsLen].Base = &bs[0] - op.iovecsData[op.iovecsLen].SetLen(len(bs)) - op.iovecsLen++ + op.iovecs = append(op.iovecs, unix.Iovec{ + Base: &bs[0], + Len: uint64(len(bs)), + }) n += uint64(len(bs)) }) if n == 0 { @@ -930,20 +928,22 @@ func (apfl *AsyncPagesFileLoad) combine(amfl *asyncMemoryFileLoad, fr memmap.Fil op.end += n op.total += n op.tempRef = tempRef - if op.frsLen > 0 && op.frsData[op.frsLen-1].End == fr.Start { - op.frsData[op.frsLen-1].End = fr.End + if len(op.frs) > 0 && op.frs[len(op.frs)-1].End == fr.Start { + op.frs[len(op.frs)-1].End = fr.End } else { - op.frsData[op.frsLen] = fr - op.frsLen++ + op.frs = append(op.frs, fr) } return n } func (apfl *AsyncPagesFileLoad) main() { - q := apfl.q defer func() { - // Destroy q first since this synchronously stops inflight I/O. - q.Destroy() + // Close ar first since this synchronously stops inflight I/O. + if err := apfl.ar.Close(); err != nil { + // Completed reads are complete irrespective of err, so log err + // rather than propagating it. + log.Warningf("Async page loading: stateio.AsyncReader.Close failed: %v", err) + } apfl.timeline.End() // Wake up any remaining waiters so that they can observe apfl.err(). // Leave all segments in asyncMemoryFileLoad.unloaded so that new @@ -972,8 +972,9 @@ func (apfl *AsyncPagesFileLoad) main() { } }() + maxParallel := apfl.ar.MaxParallel() // Storage reused between main loop iterations: - var completions []aio.Completion + var completions []stateio.Completion var wakeups []*aplWaiter var decRefs []aplFileRange @@ -1140,7 +1141,7 @@ func (apfl *AsyncPagesFileLoad) main() { apfl.enqueueCurOp() } - if apfl.qavail == aplQueueCapacity { + if apfl.qavail == maxParallel { // We are out of work to do. ev := apfl.lfStatus.Wait() if ev&aplLFPending != 0 { @@ -1162,9 +1163,9 @@ func (apfl *AsyncPagesFileLoad) main() { // Wait for any number of reads to complete. var err error - completions, err = q.Wait(completions[:0], 1 /* minCompletions */) + completions, err = apfl.ar.Wait(completions[:0], 1 /* minCompletions */) if err != nil { - log.Warningf("Async page loading failed: aio.Queue.Wait failed: %v", err) + log.Warningf("Async page loading failed: stateio.AsyncReader.Wait failed: %v", err) apfl.mu.Lock() apfl.errVal.Store(linuxerr.EIO) apfl.mu.Unlock() @@ -1183,21 +1184,12 @@ func (apfl *AsyncPagesFileLoad) main() { // required to avoid lock recursion via dropping the last // reference => asyncMemoryFileLoad.cancelWasteLoad() => // apfl.mu.Lock(). - for _, fr := range op.frs() { + for _, fr := range op.frs { decRefs = append(decRefs, aplFileRange{op.amfl, fr}) } } - if err := c.Err(); err != nil { - log.Warningf("Async page loading failed: read for MemoryFile(%p) pages %v failed: %v", op.amfl.f, op.frs(), err) - apfl.errVal.Store(err) - apfl.mu.Unlock() - apfl.amflsMu.Unlock() - return - } - if uint64(c.Result) != op.total { - // TODO: Is this something we actually have to worry about? If - // so, we need to reissue the remainder of the read... - log.Warningf("Async page loading failed: read for MemoryFile(%p) pages %v (total %d bytes) returned %d bytes", op.amfl.f, op.frs(), op.total, c.Result) + if c.N != op.total { + log.Warningf("Async page loading failed: read for MemoryFile(%p) pages %v (total %d bytes) returned %d bytes, error: %v", op.amfl.f, op.frs, op.total, c.N, c.Err) apfl.errVal.Store(linuxerr.EIO) apfl.mu.Unlock() apfl.amflsMu.Unlock() @@ -1207,7 +1199,7 @@ func (apfl *AsyncPagesFileLoad) main() { amfl := op.amfl haveWaiters := false now := int64(0) - for _, fr := range op.frs() { + for _, fr := range op.frs { // All pages in fr have been started and were split around fr // when they were started (above), and fr.amfl.unloaded never // merges started segments. Thus, we don't need to split around @@ -1241,7 +1233,7 @@ func (apfl *AsyncPagesFileLoad) main() { } } if logAwaitedLoads && haveWaiters { - log.Infof("MemoryFile(%p): awaited opid %d complete, read %d bytes: %v", op.amfl.f, c.ID, op.total, op.frs()) + log.Infof("MemoryFile(%p): awaited opid %d complete, read %d bytes: %v", op.amfl.f, c.ID, op.total, op.frs) } // Keep amfl.minUnloaded up to date. We can only determine this // accurately if insertions into amfl.unloaded are complete. @@ -1331,3 +1323,29 @@ func (aplUnloadedSetFunctions) Split(fr memmap.FileRange, ul aplUnloadedInfo, sp } return ul, ul2 } + +func (f *MemoryFile) getClientFileRangeSettings(fileSize uint64) []stateio.ClientFileRangeSetting { + if !f.opts.AdviseHugepage && !f.opts.AdviseNoHugepage { + return nil + } + var cfrs []stateio.ClientFileRangeSetting + f.forEachChunk(memmap.FileRange{0, fileSize}, func(chunk *chunkInfo, chunkFR memmap.FileRange) bool { + if chunk.huge { + if f.opts.AdviseHugepage { + cfrs = append(cfrs, stateio.ClientFileRangeSetting{ + FileRange: chunkFR, + Property: stateio.PropertyHugepage, + }) + } + } else { + if f.opts.AdviseNoHugepage { + cfrs = append(cfrs, stateio.ClientFileRangeSetting{ + FileRange: chunkFR, + Property: stateio.PropertyNoHugepage, + }) + } + } + return true + }) + return cfrs +} diff --git a/pkg/sentry/state/stateio/BUILD b/pkg/sentry/state/stateio/BUILD new file mode 100644 index 0000000000..1f776e1ead --- /dev/null +++ b/pkg/sentry/state/stateio/BUILD @@ -0,0 +1,25 @@ +load("//tools:defs.bzl", "go_library") + +package( + default_applicable_licenses = ["//:license"], + licenses = ["notice"], +) + +go_library( + name = "stateio", + srcs = [ + "bufio.go", + "fdreader.go", + "stateio.go", + "stateio_impl_util.go", + "stateio_util_unsafe.go", + ], + visibility = ["//pkg/sentry:internal"], + deps = [ + "//pkg/abi/linux", + "//pkg/aio", + "//pkg/sentry/hostfd", + "//pkg/sentry/memmap", + "@org_golang_x_sys//unix:go_default_library", + ], +) diff --git a/pkg/sentry/state/stateio/bufio.go b/pkg/sentry/state/stateio/bufio.go new file mode 100644 index 0000000000..0281fae003 --- /dev/null +++ b/pkg/sentry/state/stateio/bufio.go @@ -0,0 +1,37 @@ +// Copyright 2025 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package stateio + +import ( + "bufio" + "io" +) + +// BufioReadCloser is a wrapper around bufio.Reader that implements io.Closer +// by closing the underlying io.ReadCloser. +type BufioReadCloser struct { + bufio.Reader + io.Closer +} + +// NewBufioReadCloser returns a new BufioReadCloser whose buffer has the +// default size. +func NewBufioReadCloser(rc io.ReadCloser) *BufioReadCloser { + brc := &BufioReadCloser{ + Closer: rc, + } + brc.Reader.Reset(rc) + return brc +} diff --git a/pkg/sentry/state/stateio/fdreader.go b/pkg/sentry/state/stateio/fdreader.go new file mode 100644 index 0000000000..4f27842a2a --- /dev/null +++ b/pkg/sentry/state/stateio/fdreader.go @@ -0,0 +1,173 @@ +// Copyright 2025 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package stateio + +import ( + "io" + + "golang.org/x/sys/unix" + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/aio" + "gvisor.dev/gvisor/pkg/sentry/hostfd" + "gvisor.dev/gvisor/pkg/sentry/memmap" +) + +// FDReader implements AsyncReader for a host file descriptor. +type FDReader struct { + NoRegisterClientFD + + fd int32 + maxReadBytes uint32 + maxRanges uint32 + // aio.GoQueue is preferred over aio.LinuxQueue for our use cases since it + // can allocate and zero destination pages in parallel. + q *aio.GoQueue + inflight []fdRead + cs []aio.Completion +} + +type fdRead struct { + off int64 + done uint64 + total uint64 + dst LocalClientRanges +} + +// NewFDReader returns a FDReader that reads from the given host +// file descriptor. It takes ownership of the file descriptor. +// +// Note that FDReader.MaxReadBytes()/MaxRanges() may be less than the specified +// maxReadBytes/maxRanges respectively, due to implementation constraints. +// +// Preconditions: +// - maxReadBytes > 0. +// - maxRanges > 0. +// - maxParallel > 0. +func NewFDReader(fd int32, maxReadBytes uint64, maxRanges, maxParallel int) *FDReader { + if maxReadBytes <= 0 { + panic("invalid maxReadBytes") + } + if maxRanges <= 0 { + panic("invalid maxRanges") + } + if maxParallel <= 0 { + panic("invalid maxParallel") + } + return &FDReader{ + fd: fd, + maxReadBytes: uint32(min(maxReadBytes, uint64(linux.MAX_RW_COUNT))), + maxRanges: uint32(min(maxRanges, hostfd.MaxReadWriteIov)), + q: aio.NewGoQueue(maxParallel), + inflight: make([]fdRead, maxParallel), + cs: make([]aio.Completion, 0, maxParallel), + } +} + +// Close implements AsyncReader.Close. +func (r *FDReader) Close() error { + r.q.Destroy() + return unix.Close(int(r.fd)) +} + +// MaxReadBytes implements AsyncReader.MaxReadBytes. +func (r *FDReader) MaxReadBytes() uint64 { + return uint64(r.maxReadBytes) +} + +// MaxRanges implements AsyncReader.MaxRanges. +func (r *FDReader) MaxRanges() int { + return int(r.maxRanges) +} + +// MaxParallel implements AsyncReader.MaxParallel. +func (r *FDReader) MaxParallel() int { + return len(r.inflight) +} + +// AddRead implements AsyncReader.AddRead. +func (r *FDReader) AddRead(id int, off int64, _ DestinationFile, _ memmap.FileRange, dstMap []byte) { + aio.Read(r.q, uint64(id), r.fd, off, dstMap) + r.inflight[id] = fdRead{ + off: off, + total: uint64(len(dstMap)), + dst: LocalClientMapping(dstMap), + } +} + +// AddReadv implements AsyncReader.AddReadv. +func (r *FDReader) AddReadv(id int, off int64, total uint64, _ DestinationFile, _ []memmap.FileRange, dstMaps []unix.Iovec) { + aio.Readv(r.q, uint64(id), r.fd, off, dstMaps) + r.inflight[id] = fdRead{ + off: off, + total: total, + dst: LocalClientMappings(dstMaps), + } +} + +// Wait implements AsyncReader.Wait. +func (r *FDReader) Wait(cs []Completion, minCompletions int) ([]Completion, error) { +retry: + numCompletions := 0 + aioCS, err := r.q.Wait(r.cs, minCompletions) + for _, aioC := range aioCS { + id := int(aioC.ID) + inflight := &r.inflight[id] + switch { + case aioC.Result < 0: + cs = append(cs, Completion{ + ID: id, + N: inflight.done, + Err: aioC.Err(), + }) + numCompletions++ + case aioC.Result == 0: + cs = append(cs, Completion{ + ID: id, + N: inflight.done, + Err: io.EOF, + }) + numCompletions++ + default: + n := uint64(aioC.Result) + done := inflight.done + n + if done == inflight.total { + cs = append(cs, Completion{ + ID: id, + N: done, + }) + numCompletions++ + } else { + // Need to continue the read to get a full read or error. + inflight.off += int64(n) + inflight.done = done + inflight.dst = inflight.dst.DropFirst(n) + if inflight.dst.Mapping != nil { + aio.Read(r.q, aioC.ID, r.fd, inflight.off, inflight.dst.Mapping) + } else { + aio.Readv(r.q, aioC.ID, r.fd, inflight.off, inflight.dst.Iovecs) + } + // Since r.q is an aio.GoQueue, aio.Read/Readv() => + // aio.GoQueue.Add() allows the enqueued read to execute + // immediately, so we don't need to call r.q.Wait() again + // unless we no longer have enough completions. + } + } + } + if numCompletions < minCompletions { + minCompletions -= numCompletions + goto retry + } + return cs, err +} diff --git a/pkg/sentry/state/stateio/stateio.go b/pkg/sentry/state/stateio/stateio.go new file mode 100644 index 0000000000..a3529cae1f --- /dev/null +++ b/pkg/sentry/state/stateio/stateio.go @@ -0,0 +1,164 @@ +// Copyright 2025 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package stateio defines I/O types used by sentry save/restore. +package stateio + +import ( + "golang.org/x/sys/unix" + "gvisor.dev/gvisor/pkg/sentry/memmap" +) + +// AsyncReader represents a file supporting asynchronous random reads. +// +// MaxReadBytes, MaxRanges, MaxParallel, and NeedRegisterDestinationFD may be +// called concurrently. Only one goroutine may call RegisterDestinationFD at a +// time. Only one goroutine may call any of AddRead, AddReadv, or Wait at a +// time. However, RegisterDestinationFD and any one of AddRead, AddReadv, or +// Wait may be called concurrently. Close may not be called concurrently with +// any other methods, and no methods may be called after Close. +type AsyncReader interface { + // Close cancels inflight reads if possible, waits for uncanceled inflight + // reads to complete, and then releases resources owned by the AsyncReader. + Close() error + + // MaxReadBytes returns the maximum length of each read in bytes, which + // must be strictly positive. All calls to MaxReadBytes for a given + // AsyncReader must return the same value. + // + // Implementations should return the largest read size that is efficient, + // rather than the largest read size that is implementable, allowing + // callers to treat MaxReadBytes() as a target. + MaxReadBytes() uint64 + + // MaxRanges returns the maximum number of FileRanges and Iovecs that may + // passed in calls to AddReadv, which must be strictly positive. All calls + // to MaxRanges for a given AsyncReader must return the same value. + MaxRanges() int + + // MaxParallel returns the maximum number of parallel reads that may be + // enqueued on this file, which must be strictly positive. All calls to + // MaxParallel for a given AsyncReader must return the same value. + MaxParallel() int + + // NeedRegisterDestinationFD returns true if RegisterDestinationFD must be + // called to obtain DestinationFiles for read destinations. If + // NeedRegisterDestinationFD returns false, callers may pass a nil + // DestinationFile to AddRead and AddReadv, an empty FileRange to AddRead, + // and a nil FileRange slice to AddReadv. All calls to + // NeedRegisterDestinationFD for a given AsyncReader must return the same + // value. + // + // This feature exists to support implementations of AsyncReader in which + // reads take place in external processes. Implementations of AsyncReader + // that don't require this can embed NoRegisterClientFD to obtain an + // appropriate implementation of NeedRegisterDestinationFD and + // RegisterDestinationFD. + NeedRegisterDestinationFD() bool + + // RegisterDestinationFD makes the first size bytes of the given host file + // descriptor a valid destination for reads from this file, and returns a + // DestinationFile representing it. The returned DestinationFile can only + // be used with the AsyncReader that returned it. fd does not need to + // remain valid beyond the call to RegisterDestinationFD. + // + // There is no way to unregister individual DestinationFiles; all + // DestinationFiles are invalidated by AsyncReader.Close. + // + // It is safe, though unnecessary, to call RegisterDestinationFD even if + // NeedRegisterDestinationFD returns false. + RegisterDestinationFD(fd int32, size uint64, settings []ClientFileRangeSetting) (DestinationFile, error) + + // AddRead enqueues a read of size dstFR.Length() bytes, from the file + // starting at the given offset, to dstFile starting at dstFR.Start. dstMap + // must be a mapping of dstFR. + // + // Note that some AsyncReader implementations may not begin execution of + // enqueued reads until the following call to Wait. + // + // Preconditions: + // - 0 <= id < MaxParallel(). + // - id must not be in use by any inflight read. + // - 0 < dstFR.Length() <= MaxReadBytes(). + // - No call to Wait has returned a non-nil error. + AddRead(id int, off int64, dstFile DestinationFile, dstFR memmap.FileRange, dstMap []byte) + + // AddReadv enqueues a read of size total, from the file starting at the + // given offset, to the dstFile ranges in dstFRs. dstMaps must be a mapping + // of dstFRs. The AsyncReader may retain dstFRs and dstMaps until the + // corresponding completion is returned by Wait; neither the caller nor the + // AsyncReader may mutate dstFRs or dstMaps during this time. + // + // Note that some AsyncReader implementations may not begin execution of + // enqueued reads until the following call to Wait. + // + // Preconditions: + // - 0 <= id < MaxParallel(). + // - id must not be in use by any inflight read. + // - 0 < total <= MaxReadBytes(). + // - total == the sum of FileRange.Length() over dstFRs. + // - No FileRange in dstFRs may have length 0. + // - No call to Wait has returned a non-nil error. + AddReadv(id int, off int64, total uint64, dstFile DestinationFile, dstFRs []memmap.FileRange, dstMaps []unix.Iovec) + + // Wait waits for at least minCompletions enqueued reads to complete, + // appends information for completed reads to cs, and returns the updated + // slice. + // + // Preconditions: + // - minCompletions <= the number of inflight reads. + // - No call to Wait has returned a non-nil error. + Wait(cs []Completion, minCompletions int) ([]Completion, error) +} + +// DestinationFile represents a file that has been registered for reads from an +// AsyncReader. +type DestinationFile any + +// ClientFileRangeSetting specifies properties of a range in a DestinationFile. +type ClientFileRangeSetting struct { + memmap.FileRange + Property ClientFileRangeProperty +} + +// ClientFileRangeProperty is the type of ClientFileRangeSetting.Property. +type ClientFileRangeProperty int + +const ( + // PropertyInvalid ensures that the zero value of ClientFileRangeProperty + // is invalid. + PropertyInvalid ClientFileRangeProperty = iota + + // PropertyHugepage indicates that allocations in the given range should use + // huge pages. + PropertyHugepage + + // PropertyNoHugepage indicates that allocations in the given range should + // not use huge pages. + PropertyNoHugepage +) + +// Completion indicates the result of a completed I/O operation. +type Completion struct { + // ID is the ID passed to AsyncReader.AddRead or AsyncReader.AddReadv. + ID int + + // N is the number of bytes for which I/O was successfully performed. Err + // is the error that terminated I/O after N bytes. + // + // Invariant: If N is less than the number of bytes for which I/O was + // submitted, Err is non-nil. + N uint64 + Err error +} diff --git a/pkg/sentry/state/stateio/stateio_impl_util.go b/pkg/sentry/state/stateio/stateio_impl_util.go new file mode 100644 index 0000000000..cbe9791fa6 --- /dev/null +++ b/pkg/sentry/state/stateio/stateio_impl_util.go @@ -0,0 +1,118 @@ +// Copyright 2025 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package stateio + +import ( + "golang.org/x/sys/unix" +) + +// This file contains utilities for implementors of AsyncReader. + +// NoRegisterClientFD implements AsyncReader.NeedRegisterDestinationFD and +// AsyncReader.RegisterDestinationFD for implementations of AsyncReader that +// don't require client FD registration. +type NoRegisterClientFD struct{} + +// NeedRegisterDestinationFD implements AsyncReader.NeedRegisterDestinationFD. +func (NoRegisterClientFD) NeedRegisterDestinationFD() bool { + return false +} + +// RegisterDestinationFD implements AsyncReader.RegisterDestinationFD. +func (NoRegisterClientFD) RegisterDestinationFD(fd int32, size uint64, settings []ClientFileRangeSetting) (DestinationFile, error) { + return nil, nil +} + +// LocalClientRanges holds mappings as passed to AsyncReader.AddRead or +// AsyncReader.AddReadv, for use by implementations that ignore the +// DestinationFile and FileRanges and instead use only the provided mappings. +type LocalClientRanges struct { + // At most one of the following is non-nil: + Mapping []byte + Iovecs []unix.Iovec +} + +// LocalClientMapping returns a LocalClientRanges representing the given +// mapping. +func LocalClientMapping(m []byte) LocalClientRanges { + return LocalClientRanges{Mapping: m} +} + +// LocalClientMappings returns a LocalClientRanges representing the given +// mappings. +func LocalClientMappings(iovecs []unix.Iovec) LocalClientRanges { + return LocalClientRanges{Iovecs: iovecs} +} + +// NumMappings returns the number of mappings represented by r. +func (r *LocalClientRanges) NumMappings() int { + if r.Mapping != nil { + return 1 + } + return len(r.Iovecs) +} + +// Mappings iterates the mappings represented by r. +func (r *LocalClientRanges) Mappings(yield func(i int, m []byte) bool) { + if r.Mapping != nil { + yield(0, r.Mapping) + return + } + for i, iov := range r.Iovecs { + if !yield(i, SliceFromIovec(iov)) { + return + } + } +} + +// DropFirst returns a LocalClientRanges equivalent to r, but with the first n +// bytes omitted. If n >= the total number of bytes in r, DropFirst returns a +// LocalClientRanges representing no mappings. +func (r *LocalClientRanges) DropFirst(n uint64) (r2 LocalClientRanges) { + for i, m := range r.Mappings { + if uint64(len(m)) <= n { + n -= uint64(len(m)) + continue + } + m = m[n:] + n = 0 + if r2.Iovecs == nil && i+1 == r.NumMappings() { + r2.Mapping = m + return + } + r2.Iovecs = append(r2.Iovecs, unix.Iovec{ + Base: &m[0], + Len: uint64(len(m)), + }) + } + return +} + +// CompletionChanWait implements AsyncReader.Wait by receiving from a channel +// of completions. +func CompletionChanWait(ch <-chan Completion, cs []Completion, minCompletions int) ([]Completion, error) { + for minCompletions > 0 { + cs = append(cs, <-ch) + minCompletions-- + } + for { + select { + case c := <-ch: + cs = append(cs, c) + default: + return cs, nil + } + } +} diff --git a/pkg/sentry/state/stateio/stateio_util_unsafe.go b/pkg/sentry/state/stateio/stateio_util_unsafe.go new file mode 100644 index 0000000000..74dab992c1 --- /dev/null +++ b/pkg/sentry/state/stateio/stateio_util_unsafe.go @@ -0,0 +1,27 @@ +// Copyright 2025 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package stateio + +import ( + "unsafe" + + "golang.org/x/sys/unix" +) + +// SliceFromIovec returns a byte slice representing the memory described by +// iov. +func SliceFromIovec(iov unix.Iovec) []byte { + return unsafe.Slice(iov.Base, iov.Len) +} diff --git a/pkg/sentry/syscalls/linux/sys_splice.go b/pkg/sentry/syscalls/linux/sys_splice.go index 34313cd725..db30625c2e 100644 --- a/pkg/sentry/syscalls/linux/sys_splice.go +++ b/pkg/sentry/syscalls/linux/sys_splice.go @@ -41,8 +41,8 @@ func Splice(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, if count == 0 { return 0, nil, nil } - if count > int64(kernel.MAX_RW_COUNT) { - count = int64(kernel.MAX_RW_COUNT) + if count > int64(linux.MAX_RW_COUNT) { + count = int64(linux.MAX_RW_COUNT) } if count < 0 { return 0, nil, linuxerr.EINVAL @@ -187,8 +187,8 @@ func Tee(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, *k if count == 0 { return 0, nil, nil } - if count > int64(kernel.MAX_RW_COUNT) { - count = int64(kernel.MAX_RW_COUNT) + if count > int64(linux.MAX_RW_COUNT) { + count = int64(linux.MAX_RW_COUNT) } if count < 0 { return 0, nil, linuxerr.EINVAL @@ -336,8 +336,8 @@ func Sendfile(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintpt if count == 0 { return 0, nil, nil } - if count > int64(kernel.MAX_RW_COUNT) { - count = int64(kernel.MAX_RW_COUNT) + if count > int64(linux.MAX_RW_COUNT) { + count = int64(linux.MAX_RW_COUNT) } // Copy data. diff --git a/runsc/boot/BUILD b/runsc/boot/BUILD index fcef66cea7..5697b4e740 100644 --- a/runsc/boot/BUILD +++ b/runsc/boot/BUILD @@ -49,6 +49,7 @@ go_library( "//pkg/fspath", "//pkg/fsutil", "//pkg/gomaxprocs", + "//pkg/hostarch", "//pkg/hostos", "//pkg/log", "//pkg/memutil", @@ -103,6 +104,7 @@ go_library( "//pkg/sentry/socket/unix", "//pkg/sentry/socket/unix/transport", "//pkg/sentry/state", + "//pkg/sentry/state/stateio", "//pkg/sentry/strace", "//pkg/sentry/time", "//pkg/sentry/unimpl:unimplemented_syscall_go_proto", diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go index c7fa1bfdab..b4c1c77127 100644 --- a/runsc/boot/controller.go +++ b/runsc/boot/controller.go @@ -29,6 +29,7 @@ import ( "gvisor.dev/gvisor/pkg/control/server" "gvisor.dev/gvisor/pkg/fd" "gvisor.dev/gvisor/pkg/fspath" + "gvisor.dev/gvisor/pkg/hostarch" "gvisor.dev/gvisor/pkg/log" "gvisor.dev/gvisor/pkg/sentry/control" "gvisor.dev/gvisor/pkg/sentry/fsimpl/erofs" @@ -37,6 +38,7 @@ import ( "gvisor.dev/gvisor/pkg/sentry/socket/netstack" "gvisor.dev/gvisor/pkg/sentry/socket/plugin" "gvisor.dev/gvisor/pkg/sentry/state" + "gvisor.dev/gvisor/pkg/sentry/state/stateio" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/timing" "gvisor.dev/gvisor/pkg/urpc" @@ -560,18 +562,33 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error { fileIdx := 1 if o.HavePagesFile { - pagesMetadata, err := o.ReleaseFD(fileIdx) + pagesMetadataFD, err := o.ReleaseFD(fileIdx) if err != nil { return err } fileIdx++ - pagesFile, err := o.ReleaseFD(fileIdx) + pagesFileFD, err := o.ReleaseFD(fileIdx) if err != nil { return err } fileIdx++ + // //pkg/state/wire reads one byte at a time; buffer these reads to + // avoid making one syscall per read. For the state file, this + // buffering is handled by statefile.NewReader() => compressio.Reader + // or compressio.NewSimpleReader(). + pagesMetadata := stateio.NewBufioReadCloser(pagesMetadataFD) + // Provision one range per page, which is the most that + // pgalloc.MemoryFile save/restore can require. + // TODO: Make these parameters configurable via arguments to `runsc restore`. + const ( + pagesFileMaxReadBytes = 256 * 1024 + pagesFileMaxRanges = pagesFileMaxReadBytes / hostarch.PageSize + pagesFileMaxParallel = 128 + ) + pagesFile := stateio.NewFDReader(int32(pagesFileFD.Release()), pagesFileMaxReadBytes, pagesFileMaxRanges, pagesFileMaxParallel) + // This immediately starts loading the main MemoryFile asynchronously. cm.restorer.asyncMFLoader = kernel.NewAsyncMFLoader(pagesMetadata, pagesFile, cm.restorer.mainMF, timer.Fork("PagesFileLoader")) }