Skip to content

Commit 8bc1566

Browse files
committed
fix filesystem mountTimeout not working, it also ensures that stuckMount can be deleted after it is successfully executed in goroutine. prometheus#2903 prometheus#3058
Signed-off-by: joey <[email protected]>
1 parent c6fa86c commit 8bc1566

File tree

1 file changed

+45
-44
lines changed

1 file changed

+45
-44
lines changed

collector/filesystem_linux.go

+45-44
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,27 @@ func (c *filesystemCollector) GetStats() ([]filesystemStats, error) {
109109
return stats, nil
110110
}
111111

112+
func (c *filesystemCollector) mountWatcher(mountPoint string, buf *unix.Statfs_t, successCh chan struct{}, errCh chan error) {
113+
err := unix.Statfs(mountPoint, buf)
114+
defer func() {
115+
close(successCh)
116+
close(errCh)
117+
}()
118+
if err != nil {
119+
level.Debug(c.logger).Log("msg", "Error on statfs() system call", "rootfs", rootfsFilePath(mountPoint), "err", err)
120+
errCh <- err
121+
return
122+
}
123+
stuckMountsMtx.Lock()
124+
successCh <- struct{}{}
125+
// If the mount has been marked as stuck, unmark it and log it's recovery.
126+
if _, ok := stuckMounts[mountPoint]; ok {
127+
level.Debug(c.logger).Log("msg", "Mount point has recovered, monitoring will resume", "mountpoint", mountPoint)
128+
delete(stuckMounts, mountPoint)
129+
}
130+
stuckMountsMtx.Unlock()
131+
}
132+
112133
func (c *filesystemCollector) processStat(labels filesystemLabels) filesystemStats {
113134
var ro float64
114135
for _, option := range strings.Split(labels.options, ",") {
@@ -118,63 +139,43 @@ func (c *filesystemCollector) processStat(labels filesystemLabels) filesystemSta
118139
}
119140
}
120141

121-
success := make(chan struct{})
122-
go stuckMountWatcher(labels.mountPoint, success, c.logger)
123-
124142
buf := new(unix.Statfs_t)
125-
err := unix.Statfs(rootfsFilePath(labels.mountPoint), buf)
126-
stuckMountsMtx.Lock()
127-
close(success)
143+
success := make(chan struct{}, 1)
144+
errCh := make(chan error, 1)
128145

129-
// If the mount has been marked as stuck, unmark it and log it's recovery.
130-
if _, ok := stuckMounts[labels.mountPoint]; ok {
131-
level.Debug(c.logger).Log("msg", "Mount point has recovered, monitoring will resume", "mountpoint", labels.mountPoint)
132-
delete(stuckMounts, labels.mountPoint)
133-
}
134-
stuckMountsMtx.Unlock()
146+
mountCheckTimer := time.NewTimer(*mountTimeout)
147+
defer mountCheckTimer.Stop()
135148

136-
if err != nil {
137-
labels.deviceError = err.Error()
138-
level.Debug(c.logger).Log("msg", "Error on statfs() system call", "rootfs", rootfsFilePath(labels.mountPoint), "err", err)
139-
return filesystemStats{
140-
labels: labels,
141-
deviceError: 1,
142-
ro: ro,
143-
}
144-
}
149+
go c.mountWatcher(labels.mountPoint, buf, success, errCh)
145150

146-
return filesystemStats{
147-
labels: labels,
148-
size: float64(buf.Blocks) * float64(buf.Bsize),
149-
free: float64(buf.Bfree) * float64(buf.Bsize),
150-
avail: float64(buf.Bavail) * float64(buf.Bsize),
151-
files: float64(buf.Files),
152-
filesFree: float64(buf.Ffree),
153-
ro: ro,
151+
res := filesystemStats{
152+
labels: labels,
153+
ro: ro,
154154
}
155-
}
156155

157-
// stuckMountWatcher listens on the given success channel and if the channel closes
158-
// then the watcher does nothing. If instead the timeout is reached, the
159-
// mount point that is being watched is marked as stuck.
160-
func stuckMountWatcher(mountPoint string, success chan struct{}, logger log.Logger) {
161-
mountCheckTimer := time.NewTimer(*mountTimeout)
162-
defer mountCheckTimer.Stop()
163156
select {
164157
case <-success:
165-
// Success
158+
res.size = float64(buf.Blocks) * float64(buf.Bsize)
159+
res.free = float64(buf.Bfree) * float64(buf.Bsize)
160+
res.avail = float64(buf.Bavail) * float64(buf.Bsize)
161+
res.files = float64(buf.Files)
162+
res.filesFree = float64(buf.Ffree)
163+
case err := <-errCh:
164+
labels.deviceError = err.Error()
165+
res.deviceError = 1
166166
case <-mountCheckTimer.C:
167167
// Timed out, mark mount as stuck
168168
stuckMountsMtx.Lock()
169-
select {
170-
case <-success:
171-
// Success came in just after the timeout was reached, don't label the mount as stuck
172-
default:
173-
level.Debug(logger).Log("msg", "Mount point timed out, it is being labeled as stuck and will not be monitored", "mountpoint", mountPoint)
174-
stuckMounts[mountPoint] = struct{}{}
175-
}
169+
level.Debug(c.logger).Log("msg", "Mount point timed out, it is being labeled as stuck and will not be monitored", "mountpoint", labels.mountPoint)
170+
stuckMounts[labels.mountPoint] = struct{}{}
176171
stuckMountsMtx.Unlock()
172+
labels.deviceError = "mountpoint timeout"
173+
res.deviceError = 1
174+
default:
175+
177176
}
177+
178+
return res
178179
}
179180

180181
func mountPointDetails(logger log.Logger) ([]filesystemLabels, error) {

0 commit comments

Comments
 (0)