Skip to content

Commit

Permalink
Fix lint
Browse files Browse the repository at this point in the history
lint
  • Loading branch information
DiegoTavares committed Dec 12, 2024
1 parent 4ccf61c commit 4d87bf6
Show file tree
Hide file tree
Showing 3 changed files with 59 additions and 50 deletions.
3 changes: 2 additions & 1 deletion rqd/rqd/rqconstants.py
Original file line number Diff line number Diff line change
Expand Up @@ -251,7 +251,8 @@
if config.has_option(__override_section, "BACKUP_CACHE_PATH"):
BACKUP_CACHE_PATH = config.get(__override_section, "BACKUP_CACHE_PATH")
if config.has_option(__override_section, "BACKUP_CACHE_TIME_TO_LIVE_SECONDS"):
BACKUP_CACHE_TIME_TO_LIVE_SECONDS = config.getint(__override_section, "BACKUP_CACHE_TIME_TO_LIVE_SECONDS")
BACKUP_CACHE_TIME_TO_LIVE_SECONDS = config.getint(
__override_section, "BACKUP_CACHE_TIME_TO_LIVE_SECONDS")

__docker_mounts = "docker.mounts"
__docker_config = "docker.config"
Expand Down
98 changes: 53 additions & 45 deletions rqd/rqd/rqcore.py
Original file line number Diff line number Diff line change
Expand Up @@ -244,7 +244,8 @@ def recoverCache(self):
# pylint: enable=no-member

running_frame.frameAttendantThread.start()
except:
# pylint: disable=broad-except
except Exception:
pass
# Ignore frames that got corrupted

Expand Down Expand Up @@ -1409,8 +1410,8 @@ def setup(self):
self.rqlog.waitForFile()
# pylint: disable=broad-except
except Exception as e:
err = "Unable to write to %s due to %s" % (runFrame.log_dir_file, e)
raise RuntimeError(err)
err = "Unable to write to %s due to %s" % (runFrame.log_dir_file, e)
raise RuntimeError(err)
finally:
rqd.rqutil.permissionsLow()

Expand All @@ -1422,7 +1423,7 @@ def run(self):
"""Thread initialization"""
if self.recovery_mode:
self.runRecovery()
return;
return

log.info("Monitor frame started for frameId=%s", self.frameId)

Expand Down Expand Up @@ -1459,6 +1460,7 @@ def run(self):
self.postFrameAction()

def postFrameAction(self):
"""Action to be executed after a frame completes its execution"""
self.rqCore.releaseCores(self.runFrame.num_cores,
self.runFrame.attributes.get('CPU_LIST'),
self.runFrame.attributes.get('GPU_LIST')
Expand Down Expand Up @@ -1496,6 +1498,11 @@ def recoverDocker(self):
self.__createEnvVariables()
self.__writeHeader()

tempStatFile = "%srqd-stat-%s-%s" % (self.rqCore.machine.getTempPath(),
frameInfo.frameId,
time.time())
self._tempLocations.append(tempStatFile)

try:
log_stream = None
with self.rqCore.docker_lock:
Expand Down Expand Up @@ -1573,7 +1580,8 @@ def recoverDocker(self):
frameInfo.exitSignal = 0

# Log frame start info
log.warning("Frame %s.%s(%s) with pid %s finished on container %s with exitStatus %s %s ",
log.warning(
"Frame %s.%s(%s) with pid %s finished on container %s with exitStatus %s %s",
runFrame.job_name,
runFrame.frame_name,
frameInfo.frameId,
Expand All @@ -1596,43 +1604,43 @@ def recoverDocker(self):
self.__cleanup()

def runRecovery(self):
"""Recover a frame that was running before this instance started"""
if not self.recovery_mode:
return;

log.info("Monitor recovered frame started for frameId=%s", self.frameId)

runFrame = self.runFrame
run_on_docker = self.rqCore.docker is not None

# pylint: disable=too-many-nested-blocks
try:
self.setup()
# Store frame in cache and register servant
self.rqCore.storeFrame(runFrame.frame_id, self.frameInfo)

if run_on_docker:
self.recoverDocker()
elif platform.system() == "Linux":
# TODO
pass
elif platform.system() == "Windows":
# TODO
pass
elif platform.system() == "Darwin":
# TODO
pass
else:
self.runUnknown()

# pylint: disable=broad-except
except Exception:
log.critical(
"Failed launchFrame: For %s due to: \n%s",
runFrame.frame_id, ''.join(traceback.format_exception(*sys.exc_info())))
# Notifies the cuebot that there was an error launching
self.frameInfo.exitStatus = rqd.rqconstants.EXITSTATUS_FOR_FAILED_LAUNCH
# Delay keeps the cuebot from spamming failing booking requests
time.sleep(10)
finally:
self.postFrameAction()
"""Recover a frame that was running before this instance started"""
if not self.recovery_mode:
return

log.info("Monitor recovered frame started for frameId=%s", self.frameId)

runFrame = self.runFrame
run_on_docker = self.rqCore.docker is not None

# pylint: disable=too-many-nested-blocks
try:
self.setup()
# Store frame in cache and register servant
self.rqCore.storeFrame(runFrame.frame_id, self.frameInfo)

if run_on_docker:
self.recoverDocker()
elif platform.system() == "Linux":
# TODO
pass
elif platform.system() == "Windows":
# TODO
pass
elif platform.system() == "Darwin":
# TODO
pass
else:
self.runUnknown()

# pylint: disable=broad-except
except Exception:
log.critical(
"Failed launchFrame: For %s due to: \n%s",
runFrame.frame_id, ''.join(traceback.format_exception(*sys.exc_info())))
# Notifies the cuebot that there was an error launching
self.frameInfo.exitStatus = rqd.rqconstants.EXITSTATUS_FOR_FAILED_LAUNCH
# Delay keeps the cuebot from spamming failing booking requests
time.sleep(10)
finally:
self.postFrameAction()
8 changes: 4 additions & 4 deletions rqd/tests/rqcore_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -666,19 +666,19 @@ def test_recoverCache_validBackup(self, attendant_patch):
num_cores = 4
)
running_frame = rqd.rqnetwork.RunningFrame(self.rqcore, frame)
self.rqcore.cores.idle_cores = 8
self.rqcore.storeFrame(frameId, running_frame)
self.rqcore.cores.idle_cores = 8
self.rqcore.cores.booked_cores = 0
self.rqcore.backupCache()
self.__cache = {}
self.rqcore._RqCore__cache = {}
self.rqcore.recoverCache()
self.assertIn('frame123', self.rqcore._RqCore__cache)
self.assertEqual(4, self.rqcore.cores.idle_cores)
self.assertEqual(4, self.rqcore.cores.booked_cores)

def test_recoverCache_invalidFrame(self):
"""Test recoverCache loads frame data from valid backup file"""
self.rqcore.backup_cache_path = 'cache.dat'
with open(self.rqcore.backup_cache_path, "w") as f:
with open(self.rqcore.backup_cache_path, "w", encoding='utf-8') as f:
f.write("this is not a run frame")

self.rqcore.recoverCache()
Expand Down

0 comments on commit 4d87bf6

Please sign in to comment.