From fcbad94b168eaead39ceff5e0f4c95e7415e19ed Mon Sep 17 00:00:00 2001 From: jfr2102 Date: Sun, 24 Aug 2025 17:17:51 +0200 Subject: [PATCH 01/19] retry missing jobs first --- .../localci/LocalCIEventListenerService.java | 25 ++++++++++++++++--- .../localci/LocalCITriggerService.java | 21 +++++++++++++--- .../icl/LocalCIIntegrationTest.java | 7 +++++- 3 files changed, 45 insertions(+), 8 deletions(-) diff --git a/src/main/java/de/tum/cit/aet/artemis/programming/service/localci/LocalCIEventListenerService.java b/src/main/java/de/tum/cit/aet/artemis/programming/service/localci/LocalCIEventListenerService.java index 302398851103..a6321d0b9102 100644 --- a/src/main/java/de/tum/cit/aet/artemis/programming/service/localci/LocalCIEventListenerService.java +++ b/src/main/java/de/tum/cit/aet/artemis/programming/service/localci/LocalCIEventListenerService.java @@ -25,6 +25,8 @@ import de.tum.cit.aet.artemis.communication.service.notifications.MailService; import de.tum.cit.aet.artemis.core.domain.User; import de.tum.cit.aet.artemis.core.service.user.UserService; +import de.tum.cit.aet.artemis.exercise.repository.ParticipationRepository; +import de.tum.cit.aet.artemis.programming.domain.ProgrammingExerciseParticipation; import de.tum.cit.aet.artemis.programming.domain.build.BuildJob; import de.tum.cit.aet.artemis.programming.domain.build.BuildStatus; import de.tum.cit.aet.artemis.programming.dto.SubmissionProcessingDTO; @@ -61,14 +63,21 @@ public class LocalCIEventListenerService { private final MailService mailService; + private final LocalCITriggerService localCITriggerService; + + private final ParticipationRepository participationRepository; + public LocalCIEventListenerService(DistributedDataAccessService distributedDataAccessService, LocalCIQueueWebsocketService localCIQueueWebsocketService, - BuildJobRepository buildJobRepository, ProgrammingMessagingService programmingMessagingService, UserService userService, MailService mailService) { + BuildJobRepository buildJobRepository, ProgrammingMessagingService programmingMessagingService, UserService userService, MailService mailService, + LocalCITriggerService localCITriggerService, ParticipationRepository participationRepository) { this.distributedDataAccessService = distributedDataAccessService; this.localCIQueueWebsocketService = localCIQueueWebsocketService; this.buildJobRepository = buildJobRepository; this.programmingMessagingService = programmingMessagingService; this.userService = userService; this.mailService = mailService; + this.localCITriggerService = localCITriggerService; + this.participationRepository = participationRepository; } /** @@ -124,11 +133,21 @@ public void checkPendingBuildJobsStatus() { continue; } log.error("Build job with id {} is in an unknown state", buildJob.getBuildJobId()); - // If the build job is in an unknown state, set it to missing and update the build start date - buildJobRepository.updateBuildJobStatus(buildJob.getBuildJobId(), BuildStatus.MISSING); + if (buildJob.getRetryCount() < 3) { + retryBuildJob(buildJob); + } + else { + buildJobRepository.updateBuildJobStatus(buildJob.getBuildJobId(), BuildStatus.MISSING); + } } } + private void retryBuildJob(BuildJob buildJob) { + var participation = participationRepository.findByIdElseThrow(buildJob.getParticipationId()); + localCITriggerService.triggerBuild((ProgrammingExerciseParticipation) participation, buildJob.getCommitHash(), buildJob.getTriggeredByPushTo(), + buildJob.getRetryCount() + 1); + } + private boolean checkIfBuildJobIsStillBuilding(List processingJobIds, String buildJobId) { return processingJobIds.contains(buildJobId); } diff --git a/src/main/java/de/tum/cit/aet/artemis/programming/service/localci/LocalCITriggerService.java b/src/main/java/de/tum/cit/aet/artemis/programming/service/localci/LocalCITriggerService.java index 56cf6fc9a53f..3f6ee0090499 100644 --- a/src/main/java/de/tum/cit/aet/artemis/programming/service/localci/LocalCITriggerService.java +++ b/src/main/java/de/tum/cit/aet/artemis/programming/service/localci/LocalCITriggerService.java @@ -137,7 +137,7 @@ public LocalCITriggerService(DistributedDataAccessService distributedDataAccessS */ @Override public void triggerBuild(ProgrammingExerciseParticipation participation, boolean triggerAll) throws LocalCIException { - triggerBuild(participation, null, null, triggerAll); + triggerBuild(participation, null, null, triggerAll, 0); } /** @@ -150,10 +150,23 @@ public void triggerBuild(ProgrammingExerciseParticipation participation, boolean */ @Override public void triggerBuild(ProgrammingExerciseParticipation participation, String commitHashToBuild, RepositoryType triggeredByPushTo) throws LocalCIException { - triggerBuild(participation, commitHashToBuild, triggeredByPushTo, false); + triggerBuild(participation, commitHashToBuild, triggeredByPushTo, false, 0); } - private void triggerBuild(ProgrammingExerciseParticipation participation, String commitHashToBuild, RepositoryType triggeredByPushTo, boolean triggerAll) + /** + * Add a new build job item containing all relevant information necessary for the execution to the distributed build job queue. + * + * @param participation the participation of the repository which should be built and tested + * @param commitHashToBuild the commit hash of the commit that triggers the build. If it is null, the latest commit of the default branch will be built. + * @param triggeredByPushTo type of the repository that was pushed to and triggered the build job + * @param retryCount how often the build has been retried after it went missing + * @throws LocalCIException if the build job could not be added to the queue. + */ + public void triggerBuild(ProgrammingExerciseParticipation participation, String commitHashToBuild, RepositoryType triggeredByPushTo, int retryCount) throws LocalCIException { + triggerBuild(participation, commitHashToBuild, triggeredByPushTo, false, retryCount); + } + + private void triggerBuild(ProgrammingExerciseParticipation participation, String commitHashToBuild, RepositoryType triggeredByPushTo, boolean triggerAll, int retryCount) throws LocalCIException { log.info("Triggering build for participation {} and commit hash {}", participation.getId(), commitHashToBuild); @@ -207,7 +220,7 @@ else if (triggeredByPushTo.equals(RepositoryType.TESTS)) { BuildAgentDTO buildAgent = new BuildAgentDTO(null, null, null); BuildJobQueueItem buildJobQueueItem = new BuildJobQueueItem(buildJobId, participation.getBuildPlanId(), buildAgent, participation.getId(), courseId, - programmingExercise.getId(), 0, priority, null, repositoryInfo, jobTimingInfo, buildConfig, null); + programmingExercise.getId(), retryCount, priority, null, repositoryInfo, jobTimingInfo, buildConfig, null); // Save the build job before adding it to the queue to ensure it exists in the database. // This prevents potential race conditions where a build agent pulls the job from the queue very quickly before it is persisted, diff --git a/src/test/java/de/tum/cit/aet/artemis/programming/icl/LocalCIIntegrationTest.java b/src/test/java/de/tum/cit/aet/artemis/programming/icl/LocalCIIntegrationTest.java index 9f7b6381b9be..70e7765b3b48 100644 --- a/src/test/java/de/tum/cit/aet/artemis/programming/icl/LocalCIIntegrationTest.java +++ b/src/test/java/de/tum/cit/aet/artemis/programming/icl/LocalCIIntegrationTest.java @@ -301,7 +301,6 @@ void testBuildJobTimeoutPersistence() { } } - @Disabled @Test @WithMockUser(username = TEST_PREFIX + "student1", roles = "USER") void testMissingBuildJobCheck() { @@ -330,6 +329,12 @@ void testMissingBuildJobCheck() { localCIEventListenerService.checkPendingBuildJobsStatus(); + hazelcastInstance.getQueue("buildJobQueue").clear(); + buildJob.setRetryCount(3); + buildJobRepository.save(buildJob); + + localCIEventListenerService.checkPendingBuildJobsStatus(); + buildJobOptional = buildJobRepository.findFirstByParticipationIdOrderByBuildStartDateDesc(studentParticipation.getId()); buildJob = buildJobOptional.orElseThrow(); assertThat(buildJob.getBuildStatus()).isEqualTo(BuildStatus.MISSING); From e4e858ac828ac4eca06b197fd99dcc42213570aa Mon Sep 17 00:00:00 2001 From: jfr2102 Date: Mon, 25 Aug 2025 11:01:01 +0200 Subject: [PATCH 02/19] info log --- .../service/localci/LocalCIEventListenerService.java | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/main/java/de/tum/cit/aet/artemis/programming/service/localci/LocalCIEventListenerService.java b/src/main/java/de/tum/cit/aet/artemis/programming/service/localci/LocalCIEventListenerService.java index a6321d0b9102..62940abf1354 100644 --- a/src/main/java/de/tum/cit/aet/artemis/programming/service/localci/LocalCIEventListenerService.java +++ b/src/main/java/de/tum/cit/aet/artemis/programming/service/localci/LocalCIEventListenerService.java @@ -134,7 +134,7 @@ public void checkPendingBuildJobsStatus() { } log.error("Build job with id {} is in an unknown state", buildJob.getBuildJobId()); if (buildJob.getRetryCount() < 3) { - retryBuildJob(buildJob); + retriggerBuildJob(buildJob); } else { buildJobRepository.updateBuildJobStatus(buildJob.getBuildJobId(), BuildStatus.MISSING); @@ -142,7 +142,8 @@ public void checkPendingBuildJobsStatus() { } } - private void retryBuildJob(BuildJob buildJob) { + private void retriggerBuildJob(BuildJob buildJob) { + log.info("Retriggering build job with id {} (retry count: {})", buildJob.getBuildJobId(), buildJob.getRetryCount() + 1); var participation = participationRepository.findByIdElseThrow(buildJob.getParticipationId()); localCITriggerService.triggerBuild((ProgrammingExerciseParticipation) participation, buildJob.getCommitHash(), buildJob.getTriggeredByPushTo(), buildJob.getRetryCount() + 1); From 5abadad787ce254375799a3eba0211a8f4ca7799 Mon Sep 17 00:00:00 2001 From: jfr2102 Date: Mon, 25 Aug 2025 18:19:49 +0200 Subject: [PATCH 03/19] add extra schedule for retry of missing jobs --- .../localci/LocalCIEventListenerService.java | 22 +++++++++---------- .../localci/LocalCITriggerService.java | 5 +++++ .../icl/LocalCIIntegrationTest.java | 6 ----- 3 files changed, 16 insertions(+), 17 deletions(-) diff --git a/src/main/java/de/tum/cit/aet/artemis/programming/service/localci/LocalCIEventListenerService.java b/src/main/java/de/tum/cit/aet/artemis/programming/service/localci/LocalCIEventListenerService.java index 62940abf1354..f783cef77de7 100644 --- a/src/main/java/de/tum/cit/aet/artemis/programming/service/localci/LocalCIEventListenerService.java +++ b/src/main/java/de/tum/cit/aet/artemis/programming/service/localci/LocalCIEventListenerService.java @@ -133,20 +133,20 @@ public void checkPendingBuildJobsStatus() { continue; } log.error("Build job with id {} is in an unknown state", buildJob.getBuildJobId()); - if (buildJob.getRetryCount() < 3) { - retriggerBuildJob(buildJob); - } - else { - buildJobRepository.updateBuildJobStatus(buildJob.getBuildJobId(), BuildStatus.MISSING); - } + buildJobRepository.updateBuildJobStatus(buildJob.getBuildJobId(), BuildStatus.MISSING); } } - private void retriggerBuildJob(BuildJob buildJob) { - log.info("Retriggering build job with id {} (retry count: {})", buildJob.getBuildJobId(), buildJob.getRetryCount() + 1); - var participation = participationRepository.findByIdElseThrow(buildJob.getParticipationId()); - localCITriggerService.triggerBuild((ProgrammingExerciseParticipation) participation, buildJob.getCommitHash(), buildJob.getTriggeredByPushTo(), - buildJob.getRetryCount() + 1); + @Scheduled(fixedRateString = "${artemis.continuous-integration.check-job-status-interval-seconds:300}", initialDelayString = "${artemis.continuous-integration.check-job-status-delay-seconds:60}", timeUnit = TimeUnit.SECONDS) + public void retryMissingJobs() { + log.debug("Checking for missing build jobs"); + List missingBuildJobs = buildJobRepository.findAllByBuildStatusIn(List.of(BuildStatus.MISSING)); + for (BuildJob buildJob : missingBuildJobs) { + if (buildJob.getRetryCount() >= 3) { + continue; + } + localCITriggerService.retryBuildJob(buildJob, (ProgrammingExerciseParticipation) participationRepository.findByIdElseThrow(buildJob.getParticipationId())); + } } private boolean checkIfBuildJobIsStillBuilding(List processingJobIds, String buildJobId) { diff --git a/src/main/java/de/tum/cit/aet/artemis/programming/service/localci/LocalCITriggerService.java b/src/main/java/de/tum/cit/aet/artemis/programming/service/localci/LocalCITriggerService.java index 3f6ee0090499..9f7f33572d72 100644 --- a/src/main/java/de/tum/cit/aet/artemis/programming/service/localci/LocalCITriggerService.java +++ b/src/main/java/de/tum/cit/aet/artemis/programming/service/localci/LocalCITriggerService.java @@ -153,6 +153,11 @@ public void triggerBuild(ProgrammingExerciseParticipation participation, String triggerBuild(participation, commitHashToBuild, triggeredByPushTo, false, 0); } + public void retryBuildJob(BuildJob buildJob, ProgrammingExerciseParticipation participation) { + log.info("Retrying build for missing build job with id {} (retry count: {})", buildJob.getBuildJobId(), buildJob.getRetryCount() + 1); + triggerBuild(participation, buildJob.getCommitHash(), buildJob.getTriggeredByPushTo(), buildJob.getRetryCount() + 1); + } + /** * Add a new build job item containing all relevant information necessary for the execution to the distributed build job queue. * diff --git a/src/test/java/de/tum/cit/aet/artemis/programming/icl/LocalCIIntegrationTest.java b/src/test/java/de/tum/cit/aet/artemis/programming/icl/LocalCIIntegrationTest.java index 70e7765b3b48..76d76fb32b49 100644 --- a/src/test/java/de/tum/cit/aet/artemis/programming/icl/LocalCIIntegrationTest.java +++ b/src/test/java/de/tum/cit/aet/artemis/programming/icl/LocalCIIntegrationTest.java @@ -329,12 +329,6 @@ void testMissingBuildJobCheck() { localCIEventListenerService.checkPendingBuildJobsStatus(); - hazelcastInstance.getQueue("buildJobQueue").clear(); - buildJob.setRetryCount(3); - buildJobRepository.save(buildJob); - - localCIEventListenerService.checkPendingBuildJobsStatus(); - buildJobOptional = buildJobRepository.findFirstByParticipationIdOrderByBuildStartDateDesc(studentParticipation.getId()); buildJob = buildJobOptional.orElseThrow(); assertThat(buildJob.getBuildStatus()).isEqualTo(BuildStatus.MISSING); From a76ea4c6738613d1b750e5ac477c3516f6bc9845 Mon Sep 17 00:00:00 2001 From: jfr2102 Date: Mon, 25 Aug 2025 18:35:16 +0200 Subject: [PATCH 04/19] delay first schedule --- .../service/localci/LocalCIEventListenerService.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/de/tum/cit/aet/artemis/programming/service/localci/LocalCIEventListenerService.java b/src/main/java/de/tum/cit/aet/artemis/programming/service/localci/LocalCIEventListenerService.java index f783cef77de7..d4338c4bd463 100644 --- a/src/main/java/de/tum/cit/aet/artemis/programming/service/localci/LocalCIEventListenerService.java +++ b/src/main/java/de/tum/cit/aet/artemis/programming/service/localci/LocalCIEventListenerService.java @@ -137,7 +137,7 @@ public void checkPendingBuildJobsStatus() { } } - @Scheduled(fixedRateString = "${artemis.continuous-integration.check-job-status-interval-seconds:300}", initialDelayString = "${artemis.continuous-integration.check-job-status-delay-seconds:60}", timeUnit = TimeUnit.SECONDS) + @Scheduled(fixedRateString = "${artemis.continuous-integration.check-job-status-interval-seconds:300}", initialDelayString = "${artemis.continuous-integration.check-job-status-delay-seconds:120}", timeUnit = TimeUnit.SECONDS) public void retryMissingJobs() { log.debug("Checking for missing build jobs"); List missingBuildJobs = buildJobRepository.findAllByBuildStatusIn(List.of(BuildStatus.MISSING)); From 3ef9b357ed29ec086161ca0d2469e99093cf1eb7 Mon Sep 17 00:00:00 2001 From: jfr2102 Date: Tue, 26 Aug 2025 19:46:50 +0200 Subject: [PATCH 05/19] retry only batch of jobs --- .../repository/BuildJobRepository.java | 20 +++++++++++ .../localci/LocalCIEventListenerService.java | 34 ++++++++++++++++--- 2 files changed, 49 insertions(+), 5 deletions(-) diff --git a/src/main/java/de/tum/cit/aet/artemis/programming/repository/BuildJobRepository.java b/src/main/java/de/tum/cit/aet/artemis/programming/repository/BuildJobRepository.java index 800d8afd4615..f0ad1e1e8635 100644 --- a/src/main/java/de/tum/cit/aet/artemis/programming/repository/BuildJobRepository.java +++ b/src/main/java/de/tum/cit/aet/artemis/programming/repository/BuildJobRepository.java @@ -184,4 +184,24 @@ void updateBuildJobStatusWithBuildStartDate(@Param("buildJobId") String buildJob * @return the list of build jobs */ List findAllByBuildStatusIn(List statuses); + + /** + * Find all build jobs with the given build statuses in the given time range, ordered by submission date descending. + * + * @param statuses the list of build statuses + * @param startTime earliest build submission time + * @param endTime latest build submission time + * @param pageable pagination information + * @return the list of build jobs + */ + @Query(""" + SELECT b + FROM BuildJob b + WHERE b.buildStatus IN :statuses + AND b.buildSubmissionDate >= :startTime + AND b.buildSubmissionDate <= :endTime + ORDER BY b.buildSubmissionDate DESC + """) + Slice findJobsByStatusesInTimeRange(@Param("statuses") List statuses, @Param("startTime") ZonedDateTime startTime, + @Param("endTime") ZonedDateTime endTime, Pageable pageable); } diff --git a/src/main/java/de/tum/cit/aet/artemis/programming/service/localci/LocalCIEventListenerService.java b/src/main/java/de/tum/cit/aet/artemis/programming/service/localci/LocalCIEventListenerService.java index d4338c4bd463..29ca7646ec03 100644 --- a/src/main/java/de/tum/cit/aet/artemis/programming/service/localci/LocalCIEventListenerService.java +++ b/src/main/java/de/tum/cit/aet/artemis/programming/service/localci/LocalCIEventListenerService.java @@ -11,6 +11,9 @@ import org.slf4j.LoggerFactory; import org.springframework.context.annotation.Lazy; import org.springframework.context.annotation.Profile; +import org.springframework.data.domain.PageRequest; +import org.springframework.data.domain.Pageable; +import org.springframework.data.domain.Slice; import org.springframework.scheduling.annotation.Scheduled; import org.springframework.stereotype.Service; @@ -133,19 +136,40 @@ public void checkPendingBuildJobsStatus() { continue; } log.error("Build job with id {} is in an unknown state", buildJob.getBuildJobId()); + // If the build job is in an unknown state, set it to missing and update the build start date buildJobRepository.updateBuildJobStatus(buildJob.getBuildJobId(), BuildStatus.MISSING); } } - @Scheduled(fixedRateString = "${artemis.continuous-integration.check-job-status-interval-seconds:300}", initialDelayString = "${artemis.continuous-integration.check-job-status-delay-seconds:120}", timeUnit = TimeUnit.SECONDS) + // todo move other service maybe? + @Scheduled(fixedRateString = "${artemis.continuous-integration.retry-missing-jobs-interval-seconds:300}", initialDelayString = "${artemis.continuous-integration.retry-missing-jobs-delay-seconds:120}", timeUnit = TimeUnit.SECONDS) public void retryMissingJobs() { - log.debug("Checking for missing build jobs"); - List missingBuildJobs = buildJobRepository.findAllByBuildStatusIn(List.of(BuildStatus.MISSING)); - for (BuildJob buildJob : missingBuildJobs) { + log.debug("Checking for missing build jobs to retry"); + Pageable pageable = PageRequest.of(0, 50); + ZonedDateTime now = ZonedDateTime.now(); + ZonedDateTime oneHourAgo = now.minusHours(1); + + Slice missingJobSlice = buildJobRepository.findJobsByStatusesInTimeRange(List.of(BuildStatus.MISSING), oneHourAgo, now, pageable); + List missingJobs = missingJobSlice.getContent(); + log.debug("Processing {} missing build jobs to retry", missingJobs.size()); + + for (BuildJob buildJob : missingJobSlice.getContent()) { if (buildJob.getRetryCount() >= 3) { + log.warn("Build job with id {} for participation {} has reached the maximum number of retries and will not be retried.", buildJob.getBuildJobId(), + buildJob.getParticipationId()); continue; } - localCITriggerService.retryBuildJob(buildJob, (ProgrammingExerciseParticipation) participationRepository.findByIdElseThrow(buildJob.getParticipationId())); + + try { + localCITriggerService.retryBuildJob(buildJob, (ProgrammingExerciseParticipation) participationRepository.findByIdElseThrow(buildJob.getParticipationId())); + } + catch (Exception e) { + log.error("Failed to retry build job with id {} for participation {}", buildJob.getBuildJobId(), buildJob.getParticipationId(), e); + } + } + + if (missingJobSlice.hasNext()) { + log.debug("There are more missing jobs to process in the next scheduled run."); } } From 3f5871108a93bfa307e6c7583808f904e9658c9c Mon Sep 17 00:00:00 2001 From: jfr2102 Date: Tue, 26 Aug 2025 20:40:59 +0200 Subject: [PATCH 06/19] move missing job schedules to own service --- .../localci/LocalCIEventListenerService.java | 104 +----------- .../localci/LocalCIMissingJobService.java | 148 ++++++++++++++++++ ...mingIntegrationLocalCILocalVCTestBase.java | 4 + .../icl/LocalCIIntegrationTest.java | 2 +- 4 files changed, 154 insertions(+), 104 deletions(-) create mode 100644 src/main/java/de/tum/cit/aet/artemis/programming/service/localci/LocalCIMissingJobService.java diff --git a/src/main/java/de/tum/cit/aet/artemis/programming/service/localci/LocalCIEventListenerService.java b/src/main/java/de/tum/cit/aet/artemis/programming/service/localci/LocalCIEventListenerService.java index 29ca7646ec03..cf1233454ef7 100644 --- a/src/main/java/de/tum/cit/aet/artemis/programming/service/localci/LocalCIEventListenerService.java +++ b/src/main/java/de/tum/cit/aet/artemis/programming/service/localci/LocalCIEventListenerService.java @@ -1,9 +1,7 @@ package de.tum.cit.aet.artemis.programming.service.localci; import java.time.ZonedDateTime; -import java.util.List; import java.util.Optional; -import java.util.concurrent.TimeUnit; import jakarta.annotation.PostConstruct; @@ -11,10 +9,6 @@ import org.slf4j.LoggerFactory; import org.springframework.context.annotation.Lazy; import org.springframework.context.annotation.Profile; -import org.springframework.data.domain.PageRequest; -import org.springframework.data.domain.Pageable; -import org.springframework.data.domain.Slice; -import org.springframework.scheduling.annotation.Scheduled; import org.springframework.stereotype.Service; import com.hazelcast.collection.ItemEvent; @@ -28,9 +22,6 @@ import de.tum.cit.aet.artemis.communication.service.notifications.MailService; import de.tum.cit.aet.artemis.core.domain.User; import de.tum.cit.aet.artemis.core.service.user.UserService; -import de.tum.cit.aet.artemis.exercise.repository.ParticipationRepository; -import de.tum.cit.aet.artemis.programming.domain.ProgrammingExerciseParticipation; -import de.tum.cit.aet.artemis.programming.domain.build.BuildJob; import de.tum.cit.aet.artemis.programming.domain.build.BuildStatus; import de.tum.cit.aet.artemis.programming.dto.SubmissionProcessingDTO; import de.tum.cit.aet.artemis.programming.repository.BuildJobRepository; @@ -66,21 +57,14 @@ public class LocalCIEventListenerService { private final MailService mailService; - private final LocalCITriggerService localCITriggerService; - - private final ParticipationRepository participationRepository; - public LocalCIEventListenerService(DistributedDataAccessService distributedDataAccessService, LocalCIQueueWebsocketService localCIQueueWebsocketService, - BuildJobRepository buildJobRepository, ProgrammingMessagingService programmingMessagingService, UserService userService, MailService mailService, - LocalCITriggerService localCITriggerService, ParticipationRepository participationRepository) { + BuildJobRepository buildJobRepository, ProgrammingMessagingService programmingMessagingService, UserService userService, MailService mailService) { this.distributedDataAccessService = distributedDataAccessService; this.localCIQueueWebsocketService = localCIQueueWebsocketService; this.buildJobRepository = buildJobRepository; this.programmingMessagingService = programmingMessagingService; this.userService = userService; this.mailService = mailService; - this.localCITriggerService = localCITriggerService; - this.participationRepository = participationRepository; } /** @@ -95,92 +79,6 @@ public void init() { distributedDataAccessService.getDistributedBuildAgentInformation().addEntryListener(new BuildAgentListener(), true); } - /** - * Periodically checks the status of pending build jobs and updates their status if they are missing. - *

- * This scheduled task ensures that build jobs which are stuck in the QUEUED or BUILDING state for too long - * are detected and marked as MISSING if their status cannot be verified. This helps prevent indefinite - * waiting states due to external failures or inconsistencies in the CI system. - *

- *

- * This mechanism is necessary because build jobs are managed externally, and various failure scenarios - * can lead to jobs being lost without Artemis being notified: - *

- *
    - *
  • Application crashes or restarts while build job was queued
  • - *
  • network issues leading to Hazelcast data loss
  • - *
  • Build agent crashes or is disconnected
  • - *
- */ - @Scheduled(fixedRateString = "${artemis.continuous-integration.check-job-status-interval-seconds:300}", initialDelayString = "${artemis.continuous-integration.check-job-status-delay-seconds:60}", timeUnit = TimeUnit.SECONDS) - public void checkPendingBuildJobsStatus() { - log.debug("Checking pending build jobs status"); - List pendingBuildJobs = buildJobRepository.findAllByBuildStatusIn(List.of(BuildStatus.QUEUED, BuildStatus.BUILDING)); - ZonedDateTime now = ZonedDateTime.now(); - final int buildJobExpirationInMinutes = 5; // If a build job is older than 5 minutes, and it's status can't be determined, set it to missing - - var queuedJobs = distributedDataAccessService.getQueuedJobs(); - var processingJobs = distributedDataAccessService.getProcessingJobIds(); - - for (BuildJob buildJob : pendingBuildJobs) { - if (buildJob.getBuildSubmissionDate().isAfter(now.minusMinutes(buildJobExpirationInMinutes))) { - log.debug("Build job with id {} is too recent to check", buildJob.getBuildJobId()); - continue; - } - if (buildJob.getBuildStatus() == BuildStatus.QUEUED && checkIfBuildJobIsStillQueued(queuedJobs, buildJob.getBuildJobId())) { - log.debug("Build job with id {} is still queued", buildJob.getBuildJobId()); - continue; - } - if (checkIfBuildJobIsStillBuilding(processingJobs, buildJob.getBuildJobId())) { - log.debug("Build job with id {} is still building", buildJob.getBuildJobId()); - continue; - } - log.error("Build job with id {} is in an unknown state", buildJob.getBuildJobId()); - // If the build job is in an unknown state, set it to missing and update the build start date - buildJobRepository.updateBuildJobStatus(buildJob.getBuildJobId(), BuildStatus.MISSING); - } - } - - // todo move other service maybe? - @Scheduled(fixedRateString = "${artemis.continuous-integration.retry-missing-jobs-interval-seconds:300}", initialDelayString = "${artemis.continuous-integration.retry-missing-jobs-delay-seconds:120}", timeUnit = TimeUnit.SECONDS) - public void retryMissingJobs() { - log.debug("Checking for missing build jobs to retry"); - Pageable pageable = PageRequest.of(0, 50); - ZonedDateTime now = ZonedDateTime.now(); - ZonedDateTime oneHourAgo = now.minusHours(1); - - Slice missingJobSlice = buildJobRepository.findJobsByStatusesInTimeRange(List.of(BuildStatus.MISSING), oneHourAgo, now, pageable); - List missingJobs = missingJobSlice.getContent(); - log.debug("Processing {} missing build jobs to retry", missingJobs.size()); - - for (BuildJob buildJob : missingJobSlice.getContent()) { - if (buildJob.getRetryCount() >= 3) { - log.warn("Build job with id {} for participation {} has reached the maximum number of retries and will not be retried.", buildJob.getBuildJobId(), - buildJob.getParticipationId()); - continue; - } - - try { - localCITriggerService.retryBuildJob(buildJob, (ProgrammingExerciseParticipation) participationRepository.findByIdElseThrow(buildJob.getParticipationId())); - } - catch (Exception e) { - log.error("Failed to retry build job with id {} for participation {}", buildJob.getBuildJobId(), buildJob.getParticipationId(), e); - } - } - - if (missingJobSlice.hasNext()) { - log.debug("There are more missing jobs to process in the next scheduled run."); - } - } - - private boolean checkIfBuildJobIsStillBuilding(List processingJobIds, String buildJobId) { - return processingJobIds.contains(buildJobId); - } - - private boolean checkIfBuildJobIsStillQueued(List queuedJobs, String buildJobId) { - return queuedJobs.stream().anyMatch(job -> job.id().equals(buildJobId)); - } - private class QueuedBuildJobItemListener implements ItemListener { @Override diff --git a/src/main/java/de/tum/cit/aet/artemis/programming/service/localci/LocalCIMissingJobService.java b/src/main/java/de/tum/cit/aet/artemis/programming/service/localci/LocalCIMissingJobService.java new file mode 100644 index 000000000000..9d85f0814856 --- /dev/null +++ b/src/main/java/de/tum/cit/aet/artemis/programming/service/localci/LocalCIMissingJobService.java @@ -0,0 +1,148 @@ +package de.tum.cit.aet.artemis.programming.service.localci; + +import java.time.ZonedDateTime; +import java.util.List; +import java.util.concurrent.TimeUnit; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.springframework.beans.factory.annotation.Value; +import org.springframework.context.annotation.Lazy; +import org.springframework.context.annotation.Profile; +import org.springframework.data.domain.PageRequest; +import org.springframework.data.domain.Pageable; +import org.springframework.data.domain.Slice; +import org.springframework.scheduling.annotation.Scheduled; +import org.springframework.stereotype.Service; + +import de.tum.cit.aet.artemis.buildagent.dto.BuildJobQueueItem; +import de.tum.cit.aet.artemis.exercise.repository.ParticipationRepository; +import de.tum.cit.aet.artemis.programming.domain.ProgrammingExerciseParticipation; +import de.tum.cit.aet.artemis.programming.domain.build.BuildJob; +import de.tum.cit.aet.artemis.programming.domain.build.BuildStatus; +import de.tum.cit.aet.artemis.programming.repository.BuildJobRepository; + +/** + * Schedule service for detecting and retrying missing build jobs in the LocalCI system + */ +@Lazy +@Service +@Profile("localci & scheduling") +public class LocalCIMissingJobService { + + private static final Logger log = LoggerFactory.getLogger(LocalCIMissingJobService.class); + + private final BuildJobRepository buildJobRepository; + + private final LocalCITriggerService localCITriggerService; + + private final ParticipationRepository participationRepository; + + private final DistributedDataAccessService distributedDataAccessService; + + @Value("${artemis.continuous-integration.max-missing-job-retries:3}") + private int maxMissingJobRetries; + + public LocalCIMissingJobService(BuildJobRepository buildJobRepository, LocalCITriggerService localCITriggerService, ParticipationRepository participationRepository, + DistributedDataAccessService distributedDataAccessService) { + this.buildJobRepository = buildJobRepository; + this.localCITriggerService = localCITriggerService; + this.participationRepository = participationRepository; + this.distributedDataAccessService = distributedDataAccessService; + } + + /** + * Periodically checks the status of pending build jobs and updates their status if they are missing. + *

+ * This scheduled task ensures that build jobs which are stuck in the QUEUED or BUILDING state for too long + * are detected and marked as MISSING if their status cannot be verified. This helps prevent indefinite + * waiting states due to external failures or inconsistencies in the CI system. + *

+ *

+ * This mechanism is necessary because build jobs are managed externally, and various failure scenarios + * can lead to jobs being lost without Artemis being notified: + *

+ *
    + *
  • Application crashes or restarts while build job was queued
  • + *
  • network issues leading to Hazelcast data loss
  • + *
  • Build agent crashes or is disconnected
  • + *
+ */ + @Scheduled(fixedRateString = "${artemis.continuous-integration.check-job-status-interval-seconds:300}", initialDelayString = "${artemis.continuous-integration.check-job-status-delay-seconds:60}", timeUnit = TimeUnit.SECONDS) + public void checkPendingBuildJobsStatus() { + log.debug("Checking pending build jobs status"); + List pendingBuildJobs = buildJobRepository.findAllByBuildStatusIn(List.of(BuildStatus.QUEUED, BuildStatus.BUILDING)); + ZonedDateTime now = ZonedDateTime.now(); + final int buildJobExpirationInMinutes = 5; // If a build job is older than 5 minutes, and it's status can't be determined, set it to missing + + var queuedJobs = distributedDataAccessService.getQueuedJobs(); + var processingJobs = distributedDataAccessService.getProcessingJobIds(); + + for (BuildJob buildJob : pendingBuildJobs) { + if (buildJob.getBuildSubmissionDate().isAfter(now.minusMinutes(buildJobExpirationInMinutes))) { + log.debug("Build job with id {} is too recent to check", buildJob.getBuildJobId()); + continue; + } + if (buildJob.getBuildStatus() == BuildStatus.QUEUED && checkIfBuildJobIsStillQueued(queuedJobs, buildJob.getBuildJobId())) { + log.debug("Build job with id {} is still queued", buildJob.getBuildJobId()); + continue; + } + if (checkIfBuildJobIsStillBuilding(processingJobs, buildJob.getBuildJobId())) { + log.debug("Build job with id {} is still building", buildJob.getBuildJobId()); + continue; + } + log.error("Build job with id {} is in an unknown state", buildJob.getBuildJobId()); + // If the build job is in an unknown state, set it to missing and update the build start date + buildJobRepository.updateBuildJobStatus(buildJob.getBuildJobId(), BuildStatus.MISSING); + } + } + + /** + * Periodically retries missing build jobs. + * R + * retrieves a slice of missing build jobs from the last hour and attempts to retry them. + * If a build job has reached the maximum number of retries, it will not be retried again. + */ + @Scheduled(fixedRateString = "${artemis.continuous-integration.retry-missing-jobs-interval-seconds:300}", initialDelayString = "${artemis.continuous-integration.retry-missing-jobs-delay-seconds:120}", timeUnit = TimeUnit.SECONDS) + public void retryMissingJobs() { + log.debug("Checking for missing build jobs to retry"); + + Slice missingJobsSlice = getMissingJobsSliceOfLastHour(50); + List missingJobs = missingJobsSlice.getContent(); + log.debug("Processing {} missing build jobs to retry", missingJobs.size()); + + for (BuildJob buildJob : missingJobs) { + if (buildJob.getRetryCount() >= maxMissingJobRetries) { + log.warn("Build job with id {} for participation {} has reached the maximum number of {} retries and will not be retried.", buildJob.getBuildJobId(), + maxMissingJobRetries, buildJob.getParticipationId()); + continue; + } + + try { + localCITriggerService.retryBuildJob(buildJob, (ProgrammingExerciseParticipation) participationRepository.findByIdElseThrow(buildJob.getParticipationId())); + } + catch (Exception e) { + log.error("Failed to retry build job with id {} for participation {}", buildJob.getBuildJobId(), buildJob.getParticipationId(), e); + } + } + + if (missingJobsSlice.hasNext()) { + log.debug("There are more missing jobs to process in the next scheduled run."); + } + } + + private boolean checkIfBuildJobIsStillBuilding(List processingJobIds, String buildJobId) { + return processingJobIds.contains(buildJobId); + } + + private boolean checkIfBuildJobIsStillQueued(List queuedJobs, String buildJobId) { + return queuedJobs.stream().anyMatch(job -> job.id().equals(buildJobId)); + } + + private Slice getMissingJobsSliceOfLastHour(int maxResults) { + Pageable pageable = PageRequest.of(0, maxResults); + ZonedDateTime now = ZonedDateTime.now(); + ZonedDateTime oneHourAgo = now.minusHours(1); + return buildJobRepository.findJobsByStatusesInTimeRange(List.of(BuildStatus.MISSING), oneHourAgo, now, pageable); + } +} diff --git a/src/test/java/de/tum/cit/aet/artemis/programming/AbstractProgrammingIntegrationLocalCILocalVCTestBase.java b/src/test/java/de/tum/cit/aet/artemis/programming/AbstractProgrammingIntegrationLocalCILocalVCTestBase.java index 46d101f2dce9..ecfc79647df5 100644 --- a/src/test/java/de/tum/cit/aet/artemis/programming/AbstractProgrammingIntegrationLocalCILocalVCTestBase.java +++ b/src/test/java/de/tum/cit/aet/artemis/programming/AbstractProgrammingIntegrationLocalCILocalVCTestBase.java @@ -34,6 +34,7 @@ import de.tum.cit.aet.artemis.programming.repository.VcsAccessLogRepository; import de.tum.cit.aet.artemis.programming.service.BuildLogEntryService; import de.tum.cit.aet.artemis.programming.service.ParticipationVcsAccessTokenService; +import de.tum.cit.aet.artemis.programming.service.localci.LocalCIMissingJobService; import de.tum.cit.aet.artemis.programming.service.localci.LocalCIResultService; import de.tum.cit.aet.artemis.programming.service.localci.LocalCITriggerService; import de.tum.cit.aet.artemis.programming.service.localvc.LocalVCServletService; @@ -112,6 +113,9 @@ protected void mockBuildAgentServices() { @Autowired protected LocalCITriggerService localCITriggerService; + @Autowired + protected LocalCIMissingJobService localCIMissingJobService; + @Autowired protected ParticipationVcsAccessTokenService participationVcsAccessTokenService; diff --git a/src/test/java/de/tum/cit/aet/artemis/programming/icl/LocalCIIntegrationTest.java b/src/test/java/de/tum/cit/aet/artemis/programming/icl/LocalCIIntegrationTest.java index 76d76fb32b49..01d376bf8d8a 100644 --- a/src/test/java/de/tum/cit/aet/artemis/programming/icl/LocalCIIntegrationTest.java +++ b/src/test/java/de/tum/cit/aet/artemis/programming/icl/LocalCIIntegrationTest.java @@ -327,7 +327,7 @@ void testMissingBuildJobCheck() { hazelcastInstance.getQueue("buildJobQueue").clear(); - localCIEventListenerService.checkPendingBuildJobsStatus(); + localCIMissingJobService.checkPendingBuildJobsStatus(); buildJobOptional = buildJobRepository.findFirstByParticipationIdOrderByBuildStartDateDesc(studentParticipation.getId()); buildJob = buildJobOptional.orElseThrow(); From d9e1d2a7493944e1f79ea1dfa6d9a6b71be1f60f Mon Sep 17 00:00:00 2001 From: jfr2102 Date: Tue, 26 Aug 2025 22:06:19 +0200 Subject: [PATCH 07/19] try isolated ci tests --- .../icl/LocalCIIntegrationTest.java | 50 +++++++++++++++++++ .../BuildJobTestRepository.java | 2 + 2 files changed, 52 insertions(+) diff --git a/src/test/java/de/tum/cit/aet/artemis/programming/icl/LocalCIIntegrationTest.java b/src/test/java/de/tum/cit/aet/artemis/programming/icl/LocalCIIntegrationTest.java index 01d376bf8d8a..2e0c2153aabd 100644 --- a/src/test/java/de/tum/cit/aet/artemis/programming/icl/LocalCIIntegrationTest.java +++ b/src/test/java/de/tum/cit/aet/artemis/programming/icl/LocalCIIntegrationTest.java @@ -52,6 +52,7 @@ import org.junit.jupiter.api.TestInstance; import org.junit.jupiter.api.parallel.Execution; import org.junit.jupiter.api.parallel.ExecutionMode; +import org.junit.jupiter.api.parallel.Isolated; import org.mockito.ArgumentMatcher; import org.mockito.Mockito; import org.springframework.beans.factory.annotation.Value; @@ -105,6 +106,7 @@ // concurrently. For example, it prevents overloading the LocalCI's result processing system with too many build job results at the same time, which could lead to flaky tests // or timeouts. By keeping everything in the same thread, we maintain more predictable and stable test behavior, while not increasing the test execution time significantly. @Execution(ExecutionMode.SAME_THREAD) +@Isolated class LocalCIIntegrationTest extends AbstractProgrammingIntegrationLocalCILocalVCTestBase { private static final String TEST_PREFIX = "localciint"; @@ -132,6 +134,9 @@ protected String getTestPrefix() { @Value("${artemis.continuous-integration.build-agent.display-name:}") private String buildAgentDisplayName; + @Value("${artemis.continuous-integration.max-missing-job-retries:3}") + private int maxMissingJobRetries; + @BeforeAll void setupAll() { buildJobRepository.deleteAll(); @@ -337,6 +342,51 @@ void testMissingBuildJobCheck() { sharedQueueProcessingService.init(); } + @Test + @WithMockUser(username = TEST_PREFIX + "student1", roles = "USER") + void testMissingBuildJobRetry() { + ProgrammingExerciseStudentParticipation studentParticipation = localVCLocalCITestService.createParticipation(programmingExercise, student1Login); + processNewPush(commitHash, studentAssignmentRepository.remoteBareGitRepo.getRepository(), userTestRepository.getUserWithGroupsAndAuthorities()); + + await().until(() -> { + Optional buildJobOptional = buildJobRepository.findFirstByParticipationIdOrderByBuildStartDateDesc(studentParticipation.getId()); + return buildJobOptional.isPresent() && buildJobOptional.get().getBuildStatus() == BuildStatus.QUEUED; + }); + + BuildJob buildJob = buildJobRepository.findFirstByParticipationIdOrderByBuildStartDateDesc(studentParticipation.getId()).orElseThrow(); + buildJob.setBuildStatus(BuildStatus.MISSING); + buildJob.setBuildSubmissionDate(ZonedDateTime.now().minusMinutes(10)); + buildJobRepository.save(buildJob); + + localCIMissingJobService.retryMissingJobs(); + + // job for participation should be retried so retry count should be 1 and status QUEUED + await().until(() -> { + Optional buildJobOptional = buildJobRepository.findFirstByParticipationIdOrderByBuildJobIdDesc(buildJob.getParticipationId()); + if (buildJobOptional.isEmpty()) { + return false; + } + BuildJob retryedBuildJob = buildJobOptional.get(); + return retryedBuildJob.getBuildStatus() == BuildStatus.QUEUED && retryedBuildJob.getRetryCount() == 1; + }); + } + + @Test + void testMissingBuildJobRetryLimit() { + BuildJob buildJob = new BuildJob(); + buildJob.setBuildSubmissionDate(ZonedDateTime.now().minusMinutes(10)); + buildJob.setBuildStatus(BuildStatus.MISSING); + buildJob.setRetryCount(maxMissingJobRetries); + buildJob.setParticipationId(1L); + buildJobRepository.save(buildJob); + + localCIMissingJobService.retryMissingJobs(); + + buildJob = buildJobRepository.findFirstByParticipationIdOrderByBuildJobIdDesc(buildJob.getParticipationId()).orElseThrow(); + assertThat(buildJob.getBuildStatus()).isEqualTo(BuildStatus.MISSING); + assertThat(buildJob.getRetryCount()).isEqualTo(3); + } + @Test @WithMockUser(username = TEST_PREFIX + "student1", roles = "USER") void testInvalidLocalVCRepositoryUri() { diff --git a/src/test/java/de/tum/cit/aet/artemis/programming/test_repository/BuildJobTestRepository.java b/src/test/java/de/tum/cit/aet/artemis/programming/test_repository/BuildJobTestRepository.java index 2746fe81ecb1..8980a3e6c989 100644 --- a/src/test/java/de/tum/cit/aet/artemis/programming/test_repository/BuildJobTestRepository.java +++ b/src/test/java/de/tum/cit/aet/artemis/programming/test_repository/BuildJobTestRepository.java @@ -18,4 +18,6 @@ public interface BuildJobTestRepository extends BuildJobRepository { Optional findBuildJobByResult(Result result); Optional findFirstByParticipationIdOrderByBuildStartDateDesc(Long participationId); + + Optional findFirstByParticipationIdOrderByBuildJobIdDesc(Long participationId); } From 1d305bec1a282c43642f58c21495044e0e6327d0 Mon Sep 17 00:00:00 2001 From: jfr2102 Date: Thu, 4 Sep 2025 17:53:04 +0200 Subject: [PATCH 08/19] improve test --- .../programming/icl/LocalCIIntegrationTest.java | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/src/test/java/de/tum/cit/aet/artemis/programming/icl/LocalCIIntegrationTest.java b/src/test/java/de/tum/cit/aet/artemis/programming/icl/LocalCIIntegrationTest.java index 2e0c2153aabd..5023fcad139c 100644 --- a/src/test/java/de/tum/cit/aet/artemis/programming/icl/LocalCIIntegrationTest.java +++ b/src/test/java/de/tum/cit/aet/artemis/programming/icl/LocalCIIntegrationTest.java @@ -330,7 +330,7 @@ void testMissingBuildJobCheck() { buildJob.setBuildSubmissionDate(ZonedDateTime.now().minusMinutes(6)); buildJobRepository.save(buildJob); - hazelcastInstance.getQueue("buildJobQueue").clear(); + queuedJobs.clear(); localCIMissingJobService.checkPendingBuildJobsStatus(); @@ -369,6 +369,8 @@ void testMissingBuildJobRetry() { BuildJob retryedBuildJob = buildJobOptional.get(); return retryedBuildJob.getBuildStatus() == BuildStatus.QUEUED && retryedBuildJob.getRetryCount() == 1; }); + processingJobs.clear(); + queuedJobs.clear(); } @Test @@ -378,13 +380,15 @@ void testMissingBuildJobRetryLimit() { buildJob.setBuildStatus(BuildStatus.MISSING); buildJob.setRetryCount(maxMissingJobRetries); buildJob.setParticipationId(1L); - buildJobRepository.save(buildJob); + buildJob = buildJobRepository.save(buildJob); localCIMissingJobService.retryMissingJobs(); - buildJob = buildJobRepository.findFirstByParticipationIdOrderByBuildJobIdDesc(buildJob.getParticipationId()).orElseThrow(); - assertThat(buildJob.getBuildStatus()).isEqualTo(BuildStatus.MISSING); - assertThat(buildJob.getRetryCount()).isEqualTo(3); + // latest build job for the participation should be the same because no retry over the limit + BuildJob latestJob = buildJobRepository.findFirstByParticipationIdOrderByBuildJobIdDesc(buildJob.getParticipationId()).orElseThrow(); + assertThat(latestJob.getBuildJobId()).isEqualTo(buildJob.getBuildJobId()); + assertThat(latestJob.getBuildStatus()).isEqualTo(BuildStatus.MISSING); + assertThat(latestJob.getRetryCount()).isEqualTo(maxMissingJobRetries); } @Test @@ -801,9 +805,8 @@ void testPauseAndResumeBuildAgent() { processNewPush(commitHash, studentAssignmentRepository.remoteBareGitRepo.getRepository(), userTestRepository.getUserWithGroupsAndAuthorities()); await().until(() -> { - IQueue buildQueue = hazelcastInstance.getQueue("buildJobQueue"); IMap buildJobMap = hazelcastInstance.getMap("processingJobs"); - BuildJobQueueItem buildJobQueueItem = buildQueue.peek(); + BuildJobQueueItem buildJobQueueItem = queuedJobs.peek(); return buildJobQueueItem != null && buildJobQueueItem.buildConfig().commitHashToBuild().equals(commitHash) && !buildJobMap.containsKey(buildJobQueueItem.id()); }); From 707b3eb26c9f8e222dd5899f9ec4566afa0b3dae Mon Sep 17 00:00:00 2001 From: jfr2102 Date: Thu, 4 Sep 2025 22:42:28 +0200 Subject: [PATCH 09/19] guard query --- .../repository/BuildJobRepository.java | 20 ++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/src/main/java/de/tum/cit/aet/artemis/programming/repository/BuildJobRepository.java b/src/main/java/de/tum/cit/aet/artemis/programming/repository/BuildJobRepository.java index f0ad1e1e8635..3b32dcb2265a 100644 --- a/src/main/java/de/tum/cit/aet/artemis/programming/repository/BuildJobRepository.java +++ b/src/main/java/de/tum/cit/aet/artemis/programming/repository/BuildJobRepository.java @@ -14,6 +14,7 @@ import org.springframework.context.annotation.Profile; import org.springframework.data.domain.Pageable; import org.springframework.data.domain.Slice; +import org.springframework.data.domain.SliceImpl; import org.springframework.data.jpa.repository.EntityGraph; import org.springframework.data.jpa.repository.JpaSpecificationExecutor; import org.springframework.data.jpa.repository.Modifying; @@ -202,6 +203,23 @@ void updateBuildJobStatusWithBuildStartDate(@Param("buildJobId") String buildJob AND b.buildSubmissionDate <= :endTime ORDER BY b.buildSubmissionDate DESC """) - Slice findJobsByStatusesInTimeRange(@Param("statuses") List statuses, @Param("startTime") ZonedDateTime startTime, + Slice queryFindJobsByStatusesInTimeRange(@Param("statuses") List statuses, @Param("startTime") ZonedDateTime startTime, @Param("endTime") ZonedDateTime endTime, Pageable pageable); + + /** + * Returns a slice of build jobs submitted within the given time range whose buildStatus is contained in the provided list, ordered by submission date descending. + * If the list of statuses is null or empty, an empty slice is returned. + * + * @param statuses list of build statuses; may be null or empty + * @param startTime earliest build submission time (inclusive) + * @param endTime latest build submission time (inclusive) + * @param pageable pagination information + * @return slice of matching build jobs + */ + default Slice findJobsByStatusesInTimeRange(List statuses, ZonedDateTime startTime, ZonedDateTime endTime, Pageable pageable) { + if (statuses == null || statuses.isEmpty()) { + return new SliceImpl<>(List.of(), pageable, false); + } + return queryFindJobsByStatusesInTimeRange(statuses, startTime, endTime, pageable); + } } From c49c3e59c789a62ce7dcd4cb3a3264c3adbdd365 Mon Sep 17 00:00:00 2001 From: jfr2102 Date: Thu, 4 Sep 2025 22:44:26 +0200 Subject: [PATCH 10/19] null check submissionDate --- .../programming/service/localci/LocalCIMissingJobService.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/main/java/de/tum/cit/aet/artemis/programming/service/localci/LocalCIMissingJobService.java b/src/main/java/de/tum/cit/aet/artemis/programming/service/localci/LocalCIMissingJobService.java index 9d85f0814856..9a6174d2c784 100644 --- a/src/main/java/de/tum/cit/aet/artemis/programming/service/localci/LocalCIMissingJobService.java +++ b/src/main/java/de/tum/cit/aet/artemis/programming/service/localci/LocalCIMissingJobService.java @@ -79,7 +79,8 @@ public void checkPendingBuildJobsStatus() { var processingJobs = distributedDataAccessService.getProcessingJobIds(); for (BuildJob buildJob : pendingBuildJobs) { - if (buildJob.getBuildSubmissionDate().isAfter(now.minusMinutes(buildJobExpirationInMinutes))) { + var submissionDate = buildJob.getBuildSubmissionDate(); + if (submissionDate == null || submissionDate.isAfter(now.minusMinutes(buildJobExpirationInMinutes))) { log.debug("Build job with id {} is too recent to check", buildJob.getBuildJobId()); continue; } From dcde1f189203aab848fda6908b10636543138723 Mon Sep 17 00:00:00 2001 From: jfr2102 Date: Thu, 4 Sep 2025 22:48:14 +0200 Subject: [PATCH 11/19] fix log order --- .../programming/service/localci/LocalCIMissingJobService.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/de/tum/cit/aet/artemis/programming/service/localci/LocalCIMissingJobService.java b/src/main/java/de/tum/cit/aet/artemis/programming/service/localci/LocalCIMissingJobService.java index 9a6174d2c784..b25139e1ab8b 100644 --- a/src/main/java/de/tum/cit/aet/artemis/programming/service/localci/LocalCIMissingJobService.java +++ b/src/main/java/de/tum/cit/aet/artemis/programming/service/localci/LocalCIMissingJobService.java @@ -115,7 +115,7 @@ public void retryMissingJobs() { for (BuildJob buildJob : missingJobs) { if (buildJob.getRetryCount() >= maxMissingJobRetries) { log.warn("Build job with id {} for participation {} has reached the maximum number of {} retries and will not be retried.", buildJob.getBuildJobId(), - maxMissingJobRetries, buildJob.getParticipationId()); + buildJob.getParticipationId(), maxMissingJobRetries); continue; } From 743392ac727432ae43d943d48d5baf4c47330a0a Mon Sep 17 00:00:00 2001 From: jfr2102 Date: Fri, 5 Sep 2025 00:47:06 +0200 Subject: [PATCH 12/19] improve retry logic --- .../repository/BuildJobRepository.java | 40 +++++++++---------- .../localci/LocalCIMissingJobService.java | 14 +++++-- 2 files changed, 30 insertions(+), 24 deletions(-) diff --git a/src/main/java/de/tum/cit/aet/artemis/programming/repository/BuildJobRepository.java b/src/main/java/de/tum/cit/aet/artemis/programming/repository/BuildJobRepository.java index 3b32dcb2265a..3b3df7ee6384 100644 --- a/src/main/java/de/tum/cit/aet/artemis/programming/repository/BuildJobRepository.java +++ b/src/main/java/de/tum/cit/aet/artemis/programming/repository/BuildJobRepository.java @@ -14,7 +14,6 @@ import org.springframework.context.annotation.Profile; import org.springframework.data.domain.Pageable; import org.springframework.data.domain.Slice; -import org.springframework.data.domain.SliceImpl; import org.springframework.data.jpa.repository.EntityGraph; import org.springframework.data.jpa.repository.JpaSpecificationExecutor; import org.springframework.data.jpa.repository.Modifying; @@ -187,39 +186,40 @@ void updateBuildJobStatusWithBuildStartDate(@Param("buildJobId") String buildJob List findAllByBuildStatusIn(List statuses); /** - * Find all build jobs with the given build statuses in the given time range, ordered by submission date descending. + * Returns a slice of missing build jobs submitted within the given time range for whose participation no newer job exists, ordered by submission date descending. * - * @param statuses the list of build statuses * @param startTime earliest build submission time * @param endTime latest build submission time * @param pageable pagination information - * @return the list of build jobs + * @return slice of matching build jobs */ @Query(""" SELECT b FROM BuildJob b - WHERE b.buildStatus IN :statuses + WHERE b.buildStatus = de.tum.cit.aet.artemis.programming.domain.build.BuildStatus.MISSING AND b.buildSubmissionDate >= :startTime AND b.buildSubmissionDate <= :endTime + AND NOT EXISTS ( + SELECT 1 + FROM BuildJob b2 + WHERE b2.participationId = b.participationId + AND b2.buildSubmissionDate > b.buildSubmissionDate + ) ORDER BY b.buildSubmissionDate DESC """) - Slice queryFindJobsByStatusesInTimeRange(@Param("statuses") List statuses, @Param("startTime") ZonedDateTime startTime, - @Param("endTime") ZonedDateTime endTime, Pageable pageable); + Slice findMissingJobsToRetryInTimeRange(@Param("startTime") ZonedDateTime startTime, @Param("endTime") ZonedDateTime endTime, Pageable pageable); /** - * Returns a slice of build jobs submitted within the given time range whose buildStatus is contained in the provided list, ordered by submission date descending. - * If the list of statuses is null or empty, an empty slice is returned. + * Increment the retry count of a build job by 1 * - * @param statuses list of build statuses; may be null or empty - * @param startTime earliest build submission time (inclusive) - * @param endTime latest build submission time (inclusive) - * @param pageable pagination information - * @return slice of matching build jobs + * @param buildJobId the ID of the build job */ - default Slice findJobsByStatusesInTimeRange(List statuses, ZonedDateTime startTime, ZonedDateTime endTime, Pageable pageable) { - if (statuses == null || statuses.isEmpty()) { - return new SliceImpl<>(List.of(), pageable, false); - } - return queryFindJobsByStatusesInTimeRange(statuses, startTime, endTime, pageable); - } + @Modifying + @Transactional + @Query(""" + UPDATE BuildJob b + SET b.retryCount = b.retryCount + 1 + WHERE b.buildJobId = :buildJobId + """) + void incrementRetryCount(@Param("buildJobId") String buildJobId); } diff --git a/src/main/java/de/tum/cit/aet/artemis/programming/service/localci/LocalCIMissingJobService.java b/src/main/java/de/tum/cit/aet/artemis/programming/service/localci/LocalCIMissingJobService.java index b25139e1ab8b..828d096e782b 100644 --- a/src/main/java/de/tum/cit/aet/artemis/programming/service/localci/LocalCIMissingJobService.java +++ b/src/main/java/de/tum/cit/aet/artemis/programming/service/localci/LocalCIMissingJobService.java @@ -108,7 +108,7 @@ public void checkPendingBuildJobsStatus() { public void retryMissingJobs() { log.debug("Checking for missing build jobs to retry"); - Slice missingJobsSlice = getMissingJobsSliceOfLastHour(50); + Slice missingJobsSlice = getMissingJobsToRetrySliceOfLastHour(50); List missingJobs = missingJobsSlice.getContent(); log.debug("Processing {} missing build jobs to retry", missingJobs.size()); @@ -118,9 +118,9 @@ public void retryMissingJobs() { buildJob.getParticipationId(), maxMissingJobRetries); continue; } - try { localCITriggerService.retryBuildJob(buildJob, (ProgrammingExerciseParticipation) participationRepository.findByIdElseThrow(buildJob.getParticipationId())); + buildJobRepository.incrementRetryCount(buildJob.getBuildJobId()); } catch (Exception e) { log.error("Failed to retry build job with id {} for participation {}", buildJob.getBuildJobId(), buildJob.getParticipationId(), e); @@ -140,10 +140,16 @@ private boolean checkIfBuildJobIsStillQueued(List queuedJobs, return queuedJobs.stream().anyMatch(job -> job.id().equals(buildJobId)); } - private Slice getMissingJobsSliceOfLastHour(int maxResults) { + /** + * Retrieves a slice of missing build jobs submitted within the last hour that do not have a newer job for the same participation. + * + * @param maxResults the maximum number of results to retrieve + * @return a slice of missing build jobs + */ + private Slice getMissingJobsToRetrySliceOfLastHour(int maxResults) { Pageable pageable = PageRequest.of(0, maxResults); ZonedDateTime now = ZonedDateTime.now(); ZonedDateTime oneHourAgo = now.minusHours(1); - return buildJobRepository.findJobsByStatusesInTimeRange(List.of(BuildStatus.MISSING), oneHourAgo, now, pageable); + return buildJobRepository.findMissingJobsToRetryInTimeRange(oneHourAgo, now, pageable); } } From 0e3206ffc221ed6f56382426cec23074298360e2 Mon Sep 17 00:00:00 2001 From: jfr2102 Date: Fri, 5 Sep 2025 00:50:41 +0200 Subject: [PATCH 13/19] improve test --- .../aet/artemis/programming/icl/LocalCIIntegrationTest.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/test/java/de/tum/cit/aet/artemis/programming/icl/LocalCIIntegrationTest.java b/src/test/java/de/tum/cit/aet/artemis/programming/icl/LocalCIIntegrationTest.java index 5023fcad139c..e95cdda70536 100644 --- a/src/test/java/de/tum/cit/aet/artemis/programming/icl/LocalCIIntegrationTest.java +++ b/src/test/java/de/tum/cit/aet/artemis/programming/icl/LocalCIIntegrationTest.java @@ -366,8 +366,8 @@ void testMissingBuildJobRetry() { if (buildJobOptional.isEmpty()) { return false; } - BuildJob retryedBuildJob = buildJobOptional.get(); - return retryedBuildJob.getBuildStatus() == BuildStatus.QUEUED && retryedBuildJob.getRetryCount() == 1; + BuildJob retriedBuildJob = buildJobOptional.get(); + return (retriedBuildJob.getBuildStatus() == BuildStatus.QUEUED || retriedBuildJob.getBuildStatus() == BuildStatus.BUILDING) && retriedBuildJob.getRetryCount() == 1; }); processingJobs.clear(); queuedJobs.clear(); From 3ec393b6fdb6e68414d905e2a9f806998e257048 Mon Sep 17 00:00:00 2001 From: jfr2102 Date: Fri, 5 Sep 2025 00:53:39 +0200 Subject: [PATCH 14/19] LocalCIException --- .../programming/service/localci/LocalCITriggerService.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/de/tum/cit/aet/artemis/programming/service/localci/LocalCITriggerService.java b/src/main/java/de/tum/cit/aet/artemis/programming/service/localci/LocalCITriggerService.java index 9f7f33572d72..bedffb26ecc3 100644 --- a/src/main/java/de/tum/cit/aet/artemis/programming/service/localci/LocalCITriggerService.java +++ b/src/main/java/de/tum/cit/aet/artemis/programming/service/localci/LocalCITriggerService.java @@ -153,7 +153,7 @@ public void triggerBuild(ProgrammingExerciseParticipation participation, String triggerBuild(participation, commitHashToBuild, triggeredByPushTo, false, 0); } - public void retryBuildJob(BuildJob buildJob, ProgrammingExerciseParticipation participation) { + public void retryBuildJob(BuildJob buildJob, ProgrammingExerciseParticipation participation) throws LocalCIException { log.info("Retrying build for missing build job with id {} (retry count: {})", buildJob.getBuildJobId(), buildJob.getRetryCount() + 1); triggerBuild(participation, buildJob.getCommitHash(), buildJob.getTriggeredByPushTo(), buildJob.getRetryCount() + 1); } From 071ebc044b5dd53722812b9dd5119ea95f76ef63 Mon Sep 17 00:00:00 2001 From: jfr2102 Date: Fri, 5 Sep 2025 11:45:58 +0200 Subject: [PATCH 15/19] config default max-missing-job-retries --- src/main/resources/config/application-localci.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/src/main/resources/config/application-localci.yml b/src/main/resources/config/application-localci.yml index 3bb18bae0874..aed90205a7c6 100644 --- a/src/main/resources/config/application-localci.yml +++ b/src/main/resources/config/application-localci.yml @@ -37,3 +37,4 @@ artemis: default: 120 # The default number of seconds that will be used if the instructor does not specify a value. This is the default value that will be shown in the slider when creating or editing a programming exercise. # Max value also defines the max timeout for the build. Meaning that builds that exceed this time will be automatically interrupted. max: 240 # The maximum number of seconds that the instructor can set for the build timeout. This is the maximum value that will be shown in the slider when creating or editing a programming exercise. + max-missing-job-retries: 3 # The Maximium number of retries for jobs that are missing (jobs that got persisted but are neither still running, queued or completed) From 9d47fd9080b5607cac21a58b332eac4d0ad034bf Mon Sep 17 00:00:00 2001 From: jfr2102 Date: Fri, 5 Sep 2025 12:37:23 +0200 Subject: [PATCH 16/19] add composite indizes to buildJob table --- .../changelog/20250905120000_changelog.xml | 22 +++++++++++++++++++ .../resources/config/liquibase/master.xml | 1 + 2 files changed, 23 insertions(+) create mode 100644 src/main/resources/config/liquibase/changelog/20250905120000_changelog.xml diff --git a/src/main/resources/config/liquibase/changelog/20250905120000_changelog.xml b/src/main/resources/config/liquibase/changelog/20250905120000_changelog.xml new file mode 100644 index 000000000000..b229f69c2c70 --- /dev/null +++ b/src/main/resources/config/liquibase/changelog/20250905120000_changelog.xml @@ -0,0 +1,22 @@ + + + + + + + + + + + + + + + + + + + + diff --git a/src/main/resources/config/liquibase/master.xml b/src/main/resources/config/liquibase/master.xml index 52eccb9d875f..410da416c192 100644 --- a/src/main/resources/config/liquibase/master.xml +++ b/src/main/resources/config/liquibase/master.xml @@ -41,6 +41,7 @@ + From 1f039f224415a798f5cc596df1a64fa3e00d1b0d Mon Sep 17 00:00:00 2001 From: jfr2102 Date: Thu, 11 Sep 2025 10:38:55 +0200 Subject: [PATCH 17/19] Revert "add composite indizes to buildJob table" This reverts commit 9d47fd9080b5607cac21a58b332eac4d0ad034bf. --- .../changelog/20250905120000_changelog.xml | 22 ------------------- .../resources/config/liquibase/master.xml | 1 - 2 files changed, 23 deletions(-) delete mode 100644 src/main/resources/config/liquibase/changelog/20250905120000_changelog.xml diff --git a/src/main/resources/config/liquibase/changelog/20250905120000_changelog.xml b/src/main/resources/config/liquibase/changelog/20250905120000_changelog.xml deleted file mode 100644 index b229f69c2c70..000000000000 --- a/src/main/resources/config/liquibase/changelog/20250905120000_changelog.xml +++ /dev/null @@ -1,22 +0,0 @@ - - - - - - - - - - - - - - - - - - - - diff --git a/src/main/resources/config/liquibase/master.xml b/src/main/resources/config/liquibase/master.xml index 410da416c192..52eccb9d875f 100644 --- a/src/main/resources/config/liquibase/master.xml +++ b/src/main/resources/config/liquibase/master.xml @@ -41,7 +41,6 @@ - From 68fa6145b8bb2f30d2afe76dce2280b7d0ee63f1 Mon Sep 17 00:00:00 2001 From: jfr2102 Date: Thu, 11 Sep 2025 10:40:31 +0200 Subject: [PATCH 18/19] temp debug log --- .../programming/service/localci/LocalCIMissingJobService.java | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/main/java/de/tum/cit/aet/artemis/programming/service/localci/LocalCIMissingJobService.java b/src/main/java/de/tum/cit/aet/artemis/programming/service/localci/LocalCIMissingJobService.java index 828d096e782b..7ea9737439cc 100644 --- a/src/main/java/de/tum/cit/aet/artemis/programming/service/localci/LocalCIMissingJobService.java +++ b/src/main/java/de/tum/cit/aet/artemis/programming/service/localci/LocalCIMissingJobService.java @@ -108,7 +108,10 @@ public void checkPendingBuildJobsStatus() { public void retryMissingJobs() { log.debug("Checking for missing build jobs to retry"); + var start = System.currentTimeMillis(); Slice missingJobsSlice = getMissingJobsToRetrySliceOfLastHour(50); + log.debug("Retrieving missing jobs took {} ms", System.currentTimeMillis() - start); + List missingJobs = missingJobsSlice.getContent(); log.debug("Processing {} missing build jobs to retry", missingJobs.size()); From 1e636cd821de768a846e586249b84fb0b54d38ee Mon Sep 17 00:00:00 2001 From: jfr2102 Date: Thu, 11 Sep 2025 19:29:34 +0200 Subject: [PATCH 19/19] Revert "temp debug log" This reverts commit 68fa6145b8bb2f30d2afe76dce2280b7d0ee63f1. --- .../programming/service/localci/LocalCIMissingJobService.java | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/main/java/de/tum/cit/aet/artemis/programming/service/localci/LocalCIMissingJobService.java b/src/main/java/de/tum/cit/aet/artemis/programming/service/localci/LocalCIMissingJobService.java index 7ea9737439cc..828d096e782b 100644 --- a/src/main/java/de/tum/cit/aet/artemis/programming/service/localci/LocalCIMissingJobService.java +++ b/src/main/java/de/tum/cit/aet/artemis/programming/service/localci/LocalCIMissingJobService.java @@ -108,10 +108,7 @@ public void checkPendingBuildJobsStatus() { public void retryMissingJobs() { log.debug("Checking for missing build jobs to retry"); - var start = System.currentTimeMillis(); Slice missingJobsSlice = getMissingJobsToRetrySliceOfLastHour(50); - log.debug("Retrieving missing jobs took {} ms", System.currentTimeMillis() - start); - List missingJobs = missingJobsSlice.getContent(); log.debug("Processing {} missing build jobs to retry", missingJobs.size());