Skip to content
Merged
Show file tree
Hide file tree
Changes from 20 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -184,4 +184,42 @@ void updateBuildJobStatusWithBuildStartDate(@Param("buildJobId") String buildJob
* @return the list of build jobs
*/
List<BuildJob> findAllByBuildStatusIn(List<BuildStatus> statuses);

/**
* Returns a slice of missing build jobs submitted within the given time range for whose participation no newer job exists, ordered by submission date descending.
*
* @param startTime earliest build submission time
* @param endTime latest build submission time
* @param pageable pagination information
* @return slice of matching build jobs
*/
@Query("""
SELECT b
FROM BuildJob b
WHERE b.buildStatus = de.tum.cit.aet.artemis.programming.domain.build.BuildStatus.MISSING
AND b.buildSubmissionDate >= :startTime
AND b.buildSubmissionDate <= :endTime
AND NOT EXISTS (
SELECT 1
FROM BuildJob b2
WHERE b2.participationId = b.participationId
AND b2.buildSubmissionDate > b.buildSubmissionDate
)
ORDER BY b.buildSubmissionDate DESC
""")
Slice<BuildJob> findMissingJobsToRetryInTimeRange(@Param("startTime") ZonedDateTime startTime, @Param("endTime") ZonedDateTime endTime, Pageable pageable);

/**
* Increment the retry count of a build job by 1
*
* @param buildJobId the ID of the build job
*/
@Modifying
@Transactional
@Query("""
UPDATE BuildJob b
SET b.retryCount = b.retryCount + 1
WHERE b.buildJobId = :buildJobId
""")
void incrementRetryCount(@Param("buildJobId") String buildJobId);
}
Original file line number Diff line number Diff line change
@@ -1,9 +1,7 @@
package de.tum.cit.aet.artemis.programming.service.localci;

import java.time.ZonedDateTime;
import java.util.List;
import java.util.Optional;
import java.util.concurrent.TimeUnit;

import jakarta.annotation.PostConstruct;

Expand All @@ -25,7 +23,6 @@
import de.tum.cit.aet.artemis.communication.service.notifications.MailService;
import de.tum.cit.aet.artemis.core.domain.User;
import de.tum.cit.aet.artemis.core.service.user.UserService;
import de.tum.cit.aet.artemis.programming.domain.build.BuildJob;
import de.tum.cit.aet.artemis.programming.domain.build.BuildStatus;
import de.tum.cit.aet.artemis.programming.dto.SubmissionProcessingDTO;
import de.tum.cit.aet.artemis.programming.repository.BuildJobRepository;
Expand Down Expand Up @@ -87,52 +84,6 @@ public void init() {
distributedDataAccessService.getDistributedBuildAgentInformation().addEntryListener(new BuildAgentListener(), true);
}

/**
* Periodically checks the status of pending build jobs and updates their status if they are missing.
* <p>
* This scheduled task ensures that build jobs which are stuck in the QUEUED or BUILDING state for too long
* are detected and marked as MISSING if their status cannot be verified. This helps prevent indefinite
* waiting states due to external failures or inconsistencies in the CI system.
* </p>
* <p>
* This mechanism is necessary because build jobs are managed externally, and various failure scenarios
* can lead to jobs being lost without Artemis being notified:
* </p>
* <ul>
* <li>Application crashes or restarts while build job was queued</li>
* <li>network issues leading to Hazelcast data loss</li>
* <li>Build agent crashes or is disconnected</li>
* </ul>
*/
@Scheduled(fixedRateString = "${artemis.continuous-integration.check-job-status-interval-seconds:300}", initialDelayString = "${artemis.continuous-integration.check-job-status-delay-seconds:60}", timeUnit = TimeUnit.SECONDS)
public void checkPendingBuildJobsStatus() {
log.debug("Checking pending build jobs status");
List<BuildJob> pendingBuildJobs = buildJobRepository.findAllByBuildStatusIn(List.of(BuildStatus.QUEUED, BuildStatus.BUILDING));
ZonedDateTime now = ZonedDateTime.now();
final int buildJobExpirationInMinutes = 5; // If a build job is older than 5 minutes, and it's status can't be determined, set it to missing

var queuedJobs = distributedDataAccessService.getQueuedJobs();
var processingJobs = distributedDataAccessService.getProcessingJobIds();

for (BuildJob buildJob : pendingBuildJobs) {
if (buildJob.getBuildSubmissionDate().isAfter(now.minusMinutes(buildJobExpirationInMinutes))) {
log.debug("Build job with id {} is too recent to check", buildJob.getBuildJobId());
continue;
}
if (buildJob.getBuildStatus() == BuildStatus.QUEUED && checkIfBuildJobIsStillQueued(queuedJobs, buildJob.getBuildJobId())) {
log.debug("Build job with id {} is still queued", buildJob.getBuildJobId());
continue;
}
if (checkIfBuildJobIsStillBuilding(processingJobs, buildJob.getBuildJobId())) {
log.debug("Build job with id {} is still building", buildJob.getBuildJobId());
continue;
}
log.error("Build job with id {} is in an unknown state", buildJob.getBuildJobId());
// If the build job is in an unknown state, set it to missing and update the build start date
buildJobRepository.updateBuildJobStatus(buildJob.getBuildJobId(), BuildStatus.MISSING);
}
}

/**
* Processes the queued results from the distributed build result queue every minute.
* This is a fallback mechanism to ensure that no results are left unprocessed in the queue e.g. if listener events are lost
Expand All @@ -156,14 +107,6 @@ public void processQueuedResults() {
}
}

private boolean checkIfBuildJobIsStillBuilding(List<String> processingJobIds, String buildJobId) {
return processingJobIds.contains(buildJobId);
}

private boolean checkIfBuildJobIsStillQueued(List<BuildJobQueueItem> queuedJobs, String buildJobId) {
return queuedJobs.stream().anyMatch(job -> job.id().equals(buildJobId));
}

private class QueuedBuildJobItemListener implements ItemListener<BuildJobQueueItem> {

@Override
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
package de.tum.cit.aet.artemis.programming.service.localci;

import java.time.ZonedDateTime;
import java.util.List;
import java.util.concurrent.TimeUnit;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.context.annotation.Lazy;
import org.springframework.context.annotation.Profile;
import org.springframework.data.domain.PageRequest;
import org.springframework.data.domain.Pageable;
import org.springframework.data.domain.Slice;
import org.springframework.scheduling.annotation.Scheduled;
import org.springframework.stereotype.Service;

import de.tum.cit.aet.artemis.buildagent.dto.BuildJobQueueItem;
import de.tum.cit.aet.artemis.exercise.repository.ParticipationRepository;
import de.tum.cit.aet.artemis.programming.domain.ProgrammingExerciseParticipation;
import de.tum.cit.aet.artemis.programming.domain.build.BuildJob;
import de.tum.cit.aet.artemis.programming.domain.build.BuildStatus;
import de.tum.cit.aet.artemis.programming.repository.BuildJobRepository;

/**
* Schedule service for detecting and retrying missing build jobs in the LocalCI system
*/
@Lazy
@Service
@Profile("localci & scheduling")
public class LocalCIMissingJobService {

private static final Logger log = LoggerFactory.getLogger(LocalCIMissingJobService.class);

private final BuildJobRepository buildJobRepository;

private final LocalCITriggerService localCITriggerService;

private final ParticipationRepository participationRepository;

private final DistributedDataAccessService distributedDataAccessService;

@Value("${artemis.continuous-integration.max-missing-job-retries:3}")
private int maxMissingJobRetries;

public LocalCIMissingJobService(BuildJobRepository buildJobRepository, LocalCITriggerService localCITriggerService, ParticipationRepository participationRepository,
DistributedDataAccessService distributedDataAccessService) {
this.buildJobRepository = buildJobRepository;
this.localCITriggerService = localCITriggerService;
this.participationRepository = participationRepository;
this.distributedDataAccessService = distributedDataAccessService;
}

/**
* Periodically checks the status of pending build jobs and updates their status if they are missing.
* <p>
* This scheduled task ensures that build jobs which are stuck in the QUEUED or BUILDING state for too long
* are detected and marked as MISSING if their status cannot be verified. This helps prevent indefinite
* waiting states due to external failures or inconsistencies in the CI system.
* </p>
* <p>
* This mechanism is necessary because build jobs are managed externally, and various failure scenarios
* can lead to jobs being lost without Artemis being notified:
* </p>
* <ul>
* <li>Application crashes or restarts while build job was queued</li>
* <li>network issues leading to Hazelcast data loss</li>
* <li>Build agent crashes or is disconnected</li>
* </ul>
*/
@Scheduled(fixedRateString = "${artemis.continuous-integration.check-job-status-interval-seconds:300}", initialDelayString = "${artemis.continuous-integration.check-job-status-delay-seconds:60}", timeUnit = TimeUnit.SECONDS)
public void checkPendingBuildJobsStatus() {
log.debug("Checking pending build jobs status");
List<BuildJob> pendingBuildJobs = buildJobRepository.findAllByBuildStatusIn(List.of(BuildStatus.QUEUED, BuildStatus.BUILDING));
ZonedDateTime now = ZonedDateTime.now();
final int buildJobExpirationInMinutes = 5; // If a build job is older than 5 minutes, and it's status can't be determined, set it to missing

var queuedJobs = distributedDataAccessService.getQueuedJobs();
var processingJobs = distributedDataAccessService.getProcessingJobIds();

for (BuildJob buildJob : pendingBuildJobs) {
var submissionDate = buildJob.getBuildSubmissionDate();
if (submissionDate == null || submissionDate.isAfter(now.minusMinutes(buildJobExpirationInMinutes))) {
log.debug("Build job with id {} is too recent to check", buildJob.getBuildJobId());
continue;
}
if (buildJob.getBuildStatus() == BuildStatus.QUEUED && checkIfBuildJobIsStillQueued(queuedJobs, buildJob.getBuildJobId())) {
log.debug("Build job with id {} is still queued", buildJob.getBuildJobId());
continue;
}
if (checkIfBuildJobIsStillBuilding(processingJobs, buildJob.getBuildJobId())) {
log.debug("Build job with id {} is still building", buildJob.getBuildJobId());
continue;
}
log.error("Build job with id {} is in an unknown state", buildJob.getBuildJobId());
// If the build job is in an unknown state, set it to missing and update the build start date
buildJobRepository.updateBuildJobStatus(buildJob.getBuildJobId(), BuildStatus.MISSING);
}
}

/**
* Periodically retries missing build jobs.
* R
* retrieves a slice of missing build jobs from the last hour and attempts to retry them.
* If a build job has reached the maximum number of retries, it will not be retried again.
*/
@Scheduled(fixedRateString = "${artemis.continuous-integration.retry-missing-jobs-interval-seconds:300}", initialDelayString = "${artemis.continuous-integration.retry-missing-jobs-delay-seconds:120}", timeUnit = TimeUnit.SECONDS)
public void retryMissingJobs() {
log.debug("Checking for missing build jobs to retry");

var start = System.currentTimeMillis();
Slice<BuildJob> missingJobsSlice = getMissingJobsToRetrySliceOfLastHour(50);
log.debug("Retrieving missing jobs took {} ms", System.currentTimeMillis() - start);

List<BuildJob> missingJobs = missingJobsSlice.getContent();
log.debug("Processing {} missing build jobs to retry", missingJobs.size());

for (BuildJob buildJob : missingJobs) {
if (buildJob.getRetryCount() >= maxMissingJobRetries) {
log.warn("Build job with id {} for participation {} has reached the maximum number of {} retries and will not be retried.", buildJob.getBuildJobId(),
buildJob.getParticipationId(), maxMissingJobRetries);
continue;
}
try {
localCITriggerService.retryBuildJob(buildJob, (ProgrammingExerciseParticipation) participationRepository.findByIdElseThrow(buildJob.getParticipationId()));
buildJobRepository.incrementRetryCount(buildJob.getBuildJobId());
}
catch (Exception e) {
log.error("Failed to retry build job with id {} for participation {}", buildJob.getBuildJobId(), buildJob.getParticipationId(), e);
}
}

if (missingJobsSlice.hasNext()) {
log.debug("There are more missing jobs to process in the next scheduled run.");
}
}

private boolean checkIfBuildJobIsStillBuilding(List<String> processingJobIds, String buildJobId) {
return processingJobIds.contains(buildJobId);
}

private boolean checkIfBuildJobIsStillQueued(List<BuildJobQueueItem> queuedJobs, String buildJobId) {
return queuedJobs.stream().anyMatch(job -> job.id().equals(buildJobId));
}

/**
* Retrieves a slice of missing build jobs submitted within the last hour that do not have a newer job for the same participation.
*
* @param maxResults the maximum number of results to retrieve
* @return a slice of missing build jobs
*/
private Slice<BuildJob> getMissingJobsToRetrySliceOfLastHour(int maxResults) {
Pageable pageable = PageRequest.of(0, maxResults);
ZonedDateTime now = ZonedDateTime.now();
ZonedDateTime oneHourAgo = now.minusHours(1);
return buildJobRepository.findMissingJobsToRetryInTimeRange(oneHourAgo, now, pageable);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,7 @@ public LocalCITriggerService(DistributedDataAccessService distributedDataAccessS
*/
@Override
public void triggerBuild(ProgrammingExerciseParticipation participation, boolean triggerAll) throws LocalCIException {
triggerBuild(participation, null, null, triggerAll);
triggerBuild(participation, null, null, triggerAll, 0);
}

/**
Expand All @@ -150,10 +150,28 @@ public void triggerBuild(ProgrammingExerciseParticipation participation, boolean
*/
@Override
public void triggerBuild(ProgrammingExerciseParticipation participation, String commitHashToBuild, RepositoryType triggeredByPushTo) throws LocalCIException {
triggerBuild(participation, commitHashToBuild, triggeredByPushTo, false);
triggerBuild(participation, commitHashToBuild, triggeredByPushTo, false, 0);
}

private void triggerBuild(ProgrammingExerciseParticipation participation, String commitHashToBuild, RepositoryType triggeredByPushTo, boolean triggerAll)
public void retryBuildJob(BuildJob buildJob, ProgrammingExerciseParticipation participation) throws LocalCIException {
log.info("Retrying build for missing build job with id {} (retry count: {})", buildJob.getBuildJobId(), buildJob.getRetryCount() + 1);
triggerBuild(participation, buildJob.getCommitHash(), buildJob.getTriggeredByPushTo(), buildJob.getRetryCount() + 1);
}

/**
* Add a new build job item containing all relevant information necessary for the execution to the distributed build job queue.
*
* @param participation the participation of the repository which should be built and tested
* @param commitHashToBuild the commit hash of the commit that triggers the build. If it is null, the latest commit of the default branch will be built.
* @param triggeredByPushTo type of the repository that was pushed to and triggered the build job
* @param retryCount how often the build has been retried after it went missing
* @throws LocalCIException if the build job could not be added to the queue.
*/
public void triggerBuild(ProgrammingExerciseParticipation participation, String commitHashToBuild, RepositoryType triggeredByPushTo, int retryCount) throws LocalCIException {
triggerBuild(participation, commitHashToBuild, triggeredByPushTo, false, retryCount);
}

private void triggerBuild(ProgrammingExerciseParticipation participation, String commitHashToBuild, RepositoryType triggeredByPushTo, boolean triggerAll, int retryCount)
throws LocalCIException {

log.info("Triggering build for participation {} and commit hash {}", participation.getId(), commitHashToBuild);
Expand Down Expand Up @@ -207,7 +225,7 @@ else if (triggeredByPushTo.equals(RepositoryType.TESTS)) {
BuildAgentDTO buildAgent = new BuildAgentDTO(null, null, null);

BuildJobQueueItem buildJobQueueItem = new BuildJobQueueItem(buildJobId, participation.getBuildPlanId(), buildAgent, participation.getId(), courseId,
programmingExercise.getId(), 0, priority, null, repositoryInfo, jobTimingInfo, buildConfig, null);
programmingExercise.getId(), retryCount, priority, null, repositoryInfo, jobTimingInfo, buildConfig, null);

// Save the build job before adding it to the queue to ensure it exists in the database.
// This prevents potential race conditions where a build agent pulls the job from the queue very quickly before it is persisted,
Expand Down
1 change: 1 addition & 0 deletions src/main/resources/config/application-localci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -37,3 +37,4 @@ artemis:
default: 120 # The default number of seconds that will be used if the instructor does not specify a value. This is the default value that will be shown in the slider when creating or editing a programming exercise.
# Max value also defines the max timeout for the build. Meaning that builds that exceed this time will be automatically interrupted.
max: 240 # The maximum number of seconds that the instructor can set for the build timeout. This is the maximum value that will be shown in the slider when creating or editing a programming exercise.
max-missing-job-retries: 3 # The Maximium number of retries for jobs that are missing (jobs that got persisted but are neither still running, queued or completed)
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
import de.tum.cit.aet.artemis.programming.repository.VcsAccessLogRepository;
import de.tum.cit.aet.artemis.programming.service.BuildLogEntryService;
import de.tum.cit.aet.artemis.programming.service.ParticipationVcsAccessTokenService;
import de.tum.cit.aet.artemis.programming.service.localci.LocalCIMissingJobService;
import de.tum.cit.aet.artemis.programming.service.localci.LocalCIResultService;
import de.tum.cit.aet.artemis.programming.service.localci.LocalCITriggerService;
import de.tum.cit.aet.artemis.programming.service.localvc.LocalVCServletService;
Expand Down Expand Up @@ -112,6 +113,9 @@ protected void mockBuildAgentServices() {
@Autowired
protected LocalCITriggerService localCITriggerService;

@Autowired
protected LocalCIMissingJobService localCIMissingJobService;

@Autowired
protected ParticipationVcsAccessTokenService participationVcsAccessTokenService;

Expand Down
Loading
Loading