@@ -17,7 +17,7 @@ limitations under the License.
17
17
package trial
18
18
19
19
import (
20
- "sync "
20
+ "context "
21
21
"testing"
22
22
"time"
23
23
@@ -48,14 +48,47 @@ import (
48
48
49
49
const (
50
50
namespace = "default"
51
- trialName = "test-trial"
52
51
batchJobName = "test-job"
53
52
objectiveMetric = "accuracy"
54
- timeout = time .Second * 80
53
+ timeout = time .Second * 10
55
54
)
56
55
57
- var trialKey = types.NamespacedName {Name : trialName , Namespace : namespace }
58
- var batchJobKey = types.NamespacedName {Name : batchJobName , Namespace : namespace }
56
+ var (
57
+ batchJobKey = types.NamespacedName {Name : batchJobName , Namespace : namespace }
58
+ observationLogAvailable = & api_pb.GetObservationLogReply {
59
+ ObservationLog : & api_pb.ObservationLog {
60
+ MetricLogs : []* api_pb.MetricLog {
61
+ {
62
+ TimeStamp : "2020-08-10T14:47:38+08:00" ,
63
+ Metric : & api_pb.Metric {
64
+ Name : objectiveMetric ,
65
+ Value : "0.99" ,
66
+ },
67
+ },
68
+ {
69
+ TimeStamp : "2020-08-10T14:50:38+08:00" ,
70
+ Metric : & api_pb.Metric {
71
+ Name : objectiveMetric ,
72
+ Value : "0.11" ,
73
+ },
74
+ },
75
+ },
76
+ },
77
+ }
78
+ observationLogUnavailable = & api_pb.GetObservationLogReply {
79
+ ObservationLog : & api_pb.ObservationLog {
80
+ MetricLogs : []* api_pb.MetricLog {
81
+ {
82
+ Metric : & api_pb.Metric {
83
+ Name : objectiveMetric ,
84
+ Value : consts .UnavailableMetricValue ,
85
+ },
86
+ TimeStamp : time.Time {}.UTC ().Format (time .RFC3339 ),
87
+ },
88
+ },
89
+ },
90
+ }
91
+ )
59
92
60
93
func init () {
61
94
logf .SetLogger (zap .New (zap .UseDevMode (true )))
@@ -112,6 +145,7 @@ func TestReconcileBatchJob(t *testing.T) {
112
145
// Try to update status until it be succeeded
113
146
for err != nil {
114
147
updatedInstance := & trialsv1beta1.Trial {}
148
+ trialKey := types.NamespacedName {Name : instance .Name , Namespace : namespace }
115
149
if err = c .Get (ctx , trialKey , updatedInstance ); err != nil {
116
150
continue
117
151
}
@@ -134,59 +168,22 @@ func TestReconcileBatchJob(t *testing.T) {
134
168
viper .Set (consts .ConfigTrialResources , trialResources )
135
169
g .Expect (add (mgr , recFn )).NotTo (gomega .HaveOccurred ())
136
170
137
- // Start test manager.
138
- wg := & sync. WaitGroup {}
139
- wg . Add ( 1 )
171
+ // Start test manager
172
+ mgrCtx , cancel := context . WithCancel ( context . TODO ())
173
+ t . Cleanup ( cancel )
140
174
go func () {
141
- defer wg .Done ()
142
- g .Expect (mgr .Start (ctx )).NotTo (gomega .HaveOccurred ())
175
+ g .Expect (mgr .Start (mgrCtx )).NotTo (gomega .HaveOccurred ())
143
176
}()
144
177
145
- // Result for GetTrialObservationLog with some metrics.
146
- observationLogAvailable := & api_pb.GetObservationLogReply {
147
- ObservationLog : & api_pb.ObservationLog {
148
- MetricLogs : []* api_pb.MetricLog {
149
- {
150
- TimeStamp : "2020-08-10T14:47:38+08:00" ,
151
- Metric : & api_pb.Metric {
152
- Name : objectiveMetric ,
153
- Value : "0.99" ,
154
- },
155
- },
156
- {
157
- TimeStamp : "2020-08-10T14:50:38+08:00" ,
158
- Metric : & api_pb.Metric {
159
- Name : objectiveMetric ,
160
- Value : "0.11" ,
161
- },
162
- },
163
- },
164
- },
165
- }
166
- // Empty result for GetTrialObservationLog.
167
- // If objective metrics are not parsed, metrics collector reports "unavailable" value to DB.
168
- observationLogUnavailable := & api_pb.GetObservationLogReply {
169
- ObservationLog : & api_pb.ObservationLog {
170
- MetricLogs : []* api_pb.MetricLog {
171
- {
172
- Metric : & api_pb.Metric {
173
- Name : objectiveMetric ,
174
- Value : consts .UnavailableMetricValue ,
175
- },
176
- TimeStamp : time.Time {}.UTC ().Format (time .RFC3339 ),
177
- },
178
- },
179
- },
180
- }
181
-
182
178
t .Run (`Trial run with "Failed" BatchJob.` , func (t * testing.T ) {
183
179
g := gomega .NewGomegaWithT (t )
184
180
mockManagerClient .EXPECT ().DeleteTrialObservationLog (gomock .Any ()).Return (nil , nil )
185
181
186
- trial := newFakeTrialBatchJob ()
182
+ trial := newFakeTrialBatchJob (commonv1beta1 .StdOutCollector , "test-failed-batch-job" )
183
+ trialKey := types.NamespacedName {Name : "test-failed-batch-job" , Namespace : namespace }
187
184
batchJob := & batchv1.Job {}
188
185
189
- // Create the Trial
186
+ // Create the Trial with StdOut MC
190
187
g .Expect (c .Create (ctx , trial )).NotTo (gomega .HaveOccurred ())
191
188
192
189
// Expect that BatchJob with appropriate name is created
@@ -239,7 +236,7 @@ func TestReconcileBatchJob(t *testing.T) {
239
236
}, timeout ).Should (gomega .BeTrue ())
240
237
})
241
238
242
- t .Run (`Trail with "Complete" BatchJob and Available metrics.` , func (t * testing.T ) {
239
+ t .Run (`Trial with "Complete" BatchJob and Available metrics.` , func (t * testing.T ) {
243
240
g := gomega .NewGomegaWithT (t )
244
241
gomock .InOrder (
245
242
mockManagerClient .EXPECT ().GetTrialObservationLog (gomock .Any ()).Return (observationLogAvailable , nil ).MinTimes (1 ),
@@ -262,8 +259,9 @@ func TestReconcileBatchJob(t *testing.T) {
262
259
}
263
260
g .Expect (c .Status ().Update (ctx , batchJob )).NotTo (gomega .HaveOccurred ())
264
261
265
- // Create the Trial
266
- trial := newFakeTrialBatchJob ()
262
+ // Create the Trial with StdOut MC
263
+ trial := newFakeTrialBatchJob (commonv1beta1 .StdOutCollector , "test-available-stdout" )
264
+ trialKey := types.NamespacedName {Name : "test-available-stdout" , Namespace : namespace }
267
265
g .Expect (c .Create (ctx , trial )).NotTo (gomega .HaveOccurred ())
268
266
269
267
// Expect that Trial status is succeeded and metrics are properly populated
@@ -290,28 +288,71 @@ func TestReconcileBatchJob(t *testing.T) {
290
288
}, timeout ).Should (gomega .BeTrue ())
291
289
})
292
290
293
- t .Run (`Trail with "Complete" BatchJob and Unavailable metrics.` , func (t * testing.T ) {
291
+ t .Run (`Trial with "Complete" BatchJob and Unavailable metrics(StdOut MC) .` , func (t * testing.T ) {
294
292
g := gomega .NewGomegaWithT (t )
295
293
gomock .InOrder (
296
294
mockManagerClient .EXPECT ().GetTrialObservationLog (gomock .Any ()).Return (observationLogUnavailable , nil ).MinTimes (1 ),
297
295
mockManagerClient .EXPECT ().DeleteTrialObservationLog (gomock .Any ()).Return (nil , nil ),
298
296
)
299
- // Create the Trial
300
- trial := newFakeTrialBatchJob ()
297
+ // Create the Trial with StdOut MC
298
+ trial := newFakeTrialBatchJob (commonv1beta1 .StdOutCollector , "test-unavailable-stdout" )
299
+ trialKey := types.NamespacedName {Name : "test-unavailable-stdout" , Namespace : namespace }
301
300
g .Expect (c .Create (ctx , trial )).NotTo (gomega .HaveOccurred ())
302
301
303
302
// Expect that Trial status is succeeded with "false" status and "metrics unavailable" reason.
304
303
// Metrics unavailable because GetTrialObservationLog returns "unavailable".
304
+ g .Eventually (func (g gomega.Gomega ) {
305
+ g .Expect (c .Get (ctx , trialKey , trial )).Should (gomega .Succeed ())
306
+ g .Expect (trial .IsMetricsUnavailable ()).Should (gomega .BeTrue ())
307
+ g .Expect (trial .Status .Observation .Metrics ).ShouldNot (gomega .HaveLen (0 ))
308
+ g .Expect (trial .Status .Observation .Metrics [0 ]).Should (gomega .BeComparableTo (commonv1beta1.Metric {
309
+ Name : objectiveMetric ,
310
+ Min : consts .UnavailableMetricValue ,
311
+ Max : consts .UnavailableMetricValue ,
312
+ Latest : consts .UnavailableMetricValue ,
313
+ }))
314
+ }, timeout ).Should (gomega .Succeed ())
315
+
316
+ // Delete the Trial
317
+ g .Expect (c .Delete (ctx , trial )).NotTo (gomega .HaveOccurred ())
318
+
319
+ // Expect that Trial is deleted
305
320
g .Eventually (func () bool {
306
- if err = c .Get (ctx , trialKey , trial ); err != nil {
307
- return false
308
- }
309
- return trial .IsMetricsUnavailable () &&
310
- len (trial .Status .Observation .Metrics ) > 0 &&
311
- trial .Status .Observation .Metrics [0 ].Min == consts .UnavailableMetricValue &&
312
- trial .Status .Observation .Metrics [0 ].Max == consts .UnavailableMetricValue &&
313
- trial .Status .Observation .Metrics [0 ].Latest == consts .UnavailableMetricValue
321
+ return errors .IsNotFound (c .Get (ctx , trialKey , & trialsv1beta1.Trial {}))
314
322
}, timeout ).Should (gomega .BeTrue ())
323
+ })
324
+
325
+ t .Run (`Trial with "Complete" BatchJob and Unavailable metrics(Push MC, failed once).` , func (t * testing.T ) {
326
+ mockCtrl .Finish ()
327
+ g := gomega .NewGomegaWithT (t )
328
+ gomock .InOrder (
329
+ mockManagerClient .EXPECT ().GetTrialObservationLog (gomock .Any ()).Return (observationLogUnavailable , nil ),
330
+ mockManagerClient .EXPECT ().ReportTrialObservationLog (gomock .Any (), gomock .Any ()).Return (nil , errReportMetricsFailed ),
331
+ mockManagerClient .EXPECT ().GetTrialObservationLog (gomock .Any ()).Return (observationLogUnavailable , nil ),
332
+ mockManagerClient .EXPECT ().ReportTrialObservationLog (gomock .Any (), gomock .Any ()).Return (nil , nil ),
333
+ mockManagerClient .EXPECT ().DeleteTrialObservationLog (gomock .Any ()).Return (nil , nil ),
334
+ )
335
+ mockManagerClient .EXPECT ().GetTrialObservationLog (gomock .Any ()).Return (observationLogUnavailable , nil ).AnyTimes ()
336
+ mockManagerClient .EXPECT ().ReportTrialObservationLog (gomock .Any (), gomock .Any ()).Return (nil , nil ).AnyTimes ()
337
+
338
+ // Create the Trial with Push MC
339
+ trial := newFakeTrialBatchJob (commonv1beta1 .PushCollector , "test-unavailable-push-failed-once" )
340
+ trialKey := types.NamespacedName {Name : "test-unavailable-push-failed-once" , Namespace : namespace }
341
+ g .Expect (c .Create (ctx , trial )).NotTo (gomega .HaveOccurred ())
342
+
343
+ // Expect that Trial status is succeeded with "false" status and "metrics unavailable" reason.
344
+ // Metrics unavailable because GetTrialObservationLog returns "unavailable".
345
+ g .Eventually (func (g gomega.Gomega ) {
346
+ g .Expect (c .Get (ctx , trialKey , trial )).Should (gomega .Succeed ())
347
+ g .Expect (trial .IsMetricsUnavailable ()).Should (gomega .BeTrue ())
348
+ g .Expect (trial .Status .Observation .Metrics ).ShouldNot (gomega .HaveLen (0 ))
349
+ g .Expect (trial .Status .Observation .Metrics [0 ]).Should (gomega .BeComparableTo (commonv1beta1.Metric {
350
+ Name : objectiveMetric ,
351
+ Min : consts .UnavailableMetricValue ,
352
+ Max : consts .UnavailableMetricValue ,
353
+ Latest : consts .UnavailableMetricValue ,
354
+ }))
355
+ }, timeout ).Should (gomega .Succeed ())
315
356
316
357
// Delete the Trial
317
358
g .Expect (c .Delete (ctx , trial )).NotTo (gomega .HaveOccurred ())
@@ -386,7 +427,7 @@ func TestGetObjectiveMetricValue(t *testing.T) {
386
427
g .Expect (err ).To (gomega .HaveOccurred ())
387
428
}
388
429
389
- func newFakeTrialBatchJob () * trialsv1beta1.Trial {
430
+ func newFakeTrialBatchJob (mcType commonv1beta1. CollectorKind , trialName string ) * trialsv1beta1.Trial {
390
431
primaryContainer := "training-container"
391
432
392
433
job := & batchv1.Job {
@@ -429,8 +470,13 @@ func newFakeTrialBatchJob() *trialsv1beta1.Trial {
429
470
},
430
471
Spec : trialsv1beta1.TrialSpec {
431
472
PrimaryContainerName : primaryContainer ,
432
- SuccessCondition : experimentsv1beta1 .DefaultJobSuccessCondition ,
433
- FailureCondition : experimentsv1beta1 .DefaultJobFailureCondition ,
473
+ MetricsCollector : commonv1beta1.MetricsCollectorSpec {
474
+ Collector : & commonv1beta1.CollectorSpec {
475
+ Kind : mcType ,
476
+ },
477
+ },
478
+ SuccessCondition : experimentsv1beta1 .DefaultJobSuccessCondition ,
479
+ FailureCondition : experimentsv1beta1 .DefaultJobFailureCondition ,
434
480
Objective : & commonv1beta1.ObjectiveSpec {
435
481
ObjectiveMetricName : objectiveMetric ,
436
482
MetricStrategies : []commonv1beta1.MetricStrategy {
0 commit comments