Skip to content

Commit

Permalink
LabelBot NeedSync needs to check if model is being trained.
Browse files Browse the repository at this point in the history
* NeedsSync needs to check whether there is a model being trained or
  if there is a dataset being imported. Otherwise we end up launching
  multiple overlapping jobs because it takes a long time for the model
  to train. During which time the Tekton job will have finished.

* Related to kubeflow#178
  • Loading branch information
Jeremy Lewi committed Oct 4, 2020
1 parent 3697618 commit a1dacb5
Show file tree
Hide file tree
Showing 8 changed files with 175 additions and 20 deletions.
4 changes: 2 additions & 2 deletions Label_Microservice/auto-update/base/kustomization.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,6 @@ resources:
- deployment.yaml
- service-account.yaml
images:
- digest: sha256:32ecb5f829796b37fb5b5ddc2c3d98c693a81728b285fa90756f2b4b8f1075ae
- digest: sha256:2b7e759153baacf59514c98e50cb01eec0fc7aa2d7e2b271ac0a3c5cc516a74d
name: gcr.io/issue-label-bot-dev/labelbot-diff
newName: gcr.io/issue-label-bot-dev/labelbot-diff:874d2f5-dirty
newName: gcr.io/issue-label-bot-dev/labelbot-diff:3697618-dirty
14 changes: 8 additions & 6 deletions Label_Microservice/deployment/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,20 +22,22 @@ Deploying it
skaffold build
```

1. Edit the image
1. Update the image

```
cd deployment/overlays/prod
kustomize edit set image gcr.io/issue-label-bot-dev/bot-worker=gcr.io/issue-label-bot-dev/bot-worker:${TAG}@${SHA}
cd ..
make update-diff-image
```

1. Create the deployment
1. Hydrate the GitOps manifests

```
cd Label_Microservice/deployment/overlays/prod
kustomize build | kubectl apply -f -
cd ..
make hydrate-prod
```

1. Commit and push the manifests

## Staging/Dev

There is a staging/dev instance running in a different namespace
32 changes: 30 additions & 2 deletions Label_Microservice/go/cmd/automl/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,9 @@ func init() {
rootCmd.AddCommand(serveCmd)
rootCmd.AddCommand(getCmd)
rootCmd.AddCommand(labelCmd)
rootCmd.AddCommand(isTrainCmd)

rootCmd.PersistentFlags().IntVarP(&Verbose, "verbose", "v", int(log.InfoLevel), "verbose output")

labelCmd.AddCommand(labelModelCmd)

Expand All @@ -46,6 +49,11 @@ func init() {
getCmd.Flags().StringVarP(&getOptions.location, "location", "", "us-central1", "Location to search for models")
getCmd.Flags().StringVarP(&getOptions.outputFile, "output", "", "", "(Optional) If supplied write the evaluation scores to this file in csv format.")
getCmd.MarkFlagRequired("model")


isTrainCmd.Flags().StringVarP(&getOptions.name, "name", "", "", "The model to get.")
isTrainCmd.Flags().StringVarP(&getOptions.project, "project", "", "issue-label-bot-dev", "Project to check")
isTrainCmd.Flags().StringVarP(&getOptions.location, "location", "", "us-central1", "Location to search for models")
}

type cliOptions struct {
Expand All @@ -66,6 +74,8 @@ type getCmdOptions struct {
}

var (
Verbose int

options = cliOptions{}
getOptions = getCmdOptions{}
rootCmd = &cobra.Command{
Expand All @@ -78,6 +88,7 @@ var (
Short: "Start webserver.",
Long: `starts the controller`,
Run: func(cmd *cobra.Command, args []string) {
log.SetLevel(log.Level(Verbose))
router := mux.NewRouter().StrictSlash(true)

interval, err := time.ParseDuration(options.retrainInterval)
Expand Down Expand Up @@ -110,7 +121,7 @@ var (
Short: "Get the specified model.",
Long: `Get the specified model`,
Run: func(cmd *cobra.Command, args []string) {

log.SetLevel(log.Level(Verbose))
name := fmt.Sprintf("projects/%v/locations/%v/models/%v", getOptions.project, getOptions.location, getOptions.name)
model, err := automl.GetModel(name)

Expand All @@ -131,7 +142,7 @@ var (
e, err := yaml.Marshal(evaluation)

if err != nil {
log.Fatalf("Error marshiling the evaluation to yaml %v; error: %v", getOptions.name, err)
log.Fatalf("Error marshaling the evaluation to yaml %v; error: %v", getOptions.name, err)
}

fmt.Printf(string(e) + "\n")
Expand All @@ -148,6 +159,7 @@ var (
Short: "Label the specified model.",
Long: `Label the specified model`,
Run: func(cmd *cobra.Command, args []string) {
log.SetLevel(log.Level(Verbose))
if len(args) < 2 {
log.Fatalf("Error usage is label models <model> label1=value1 label2=value2")
}
Expand All @@ -174,6 +186,22 @@ var (

},
}

isTrainCmd = &cobra.Command{
Use: "isTraining",
Short: "Check if a model is being trained.",
Run: func(cmd *cobra.Command, args []string) {
log.SetLevel(log.Level(Verbose))
isTraining, err := automl.IsTraining(getOptions.project, getOptions.location)

if err != nil {
log.Fatalf("Error checking if model %v is being trained; error: %v", getOptions.name, err)
}

fmt.Printf("is training: %v", isTraining)
},
}

)

func main() {
Expand Down
99 changes: 91 additions & 8 deletions Label_Microservice/go/cmd/automl/pkg/automl/automl.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,13 @@ import (
"context"
"encoding/csv"
"fmt"
"github.com/go-yaml/yaml"
"github.com/golang/protobuf/proto"
"github.com/pkg/errors"
log "github.com/sirupsen/logrus"
"google.golang.org/api/iterator"
automlpb "google.golang.org/genproto/googleapis/cloud/automl/v1"
lropb "google.golang.org/genproto/googleapis/longrunning"
"google.golang.org/genproto/protobuf/field_mask"
"os"
)
Expand Down Expand Up @@ -102,9 +104,6 @@ func GetLatestDeployed(projectID string, location string, modelName string) (*au
// TODO(jlewi): We should really filter on labels; they don't appear to show up in the UI but they
// are in the API: https://cloud.google.com/automl/docs/reference/rest/v1/projects.locations.models#Model
func GetLatestTrained(projectID string, location string, modelName string) (*automlpb.Model, error) {
// projectID := "my-project-id"
// location := "us-central1"

ctx := context.Background()
client, err := automl.NewClient(ctx)
if err != nil {
Expand Down Expand Up @@ -193,11 +192,6 @@ func GetModelEvaluation(name string, outputFile string) (*automlpb.ModelEvaluati
//log.Infof("Evaluation: %+v, %+v", e, m.GetAuRoc())
}

//
//if err != nil {
// return nil, err
//}

return nil, nil
}

Expand Down Expand Up @@ -240,3 +234,92 @@ func LabelModel(name string, labels map[string]string) error {

return err
}

func toString(v interface{}) string {
output, err := yaml.Marshal(v)

if err != nil {
return err.Error()
}

return string(output)

}
// IsTraining determines if there is an import job or model training currently in progress
//
// TODO(jlewi): We should really only look at models and datasets matching some labels or other criterion.
// Its not clear how to link LRO's for import jobs to the corresponding dataset. So right now
// we just check if there are any operations still in progress.
func IsTraining(projectID string, location string) (bool, error) {
ctx := context.Background()
client, err := automl.NewClient(ctx)
if err != nil {
return false, fmt.Errorf("NewClient: %v", err)
}
defer client.Close()

req := &lropb.ListOperationsRequest{
Name: fmt.Sprintf("projects/%s/locations/%s", projectID, location),
}

it := client.LROClient.ListOperations(ctx, req)

for {
op, err := it.Next()
if err == iterator.Done {
break
}
if err != nil {
return false, fmt.Errorf("ListOperations.Next: %v", err)
}

if op.GetDone() {
continue
}

// Since the OP is still running check if its an ImportJob or model training.
opMeta := &automlpb.OperationMetadata{}

err = op.GetMetadata().UnmarshalTo(opMeta)
if err != nil {
log.Errorf("Could not unmarshal metadata for: %v; error: %v", op.GetName(), err)
}

// Creating a dataset is not actually a long running operation; when we create a dataset we are just
// defining the name essentionally.
if opMeta.GetCreateDatasetDetails() != nil {
details := opMeta.GetCreateDatasetDetails()

if details != nil {
log.Infof("Found running op dataset pp: %v\n%v",op.Name, toString(details))
}
return true, nil
}

// ImportData is a long running operation can take hours
// TODO(jlewi): How do we associate a running ImportData with its dataset? Neither Metadata, nor details appears
// to contain the dataset id.
if opMeta.GetImportDataDetails() != nil {
log.Infof("ImportData Op: %v; metadata:\n%v", op.Name, toString(opMeta))
details := opMeta.GetImportDataDetails()

if details != nil {
log.Infof("ImportDataDetails Op: %v; Details:\n%v",op.Name, toString(details))
}

return true, nil
}

if opMeta.GetCreateModelDetails() != nil {
details := opMeta.GetCreateModelDetails()

if details != nil {
log.Infof("Create Model Op: %v; Details:\n%v",op.Name, toString(details))
}

return true, nil
}
}

return false, nil
}
12 changes: 12 additions & 0 deletions Label_Microservice/go/cmd/automl/pkg/server/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,18 @@ func (s *Server) NeedsSync(w http.ResponseWriter, r *http.Request) {
}

getErr := func() error {
isTraining, err := automl.IsTraining(s.Project, s.Location)

if err != nil {
appendError(response, fmt.Sprintf("Error checking if model is being trained; %v", err))
return err
}

if isTraining {
log.Infof("There is model currently being trained; no sync needed.")
return nil
}

latest, err := automl.GetLatestDeployed(s.Project, s.Location, s.Name)

if err != nil {
Expand Down
2 changes: 1 addition & 1 deletion Label_Microservice/labelbot-diff.image.json
Original file line number Diff line number Diff line change
@@ -1 +1 @@
{"builds":[{"imageName":"gcr.io/issue-label-bot-dev/labelbot-diff","tag":"gcr.io/issue-label-bot-dev/labelbot-diff:874d2f5-dirty@sha256:32ecb5f829796b37fb5b5ddc2c3d98c693a81728b285fa90756f2b4b8f1075ae"}]}
{"builds":[{"imageName":"gcr.io/issue-label-bot-dev/labelbot-diff","tag":"gcr.io/issue-label-bot-dev/labelbot-diff:3697618-dirty@sha256:2b7e759153baacf59514c98e50cb01eec0fc7aa2d7e2b271ac0a3c5cc516a74d"}]}
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ spec:
- --kptFile=/src/code-intelligence.git/Label_Microservice/deployment/Kptfile
- --port=8080
- --retrainInterval=24h
image: gcr.io/issue-label-bot-dev/labelbot-diff:874d2f5-dirty@sha256:32ecb5f829796b37fb5b5ddc2c3d98c693a81728b285fa90756f2b4b8f1075ae
image: gcr.io/issue-label-bot-dev/labelbot-diff:3697618-dirty@sha256:2b7e759153baacf59514c98e50cb01eec0fc7aa2d7e2b271ac0a3c5cc516a74d
name: diff
ports:
- containerPort: 8080
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
apiVersion: automl.cloudai.kubeflow.org/v1alpha1
kind: ModelSync
metadata:
name: labelbot-train
namespace: label-bot-prod
spec:
failedPipelineRunsHistoryLimit: 10
needsSyncUrl: http://labelbot-diff.label-bot-prod/needsTrain
pipelineRunTemplate:
spec:
params:
- name: notebook-path
value: Label_Microservice/notebooks/automl.ipynb
- name: requirements
value: Label_Microservice/requirements.train.txt
- name: output
value: gs://issue-label-bot-dev_public/label-bot/training/runs
pipelineRef:
name: run-notebook
resources:
- name: notebook-repo
resourceSpec:
params:
- name: url
value: https://github.com/kubeflow/code-intelligence.git
- name: revision
value: master
type: git
serviceAccountName: default-editor
successfulPipelineRunsHistoryLimit: 10

0 comments on commit a1dacb5

Please sign in to comment.