Skip to content

Commit

Permalink
Improved status updates from trainer (#44)
Browse files Browse the repository at this point in the history
Communication changes
- No longer send redundant fields (like the hyperparameters, ... ) with
the trainer updates (they are send with the training updates)
- Change type of hyperparameters to a dict
- Add fields `trainer_name` and `best_epoch` to DC TrainingOut - used to
sync the training

Internal changes:
- removes the internal DC `TrainingData` and moves its fields
(image_data, categories,skipped_image_count,hyperparameter) to the
`Training` DC. The class added non-required complexity!

---------

Co-authored-by: Niklas Neugebauer <[email protected]>
  • Loading branch information
denniswittich and NiklasNeugebauer authored Nov 25, 2024
1 parent a0c6326 commit b382199
Show file tree
Hide file tree
Showing 29 changed files with 315 additions and 277 deletions.
36 changes: 30 additions & 6 deletions .github/workflows/pytest.yml
Original file line number Diff line number Diff line change
Expand Up @@ -27,14 +27,26 @@ jobs:
poetry config virtualenvs.create false --local
poetry install
- name: test_general
env:
LOOP_HOST: "preview.learning-loop.ai"
LOOP_USERNAME: "admin"
LOOP_PASSWORD: ${{ secrets.LEARNING_LOOP_ADMIN_PASSWORD }}
run: |
LOOP_HOST=preview.learning-loop.ai LOOP_USERNAME=admin LOOP_PASSWORD=${{ secrets.LEARNING_LOOP_ADMIN_PASSWORD }} pytest learning_loop_node/tests/general -v
pytest "learning_loop_node/tests/general" -v
- name: test_detector
env:
LOOP_HOST: "preview.learning-loop.ai"
LOOP_USERNAME: "admin"
LOOP_PASSWORD: ${{ secrets.LEARNING_LOOP_ADMIN_PASSWORD }}
run: |
LOOP_HOST=preview.learning-loop.ai LOOP_USERNAME=admin LOOP_PASSWORD=${{ secrets.LEARNING_LOOP_ADMIN_PASSWORD }} pytest learning_loop_node/tests/detector -v
pytest learning_loop_node/tests/detector -v
- name: test_mock_detector
env:
LOOP_HOST: "preview.learning-loop.ai"
LOOP_USERNAME: "admin"
LOOP_PASSWORD: ${{ secrets.LEARNING_LOOP_ADMIN_PASSWORD }}
run: |
LOOP_HOST=preview.learning-loop.ai LOOP_USERNAME=admin LOOP_PASSWORD=${{ secrets.LEARNING_LOOP_ADMIN_PASSWORD }} pytest mock_detector -v
pytest mock_detector -v
pytest_3_11:
needs:
Expand Down Expand Up @@ -62,14 +74,26 @@ jobs:
poetry config virtualenvs.create false --local
poetry install
- name: test_annotator
env:
LOOP_HOST: "preview.learning-loop.ai"
LOOP_USERNAME: "admin"
LOOP_PASSWORD: ${{ secrets.LEARNING_LOOP_ADMIN_PASSWORD }}
run: |
LOOP_HOST=preview.learning-loop.ai LOOP_USERNAME=admin LOOP_PASSWORD=${{ secrets.LEARNING_LOOP_ADMIN_PASSWORD }} pytest learning_loop_node/tests/annotator -v
pytest learning_loop_node/tests/annotator -v
- name: test_trainer
env:
LOOP_HOST: "preview.learning-loop.ai"
LOOP_USERNAME: "admin"
LOOP_PASSWORD: ${{ secrets.LEARNING_LOOP_ADMIN_PASSWORD }}
run: |
LOOP_HOST=preview.learning-loop.ai LOOP_USERNAME=admin LOOP_PASSWORD=${{ secrets.LEARNING_LOOP_ADMIN_PASSWORD }} pytest learning_loop_node/tests/trainer -v
pytest learning_loop_node/tests/trainer -v
- name: test_mock_trainer
env:
LOOP_HOST: "preview.learning-loop.ai"
LOOP_USERNAME: "admin"
LOOP_PASSWORD: ${{ secrets.LEARNING_LOOP_ADMIN_PASSWORD }}
run: |
LOOP_HOST=preview.learning-loop.ai LOOP_USERNAME=admin LOOP_PASSWORD=${{ secrets.LEARNING_LOOP_ADMIN_PASSWORD }} pytest mock_trainer -v
pytest mock_trainer -v
slack:
needs:
Expand Down
6 changes: 3 additions & 3 deletions learning_loop_node/data_classes/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@
ModelInformation, NodeState, NodeStatus)
from .image_metadata import ImageMetadata
from .socket_response import SocketResponse
from .training import (Errors, Hyperparameter, Model, PretrainedModel, TrainerState, Training, TrainingData,
TrainingError, TrainingOut, TrainingStateData, TrainingStatus)
from .training import (Errors, PretrainedModel, TrainerState, Training, TrainingError, TrainingOut, TrainingStateData,
TrainingStatus)

__all__ = [
'AnnotationData', 'AnnotationEventType', 'SegmentationAnnotation', 'ToolOutput', 'UserInput',
Expand All @@ -15,6 +15,6 @@
'AnnotationNodeStatus', 'Category', 'CategoryType', 'Context', 'DetectionStatus', 'ErrorConfiguration',
'ModelInformation', 'NodeState', 'NodeStatus',
'SocketResponse',
'Errors', 'Hyperparameter', 'Model', 'PretrainedModel', 'TrainerState', 'Training', 'TrainingData',
'Errors', 'PretrainedModel', 'TrainerState', 'Training',
'TrainingError', 'TrainingOut', 'TrainingStateData', 'TrainingStatus',
]
2 changes: 1 addition & 1 deletion learning_loop_node/data_classes/general.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ def load_from_disk(model_root_path: str) -> Optional['ModelInformation']:
"""
model_info_file_path = f'{model_root_path}/model.json'
if not os.path.exists(model_info_file_path):
logging.warning(f"could not find model information file '{model_info_file_path}'")
logging.warning('could not find model information file %s', model_info_file_path)
return None
with open(model_info_file_path, 'r') as f:
try:
Expand Down
129 changes: 62 additions & 67 deletions learning_loop_node/data_classes/training.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,46 +4,16 @@
from dataclasses import dataclass, field
from enum import Enum
from pathlib import Path
from typing import Dict, List, Optional
from typing import Any, Dict, List, Optional
from uuid import uuid4

from ..helpers.misc import create_image_folder, create_training_folder
# pylint: disable=no-name-in-module
from .general import Category, Context

KWONLY_SLOTS = {'kw_only': True, 'slots': True} if sys.version_info >= (3, 10) else {}


@dataclass(**KWONLY_SLOTS)
class Hyperparameter():
resolution: int
flip_rl: bool
flip_ud: bool

@staticmethod
def from_data(data: Dict):
return Hyperparameter(
resolution=data['resolution'],
flip_rl=data.get('flip_rl', False),
flip_ud=data.get('flip_ud', False)
)


@dataclass(**KWONLY_SLOTS)
class TrainingData():
image_data: List[Dict] = field(default_factory=list)
skipped_image_count: Optional[int] = 0
categories: List[Category] = field(default_factory=list)
hyperparameter: Optional[Hyperparameter] = None

def image_ids(self):
return [image['id'] for image in self.image_data]

def train_image_count(self):
return len([image for image in self.image_data if image['set'] == 'train'])

def test_image_count(self):
return len([image for image in self.image_data if image['set'] == 'test'])


@dataclass(**KWONLY_SLOTS)
class PretrainedModel():
name: str
Expand Down Expand Up @@ -75,26 +45,21 @@ class TrainerState(str, Enum):
class TrainingStatus():
id: str # NOTE this must not be changed, but tests wont detect a change -> update tests!
name: str

state: Optional[str]
errors: Optional[Dict]
uptime: Optional[float]
errors: Optional[Dict[str, Any]]
progress: Optional[float]

train_image_count: Optional[int] = None
test_image_count: Optional[int] = None
skipped_image_count: Optional[int] = None
pretrained_models: List[PretrainedModel] = field(default_factory=list)
hyperparameters: Optional[Dict] = None
architecture: Optional[str] = None
context: Optional[Context] = None

def short_str(self) -> str:
prgr = f'{self.progress * 100:.0f}%' if self.progress else ''
trtesk = f'{self.train_image_count}/{self.test_image_count}/{self.skipped_image_count}' if self.train_image_count else 'n.a.'
cntxt = f'{self.context.organization}/{self.context.project}' if self.context else ''
hyps = f'({self.hyperparameters})' if self.hyperparameters else ''
arch = f'.{self.architecture} - ' if self.architecture else ''
return f'[{str(self.state).rsplit(".", maxsplit=1)[-1]} {prgr}. {self.name}({self.id}). Tr/Ts/Tsk: {trtesk} {cntxt}{arch}{hyps}]'
return f'[{str(self.state).rsplit(".", maxsplit=1)[-1]} {prgr}. {self.name}({self.id}). {cntxt}{arch}]'


@dataclass(**KWONLY_SLOTS)
Expand All @@ -105,53 +70,83 @@ class Training():
project_folder: str # f'{GLOBALS.data_folder}/{context.organization}/{context.project}'
images_folder: str # f'{project_folder}/images'
training_folder: str # f'{project_folder}/trainings/{trainings_id}'

categories: List[Category]
hyperparameters: dict

training_number: int
training_state: str
model_variant: str # from `provided_pretrained_models->name`

start_time: float = field(default_factory=time.time)

# model uuid to download (to continue training) | is not a uuid when training from scratch (blank or pt-name from provided_pretrained_models->name)
base_model_uuid_or_name: Optional[str] = None
base_model_uuid: Optional[str] = None # model uuid to continue training (is loaded from loop)

data: Optional[TrainingData] = None
training_number: Optional[int] = None
training_state: Optional[str] = None
model_uuid_for_detecting: Optional[str] = None
hyperparameters: Optional[Dict] = None
# NOTE: these are set later after the model has been uploaded
image_data: Optional[List[dict]] = None
skipped_image_count: Optional[int] = None
model_uuid_for_detecting: Optional[str] = None # Model uuid to load from the loop after training and upload

@property
def training_folder_path(self) -> Path:
return Path(self.training_folder)

def set_values_from_data(self, data: Dict) -> None:
self.data = TrainingData(categories=Category.from_list(data['categories']))
self.data.hyperparameter = Hyperparameter.from_data(data=data)
self.training_number = data['training_number']
self.base_model_uuid_or_name = data['id']
self.training_state = TrainerState.Initialized
@classmethod
def generate_training(cls, project_folder: str, context: Context, data: Dict[str, Any]) -> 'Training':
if 'hyperparameters' not in data or not isinstance(data['hyperparameters'], dict):
raise ValueError('hyperparameters missing or not a dict')
if 'categories' not in data or not isinstance(data['categories'], list):
raise ValueError('categories missing or not a list')
if 'training_number' not in data or not isinstance(data['training_number'], int):
raise ValueError('training_number missing or not an int')
if 'model_variant' not in data or not isinstance(data['model_variant'], str):
raise ValueError('model_variant missing or not a str')

training_uuid = str(uuid4())

return Training(
id=training_uuid,
context=context,
project_folder=project_folder,
images_folder=create_image_folder(project_folder),
training_folder=create_training_folder(project_folder, training_uuid),
categories=Category.from_list(data['categories']),
hyperparameters=data['hyperparameters'],
training_number=data['training_number'],
base_model_uuid=data.get('base_model_uuid', None),
model_variant=data['model_variant'],
training_state=TrainerState.Initialized.value
)

def image_ids(self) -> List[str]:
assert self.image_data is not None, 'Image data not set'
return [image['id'] for image in self.image_data]

def train_image_count(self) -> int:
assert self.image_data is not None, 'Image data not set'
return len([image for image in self.image_data if image['set'] == 'train'])

def test_image_count(self) -> int:
assert self.image_data is not None, 'Image data not set'
return len([image for image in self.image_data if image['set'] == 'test'])


@dataclass(**KWONLY_SLOTS)
class TrainingOut():
trainer_id: str
trainer_name: Optional[str] = None
confusion_matrix: Optional[Dict] = None # This is actually just class-wise metrics
train_image_count: Optional[int] = None
test_image_count: Optional[int] = None
trainer_id: Optional[str] = None
hyperparameters: Optional[Dict] = None
hyperparameters: Optional[Dict[str, Any]] = None
best_epoch: Optional[int] = None


@dataclass(**KWONLY_SLOTS)
class TrainingStateData():
confusion_matrix: Dict = field(default_factory=dict)
meta_information: Dict = field(default_factory=dict)


@dataclass(**KWONLY_SLOTS)
class Model():
uuid: str
confusion_matrix: Optional[Dict] = None
parent_id: Optional[str] = None
train_image_count: Optional[int] = None
test_image_count: Optional[int] = None
trainer_id: Optional[str] = None
hyperparameters: Optional[str] = None
epoch: Optional[int] = None


class Errors():
Expand Down
20 changes: 11 additions & 9 deletions learning_loop_node/data_exchanger.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,15 +62,15 @@ def context(self) -> Context:

async def fetch_image_uuids(self, query_params: Optional[str] = '') -> List[str]:
"""Fetch image uuids from the learning loop data endpoint."""
logging.info(f'Fetching image uuids for {self.context.organization}/{self.context.project}..')
logging.info('Fetching image uuids for %s/%s..', self.context.organization, self.context.project)

response = await self.loop_communicator.get(f'/{self.context.organization}/projects/{self.context.project}/data?{query_params}')
assert response.status_code == 200, response
return (response.json())['image_ids']

async def download_images_data(self, image_uuids: List[str], chunk_size: int = 100) -> List[Dict]:
"""Download image annotations, tags, set and other information for the given image uuids."""
logging.info(f'Fetching annotations, tags, sets, etc. for {len(image_uuids)} images..')
logging.info('Fetching annotations, tags, sets, etc. for %s images..', len(image_uuids))

num_image_ids = len(image_uuids)
if num_image_ids == 0:
Expand All @@ -84,15 +84,15 @@ async def download_images_data(self, image_uuids: List[str], chunk_size: int = 1
chunk_ids = image_uuids[i:i+chunk_size]
response = await self.loop_communicator.get(f'/{self.context.organization}/projects/{self.context.project}/images?ids={",".join(chunk_ids)}')
if response.status_code != 200:
logging.error(f'Error {response.status_code} during downloading image data. Continue with next batch..')
logging.error('Error %s during downloading image data. Continue with next batch..', response.status_code)
continue
images_data += response.json()['images']

return images_data

async def download_images(self, image_uuids: List[str], image_folder: str, chunk_size: int = 10) -> None:
"""Downloads images (actual image data). Will skip existing images"""
logging.info(f'Downloading {len(image_uuids)} images (actual image data).. skipping existing images.')
logging.info('Downloading %s images (actual image data).. skipping existing images.', len(image_uuids))
if not image_uuids:
return

Expand All @@ -106,7 +106,7 @@ async def download_images(self, image_uuids: List[str], image_folder: str, chunk
self.progress = 1.0
return

logging.info(f'Downloading {num_new_image_ids} new images to {image_folder}..')
logging.info('Downloading %s new images to %s..', num_new_image_ids, image_folder)
os.makedirs(image_folder, exist_ok=True)

progress_factor = 0.5 / num_new_image_ids # second 50% of progress is for downloading images
Expand All @@ -128,7 +128,7 @@ async def _download_one_image(self, path: str, image_id: str, image_folder: str)
await asyncio.sleep(1)
response = await self.loop_communicator.get(path)
if response.status_code != HTTPStatus.OK:
logging.error(f'bad status code {response.status_code} for {path}. Details: {response.text}')
logging.error('bad status code %s for %s. Details: %s', response.status_code, path, response.text)
return
filename = f'{image_folder}/{image_id}.jpg'
async with aiofiles.open(filename, 'wb') as f:
Expand Down Expand Up @@ -171,7 +171,7 @@ async def download_model(self, target_folder: str, context: Context, model_uuid:
created_files.append(new_file)

shutil.rmtree(tmp_path, ignore_errors=True)
logging.info(f'Downloaded model {model_uuid}({model_format}) to {target_folder}.')
logging.info('Downloaded model %s(%s) to %s.', model_uuid, model_format, target_folder)
return created_files

async def upload_model_get_uuid(self, context: Context, files: List[str], training_number: Optional[int], mformat: str) -> str:
Expand All @@ -182,10 +182,12 @@ async def upload_model_get_uuid(self, context: Context, files: List[str], traini
"""
response = await self.loop_communicator.put(f'/{context.organization}/projects/{context.project}/trainings/{training_number}/models/latest/{mformat}/file', files=files)
if response.status_code != 200:
logging.error(f'Could not upload model for training {training_number}, format {mformat}: {response.text}')
logging.error('Could not upload model for training %s, format %s: %s',
training_number, mformat, response.text)
raise CriticalError(
f'Could not upload model for training {training_number}, format {mformat}: {response.text}')

uploaded_model = response.json()
logging.info(f'Uploaded model for training {training_number}, format {mformat}. Response is: {uploaded_model}')
logging.info('Uploaded model for training %s, format %s. Response is: %s',
training_number, mformat, uploaded_model)
return uploaded_model['id']
4 changes: 2 additions & 2 deletions learning_loop_node/helpers/environment_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,14 +11,14 @@ def read_from_env(possible_names: List[str], ignore_errors: bool = True) -> Opti
# Possible error: no values are set
if not values:
if ignore_errors:
logging.warning(f'no environment variable set for {possible_names}')
logging.warning('no environment variable set for %s', possible_names)
return None
raise ValueError(f'no environment variable set for {possible_names}')

# Possible error: multiple values are not None and not equal
if len(values) > 1 and len(set(values)) > 1:
if ignore_errors:
logging.warning(f'different environment variables set for {possible_names}: {values}')
logging.warning('different environment variables set for %s: %s', possible_names, values)
return None
raise ValueError(f'different environment variables set for {possible_names}: {values}')

Expand Down
5 changes: 4 additions & 1 deletion learning_loop_node/helpers/log_conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

LOGGING_CONF = {
'version': 1,
'disable_existing_loggers': True, # to make sure this config is used
'disable_existing_loggers': False, # to make sure this config is used
'formatters': {
'default': {
'format': '%(asctime)s,%(msecs)01d %(levelname)-8s [%(filename)s:%(lineno)d] %(message)s',
Expand Down Expand Up @@ -34,3 +34,6 @@

def init():
logging.config.dictConfig(LOGGING_CONF)


init()
Loading

0 comments on commit b382199

Please sign in to comment.