From b80eb33d4fb6d394de173d7c2fdf1d2a294f4e40 Mon Sep 17 00:00:00 2001 From: Max Burnette Date: Wed, 30 Oct 2019 10:46:34 -0500 Subject: [PATCH 01/32] improve error handling, make message and type separate --- pyclowder/collections.py | 4 +- pyclowder/connectors.py | 111 ++++++++++++----------- pyclowder/datasets.py | 6 +- pyclowder/files.py | 9 +- pyclowder/sections.py | 6 +- pyclowder/utils.py | 6 +- sample-extractors/wordcount/wordcount.py | 14 ++- 7 files changed, 80 insertions(+), 76 deletions(-) diff --git a/pyclowder/collections.py b/pyclowder/collections.py index c1c34e7..8795e57 100644 --- a/pyclowder/collections.py +++ b/pyclowder/collections.py @@ -8,7 +8,6 @@ import requests from pyclowder.client import ClowderClient -from pyclowder.utils import StatusMessage def create_empty(connector, host, key, collectionname, description, parentid=None, spaceid=None): @@ -121,8 +120,7 @@ def upload_preview(connector, host, key, collectionid, previewfile, previewmetad section this preview should be associated with. """ - connector.status_update(StatusMessage.processing, {"type": "collection", "id": collectionid}, - "Uploading collection preview.") + connector.message_process({"type": "collection", "id": collectionid}, "Uploading collection preview.") logger = logging.getLogger(__name__) headers = {'Content-Type': 'application/json'} diff --git a/pyclowder/connectors.py b/pyclowder/connectors.py index e0c3af7..80df4c2 100644 --- a/pyclowder/connectors.py +++ b/pyclowder/connectors.py @@ -230,7 +230,6 @@ def _build_resource(self, body, host, secret_key): "type": "dataset", "id": datasetid } - self.status_update(pyclowder.utils.StatusMessage.error, resource, msg) self.message_error(resource) return None @@ -392,7 +391,7 @@ def _process_message(self, body): self.register_extractor("%s?key=%s" % (url, secret_key)) # tell everybody we are starting to process the file - self.status_update(pyclowder.utils.StatusMessage.start, resource, "Started processing") + self.status_update(pyclowder.utils.StatusMessage.start.value, resource, "Started processing.") # checks whether to process the file in this message or not # pylint: disable=too-many-nested-blocks @@ -456,41 +455,37 @@ def _process_message(self, body): logger.exception("Error removing temporary dataset directory") else: - self.status_update(pyclowder.utils.StatusMessage.processing, resource, "Skipped in check_message") + self.status_update(pyclowder.utils.StatusMessage.skip.value, resource, "Skipped in check_message") self.message_ok(resource) except SystemExit as exc: - status = "sys.exit : " + str(exc) - logger.exception("[%s] %s", resource['id'], status) - self.status_update(pyclowder.utils.StatusMessage.error, resource, status) - self.message_resubmit(resource, retry_count) + message = str.format("sys.exit: {}", str(exc)) + logger.exception("[%s] %s", resource['id'], message) + self.message_resubmit(resource, retry_count, message) raise except KeyboardInterrupt: - status = "keyboard interrupt" - logger.exception("[%s] %s", resource['id'], status) - self.status_update(pyclowder.utils.StatusMessage.error, resource, status) - self.message_resubmit(resource, retry_count) + message = "keyboard interrupt" + logger.exception("[%s] %s", resource['id'], message) + self.message_resubmit(resource, retry_count, message) raise except GeneratorExit: - status = "generator exit" - logger.exception("[%s] %s", resource['id'], status) - self.status_update(pyclowder.utils.StatusMessage.error, resource, status) - self.message_resubmit(resource, retry_count) + message = "generator exit" + logger.exception("[%s] %s", resource['id'], message) + self.message_resubmit(resource, retry_count, message) raise except subprocess.CalledProcessError as exc: - status = str.format("Error processing [exit code={}]\n{}", exc.returncode, exc.output) - logger.exception("[%s] %s", resource['id'], status) - self.status_update(pyclowder.utils.StatusMessage.error, resource, status) - self.message_error(resource) + message = str.format("Error in subprocess [exit code={}]:\n{}", exc.returncode, exc.output) + logger.exception("[%s] %s", resource['id'], message) + self.message_error(resource, message) except Exception as exc: # pylint: disable=broad-except - status = "Error processing : " + str(exc) - logger.exception("[%s] %s", resource['id'], status) - self.status_update(pyclowder.utils.StatusMessage.error, resource, status) + message = str(exc) + logger.exception("[%s] %s", resource['id'], message) if retry_count < 10: - self.message_resubmit(resource, retry_count + 1) + message = "(#%s) %s" % (retry_count+1, message) + self.message_resubmit(resource, retry_count+1, message) else: - self.message_error(resource) + self.message_error(resource, message) def register_extractor(self, endpoints): """Register extractor info with Clowder. @@ -528,21 +523,23 @@ def status_update(self, status, resource, message): the instance know the progress of the extractor. Keyword arguments: - status - START | PROCESSING | DONE | ERROR + status - pyclowder.utils.StatusMessage value resource - descriptor object with {"type", "id"} fields message - contents of the status update """ logging.getLogger(__name__).info("[%s] : %s: %s", resource["id"], status, message) - def message_ok(self, resource): - self.status_update(pyclowder.utils.StatusMessage.done, resource, "Done processing") + def message_ok(self, resource, message="Done processing."): + self.status_update(pyclowder.utils.StatusMessage.done.value, resource, message) + + def message_error(self, resource, message="Error processing message."): + self.status_update(pyclowder.utils.StatusMessage.error.value, resource, message) - def message_error(self, resource): - self.status_update(pyclowder.utils.StatusMessage.error, resource, "Error processing message") + def message_resubmit(self, resource, retry_count, message="Resubmitting message."): + self.status_update(pyclowder.utils.StatusMessage.retry.value, resource, message) - def message_resubmit(self, resource, retry_count): - self.status_update(pyclowder.utils.StatusMessage.processing, resource, "Resubmitting message (attempt #%s)" - % retry_count) + def message_process(self, resource, message): + self.status_update(pyclowder.utils.StatusMessage.processing.value, resource, message) def get(self, url, params=None, raise_status=True, **kwargs): """ @@ -871,19 +868,22 @@ def process_messages(self, channel, rabbitmq_queue): with self.lock: msg = self.messages.pop(0) + # PROCESSING - Standard update message during extractor processing if msg["type"] == 'status': if self.header.reply_to: properties = pika.BasicProperties(delivery_mode=2, correlation_id=self.header.correlation_id) channel.basic_publish(exchange='', routing_key=self.header.reply_to, properties=properties, - body=json.dumps(msg['status'])) + body=json.dumps(msg['payload'])) + # DONE - Extractor finished without error elif msg["type"] == 'ok': channel.basic_ack(self.method.delivery_tag) with self.lock: self.finished = True + # ERROR - Extractor encountered error and message goes to error queue elif msg["type"] == 'error': properties = pika.BasicProperties(delivery_mode=2, reply_to=self.header.reply_to) channel.basic_publish(exchange='', @@ -894,18 +894,18 @@ def process_messages(self, channel, rabbitmq_queue): with self.lock: self.finished = True + # RESUBMITTING - Extractor encountered error and message is resubmitted to same queue elif msg["type"] == 'resubmit': - retry_count = msg['retry_count'] - queue = rabbitmq_queue - properties = pika.BasicProperties(delivery_mode=2, reply_to=self.header.reply_to) jbody = json.loads(self.body) - jbody['retry_count'] = retry_count + jbody['retry_count'] = msg['retry_count'] if 'exchange' not in jbody and self.method.exchange: jbody['exchange'] = self.method.exchange - if 'routing_key' not in jbody and self.method.routing_key and self.method.routing_key != queue: + if 'routing_key' not in jbody and self.method.routing_key and self.method.routing_key != rabbitmq_queue: jbody['routing_key'] = self.method.routing_key + + properties = pika.BasicProperties(delivery_mode=2, reply_to=self.header.reply_to) channel.basic_publish(exchange='', - routing_key=queue, + routing_key=rabbitmq_queue, properties=properties, body=json.dumps(jbody)) channel.basic_ack(self.method.delivery_tag) @@ -917,30 +917,33 @@ def process_messages(self, channel, rabbitmq_queue): def status_update(self, status, resource, message): super(RabbitMQHandler, self).status_update(status, resource, message) - status_report = dict() - # TODO: Update this to check resource["type"] once Clowder better supports dataset events - status_report['file_id'] = resource["id"] - status_report['extractor_id'] = self.extractor_info['name'] - status_report['status'] = "%s: %s" % (status, message) - status_report['start'] = pyclowder.utils.iso8601time() with self.lock: + # TODO: Remove 'status' from payload later and read from message_type and message in Clowder 2.0 self.messages.append({"type": "status", - "status": status_report, "resource": resource, - "message": message}) - - def message_ok(self, resource): - super(RabbitMQHandler, self).message_ok(resource) + "payload": { + "file_id": resource["id"], + "extractor_id": self.extractor_info['name'], + "status": "%s: %s" % (status, message), + "start": pyclowder.utils.iso8601time(), + "message_type": status, + "message": message + }}) + + def message_ok(self, resource, message="Done processing."): + super(RabbitMQHandler, self).message_ok(resource, message) with self.lock: self.messages.append({"type": "ok"}) - def message_error(self, resource): - super(RabbitMQHandler, self).message_error(resource) + def message_error(self, resource, message="Error processing message."): + super(RabbitMQHandler, self).message_error(resource, message) with self.lock: self.messages.append({"type": "error"}) - def message_resubmit(self, resource, retry_count): - super(RabbitMQHandler, self).message_resubmit(resource, retry_count) + def message_resubmit(self, resource, retry_count, message=None): + if message is None: + message = "(#%s)" % retry_count + super(RabbitMQHandler, self).message_resubmit(resource, retry_count, message) with self.lock: self.messages.append({"type": "resubmit", "retry_count": retry_count}) diff --git a/pyclowder/datasets.py b/pyclowder/datasets.py index cb04f93..c609c58 100644 --- a/pyclowder/datasets.py +++ b/pyclowder/datasets.py @@ -12,7 +12,6 @@ from pyclowder.client import ClowderClient from pyclowder.collections import get_datasets, get_child_collections, delete as delete_collection -from pyclowder.utils import StatusMessage def create_empty(connector, host, key, datasetname, description, parentid=None, spaceid=None): @@ -113,7 +112,7 @@ def download(connector, host, key, datasetid): datasetid -- the file that is currently being processed """ - connector.status_update(StatusMessage.processing, {"type": "dataset", "id": datasetid}, "Downloading dataset.") + connector.message_process({"type": "dataset", "id": datasetid}, "Downloading dataset.") # fetch dataset zipfile url = '%sapi/datasets/%s/download?key=%s' % (host, datasetid, key) @@ -268,8 +267,7 @@ def upload_metadata(connector, host, key, datasetid, metadata): metadata -- the metadata to be uploaded """ - connector.status_update(StatusMessage.processing, {"type": "dataset", "id": datasetid}, - "Uploading dataset metadata.") + connector.message_process({"type": "dataset", "id": datasetid}, "Uploading dataset metadata.") headers = {'Content-Type': 'application/json'} url = '%sapi/datasets/%s/metadata.jsonld?key=%s' % (host, datasetid, key) diff --git a/pyclowder/files.py b/pyclowder/files.py index 15ef634..1b5a9a5 100644 --- a/pyclowder/files.py +++ b/pyclowder/files.py @@ -14,7 +14,6 @@ from pyclowder.datasets import get_file_list from pyclowder.collections import get_datasets, get_child_collections -from pyclowder.utils import StatusMessage # Some sources of urllib3 support warning suppression, but not all try: @@ -38,7 +37,7 @@ def download(connector, host, key, fileid, intermediatefileid=None, ext=""): ext -- the file extension, the downloaded file will end with this extension """ - connector.status_update(StatusMessage.processing, {"type": "file", "id": fileid}, "Downloading file.") + connector.message_process({"type": "file", "id": fileid}, "Downloading file.") # TODO: intermediateid doesn't really seem to be used here, can we remove entirely? if not intermediatefileid: @@ -180,7 +179,7 @@ def upload_metadata(connector, host, key, fileid, metadata): metadata -- the metadata to be uploaded """ - connector.status_update(StatusMessage.processing, {"type": "file", "id": fileid}, "Uploading file metadata.") + connector.message_process({"type": "file", "id": fileid}, "Uploading file metadata.") headers = {'Content-Type': 'application/json'} url = '%sapi/files/%s/metadata.jsonld?key=%s' % (host, fileid, key) @@ -204,7 +203,7 @@ def upload_preview(connector, host, key, fileid, previewfile, previewmetadata=No file itself and this parameter can be ignored. E.g. 'application/vnd.clowder+custom+xml' """ - connector.status_update(StatusMessage.processing, {"type": "file", "id": fileid}, "Uploading file preview.") + connector.message_process({"type": "file", "id": fileid}, "Uploading file preview.") logger = logging.getLogger(__name__) headers = {'Content-Type': 'application/json'} @@ -248,7 +247,7 @@ def upload_tags(connector, host, key, fileid, tags): tags -- the tags to be uploaded """ - connector.status_update(StatusMessage.processing, {"type": "file", "id": fileid}, "Uploading file tags.") + connector.message_process({"type": "file", "id": fileid}, "Uploading file tags.") headers = {'Content-Type': 'application/json'} url = '%sapi/files/%s/tags?key=%s' % (host, fileid, key) diff --git a/pyclowder/sections.py b/pyclowder/sections.py index ef6800a..737b5fa 100644 --- a/pyclowder/sections.py +++ b/pyclowder/sections.py @@ -8,8 +8,6 @@ import requests -from pyclowder.utils import StatusMessage - def upload(connector, host, key, sectiondata): """Upload section to Clowder. @@ -47,7 +45,7 @@ def upload_tags(connector, host, key, sectionid, tags): tags -- the tags to be uploaded """ - connector.status_update(StatusMessage.processing, {"type": "section", "id": sectionid}, "Uploading section tags.") + connector.message_process({"type": "section", "id": sectionid}, "Uploading section tags.") headers = {'Content-Type': 'application/json'} url = '%sapi/sections/%s/tags?key=%s' % (host, sectionid, key) @@ -67,7 +65,7 @@ def upload_description(connector, host, key, sectionid, description): description -- the description to be uploaded """ - connector.status_update(StatusMessage.processing, {"type": "section", "id": sectionid}, + connector.message_process({"type": "section", "id": sectionid}, "Uploading section description.") headers = {'Content-Type': 'application/json'} diff --git a/pyclowder/utils.py b/pyclowder/utils.py index f71ecb8..8839f84 100644 --- a/pyclowder/utils.py +++ b/pyclowder/utils.py @@ -44,10 +44,12 @@ class StatusMessage(Enum): full string will be STATUS: MESSAGE. """ - start = "START" + start = "STARTED" processing = "PROCESSING" - done = "DONE" + done = "SUCCEEDED" + skip = "SKIPPED" error = "ERROR" + retry = "RESUBMITTED" def iso8601time(): diff --git a/sample-extractors/wordcount/wordcount.py b/sample-extractors/wordcount/wordcount.py index 1f5055f..eb4693d 100755 --- a/sample-extractors/wordcount/wordcount.py +++ b/sample-extractors/wordcount/wordcount.py @@ -32,24 +32,30 @@ def process_message(self, connector, host, secret_key, resource, parameters): inputfile = resource["local_paths"][0] file_id = resource['id'] - # call actual program + # These process messages will appear in the Clowder UI under Extractions. + connector.message_process(resource, "Loading contents of file...") + + # Call actual program result = subprocess.check_output(['wc', inputfile], stderr=subprocess.STDOUT) result = result.decode('utf-8') (lines, words, characters, _) = result.split() - # store results as metadata + connector.message_process(resource, "Found %s lines and %s words..." % (lines, words)) + + # Store results as metadata result = { 'lines': lines, 'words': words, 'characters': characters } metadata = self.get_metadata(result, 'file', file_id, host) + + # Normal logs will appear in the extractor log, but NOT in the Clowder UI. logger.debug(metadata) - # upload metadata + # Upload metadata to original file pyclowder.files.upload_metadata(connector, host, secret_key, file_id, metadata) - if __name__ == "__main__": extractor = WordCount() extractor.start() From b36580a7a1947ac0a873967c31cd9f613d6e275e Mon Sep 17 00:00:00 2001 From: Max Burnette Date: Tue, 12 Nov 2019 09:38:10 -0600 Subject: [PATCH 02/32] begin adding support for datasets in simple extractor --- pyclowder/extractors.py | 115 +++++++++++++++--- sample-extractors/simple-extractor/Dockerfile | 4 +- .../simple-extractor/simple_extractor.py | 3 + 3 files changed, 103 insertions(+), 19 deletions(-) diff --git a/pyclowder/extractors.py b/pyclowder/extractors.py index 7d9bf8e..55f61b8 100644 --- a/pyclowder/extractors.py +++ b/pyclowder/extractors.py @@ -308,36 +308,99 @@ def __init__(self): self.logger = logging.getLogger('__main__') self.logger.setLevel(logging.INFO) + # TODO: Support check_message() in simple extractors + def process_message(self, connector, host, secret_key, resource, parameters): """ - Process a clowder message. This will download the file to local disk and call the - process_file to do the actual processing of the file. The resulting dict is then + Process a clowder message. This will download the file(s) to local disk and call + process_file or process_dataset to do the actual processing. The resulting dict is then parsed and based on the keys in the dict it will upload the results to the right location in clowder. """ - input_file = resource["local_paths"][0] - file_id = resource['id'] + if 'files' in resource: + type = 'dataset' + input_files = resource['local_paths'] + dataset_id = resource['id'] + + elif 'local_paths' in resource: + type = 'file' + input_file = resource['local_paths'][0] + file_id = resource['id'] + dataset_id = resource['parent']['id'] + else: + # TODO: Eventually support other messages such as metadata.added + type = 'unknown' - # call the actual function that processes the file - if file_id and input_file: + # call the actual function that processes the message + if type == 'file' and file_id and input_file: result = self.process_file(input_file) + elif type == 'dataset' and dataset_id and input_files: + result = self.process_dataset(input_files) else: result = dict() - # return information to clowder try: + # upload metadata to the processed file or dataset if 'metadata' in result.keys(): - metadata = self.get_metadata(result.get('metadata'), 'file', file_id, host) self.logger.info("upload metadata") - self.logger.debug(metadata) - pyclowder.files.upload_metadata(connector, host, secret_key, file_id, metadata) + if type == 'file': + metadata = self.get_metadata(result.get('metadata'), 'file', file_id, host) + self.logger.debug(metadata) + pyclowder.files.upload_metadata(connector, host, secret_key, file_id, metadata) + elif type == 'dataset': + metadata = self.get_metadata(result.get('metadata'), 'dataset', dataset_id, host) + self.logger.debug(metadata) + pyclowder.datasets.upload_metadata(connector, host, secret_key, dataset_id, metadata) + else: + self.logger.error("unable to attach metadata to resource type: %s" % type) + + # upload previews to the processed file if 'previews' in result.keys(): self.logger.info("upload previews") - for preview in result['previews']: - if os.path.exists(str(preview)): - preview = {'file': preview} - self.logger.info("upload preview") - pyclowder.files.upload_preview(connector, host, secret_key, file_id, str(preview)) + if type == 'file': + for preview in result['previews']: + if os.path.exists(str(preview)): + preview = {'file': preview} + self.logger.info("upload preview") + pyclowder.files.upload_preview(connector, host, secret_key, file_id, str(preview)) + else: + # TODO: Add Clowder endpoint & pyclowder method to attach previews to datasets + self.logger.error("previews not currently supported for resource type: %s" % type) + + # upload output files to the processed file's parent dataset or processed dataset + if 'outputs' in result.keys(): + self.logger.info("upload output files") + if type == 'file' or type == 'dataset': + for output in result['outputs']: + if os.path.exists(str(output)): + pyclowder.files.upload_to_dataset(connector, host, secret_key, dataset_id, str(output)) + else: + self.logger.error("unable to upload outputs to resource type: %s" % type) + + if 'new_dataset' in result.keys(): + if type == 'dataset': + nds = result['new_dataset'] + if 'name' not in nds.keys(): + self.logger.error("new datasets require a name") + else: + description = nds['description'] if 'description' in nds.keys() else "" + new_dataset_id = pyclowder.datasets.create_empty(connector, host, secret_key, nds['name'], + description) + self.logger.info("created new dataset: %s" % new_dataset_id) + + if 'metadata' in nds.keys(): + self.logger.info("upload metadata to new dataset") + metadata = self.get_metadata(nds.get('metadata'), 'dataset', new_dataset_id, host) + self.logger.debug(metadata) + pyclowder.datasets.upload_metadata(connector, host, secret_key, new_dataset_id, metadata) + + if 'outputs' in nds.keys(): + self.logger.info("upload output files to new dataset") + for output in nds['outputs']: + if os.path.exists(str(output)): + pyclowder.files.upload_to_dataset(connector, host, secret_key, new_dataset_id, + str(output)) + finally: self.cleanup_data(result) @@ -345,13 +408,31 @@ def process_file(self, input_file): """ This function will process the file and return a dict that contains the result. This dict can have the following keys: - - metadata: the metadata to be associated with the file - - previews: files on disk with the preview to be uploaded + - metadata: the metadata to be associated with the processed file + - previews: files on disk with the preview to be uploaded to the processed file + - outputs: files on disk to be added to processed file's parent :param input_file: the file to be processed. :return: the specially formatted dict. """ return dict() + def process_dataset(self, input_files): + """ + This function will process the file list and return a dict that contains the result. This + dict can have the following keys: + - metadata: the metadata to be associated with the processed dataset + - outputs: files on disk to be added to the dataset + - new_dataset: a dict describing a new dataset to be created for the outputs, with the following keys: + - name: the name of the new dataset to be created (including adding the outputs, + metadata and previews contained in new_dataset) + - description: description for the new dataset to be created + - metadata: (see above) + - outputs: (see above) + :param input_files: the files to be processed. + :return: the specially formatted dict. + """ + return dict() + def cleanup_data(self, result): """ Once the information is uploaded to clowder this function is called for cleanup. This diff --git a/sample-extractors/simple-extractor/Dockerfile b/sample-extractors/simple-extractor/Dockerfile index 83ab295..50c2107 100644 --- a/sample-extractors/simple-extractor/Dockerfile +++ b/sample-extractors/simple-extractor/Dockerfile @@ -1,11 +1,11 @@ -ARG PYCLOWDER_PYTHON="" -FROM clowder/pyclowder${PYCLOWDER_PYTHON}:latest ENV EXTRACTION_FUNC="" ENV EXTRACTION_MODULE="" COPY simple_extractor.py . +RUN apt-get update && apt-get install -y build-essential python3-dev + # install any packages ONBUILD COPY packages.* Dockerfile /home/clowder/ ONBUILD RUN if [ -e packages.apt ]; then \ diff --git a/sample-extractors/simple-extractor/simple_extractor.py b/sample-extractors/simple-extractor/simple_extractor.py index c923361..5146906 100644 --- a/sample-extractors/simple-extractor/simple_extractor.py +++ b/sample-extractors/simple-extractor/simple_extractor.py @@ -10,3 +10,6 @@ def __init__(self, extraction): def process_file(self, input_file): return self.extraction(input_file) + + def process_dataset(self, input_files): + return self.extraction(input_files) From 4146413fd277300324e11dfcc88b34a67ff709b5 Mon Sep 17 00:00:00 2001 From: Max Burnette Date: Tue, 26 Nov 2019 10:11:31 -0600 Subject: [PATCH 03/32] remove python3 install --- sample-extractors/simple-extractor/Dockerfile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sample-extractors/simple-extractor/Dockerfile b/sample-extractors/simple-extractor/Dockerfile index 50c2107..83ab295 100644 --- a/sample-extractors/simple-extractor/Dockerfile +++ b/sample-extractors/simple-extractor/Dockerfile @@ -1,11 +1,11 @@ +ARG PYCLOWDER_PYTHON="" +FROM clowder/pyclowder${PYCLOWDER_PYTHON}:latest ENV EXTRACTION_FUNC="" ENV EXTRACTION_MODULE="" COPY simple_extractor.py . -RUN apt-get update && apt-get install -y build-essential python3-dev - # install any packages ONBUILD COPY packages.* Dockerfile /home/clowder/ ONBUILD RUN if [ -e packages.apt ]; then \ From 355bd671b5aa8eb1f9698a274d80ea6ad9412563 Mon Sep 17 00:00:00 2001 From: Max Burnette Date: Wed, 22 Jan 2020 14:55:57 -0600 Subject: [PATCH 04/32] fix test formatting --- pyclowder/extractors.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyclowder/extractors.py b/pyclowder/extractors.py index 55f61b8..6fa5cb2 100644 --- a/pyclowder/extractors.py +++ b/pyclowder/extractors.py @@ -320,7 +320,7 @@ def process_message(self, connector, host, secret_key, resource, parameters): if 'files' in resource: type = 'dataset' input_files = resource['local_paths'] - dataset_id = resource['id'] + dataset_id = resource['id'] elif 'local_paths' in resource: type = 'file' From 24fc65a9a4917ffc234adc4f118912d662705666 Mon Sep 17 00:00:00 2001 From: Max Burnette Date: Wed, 22 Jan 2020 14:58:29 -0600 Subject: [PATCH 05/32] fix build typo --- pyclowder/connectors.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyclowder/connectors.py b/pyclowder/connectors.py index 80df4c2..b27f1d9 100644 --- a/pyclowder/connectors.py +++ b/pyclowder/connectors.py @@ -897,7 +897,7 @@ def process_messages(self, channel, rabbitmq_queue): # RESUBMITTING - Extractor encountered error and message is resubmitted to same queue elif msg["type"] == 'resubmit': jbody = json.loads(self.body) - jbody['retry_count'] = msg['retry_count'] + jbody['retry_count'] = msg['retry_count'] if 'exchange' not in jbody and self.method.exchange: jbody['exchange'] = self.method.exchange if 'routing_key' not in jbody and self.method.routing_key and self.method.routing_key != rabbitmq_queue: From 4268d7460829d7c25ea71a53fd5742686f1005e8 Mon Sep 17 00:00:00 2001 From: Max Burnette Date: Wed, 22 Jan 2020 14:59:11 -0600 Subject: [PATCH 06/32] fix other build typo --- pyclowder/sections.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyclowder/sections.py b/pyclowder/sections.py index 737b5fa..fab0bd4 100644 --- a/pyclowder/sections.py +++ b/pyclowder/sections.py @@ -66,7 +66,7 @@ def upload_description(connector, host, key, sectionid, description): """ connector.message_process({"type": "section", "id": sectionid}, - "Uploading section description.") + "Uploading section description.") headers = {'Content-Type': 'application/json'} url = '%sapi/sections/%s/description?key=%s' % (host, sectionid, key) From df453426d68cb1580fa48f25f99d94c433260289 Mon Sep 17 00:00:00 2001 From: Max Burnette Date: Mon, 27 Jan 2020 09:16:43 -0600 Subject: [PATCH 07/32] improved comments --- pyclowder/extractors.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/pyclowder/extractors.py b/pyclowder/extractors.py index 6fa5cb2..a062947 100644 --- a/pyclowder/extractors.py +++ b/pyclowder/extractors.py @@ -364,7 +364,7 @@ def process_message(self, connector, host, secret_key, resource, parameters): self.logger.info("upload preview") pyclowder.files.upload_preview(connector, host, secret_key, file_id, str(preview)) else: - # TODO: Add Clowder endpoint & pyclowder method to attach previews to datasets + # TODO: Add Clowder endpoint (& pyclowder method) to attach previews to datasets self.logger.error("previews not currently supported for resource type: %s" % type) # upload output files to the processed file's parent dataset or processed dataset @@ -401,6 +401,10 @@ def process_message(self, connector, host, secret_key, resource, parameters): pyclowder.files.upload_to_dataset(connector, host, secret_key, new_dataset_id, str(output)) + if 'previews' in nds.keys(): + # TODO: Add Clowder endpoint (& pyclowder method) to attach previews to datasets + self.logger.error("previews not currently supported for resource type: %s" % type) + finally: self.cleanup_data(result) @@ -409,7 +413,7 @@ def process_file(self, input_file): This function will process the file and return a dict that contains the result. This dict can have the following keys: - metadata: the metadata to be associated with the processed file - - previews: files on disk with the preview to be uploaded to the processed file + - previews: images on disk with the preview to be uploaded to the processed file - outputs: files on disk to be added to processed file's parent :param input_file: the file to be processed. :return: the specially formatted dict. @@ -422,10 +426,12 @@ def process_dataset(self, input_files): dict can have the following keys: - metadata: the metadata to be associated with the processed dataset - outputs: files on disk to be added to the dataset + - previews: images to be associated with the dataset - new_dataset: a dict describing a new dataset to be created for the outputs, with the following keys: - name: the name of the new dataset to be created (including adding the outputs, metadata and previews contained in new_dataset) - description: description for the new dataset to be created + - previews: (see above) - metadata: (see above) - outputs: (see above) :param input_files: the files to be processed. From fa7fb11a2ca3cc674dadbda7a47407d2f268f05c Mon Sep 17 00:00:00 2001 From: Max Burnette Date: Mon, 27 Jan 2020 09:23:50 -0600 Subject: [PATCH 08/32] add PyClowderExtractionAbort exception class --- pyclowder/connectors.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/pyclowder/connectors.py b/pyclowder/connectors.py index b27f1d9..4269ccd 100644 --- a/pyclowder/connectors.py +++ b/pyclowder/connectors.py @@ -478,6 +478,10 @@ def _process_message(self, body): message = str.format("Error in subprocess [exit code={}]:\n{}", exc.returncode, exc.output) logger.exception("[%s] %s", resource['id'], message) self.message_error(resource, message) + except PyClowderExtractionAbort as exc: + message = str.format("Aborting message: {}", exc.message) + logger.exception("[%s] %s", resource['id'], message) + self.message_error(resource, message) except Exception as exc: # pylint: disable=broad-except message = str(exc) logger.exception("[%s] %s", resource['id'], message) @@ -1099,3 +1103,14 @@ def put(self, url, data=None, raise_status=True, **kwargs): def delete(self, url, raise_status=True, **kwargs): logging.getLogger(__name__).debug("DELETE: " + url) return None + + +class PyClowderExtractionAbort(Exception): + """Raise exception that will not be subject to retry attempts (i.e. errors that are expected to fail again). + + Attributes: + message -- explanation of the error + """ + + def __init__(self, message): + self.message = message From a43fc1f93f417e49af4ff6fd04ded2b5ebda0d72 Mon Sep 17 00:00:00 2001 From: toddn Date: Fri, 20 Mar 2020 11:53:34 -0500 Subject: [PATCH 09/32] tags added to simple extractor --- pyclowder/extractors.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pyclowder/extractors.py b/pyclowder/extractors.py index 7d9bf8e..b475aa2 100644 --- a/pyclowder/extractors.py +++ b/pyclowder/extractors.py @@ -338,6 +338,11 @@ def process_message(self, connector, host, secret_key, resource, parameters): preview = {'file': preview} self.logger.info("upload preview") pyclowder.files.upload_preview(connector, host, secret_key, file_id, str(preview)) + if 'tags' in result.keys(): + tags = {"tags": result["tags"]} + self.logger.info("upload tags") + self.logger.debug(tags) + pyclowder.files.upload_tags(connector, host, secret_key, file_id, tags) finally: self.cleanup_data(result) From a04717fdeec24917475bd489b9533950807c078a Mon Sep 17 00:00:00 2001 From: toddn Date: Tue, 24 Mar 2020 07:39:36 -0500 Subject: [PATCH 10/32] update readme --- sample-extractors/simple-extractor/README.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sample-extractors/simple-extractor/README.md b/sample-extractors/simple-extractor/README.md index 4d9ee42..5cfc304 100644 --- a/sample-extractors/simple-extractor/README.md +++ b/sample-extractors/simple-extractor/README.md @@ -18,12 +18,13 @@ you have to write your extractor the normal way using [PyClowder](https://openso To write an extractor using the Simple Extractor, you need to have your Python program available. The main function of this Python program is supposed to take an input file path as its parameter. It needs to return a Python dictionary that -can contain either metadata information ("metadata"), details about file previews ("previews") or both. For example: +can contain either metadata information ("metadata"), details about file previews ("previews"), tags for the file ("tags) or all. For example: ``` json { "metadata": dict(), - "previews": array() + "previews": array(), + "tags": array() } ``` From 4ab0dcb470cedcec29162cb2c1780b414fe2259f Mon Sep 17 00:00:00 2001 From: Rob Kooper Date: Wed, 6 May 2020 13:43:06 -0500 Subject: [PATCH 11/32] update changelog --- CHANGELOG.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 937ca7f..33d6dd5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,11 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](http://keepachangelog.com/) and this project adheres to [Semantic Versioning](http://semver.org/). +## Unreleased + +### Added +- Simple extractors now support datasets, can also create new datasets. + ## 2.2.3 - 2019-10-14 ### Fixed From 5ffd63cd4ee654637157196bc64b104df4484ba5 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 26 Jun 2020 18:09:42 +0000 Subject: [PATCH 12/32] Bump urllib3 from 1.24.1 to 1.24.2 Bumps [urllib3](https://github.com/urllib3/urllib3) from 1.24.1 to 1.24.2. - [Release notes](https://github.com/urllib3/urllib3/releases) - [Changelog](https://github.com/urllib3/urllib3/blob/master/CHANGES.rst) - [Commits](https://github.com/urllib3/urllib3/compare/1.24.1...1.24.2) Signed-off-by: dependabot[bot] --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index b03e32c..ac56165 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,7 +3,7 @@ pika==1.0.0 PyYAML==5.1 requests==2.21.0 wheel==0.33.1 -urllib3==1.24.1 +urllib3==1.24.2 pytest==4.3.1 pytest-pep8==1.0.6 requests-toolbelt==0.9.1 \ No newline at end of file From 80e3e13b549714fe4bf88076ef970b14c8ba44c2 Mon Sep 17 00:00:00 2001 From: Mike Lambert Date: Mon, 29 Jun 2020 17:51:54 -0500 Subject: [PATCH 13/32] Proof-of-concept for tracking extractions using new job_id --- pyclowder/connectors.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/pyclowder/connectors.py b/pyclowder/connectors.py index e0c3af7..ec4b711 100644 --- a/pyclowder/connectors.py +++ b/pyclowder/connectors.py @@ -760,7 +760,9 @@ def on_message(self, channel, method, header, body): if 'routing_key' not in json_body and method.routing_key: json_body['routing_key'] = method.routing_key - self.worker = RabbitMQHandler(self.extractor_name, self.extractor_info, self.check_message, + job_id = json_body['jobid'] + + self.worker = RabbitMQHandler(self.extractor_name, self.extractor_info, job_id, self.check_message, self.process_message, self.ssl_verify, self.mounted_paths, method, header, body) self.worker.start_thread(json_body) @@ -835,13 +837,14 @@ class RabbitMQHandler(Connector): a queue of messages that the super- loop can access and send later. """ - def __init__(self, extractor_name, extractor_info, check_message=None, process_message=None, ssl_verify=True, + def __init__(self, extractor_name, extractor_info, job_id, check_message=None, process_message=None, ssl_verify=True, mounted_paths=None, method=None, header=None, body=None): super(RabbitMQHandler, self).__init__(extractor_name, extractor_info, check_message, process_message, ssl_verify, mounted_paths) self.method = method self.header = header self.body = body + self.job_id = job_id self.messages = [] self.thread = None self.finished = False @@ -920,6 +923,7 @@ def status_update(self, status, resource, message): status_report = dict() # TODO: Update this to check resource["type"] once Clowder better supports dataset events status_report['file_id'] = resource["id"] + status_report['job_id'] = self.job_id status_report['extractor_id'] = self.extractor_info['name'] status_report['status'] = "%s: %s" % (status, message) status_report['start'] = pyclowder.utils.iso8601time() @@ -952,7 +956,8 @@ class HPCConnector(Connector): def __init__(self, extractor_name, extractor_info, picklefile, check_message=None, process_message=None, ssl_verify=True, mounted_paths=None): super(HPCConnector, self).__init__(extractor_name, extractor_info, check_message, process_message, - ssl_verify, mounted_paths) + ssl_verify, job_id, mounted_paths) + self.job_id = job_id self.picklefile = picklefile self.logfile = None @@ -991,6 +996,7 @@ def status_update(self, status, resource, message): statusreport = dict() statusreport['file_id'] = resource["id"] statusreport['extractor_id'] = self.extractor_info['name'] + statusreport['job_id'] = self.job_id statusreport['status'] = "%s: %s" % (status, message) statusreport['start'] = time.strftime('%Y-%m-%dT%H:%M:%S') log.write(json.dumps(statusreport) + '\n') From 6e762adfca8cfd8e3c412b083ab579e88e2c838a Mon Sep 17 00:00:00 2001 From: Mike Lambert Date: Fri, 7 Aug 2020 12:54:28 -0500 Subject: [PATCH 14/32] Allow older versions of Clowder to use newer pyClowder --- pyclowder/connectors.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pyclowder/connectors.py b/pyclowder/connectors.py index ec4b711..cd24fd4 100644 --- a/pyclowder/connectors.py +++ b/pyclowder/connectors.py @@ -760,7 +760,10 @@ def on_message(self, channel, method, header, body): if 'routing_key' not in json_body and method.routing_key: json_body['routing_key'] = method.routing_key - job_id = json_body['jobid'] + if 'jobid' not in json_body: + job_id = None + else: + job_id = json_body['jobid'] self.worker = RabbitMQHandler(self.extractor_name, self.extractor_info, job_id, self.check_message, self.process_message, self.ssl_verify, self.mounted_paths, From ed0348ee12721b4d6d117e0ec1196b61659cf37a Mon Sep 17 00:00:00 2001 From: Max Burnette Date: Mon, 24 Aug 2020 10:20:08 -0500 Subject: [PATCH 15/32] change log message level --- pyclowder/extractors.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/pyclowder/extractors.py b/pyclowder/extractors.py index 4147f72..7a31bb1 100644 --- a/pyclowder/extractors.py +++ b/pyclowder/extractors.py @@ -343,7 +343,7 @@ def process_message(self, connector, host, secret_key, resource, parameters): try: # upload metadata to the processed file or dataset if 'metadata' in result.keys(): - self.logger.info("upload metadata") + self.logger.debug("upload metadata") if type == 'file': metadata = self.get_metadata(result.get('metadata'), 'file', file_id, host) self.logger.debug(metadata) @@ -357,19 +357,18 @@ def process_message(self, connector, host, secret_key, resource, parameters): # upload previews to the processed file if 'previews' in result.keys(): - self.logger.info("upload previews") if type == 'file': for preview in result['previews']: if os.path.exists(str(preview)): preview = {'file': preview} - self.logger.info("upload preview") + self.logger.debug("upload preview") pyclowder.files.upload_preview(connector, host, secret_key, file_id, str(preview)) else: # TODO: Add Clowder endpoint (& pyclowder method) to attach previews to datasets self.logger.error("previews not currently supported for resource type: %s" % type) if 'tags' in result.keys(): - self.logger.info("upload tags") + self.logger.debug("upload tags") tags = {"tags": result["tags"]} if type == 'file': pyclowder.files.upload_tags(connector, host, secret_key, file_id, tags) @@ -378,7 +377,7 @@ def process_message(self, connector, host, secret_key, resource, parameters): # upload output files to the processed file's parent dataset or processed dataset if 'outputs' in result.keys(): - self.logger.info("upload output files") + self.logger.debug("upload output files") if type == 'file' or type == 'dataset': for output in result['outputs']: if os.path.exists(str(output)): @@ -395,16 +394,16 @@ def process_message(self, connector, host, secret_key, resource, parameters): description = nds['description'] if 'description' in nds.keys() else "" new_dataset_id = pyclowder.datasets.create_empty(connector, host, secret_key, nds['name'], description) - self.logger.info("created new dataset: %s" % new_dataset_id) + self.logger.debug("created new dataset: %s" % new_dataset_id) if 'metadata' in nds.keys(): - self.logger.info("upload metadata to new dataset") + self.logger.debug("upload metadata to new dataset") metadata = self.get_metadata(nds.get('metadata'), 'dataset', new_dataset_id, host) self.logger.debug(metadata) pyclowder.datasets.upload_metadata(connector, host, secret_key, new_dataset_id, metadata) if 'outputs' in nds.keys(): - self.logger.info("upload output files to new dataset") + self.logger.debug("upload output files to new dataset") for output in nds['outputs']: if os.path.exists(str(output)): pyclowder.files.upload_to_dataset(connector, host, secret_key, new_dataset_id, From 2cf2d1a491531640cfe3502ba9d5eddaed4361b2 Mon Sep 17 00:00:00 2001 From: Max Burnette Date: Mon, 24 Aug 2020 11:31:41 -0500 Subject: [PATCH 16/32] add job_id to payload for RMQ --- pyclowder/connectors.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/pyclowder/connectors.py b/pyclowder/connectors.py index 728e82b..3fe0ea7 100644 --- a/pyclowder/connectors.py +++ b/pyclowder/connectors.py @@ -928,14 +928,6 @@ def process_messages(self, channel, rabbitmq_queue): def status_update(self, status, resource, message): super(RabbitMQHandler, self).status_update(status, resource, message) - status_report = dict() - # TODO: Update this to check resource["type"] once Clowder better supports dataset events - status_report['file_id'] = resource["id"] - status_report['job_id'] = self.job_id - status_report['extractor_id'] = self.extractor_info['name'] - status_report['status'] = "%s: %s" % (status, message) - status_report['start'] = pyclowder.utils.iso8601time() - with self.lock: # TODO: Remove 'status' from payload later and read from message_type and message in Clowder 2.0 self.messages.append({"type": "status", @@ -943,6 +935,7 @@ def status_update(self, status, resource, message): "payload": { "file_id": resource["id"], "extractor_id": self.extractor_info['name'], + "job_id": self.job_id, "status": "%s: %s" % (status, message), "start": pyclowder.utils.iso8601time(), "message_type": status, From c99faa88ce65342b1310ba4d08bbe94d578f0265 Mon Sep 17 00:00:00 2001 From: Rob Kooper Date: Tue, 15 Sep 2020 14:34:08 -0500 Subject: [PATCH 17/32] release 2.3.0 --- CHANGELOG.md | 2 +- Pipfile | 14 +++++++++ Pipfile.lock | 75 ++++++++++++++++++++++++++++++++++++++++++++++++ requirements.txt | 12 ++------ setup.py | 8 ++---- 5 files changed, 96 insertions(+), 15 deletions(-) create mode 100644 Pipfile create mode 100644 Pipfile.lock diff --git a/CHANGELOG.md b/CHANGELOG.md index c4b1d86..e739a3b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,7 +4,7 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](http://keepachangelog.com/) and this project adheres to [Semantic Versioning](http://semver.org/). -## Unreleased +## 2.3.0 - 2020-09-15 ### Added - Simple extractors now support datasets, can also create new datasets. diff --git a/Pipfile b/Pipfile new file mode 100644 index 0000000..8a259a4 --- /dev/null +++ b/Pipfile @@ -0,0 +1,14 @@ +[[source]] +name = "pypi" +url = "https://pypi.org/simple" +verify_ssl = true + +[dev-packages] + +[packages] +pika = "*" +requests = "*" +requests-toolbelt = "*" + +[requires] +python_version = "3.7" diff --git a/Pipfile.lock b/Pipfile.lock new file mode 100644 index 0000000..5b8dcce --- /dev/null +++ b/Pipfile.lock @@ -0,0 +1,75 @@ +{ + "_meta": { + "hash": { + "sha256": "d6c9b6d71b5b799a85a1ba3947336c8e9a7095ba216219722a8d928d89f302d4" + }, + "pipfile-spec": 6, + "requires": { + "python_version": "3.7" + }, + "sources": [ + { + "name": "pypi", + "url": "https://pypi.org/simple", + "verify_ssl": true + } + ] + }, + "default": { + "certifi": { + "hashes": [ + "sha256:5930595817496dd21bb8dc35dad090f1c2cd0adfaf21204bf6732ca5d8ee34d3", + "sha256:8fc0819f1f30ba15bdb34cceffb9ef04d99f420f68eb75d901e9560b8749fc41" + ], + "version": "==2020.6.20" + }, + "chardet": { + "hashes": [ + "sha256:84ab92ed1c4d4f16916e05906b6b75a6c0fb5db821cc65e70cbd64a3e2a5eaae", + "sha256:fc323ffcaeaed0e0a02bf4d117757b98aed530d9ed4531e3e15460124c106691" + ], + "version": "==3.0.4" + }, + "idna": { + "hashes": [ + "sha256:b307872f855b18632ce0c21c5e45be78c0ea7ae4c15c828c20788b26921eb3f6", + "sha256:b97d804b1e9b523befed77c48dacec60e6dcb0b5391d57af6a65a312a90648c0" + ], + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", + "version": "==2.10" + }, + "pika": { + "hashes": [ + "sha256:4e1a1a6585a41b2341992ec32aadb7a919d649eb82904fd8e4a4e0871c8cf3af", + "sha256:9fa76ba4b65034b878b2b8de90ff8660a59d925b087c5bb88f8fdbb4b64a1dbf" + ], + "index": "pypi", + "version": "==1.1.0" + }, + "requests": { + "hashes": [ + "sha256:b3559a131db72c33ee969480840fff4bb6dd111de7dd27c8ee1f820f4f00231b", + "sha256:fe75cc94a9443b9246fc7049224f75604b113c36acb93f87b80ed42c44cbb898" + ], + "index": "pypi", + "version": "==2.24.0" + }, + "requests-toolbelt": { + "hashes": [ + "sha256:380606e1d10dc85c3bd47bf5a6095f815ec007be7a8b69c878507068df059e6f", + "sha256:968089d4584ad4ad7c171454f0a5c6dac23971e9472521ea3b6d49d610aa6fc0" + ], + "index": "pypi", + "version": "==0.9.1" + }, + "urllib3": { + "hashes": [ + "sha256:91056c15fa70756691db97756772bb1eb9678fa585d9184f24534b100dc60f4a", + "sha256:e7983572181f5e1522d9c98453462384ee92a0be7fac5f1413a1e35c56cc0461" + ], + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4'", + "version": "==1.25.10" + } + }, + "develop": {} +} diff --git a/requirements.txt b/requirements.txt index ac56165..2240490 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,9 +1,3 @@ -enum34==1.1.6 -pika==1.0.0 -PyYAML==5.1 -requests==2.21.0 -wheel==0.33.1 -urllib3==1.24.2 -pytest==4.3.1 -pytest-pep8==1.0.6 -requests-toolbelt==0.9.1 \ No newline at end of file +pika==1.1.0 +requests==2.24.0 +requests-toolbelt==0.9.1 diff --git a/setup.py b/setup.py index aadf4cb..3bc41e3 100644 --- a/setup.py +++ b/setup.py @@ -9,7 +9,7 @@ def description(): setup(name='pyclowder', - version='2.2.3', + version='2.3.0', packages=find_packages(), description='Python SDK for the Clowder Data Management System', long_description=description(), @@ -32,10 +32,8 @@ def description(): keywords=['clowder', 'data management system'], install_requires=[ - 'enum34==1.1.6', - 'pika==1.0.0', - 'PyYAML==5.1', - 'requests==2.21.0', + 'pika==1.1.0', + 'requests==2.24.0', 'requests-toolbelt==0.9.1', ], From 4bb75807b1b939e024777ea72d077e8d1962990a Mon Sep 17 00:00:00 2001 From: Rob Kooper Date: Tue, 15 Sep 2020 14:45:17 -0500 Subject: [PATCH 18/32] update CHANGELOG --- CHANGELOG.md | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index e739a3b..6d09788 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,9 +6,19 @@ and this project adheres to [Semantic Versioning](http://semver.org/). ## 2.3.0 - 2020-09-15 +Removed develop branch, all pull requests will need to be against master from now +forward. Please update version number in setup.py in each PR. + ### Added - Simple extractors now support datasets, can also create new datasets. - Ability to add tags from simple extractor to files and datasets. +- Ability to add additional files (outputs) to dataset in simple extractor. +- Use pipenv to manage dependencies. +- Add job_id to each status message returned by pyclowder. +- PyClowderExtractionAbort to indicate the message shoudl not be retried. + +### Changed +- Better handling of status messages ## 2.2.3 - 2019-10-14 From 705b6f55b12587bdf88c405e710cd4850bedac65 Mon Sep 17 00:00:00 2001 From: Rob Kooper Date: Tue, 15 Sep 2020 15:00:48 -0500 Subject: [PATCH 19/32] simple ci github action --- .github/workflows/ci.yaml | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) create mode 100644 .github/workflows/ci.yaml diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml new file mode 100644 index 0000000..22dafa4 --- /dev/null +++ b/.github/workflows/ci.yaml @@ -0,0 +1,36 @@ +name: Python package + +on: [push] + +jobs: + build: + + runs-on: ubuntu-latest + strategy: + matrix: + python-version: [2.7, 3.5, 3.6, 3.7, 3.8] + + steps: + - uses: actions/checkout@v2 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install flake8 pytest + if [ -f requirements.txt ]; then pip install -r requirements.txt; fi + + - name: Lint with flake8 + run: | + # stop the build if there are Python syntax errors or undefined names + flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics + # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide + flake8 . --count --exit-zero --max-complexity=10 --max-line-length=120 --statistics + + - name: Test with pytest + run: | + pytest From c75d2538884bf9a176cbf385a27b71642a6b7c7f Mon Sep 17 00:00:00 2001 From: Rob Kooper Date: Tue, 15 Sep 2020 15:03:12 -0500 Subject: [PATCH 20/32] use pipenv --- .github/workflows/ci.yaml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 22dafa4..7839467 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -21,8 +21,9 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - pip install flake8 pytest - if [ -f requirements.txt ]; then pip install -r requirements.txt; fi + pip install flake8 pytest pipenv + pipenv run pip freeze > requirements.txt + pip install -r requirements.txt - name: Lint with flake8 run: | From 687200ed0bbf22ff66859f1db3d9d377a776d00d Mon Sep 17 00:00:00 2001 From: Rob Kooper Date: Tue, 15 Sep 2020 21:51:17 -0500 Subject: [PATCH 21/32] fix build errors --- .github/workflows/ci.yaml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 7839467..59a0536 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -22,14 +22,13 @@ jobs: run: | python -m pip install --upgrade pip pip install flake8 pytest pipenv - pipenv run pip freeze > requirements.txt pip install -r requirements.txt - name: Lint with flake8 run: | # stop the build if there are Python syntax errors or undefined names flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics - # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide + # exit-zero treats all errors as warnings. flake8 . --count --exit-zero --max-complexity=10 --max-line-length=120 --statistics - name: Test with pytest From 3d978c95973cc5db222669592fd9582485a0c1a6 Mon Sep 17 00:00:00 2001 From: Rob Kooper Date: Tue, 15 Sep 2020 21:58:32 -0500 Subject: [PATCH 22/32] fix compile errors --- pyclowder/connectors.py | 2 +- pyclowder/datasets.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/pyclowder/connectors.py b/pyclowder/connectors.py index 3fe0ea7..5a84af2 100644 --- a/pyclowder/connectors.py +++ b/pyclowder/connectors.py @@ -967,7 +967,7 @@ class HPCConnector(Connector): def __init__(self, extractor_name, extractor_info, picklefile, check_message=None, process_message=None, ssl_verify=True, mounted_paths=None): super(HPCConnector, self).__init__(extractor_name, extractor_info, check_message, process_message, - ssl_verify, job_id, mounted_paths) + ssl_verify, mounted_paths) self.job_id = job_id self.picklefile = picklefile self.logfile = None diff --git a/pyclowder/datasets.py b/pyclowder/datasets.py index 075eacc..2566d91 100644 --- a/pyclowder/datasets.py +++ b/pyclowder/datasets.py @@ -12,6 +12,7 @@ from pyclowder.client import ClowderClient from pyclowder.collections import get_datasets, get_child_collections, delete as delete_collection +from pyclowder.utils import StatusMessage def create_empty(connector, host, key, datasetname, description, parentid=None, spaceid=None): From ec461ee9cce4755f175985f3c57e5f715b865ff5 Mon Sep 17 00:00:00 2001 From: Rob Kooper Date: Tue, 15 Sep 2020 22:02:29 -0500 Subject: [PATCH 23/32] missing job_id --- pyclowder/connectors.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyclowder/connectors.py b/pyclowder/connectors.py index 5a84af2..d39e477 100644 --- a/pyclowder/connectors.py +++ b/pyclowder/connectors.py @@ -964,7 +964,7 @@ class HPCConnector(Connector): """Takes pickle files and processes them.""" # pylint: disable=too-many-arguments - def __init__(self, extractor_name, extractor_info, picklefile, + def __init__(self, extractor_name, extractor_info, picklefile, job_id=None, check_message=None, process_message=None, ssl_verify=True, mounted_paths=None): super(HPCConnector, self).__init__(extractor_name, extractor_info, check_message, process_message, ssl_verify, mounted_paths) From a220061800474d2ef5753d23b0e7eb39361500ee Mon Sep 17 00:00:00 2001 From: Rob Kooper Date: Tue, 15 Sep 2020 22:09:50 -0500 Subject: [PATCH 24/32] use flake instead of pep8 --- setup.cfg | 2 -- 1 file changed, 2 deletions(-) delete mode 100644 setup.cfg diff --git a/setup.cfg b/setup.cfg deleted file mode 100644 index 1531e1a..0000000 --- a/setup.cfg +++ /dev/null @@ -1,2 +0,0 @@ -[tool:pytest] -pep8maxlinelength = 120 From 931710615effb5775f75a31ab77d1bc93a94ee73 Mon Sep 17 00:00:00 2001 From: Rob Kooper Date: Tue, 15 Sep 2020 22:24:30 -0500 Subject: [PATCH 25/32] build updates --- .github/workflows/ci.yaml | 12 ++++++++- .github/workflows/pypi.yaml | 50 +++++++++++++++++++++++++++++++++++++ 2 files changed, 61 insertions(+), 1 deletion(-) create mode 100644 .github/workflows/pypi.yaml diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 59a0536..ebf0d9c 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -4,7 +4,6 @@ on: [push] jobs: build: - runs-on: ubuntu-latest strategy: matrix: @@ -18,6 +17,17 @@ jobs: with: python-version: ${{ matrix.python-version }} + - name: Cache pip + uses: actions/cache@v2 + with: + # This path is specific to Ubuntu + path: ~/.cache/pip + # Look to see if there is a cache hit for the corresponding requirements file + key: ${{ runner.os }}-pip-${{ hashFiles('requirements.txt') }} + restore-keys: | + ${{ runner.os }}-pip- + ${{ runner.os }}- + - name: Install dependencies run: | python -m pip install --upgrade pip diff --git a/.github/workflows/pypi.yaml b/.github/workflows/pypi.yaml new file mode 100644 index 0000000..63f30fc --- /dev/null +++ b/.github/workflows/pypi.yaml @@ -0,0 +1,50 @@ +on: + push: + tags: + +jobs: + publish: + name: Build and publish python packages + runs-on: ubuntu-18.04 + steps: + - uses: actions/checkout@v2 + + - name: Set up Python 3.7 + uses: actions/setup-python@v2 + with: + python-version: 3.7 + + - name: Install dependencies + run: python -m pip install --upgrade pip setuptools wheel + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r requirements.txt + + - name: Cache pip + uses: actions/cache@v2 + with: + # This path is specific to Ubuntu + path: ~/.cache/pip + # Look to see if there is a cache hit for the corresponding requirements file + key: ${{ runner.os }}-pip-${{ hashFiles('requirements.txt') }} + restore-keys: | + ${{ runner.os }}-pip- + ${{ runner.os }}- + + - name: Build dist file + run: | + python setup.py sdist bdist_wheel + + - name: Publish distribution to Test PyPI + uses: pypa/gh-action-pypi-publish@master + with: + password: ${{ secrets.test_pypi_password }} + repository_url: https://test.pypi.org/legacy/ + + - name: Publish distribution to PyPI + if: startsWith(github.ref, 'refs/tags') + uses: pypa/gh-action-pypi-publish@master + with: + password: ${{ secrets.pypi_password }} From 56ba3ef29bf7cc8f52ee8bec2503779acfe643d4 Mon Sep 17 00:00:00 2001 From: Rob Kooper Date: Tue, 15 Sep 2020 22:25:31 -0500 Subject: [PATCH 26/32] there are no tests :( --- .github/workflows/ci.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index ebf0d9c..fd4646b 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -41,6 +41,6 @@ jobs: # exit-zero treats all errors as warnings. flake8 . --count --exit-zero --max-complexity=10 --max-line-length=120 --statistics - - name: Test with pytest - run: | - pytest +# - name: Test with pytest +# run: | +# pytest From fe8165c28c6161a0c085ef569e2d9965f529c423 Mon Sep 17 00:00:00 2001 From: Rob Kooper Date: Tue, 15 Sep 2020 22:26:24 -0500 Subject: [PATCH 27/32] give action a name --- .github/workflows/pypi.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/pypi.yaml b/.github/workflows/pypi.yaml index 63f30fc..2aece61 100644 --- a/.github/workflows/pypi.yaml +++ b/.github/workflows/pypi.yaml @@ -1,3 +1,5 @@ +name: Publish Python package + on: push: tags: From 92b70567791dd4bd45de0b2bbc6a89881a45f627 Mon Sep 17 00:00:00 2001 From: Rob Kooper Date: Tue, 15 Sep 2020 22:33:09 -0500 Subject: [PATCH 28/32] fix urls --- setup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 3bc41e3..b641e5a 100644 --- a/setup.py +++ b/setup.py @@ -16,9 +16,9 @@ def description(): author='Rob Kooper', author_email='kooper@illinois.edu', - url='https://clowder.ncsa.illinois.edu', + url='https://clowderframework.org', project_urls={ - 'Source': 'https://opensource.ncsa.illinois.edu/bitbucket/scm/cats/pyclowder.git', + 'Source': 'https://github.com/clowder-framework/pyclowder', }, license='BSD', From 359bcf9ffc317b0c7e67e5b2d5157323f13fe34e Mon Sep 17 00:00:00 2001 From: Rob Kooper Date: Tue, 15 Sep 2020 22:34:36 -0500 Subject: [PATCH 29/32] add warning about docker images --- CHANGELOG.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6d09788..dfef20f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,9 @@ and this project adheres to [Semantic Versioning](http://semver.org/). Removed develop branch, all pull requests will need to be against master from now forward. Please update version number in setup.py in each PR. +From this version no more docker images are build, please use pip install to +install pyclowder. + ### Added - Simple extractors now support datasets, can also create new datasets. - Ability to add tags from simple extractor to files and datasets. From ce15f98fcbe9c9709c8c231b6d88b39db19c51d2 Mon Sep 17 00:00:00 2001 From: Rob Kooper Date: Tue, 15 Sep 2020 22:42:12 -0500 Subject: [PATCH 30/32] can only push once so no use for test pypi --- .github/workflows/pypi.yaml | 6 ------ 1 file changed, 6 deletions(-) diff --git a/.github/workflows/pypi.yaml b/.github/workflows/pypi.yaml index 2aece61..bb3faa2 100644 --- a/.github/workflows/pypi.yaml +++ b/.github/workflows/pypi.yaml @@ -39,12 +39,6 @@ jobs: run: | python setup.py sdist bdist_wheel - - name: Publish distribution to Test PyPI - uses: pypa/gh-action-pypi-publish@master - with: - password: ${{ secrets.test_pypi_password }} - repository_url: https://test.pypi.org/legacy/ - - name: Publish distribution to PyPI if: startsWith(github.ref, 'refs/tags') uses: pypa/gh-action-pypi-publish@master From f8cfc7161f8ac7aac3e8a25673271ce36fc711c6 Mon Sep 17 00:00:00 2001 From: Max Burnette Date: Wed, 16 Sep 2020 08:08:22 -0500 Subject: [PATCH 31/32] Add PyYAML back to requirements. --- requirements.txt | 1 + setup.py | 1 + 2 files changed, 2 insertions(+) diff --git a/requirements.txt b/requirements.txt index 2240490..27a4ff8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ pika==1.1.0 +PyYAML==5.1 requests==2.24.0 requests-toolbelt==0.9.1 diff --git a/setup.py b/setup.py index b641e5a..91c17b3 100644 --- a/setup.py +++ b/setup.py @@ -33,6 +33,7 @@ def description(): install_requires=[ 'pika==1.1.0', + 'PyYAML==5.1', 'requests==2.24.0', 'requests-toolbelt==0.9.1', ], From 39a6e61e70992d310383513b89142e9ab9266448 Mon Sep 17 00:00:00 2001 From: Rob Kooper Date: Wed, 16 Sep 2020 08:14:24 -0500 Subject: [PATCH 32/32] add PyYAML to pipenv --- Pipfile | 1 + Pipfile.lock | 19 ++++++++++++++++++- 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/Pipfile b/Pipfile index 8a259a4..5441353 100644 --- a/Pipfile +++ b/Pipfile @@ -9,6 +9,7 @@ verify_ssl = true pika = "*" requests = "*" requests-toolbelt = "*" +pyyaml = "==5.1" [requires] python_version = "3.7" diff --git a/Pipfile.lock b/Pipfile.lock index 5b8dcce..bac9acb 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "d6c9b6d71b5b799a85a1ba3947336c8e9a7095ba216219722a8d928d89f302d4" + "sha256": "d33480b241f0335d12224043e531f6982e24a7e3593cfdbb64bb93dbed8325e1" }, "pipfile-spec": 6, "requires": { @@ -46,6 +46,23 @@ "index": "pypi", "version": "==1.1.0" }, + "pyyaml": { + "hashes": [ + "sha256:1adecc22f88d38052fb787d959f003811ca858b799590a5eaa70e63dca50308c", + "sha256:436bc774ecf7c103814098159fbb84c2715d25980175292c648f2da143909f95", + "sha256:460a5a4248763f6f37ea225d19d5c205677d8d525f6a83357ca622ed541830c2", + "sha256:5a22a9c84653debfbf198d02fe592c176ea548cccce47553f35f466e15cf2fd4", + "sha256:7a5d3f26b89d688db27822343dfa25c599627bc92093e788956372285c6298ad", + "sha256:9372b04a02080752d9e6f990179a4ab840227c6e2ce15b95e1278456664cf2ba", + "sha256:a5dcbebee834eaddf3fa7366316b880ff4062e4bcc9787b78c7fbb4a26ff2dd1", + "sha256:aee5bab92a176e7cd034e57f46e9df9a9862a71f8f37cad167c6fc74c65f5b4e", + "sha256:c51f642898c0bacd335fc119da60baae0824f2cde95b0330b56c0553439f0673", + "sha256:c68ea4d3ba1705da1e0d85da6684ac657912679a649e8868bd850d2c299cce13", + "sha256:e23d0cc5299223dcc37885dae624f382297717e459ea24053709675a976a3e19" + ], + "index": "pypi", + "version": "==5.1" + }, "requests": { "hashes": [ "sha256:b3559a131db72c33ee969480840fff4bb6dd111de7dd27c8ee1f820f4f00231b",