From 6d148561bd0a171b851245cb59bdbf8db2a533a8 Mon Sep 17 00:00:00 2001 From: troyraen Date: Wed, 2 Aug 2023 20:04:11 -0700 Subject: [PATCH 01/55] prep imports and module-level variables --- pittgoogle/pubsub.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/pittgoogle/pubsub.py b/pittgoogle/pubsub.py index 28b1ffc..7b32ed4 100644 --- a/pittgoogle/pubsub.py +++ b/pittgoogle/pubsub.py @@ -76,12 +76,17 @@ def my_batch_callback(results): ---- """ +import importlib.resources +import io +import json import logging import queue from concurrent.futures import ThreadPoolExecutor from time import sleep -from typing import Any, ByteString, Callable, List, Optional, Union +from typing import TYPE_CHECKING, Any, Callable, List, Optional, Union +import fastavro +import yaml from attrs import converters, define, field from attrs.validators import gt, instance_of, is_callable, optional from google.api_core.exceptions import NotFound @@ -91,7 +96,14 @@ def my_batch_callback(results): from .exceptions import OpenAlertError from .utils import Cast +if TYPE_CHECKING: + import google.protobuf.timestamp_pb2 + import google._upb._message + import pandas as pd + + LOGGER = logging.getLogger(__name__) +PACKAGE_DIR = importlib.resources.files(__package__) def msg_callback_example(alert: "Alert") -> "Response": From 48f67ef7731238759a5ed466f7eca2a8d7470286 Mon Sep 17 00:00:00 2001 From: troyraen Date: Wed, 2 Aug 2023 20:06:05 -0700 Subject: [PATCH 02/55] add methods to create, delete, publish to topic --- pittgoogle/pubsub.py | 95 +++++++++++++++++++++++++++++++++++++++----- 1 file changed, 86 insertions(+), 9 deletions(-) diff --git a/pittgoogle/pubsub.py b/pittgoogle/pubsub.py index 7b32ed4..650b056 100644 --- a/pittgoogle/pubsub.py +++ b/pittgoogle/pubsub.py @@ -158,18 +158,24 @@ class Topic: ------------ name : `str` Name of the Pub/Sub topic. - projectid : `str` - The topic owner's Google Cloud project ID. Note: :attr:`pittgoogle.utils.ProjectIds` - is a registry containing Pitt-Google's project IDs. + projectid : `str`, optional + The topic owner's Google Cloud project ID. Either this or `auth` is required. Use this + if you are connecting to a subscription owned by a different project than this topic. Note: + :attr:`pittgoogle.utils.ProjectIds` is a registry containing Pitt-Google's project IDs. + auth : :class:`pittgoogle.auth.Auth`, optional + Credentials for the Google Cloud project that owns this topic. If not provided, + it will be created from environment variables when needed. + client : `pubsub_v1.PublisherClient`, optional + Pub/Sub client that will be used to access the topic. If not provided, a new client will + be created (using `auth`) the first time it is requested. """ name: str = field() - projectid: str = field() - - @property - def path(self) -> str: - """Fully qualified path to the topic.""" - return f"projects/{self.projectid}/topics/{self.name}" + projectid: str = field(default=None) + _auth: Auth = field(default=None, validator=optional(instance_of(Auth))) + _client: Optional[pubsub_v1.PublisherClient] = field( + default=None, validator=optional(instance_of(pubsub_v1.PublisherClient)) + ) @classmethod def from_path(cls, path) -> "Topic": @@ -177,6 +183,77 @@ def from_path(cls, path) -> "Topic": _, projectid, _, name = path.split("/") return cls(name, projectid) + @property + def auth(self) -> Auth: + """Credentials for the Google Cloud project that owns this topic. + + This will be created from environment variables if `self._auth` is None. + """ + if self._auth is None: + self._auth = Auth() + + if (self.projectid != self._auth.GOOGLE_CLOUD_PROJECT) and (self.projectid is not None): + LOGGER.warning(f"setting projectid to match auth: {self._auth.GOOGLE_CLOUD_PROJECT}") + self.projectid = self._auth.GOOGLE_CLOUD_PROJECT + + return self._auth + + @property + def path(self) -> str: + """Fully qualified path to the topic.""" + # make sure we have a projectid. if it needs to be set, call auth + if self.projectid is None: + self.auth + return f"projects/{self.projectid}/topics/{self.name}" + + @property + def client(self) -> pubsub_v1.PublisherClient: + """Pub/Sub client for topic access. + + Will be created using `self.auth.credentials` if necessary. + """ + if self._client is None: + self._client = pubsub_v1.PublisherClient(credentials=self.auth.credentials) + return self._client + + def touch(self) -> None: + """Test the connection to the topic, creating it if necessary.""" + try: + self.client.get_topic(topic=self.path) + LOGGER.info(f"topic exists: {self.path}") + + except NotFound: + self.client.create_topic(name=self.path) + LOGGER.info(f"topic created: {self.path}") + + def delete(self) -> None: + """Delete the topic.""" + try: + self.client.delete_topic(topic=self.path) + except NotFound: + LOGGER.info(f"nothing to delete. topic not found: {self.path}") + else: + LOGGER.info(f"deleted topic: {self.path}") + + def publish(self, alert: "Alert", format="json") -> int: + """Publish the `alert.dict` in the requested `format`, attaching the `alert.attributes`.""" + if format == "json": + message = json.dumps(alert.dict).encode("utf-8") + + elif format.startswith("elasticc"): + # load the avro schema and use it to serialize alert.dict + schema = fastavro.schema.load_schema(PACKAGE_DIR / f"schemas/elasticc/{format}.avsc") + fout = io.BytesIO() + fastavro.schemaless_writer(fout, schema, alert.dict) + fout.seek(0) + message = fout.getvalue() + + # attribute keys and values must be strings + attributes = {str(key): str(val) for key, val in alert.attributes.items()} + + future = self.client.publish(self.path, data=message, **attributes) + return future.result() + @define class Subscription: From cf2398e58660e4b98e389856df1c9196d3192d20 Mon Sep 17 00:00:00 2001 From: troyraen Date: Wed, 2 Aug 2023 22:00:50 -0700 Subject: [PATCH 03/55] revamp Alert class add: attributes, dataframe, schema_name, schema_map remove: bytes (available as alert.msg.data), metadata (available as alert.msg.*) --- pittgoogle/pubsub.py | 165 ++++++++++++++++++++++++++++++++----------- 1 file changed, 125 insertions(+), 40 deletions(-) diff --git a/pittgoogle/pubsub.py b/pittgoogle/pubsub.py index 650b056..2f187d0 100644 --- a/pittgoogle/pubsub.py +++ b/pittgoogle/pubsub.py @@ -120,6 +120,7 @@ def batch_callback_example(batch: list) -> None: def pull_batch( subscription: Union[str, "Subscription"], max_messages: int = 1, + schema_name: str = str(), **subscription_kwargs, ) -> List["Alert"]: """Pull a single batch of messages from the `subscription`. @@ -130,6 +131,10 @@ def pull_batch( Subscription to be pulled. If `str`, the name of the subscription. max_messages : `int` Maximum number of messages to be pulled. + schema_name : `str` + One of "ztf", "ztf.lite", "elasticc.v0_9_1.alert", "elasticc.v0_9_1.brokerClassification". + Schema name of the alerts in the subscription. Passed to :class:`pittgoogle.pubsub.Alert` + for unpacking. If not provided, some properties of the `Alert` may not be available. subscription_kwargs Keyword arguments sent to :class:`pittgoogle.pubsub.Subscription`. Ignored if `subscription` is a :class:`pittgoogle.pubsub.Subscription`. @@ -272,6 +277,10 @@ class Subscription: client : `pubsub_v1.SubscriberClient`, optional Pub/Sub client that will be used to access the subscription. This kwarg is useful if you want to reuse a client. If None, a new client will be created. + schema_name : `str` + One of "ztf", "ztf.lite", "elasticc.v0_9_1.alert", "elasticc.v0_9_1.brokerClassification". + Schema name of the alerts in the subscription. Passed to :class:`pittgoogle.pubsub.Alert` + for unpacking. If not provided, some properties of the `Alert` may not be available. """ name: str = field() @@ -280,6 +289,7 @@ class Subscription: _client: Optional[pubsub_v1.SubscriberClient] = field( default=None, validator=optional(instance_of(pubsub_v1.SubscriberClient)) ) + schema_name: str = field(factory=str) @property def projectid(self) -> str: @@ -544,63 +554,138 @@ class Alert: The message metadata. msg : `google.cloud.pubsub_v1.types.PubsubMessage`, optional The Pub/Sub message object, documented at - ``__. + ``__. + schema_name : `str` + One of "ztf", "ztf.lite", "elasticc.v0_9_1.alert", "elasticc.v0_9_1.brokerClassification". + Schema name of the alert. Used for unpacking. If not provided, some properties of the + `Alert` may not be available. """ - _bytes: Optional[ByteString] = field(default=None) + # _bytes: Optional[ByteString] = field(default=None) _dict: Optional[dict] = field(default=None) - _metadata: Optional[dict] = field(default=None) - msg: Optional["pubsub_v1.types.PubsubMessage"] = field(default=None) - """Original Pub/Sub message object.""" - - @property - def bytes(self) -> bytes: - """Message payload in original format (Avro or JSON serialized bytes).""" - if self._bytes is None: - # add try-except when we know what we're looking for - self._bytes = self.msg.data - if self._bytes is None: - # if we add a "path" attribute for the path to an avro file on disk - # we can load it like this: - # with open(self.path, "rb") as f: - # self._bytes = f.read() - pass - return self._bytes + _attributes: Optional[Union[dict, "google._upb._message.ScalarMapContainer"]] = field( + default=None + ) + # _metadata: Optional[dict] = field(default=None) + msg: Optional[Union["pubsub_v1.types.PubsubMessage", "_PubsubMessageLike"]] = field( + default=None + ) + """Incoming Pub/Sub message object.""" + _dataframe: Optional["pd.DataFrame"] = field(default=None) + schema_name: str = field(factory=str) + _schema_map: Optional[dict] = field(default=None) + # _metadata: Optional[dict] = field(default=None) + + + # @property + # def bytes(self) -> bytes: + # """Message payload in original format (Avro or JSON serialized bytes).""" + # if self._bytes is None: + # # add try-except when we know what we're looking for + # self._bytes = self.msg.data + # if self._bytes is None: + # # if we add a "path" attribute for the path to an avro file on disk + # # we can load it like this: + # # with open(self.path, "rb") as f: + # # self._bytes = f.read() + # pass + # return self._bytes @property def dict(self) -> dict: - """Message payload as a dictionary. + """Message payload as a dictionary. Created from `self.msg.data` and `self.schema_name`, if needed. Raises ------ :class:`pittgoogle.exceptions.OpenAlertError` if unable to deserialize the alert bytes. """ - if self._dict is None: - # this should be rewritten to catch specific errors - # for now, just try avro then json, catching basically all errors in the process + if self._dict is not None: + return self._dict + + if self.schema_name.startswith("elasticc"): + # self.msg.data is avro and schemaless. load the schema, then convert the bytes to a dict + schemapath = PACKAGE_DIR / f"schemas/elasticc/{self.schema_name}.avsc" + schema = fastavro.schema.load_schema(schemapath) + with io.BytesIO(self.msg.data) as fin: + self._dict = fastavro.schemaless_reader(fin, schema) + return self._dict + + if self.schema_name == "": + LOGGER.warning("no alert schema_name provided. attempting to deserialize without it.") + + # assume this is a ztf or ztf-lite alert + # this should be rewritten to catch specific errors + # for now, just try avro then json, catching basically all errors in the process + try: + self._dict = Cast.avro_to_dict(self.msg.data) + except Exception: try: - self._dict = Cast.avro_to_dict(self.bytes) + self._dict = Cast.json_to_dict(self.msg.data) except Exception: - try: - self._dict = Cast.json_to_dict(self.bytes) - except Exception: - raise OpenAlertError("failed to deserialize the alert bytes") + raise OpenAlertError("failed to deserialize the alert bytes") return self._dict @property - def metadata(self) -> dict: - """Message metadata as a flat dictionary.""" - if self._metadata is None: - self._metadata = { - "message_id": self.msg.message_id, - "publish_time": self.msg.publish_time, - # ordering must be enabled on the subscription for this to be useful - "ordering_key": self.msg.ordering_key, - # flatten the dict containing our custom attributes - **self.msg.attributes, - } - return self._metadata + def attributes(self) -> Union[dict, "google._upb._message.ScalarMapContainer"]: + """Custom metadata for the message. Pub/Sub handles this as a dict-like called "attributes". + + If None, this will be set to `self.msg.attributes`. + Update as desired. + When publishing, this will be sent as the message attributes. + """ + if self._attributes is None: + self._attributes = self.msg.attributes + return self._attributes + + @property + def dataframe(self) -> "pd.DataFrame": + if self._dataframe is None: + import pandas as pd # lazy-load pandas. it hogs memory on cloud functions and run + + if self.schema_name.endswith(".lite"): + src_df = pd.DataFrame(self.dict["source"], index=[0]) + prvs_df = pd.DataFrame(self.dict["prvSources"]) + else: + src_df = pd.DataFrame(self.dict[self.schema_map["source"]], index=[0]) + prvs_df = pd.DataFrame(self.dict[self.schema_map["prvSources"]]) + self._dataframe = pd.concat([src_df, prvs_df], ignore_index=True) + + return self._dataframe + + @property + def schema_map(self) -> dict: + if self._schema_map is None: + if self.schema_name == str(): + raise TypeError("no alert schema_name provided. unable to load schema map.") + survey = self.schema_name.split(".")[0] + path = PACKAGE_DIR / f"schema_maps/{survey}.yml" + self._schema_map = yaml.safe_load(path.read_text()) + return self._schema_map + + # @property + # def metadata(self) -> dict: + # """Pub/Sub message metadata. + + # Includes + + # - message_id, publish_time, and ordering_key* of the incoming Pub/Sub message + # - attributes, which is a dict that typically includes the attributes of the + # incoming message and possibly additional entries added by the user in the meantime. + + # *To be useful, ordering_key requires that ordering is enabled on the subscription. + # """ + # if self._metadata is None: + # self._metadata = { + # "message_id": self.msg.message_id, + # "publish_time": self.msg.publish_time, + # # ordering must be enabled on the subscription for this to be useful + # "ordering_key": self.msg.ordering_key, + # # [TODO] breaking change. attributes is now a dict. open a pr on tom_desc + # # typically includes self.msg.attributes plus additional items added by the user + # "attributes": self.attributes, + # } + # return self._metadata @define(kw_only=True, frozen=True) From dafec2d917d9abd68c52924032f0d4b142884ed9 Mon Sep 17 00:00:00 2001 From: troyraen Date: Wed, 2 Aug 2023 22:01:41 -0700 Subject: [PATCH 04/55] add schemas and schema maps --- pittgoogle/schema_maps/decat.yml | 17 ++++ pittgoogle/schema_maps/elasticc.yml | 18 +++++ pittgoogle/schema_maps/ztf.yml | 18 +++++ .../elasticc/elasticc.v0_9_1.alert.avsc | 17 ++++ .../elasticc.v0_9_1.brokerClassification.avsc | 35 ++++++++ .../elasticc.v0_9_1.diaForcedSource.avsc | 13 +++ .../elasticc.v0_9_1.diaNondetectionLimit.avsc | 11 +++ .../elasticc/elasticc.v0_9_1.diaObject.avsc | 79 +++++++++++++++++++ .../elasticc/elasticc.v0_9_1.diaSource.avsc | 16 ++++ 9 files changed, 224 insertions(+) create mode 100644 pittgoogle/schema_maps/decat.yml create mode 100644 pittgoogle/schema_maps/elasticc.yml create mode 100644 pittgoogle/schema_maps/ztf.yml create mode 100644 pittgoogle/schemas/elasticc/elasticc.v0_9_1.alert.avsc create mode 100644 pittgoogle/schemas/elasticc/elasticc.v0_9_1.brokerClassification.avsc create mode 100644 pittgoogle/schemas/elasticc/elasticc.v0_9_1.diaForcedSource.avsc create mode 100644 pittgoogle/schemas/elasticc/elasticc.v0_9_1.diaNondetectionLimit.avsc create mode 100644 pittgoogle/schemas/elasticc/elasticc.v0_9_1.diaObject.avsc create mode 100644 pittgoogle/schemas/elasticc/elasticc.v0_9_1.diaSource.avsc diff --git a/pittgoogle/schema_maps/decat.yml b/pittgoogle/schema_maps/decat.yml new file mode 100644 index 0000000..068839c --- /dev/null +++ b/pittgoogle/schema_maps/decat.yml @@ -0,0 +1,17 @@ +SURVEY: decat +SURVEY_SCHEMA: https://github.com/rknop/decat_schema +TOPIC_SYNTAX: decat_yyyymmdd_2021A-0113 # replace yyyymmdd with the date +FILTER_MAP: + g DECam SDSS c0001 4720.0 1520.0: g + r DECam SDSS c0002 6415.0 1480.0: r +objectId: objectid +prvSources: sources +source: triggersource +sourceId: sourceid +cutoutDifference: diffcutout +cutoutScience: scicutout +cutoutTemplate: refcutout +filter: filter +mag: mag +magerr: magerr +magzp: magzp diff --git a/pittgoogle/schema_maps/elasticc.yml b/pittgoogle/schema_maps/elasticc.yml new file mode 100644 index 0000000..a0f953f --- /dev/null +++ b/pittgoogle/schema_maps/elasticc.yml @@ -0,0 +1,18 @@ +SURVEY: elasticc +SURVEY_SCHEMA: https://github.com/LSSTDESC/elasticc/tree/main/alert_schema +SCHEMA_VERSION: v0_9_1 +TOPIC_SYNTAX: +FILTER_MAP: +objectId: [diaObject, diaObjectId] +prvSources: prvDiaSources +source: diaSource +sourceId: [diaSource, diaSourceId] +cutoutScience: none +filter: filterName +mag: magpsf +magerr: sigmapsf +magzp: magzpsci +psFlux: psFlux +psFluxErr: psFluxErr +dec: decl +ra: ra diff --git a/pittgoogle/schema_maps/ztf.yml b/pittgoogle/schema_maps/ztf.yml new file mode 100644 index 0000000..5f80e1e --- /dev/null +++ b/pittgoogle/schema_maps/ztf.yml @@ -0,0 +1,18 @@ +SURVEY: ztf +SURVEY_SCHEMA: https://zwickytransientfacility.github.io/ztf-avro-alert/schema.html +TOPIC_SYNTAX: ztf_yyyymmdd_programid1 # replace yyyymmdd with the date +FILTER_MAP: + 1: g + 2: r + 3: i +objectId: objectId +prvSources: prv_candidates +source: candidate +sourceId: candid +cutoutDifference: cutoutDifference +cutoutScience: cutoutScience +cutoutTemplate: cutoutTemplate +filter: fid +mag: magpsf +magerr: sigmapsf +magzp: magzpsci diff --git a/pittgoogle/schemas/elasticc/elasticc.v0_9_1.alert.avsc b/pittgoogle/schemas/elasticc/elasticc.v0_9_1.alert.avsc new file mode 100644 index 0000000..d5b89ea --- /dev/null +++ b/pittgoogle/schemas/elasticc/elasticc.v0_9_1.alert.avsc @@ -0,0 +1,17 @@ +{ + "namespace": "elasticc.v0_9_1", + "type": "record", + "name": "alert", + "doc": "sample avro alert schema v4.1", + "fields": [ + {"name": "alertId", "type": "long", "doc": "unique alert identifer"}, + {"name": "diaSource", "type": "elasticc.v0_9_1.diaSource"}, + {"name": "prvDiaSources", "type": ["null", { + "type": "array", + "items": "elasticc.v0_9_1.diaSource"}], "default": null}, + {"name": "prvDiaForcedSources", "type": ["null", { + "type": "array", + "items": "elasticc.v0_9_1.diaForcedSource"}], "default": null}, + {"name": "diaObject", "type": ["null", "elasticc.v0_9_1.diaObject"], "default": null} + ] +} diff --git a/pittgoogle/schemas/elasticc/elasticc.v0_9_1.brokerClassification.avsc b/pittgoogle/schemas/elasticc/elasticc.v0_9_1.brokerClassification.avsc new file mode 100644 index 0000000..f975f9a --- /dev/null +++ b/pittgoogle/schemas/elasticc/elasticc.v0_9_1.brokerClassification.avsc @@ -0,0 +1,35 @@ +{ + "namespace": "elasticc.v0_9_1", + "type": "record", + "name": "brokerClassfication", + "fields": [ + {"name": "alertId", "type": "long", "doc": "unique alert identifer"}, + {"name": "diaSourceId", "type": "long", "doc": "id of source that triggered this classification"}, + {"name": "elasticcPublishTimestamp", + "type": {"type": "long", "logicalType": "timestamp-millis"}, + "doc": "timestamp from originating ELAsTiCC alert" + }, + {"name": "brokerIngestTimestamp", + "type": ["null", {"type": "long", "logicalType": "timestamp-millis"}], + "doc": "timestamp of broker ingestion of ELAsTiCC alert" + }, + {"name": "brokerName", "type": "string", "doc": "Name of broker (never changes)" }, + {"name": "brokerVersion", "type": "string", "doc": "Version/Release of broker's software" }, + {"name": "classifierName", "type": "string", + "doc": "Name of classifier broker is using, including software version" }, + {"name": "classifierParams", "type": "string", + "doc": "Any classifier parameter information worth noting for this classification" }, + {"name": "classifications", "type": { + "type": "array", + "items": { + "type": "record", + "name": "classificationDict", + "fields": [ + {"name": "classId", "type": "int", "doc": "See https://github.com/LSSTDESC/elasticc/tree/main/taxonomy/taxonomy.ipynb for specification" }, + {"name": "probability", "type": "float", "doc": "0-1" } + ] + } + } + } + ] +} diff --git a/pittgoogle/schemas/elasticc/elasticc.v0_9_1.diaForcedSource.avsc b/pittgoogle/schemas/elasticc/elasticc.v0_9_1.diaForcedSource.avsc new file mode 100644 index 0000000..d5d180f --- /dev/null +++ b/pittgoogle/schemas/elasticc/elasticc.v0_9_1.diaForcedSource.avsc @@ -0,0 +1,13 @@ +{ + "namespace": "elasticc.v0_9_1", + "name": "diaForcedSource", + "type": "record", + "fields": [ + {"name": "diaForcedSourceId", "type": "long"}, + {"name": "diaObjectId", "type": "long"}, + {"name": "midPointTai", "type": "double"}, + {"name": "filterName", "type": "string"}, + {"name": "psFlux", "type": "float"}, + {"name": "psFluxErr", "type": "float"} + ] +} diff --git a/pittgoogle/schemas/elasticc/elasticc.v0_9_1.diaNondetectionLimit.avsc b/pittgoogle/schemas/elasticc/elasticc.v0_9_1.diaNondetectionLimit.avsc new file mode 100644 index 0000000..2cffef3 --- /dev/null +++ b/pittgoogle/schemas/elasticc/elasticc.v0_9_1.diaNondetectionLimit.avsc @@ -0,0 +1,11 @@ +{ + "namespace": "elasticc.v0_9_1", + "name": "diaNondetectionLimit", + "type": "record", + "fields": [ + {"name": "ccdVisitId", "type": "long"}, + {"name": "midPointTai", "type": "double"}, + {"name": "filterName", "type": "string"}, + {"name": "diaNoise", "type": "float"} + ] +} diff --git a/pittgoogle/schemas/elasticc/elasticc.v0_9_1.diaObject.avsc b/pittgoogle/schemas/elasticc/elasticc.v0_9_1.diaObject.avsc new file mode 100644 index 0000000..5b65699 --- /dev/null +++ b/pittgoogle/schemas/elasticc/elasticc.v0_9_1.diaObject.avsc @@ -0,0 +1,79 @@ +{ + "namespace": "elasticc.v0_9_1", + "name": "diaObject", + "type": "record", + "fields": [ + {"name": "diaObjectId", "type": "long"}, + {"name": "simVersion", "type": ["null", "string"], "doc": "diaObject provenance"}, + {"name": "ra", "type": "double"}, + {"name": "decl", "type": "double"}, + {"name": "mwebv", "type": ["null", "float"], "default": null}, + {"name": "mwebv_err", "type": ["null", "float"], "default": null}, + {"name": "z_final", "type": ["null", "float"], "default": null}, + {"name": "z_final_err", "type": ["null", "float"], "default": null}, + {"name": "hostgal_ellipticity", "type": ["null", "float"], "default": null}, + {"name": "hostgal_sqradius", "type": ["null", "float"], "default": null}, + {"name": "hostgal_zspec", "type": ["null", "float"], "default": null}, + {"name": "hostgal_zspec_err", "type": ["null", "float"], "default": null}, + {"name": "hostgal_zphot", "type": ["null", "float"], "default": null}, + {"name": "hostgal_zphot_err", "type": ["null", "float"], "default": null}, + {"name": "hostgal_zphot_q000", "type": ["null", "float"], "default": null}, + {"name": "hostgal_zphot_q010", "type": ["null", "float"], "default": null}, + {"name": "hostgal_zphot_q020", "type": ["null", "float"], "default": null}, + {"name": "hostgal_zphot_q030", "type": ["null", "float"], "default": null}, + {"name": "hostgal_zphot_q040", "type": ["null", "float"], "default": null}, + {"name": "hostgal_zphot_q050", "type": ["null", "float"], "default": null}, + {"name": "hostgal_zphot_q060", "type": ["null", "float"], "default": null}, + {"name": "hostgal_zphot_q070", "type": ["null", "float"], "default": null}, + {"name": "hostgal_zphot_q080", "type": ["null", "float"], "default": null}, + {"name": "hostgal_zphot_q090", "type": ["null", "float"], "default": null}, + {"name": "hostgal_zphot_q100", "type": ["null", "float"], "default": null}, + {"name": "hostgal_mag_u", "type": ["null", "float"], "default": null}, + {"name": "hostgal_mag_g", "type": ["null", "float"], "default": null}, + {"name": "hostgal_mag_r", "type": ["null", "float"], "default": null}, + {"name": "hostgal_mag_i", "type": ["null", "float"], "default": null}, + {"name": "hostgal_mag_z", "type": ["null", "float"], "default": null}, + {"name": "hostgal_mag_Y", "type": ["null", "float"], "default": null}, + {"name": "hostgal_ra", "type": ["null", "float"], "default": null}, + {"name": "hostgal_dec", "type": ["null", "float"], "default": null}, + {"name": "hostgal_snsep", "type": ["null", "float"], "default": null}, + {"name": "hostgal_magerr_u", "type": ["null", "float"], "default": null}, + {"name": "hostgal_magerr_g", "type": ["null", "float"], "default": null}, + {"name": "hostgal_magerr_r", "type": ["null", "float"], "default": null}, + {"name": "hostgal_magerr_i", "type": ["null", "float"], "default": null}, + {"name": "hostgal_magerr_z", "type": ["null", "float"], "default": null}, + {"name": "hostgal_magerr_Y", "type": ["null", "float"], "default": null}, + {"name": "hostgal2_ellipticity", "type": ["null", "float"], "default": null}, + {"name": "hostgal2_sqradius", "type": ["null", "float"], "default": null}, + {"name": "hostgal2_zspec", "type": ["null", "float"], "default": null}, + {"name": "hostgal2_zspec_err", "type": ["null", "float"], "default": null}, + {"name": "hostgal2_zphot", "type": ["null", "float"], "default": null}, + {"name": "hostgal2_zphot_err", "type": ["null", "float"], "default": null}, + {"name": "hostgal2_zphot_q000", "type": ["null", "float"], "default": null}, + {"name": "hostgal2_zphot_q010", "type": ["null", "float"], "default": null}, + {"name": "hostgal2_zphot_q020", "type": ["null", "float"], "default": null}, + {"name": "hostgal2_zphot_q030", "type": ["null", "float"], "default": null}, + {"name": "hostgal2_zphot_q040", "type": ["null", "float"], "default": null}, + {"name": "hostgal2_zphot_q050", "type": ["null", "float"], "default": null}, + {"name": "hostgal2_zphot_q060", "type": ["null", "float"], "default": null}, + {"name": "hostgal2_zphot_q070", "type": ["null", "float"], "default": null}, + {"name": "hostgal2_zphot_q080", "type": ["null", "float"], "default": null}, + {"name": "hostgal2_zphot_q090", "type": ["null", "float"], "default": null}, + {"name": "hostgal2_zphot_q100", "type": ["null", "float"], "default": null}, + {"name": "hostgal2_mag_u", "type": ["null", "float"], "default": null}, + {"name": "hostgal2_mag_g", "type": ["null", "float"], "default": null}, + {"name": "hostgal2_mag_r", "type": ["null", "float"], "default": null}, + {"name": "hostgal2_mag_i", "type": ["null", "float"], "default": null}, + {"name": "hostgal2_mag_z", "type": ["null", "float"], "default": null}, + {"name": "hostgal2_mag_Y", "type": ["null", "float"], "default": null}, + {"name": "hostgal2_ra", "type": ["null", "float"], "default": null}, + {"name": "hostgal2_dec", "type": ["null", "float"], "default": null}, + {"name": "hostgal2_snsep", "type": ["null", "float"], "default": null}, + {"name": "hostgal2_magerr_u", "type": ["null", "float"], "default": null}, + {"name": "hostgal2_magerr_g", "type": ["null", "float"], "default": null}, + {"name": "hostgal2_magerr_r", "type": ["null", "float"], "default": null}, + {"name": "hostgal2_magerr_i", "type": ["null", "float"], "default": null}, + {"name": "hostgal2_magerr_z", "type": ["null", "float"], "default": null}, + {"name": "hostgal2_magerr_Y", "type": ["null", "float"], "default": null} + ] +} diff --git a/pittgoogle/schemas/elasticc/elasticc.v0_9_1.diaSource.avsc b/pittgoogle/schemas/elasticc/elasticc.v0_9_1.diaSource.avsc new file mode 100644 index 0000000..4906aa7 --- /dev/null +++ b/pittgoogle/schemas/elasticc/elasticc.v0_9_1.diaSource.avsc @@ -0,0 +1,16 @@ +{ + "namespace": "elasticc.v0_9_1", + "name": "diaSource", + "type": "record", + "fields": [ + {"name": "diaSourceId", "type": "long"}, + {"name": "diaObjectId", "type": ["null", "long"], "default": null}, + {"name": "midPointTai", "type": "double"}, + {"name": "filterName", "type": "string"}, + {"name": "ra", "type": "double"}, + {"name": "decl", "type": "double"}, + {"name": "psFlux", "type": "float"}, + {"name": "psFluxErr", "type": "float"}, + {"name": "snr", "type": "float"} + ] +} From 16719c614d5eebf65a5fa8b4e62586f2ea660453 Mon Sep 17 00:00:00 2001 From: troyraen Date: Wed, 2 Aug 2023 22:05:26 -0700 Subject: [PATCH 05/55] add Alert class methods and _PubsubMessageLike --- pittgoogle/pubsub.py | 44 +++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 41 insertions(+), 3 deletions(-) diff --git a/pittgoogle/pubsub.py b/pittgoogle/pubsub.py index 2f187d0..82994fd 100644 --- a/pittgoogle/pubsub.py +++ b/pittgoogle/pubsub.py @@ -146,13 +146,15 @@ def pull_batch( {"subscription": subscription.path, "max_messages": max_messages} ) - message_list = [Alert(msg=msg.message) for msg in response.received_messages] - ack_ids = [msg.ack_id for msg in response.received_messages] + alerts = [ + Alert.from_msg(msg.message, schema_name=schema_name) for msg in response.received_messages + ] + ack_ids = [msg.ack_id for msg in response.received_messages] if len(ack_ids) > 0: subscription.client.acknowledge({"subscription": subscription.path, "ack_ids": ack_ids}) - return message_list + return alerts @define @@ -576,6 +578,23 @@ class Alert: _schema_map: Optional[dict] = field(default=None) # _metadata: Optional[dict] = field(default=None) + @classmethod + def from_msg(cls, msg, schema_name=str()): # [TODO] update tom_desc to use this + """Create an `Alert` from a `pubsub_v1.types.PubsubMessage`.""" + return cls(msg=msg, schema_name=schema_name) + + @classmethod + def from_cloud_run(cls, envelope, schema_name=str()): + return cls( + msg=_PubsubMessageLike( + data=envelope["message"]["data"], + attributes=envelope["message"]["attributes"], + message_id=envelope["message"]["message_id"], + publish_time=envelope["message"]["publish_time"], + ordering_key=envelope["message"]["ordering_key"], + ), + schema_name=schema_name, + ) # @property # def bytes(self) -> bytes: @@ -708,3 +727,22 @@ class Response: ack: bool = field(default=True, converter=converters.to_bool) result: Any = field(default=None) + + +@define(frozen=True) +class _PubsubMessageLike: + """Container for an incoming Pub/Sub message that mimics a `pubsub_v1.types.PubsubMessage`. + + It is convenient for the `Alert` class to work with a message as a + `pubsub_v1.types.PubsubMessage`. However, there are many ways to obtain an alert that do + not result in a `pubsub_v1.types.PubsubMessage` (e.g., an alert packet loaded from disk or + an incoming message to a Cloud Functions or Cloud Run module). In those cases, this class + is used to create an object with the same attributes as a `pubsub_v1.types.PubsubMessage`. + This object is then assigned to the `msg` attribute of the `Alert`. + """ + + data: bytes = field() + attributes: dict = field(factory=dict) + message_id: Optional[str] = field(default=None) + publish_time: Optional["google.protobuf.timestamp_pb2.Timestamp"] = field(default=None) + ordering_key: Optional[str] = field(default=None) From d4fdba874a7c867c4259481b3e1beefa564553ee Mon Sep 17 00:00:00 2001 From: troyraen Date: Wed, 2 Aug 2023 22:10:32 -0700 Subject: [PATCH 06/55] add Subscription.pull_batch --- pittgoogle/pubsub.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/pittgoogle/pubsub.py b/pittgoogle/pubsub.py index 82994fd..498c187 100644 --- a/pittgoogle/pubsub.py +++ b/pittgoogle/pubsub.py @@ -367,6 +367,19 @@ def delete(self) -> None: else: LOGGER.info(f"deleted subscription: {self.path}") + def pull_batch(self, max_messages: int = 1) -> List["Alert"]: + """Pull a single batch of messages. + + Recommended for testing. Not recommended for long-running listeners (use the + :meth:`~Consumer.stream` method instead). + + Parameters + ---------- + max_messages : `int` + Maximum number of messages to be pulled. + """ + return pull_batch(self, max_messages=max_messages, schema_name=self.schema_name) + @define() class Consumer: @@ -533,7 +546,7 @@ def pull_batch(self, max_messages: int = 1) -> List["Alert"]: max_messages : `int` Maximum number of messages to be pulled. """ - return pull_batch(self.subscription, max_messages) + return self.subscription.pull_batch(max_messages=max_messages) @define(kw_only=True) From 426281e4009ec3103e73bd207691e9b12a5ec2b8 Mon Sep 17 00:00:00 2001 From: troyraen Date: Thu, 3 Aug 2023 12:24:31 -0700 Subject: [PATCH 07/55] reorder classes to avoid forward reference --- pittgoogle/pubsub.py | 42 ++++++++++++++++++++---------------------- 1 file changed, 20 insertions(+), 22 deletions(-) diff --git a/pittgoogle/pubsub.py b/pittgoogle/pubsub.py index 498c187..ac8a541 100644 --- a/pittgoogle/pubsub.py +++ b/pittgoogle/pubsub.py @@ -549,6 +549,25 @@ def pull_batch(self, max_messages: int = 1) -> List["Alert"]: return self.subscription.pull_batch(max_messages=max_messages) +@define(frozen=True) +class _PubsubMessageLike: + """Container for an incoming Pub/Sub message that mimics a `pubsub_v1.types.PubsubMessage`. + + It is convenient for the `Alert` class to work with a message as a + `pubsub_v1.types.PubsubMessage`. However, there are many ways to obtain an alert that do + not result in a `pubsub_v1.types.PubsubMessage` (e.g., an alert packet loaded from disk or + an incoming message to a Cloud Functions or Cloud Run module). In those cases, this class + is used to create an object with the same attributes as a `pubsub_v1.types.PubsubMessage`. + This object is then assigned to the `msg` attribute of the `Alert`. + """ + + data: bytes = field() + attributes: dict = field(factory=dict) + message_id: Optional[str] = field(default=None) + publish_time: Optional["google.protobuf.timestamp_pb2.Timestamp"] = field(default=None) + ordering_key: Optional[str] = field(default=None) + + @define(kw_only=True) class Alert: """Pitt-Google container for a Pub/Sub message. @@ -582,9 +601,7 @@ class Alert: default=None ) # _metadata: Optional[dict] = field(default=None) - msg: Optional[Union["pubsub_v1.types.PubsubMessage", "_PubsubMessageLike"]] = field( - default=None - ) + msg: Optional[Union["pubsub_v1.types.PubsubMessage", _PubsubMessageLike]] = field(default=None) """Incoming Pub/Sub message object.""" _dataframe: Optional["pd.DataFrame"] = field(default=None) schema_name: str = field(factory=str) @@ -740,22 +757,3 @@ class Response: ack: bool = field(default=True, converter=converters.to_bool) result: Any = field(default=None) - - -@define(frozen=True) -class _PubsubMessageLike: - """Container for an incoming Pub/Sub message that mimics a `pubsub_v1.types.PubsubMessage`. - - It is convenient for the `Alert` class to work with a message as a - `pubsub_v1.types.PubsubMessage`. However, there are many ways to obtain an alert that do - not result in a `pubsub_v1.types.PubsubMessage` (e.g., an alert packet loaded from disk or - an incoming message to a Cloud Functions or Cloud Run module). In those cases, this class - is used to create an object with the same attributes as a `pubsub_v1.types.PubsubMessage`. - This object is then assigned to the `msg` attribute of the `Alert`. - """ - - data: bytes = field() - attributes: dict = field(factory=dict) - message_id: Optional[str] = field(default=None) - publish_time: Optional["google.protobuf.timestamp_pb2.Timestamp"] = field(default=None) - ordering_key: Optional[str] = field(default=None) From 793612b5e97f543e8276b5f899f09ddecaf7d2df Mon Sep 17 00:00:00 2001 From: troyraen Date: Thu, 3 Aug 2023 12:25:52 -0700 Subject: [PATCH 08/55] add Topic.from_cloud for client w/o explicit Auth --- pittgoogle/pubsub.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/pittgoogle/pubsub.py b/pittgoogle/pubsub.py index ac8a541..ee8dc90 100644 --- a/pittgoogle/pubsub.py +++ b/pittgoogle/pubsub.py @@ -184,6 +184,14 @@ class Topic: default=None, validator=optional(instance_of(pubsub_v1.PublisherClient)) ) + @classmethod + def from_cloud(cls, name, *, projectid, testid=False): + """Create a `Topic` with a `client` using implicit credentials (no explicit `auth`).""" + # if testid is not False, "False", or None, append the testid to the name + if testid and testid != "False": + name = f"{name}-{testid}" + return cls(name, projectid=projectid, client=pubsub_v1.PublisherClient()) + @classmethod def from_path(cls, path) -> "Topic": """Parse the `path` and return a new `Topic`.""" From d751e037ac7110f3c3a0fe10023ea27b11c838ea Mon Sep 17 00:00:00 2001 From: troyraen Date: Thu, 3 Aug 2023 12:26:50 -0700 Subject: [PATCH 09/55] check envelope for bad request --- pittgoogle/pubsub.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/pittgoogle/pubsub.py b/pittgoogle/pubsub.py index ee8dc90..bf47f9b 100644 --- a/pittgoogle/pubsub.py +++ b/pittgoogle/pubsub.py @@ -615,6 +615,7 @@ class Alert: schema_name: str = field(factory=str) _schema_map: Optional[dict] = field(default=None) # _metadata: Optional[dict] = field(default=None) + bad_request: Union[bool, tuple[str, int]] = field(default=False) @classmethod def from_msg(cls, msg, schema_name=str()): # [TODO] update tom_desc to use this @@ -622,7 +623,13 @@ def from_msg(cls, msg, schema_name=str()): # [TODO] update tom_desc to use this return cls(msg=msg, schema_name=schema_name) @classmethod - def from_cloud_run(cls, envelope, schema_name=str()): + def from_cloud_run(cls, envelope: dict, schema_name: str = str()): + # check whether received message is valid + if not envelope: + return cls(bad_request=("Bad Request: no Pub/Sub message received", 400)) + if not isinstance(envelope, dict) or "message" not in envelope: + return cls(bad_request=("Bad Request: invalid Pub/Sub message format", 400)) + return cls( msg=_PubsubMessageLike( data=envelope["message"]["data"], From e5503b3423aede91280bb3db9a719c7fb1643d63 Mon Sep 17 00:00:00 2001 From: troyraen Date: Thu, 3 Aug 2023 12:27:16 -0700 Subject: [PATCH 10/55] add Alert.get --- pittgoogle/pubsub.py | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/pittgoogle/pubsub.py b/pittgoogle/pubsub.py index bf47f9b..c2f0844 100644 --- a/pittgoogle/pubsub.py +++ b/pittgoogle/pubsub.py @@ -655,6 +655,33 @@ def from_cloud_run(cls, envelope: dict, schema_name: str = str()): # pass # return self._bytes + def get(self, schema_key: str, return_key_name=False): + # fullkey = self.get(key, return_key=True) + survey_key = self.schema_map.get(schema_key) + + if return_key_name: + if isinstance(survey_key, list): + return survey_key[-1] + return survey_key + + if schema_key in self.dict: + return self.dict.get(schema_key) + + if isinstance(survey_key, str): + return self.dict.get(survey_key) + + if not isinstance(survey_key, list): + return + + if len(survey_key) == 1: + return self.dict.get(survey_key[0]) + + if len(survey_key) == 2: + return self.dict.get(survey_key[0]).get(survey_key[1]) + + if len(survey_key) == 3: + return self.dict.get(survey_key[0]).get(survey_key[1]).get(survey_key[2]) + @property def dict(self) -> dict: """Message payload as a dictionary. Created from `self.msg.data` and `self.schema_name`, if needed. From 878a281f0a6e7f3ca94fed9930ab6420cedf7f8f Mon Sep 17 00:00:00 2001 From: troyraen Date: Thu, 3 Aug 2023 12:28:00 -0700 Subject: [PATCH 11/55] clarify docstring --- pittgoogle/pubsub.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pittgoogle/pubsub.py b/pittgoogle/pubsub.py index c2f0844..16ab71c 100644 --- a/pittgoogle/pubsub.py +++ b/pittgoogle/pubsub.py @@ -251,7 +251,10 @@ def delete(self) -> None: LOGGER.info(f"deleted topic: {self.path}") def publish(self, alert: "Alert", format="json") -> int: - """Publish the `alert.dict` in the requested `format`, attaching the `alert.attributes`.""" + """Publish the `alert.dict` in the requested `format`, attaching the `alert.attributes`. + + `format` can be "json" or a schema name. + """ if format == "json": message = json.dumps(alert.dict).encode("utf-8") From 60cbc8d3a22122fa6cee1f541736535dc38753ac Mon Sep 17 00:00:00 2001 From: troyraen Date: Sat, 16 Sep 2023 15:38:12 -0700 Subject: [PATCH 12/55] move schema maps under the schemas directory --- .../{schema_maps => schemas/maps}/decat.yml | 12 ++++++------ .../{schema_maps => schemas/maps}/elasticc.yml | 18 +++++++++++------- .../{schema_maps => schemas/maps}/ztf.yml | 12 ++++++------ 3 files changed, 23 insertions(+), 19 deletions(-) rename pittgoogle/{schema_maps => schemas/maps}/decat.yml (68%) rename pittgoogle/{schema_maps => schemas/maps}/elasticc.yml (52%) rename pittgoogle/{schema_maps => schemas/maps}/ztf.yml (63%) diff --git a/pittgoogle/schema_maps/decat.yml b/pittgoogle/schemas/maps/decat.yml similarity index 68% rename from pittgoogle/schema_maps/decat.yml rename to pittgoogle/schemas/maps/decat.yml index 068839c..c150e38 100644 --- a/pittgoogle/schema_maps/decat.yml +++ b/pittgoogle/schemas/maps/decat.yml @@ -4,13 +4,13 @@ TOPIC_SYNTAX: decat_yyyymmdd_2021A-0113 # replace yyyymmdd with the date FILTER_MAP: g DECam SDSS c0001 4720.0 1520.0: g r DECam SDSS c0002 6415.0 1480.0: r -objectId: objectid -prvSources: sources +objectid: objectid +prv_sources: sources source: triggersource -sourceId: sourceid -cutoutDifference: diffcutout -cutoutScience: scicutout -cutoutTemplate: refcutout +sourceid: sourceid +cutout_difference: diffcutout +cutout_science: scicutout +cutout_template: refcutout filter: filter mag: mag magerr: magerr diff --git a/pittgoogle/schema_maps/elasticc.yml b/pittgoogle/schemas/maps/elasticc.yml similarity index 52% rename from pittgoogle/schema_maps/elasticc.yml rename to pittgoogle/schemas/maps/elasticc.yml index a0f953f..50852c1 100644 --- a/pittgoogle/schema_maps/elasticc.yml +++ b/pittgoogle/schemas/maps/elasticc.yml @@ -3,16 +3,20 @@ SURVEY_SCHEMA: https://github.com/LSSTDESC/elasticc/tree/main/alert_schema SCHEMA_VERSION: v0_9_1 TOPIC_SYNTAX: FILTER_MAP: -objectId: [diaObject, diaObjectId] -prvSources: prvDiaSources +alertid: alertId +objectid: [diaObject, diaObjectId] source: diaSource -sourceId: [diaSource, diaSourceId] -cutoutScience: none +sourceid: [diaSource, diaSourceId] +prv_sources: prvDiaSources +mjd: midPointTai filter: filterName mag: magpsf magerr: sigmapsf magzp: magzpsci -psFlux: psFlux -psFluxErr: psFluxErr -dec: decl +flux: psFlux +fluxerr: psFluxErr ra: ra +dec: decl +cutout_science: +cutout_template: +cutout_difference: diff --git a/pittgoogle/schema_maps/ztf.yml b/pittgoogle/schemas/maps/ztf.yml similarity index 63% rename from pittgoogle/schema_maps/ztf.yml rename to pittgoogle/schemas/maps/ztf.yml index 5f80e1e..4aaf800 100644 --- a/pittgoogle/schema_maps/ztf.yml +++ b/pittgoogle/schemas/maps/ztf.yml @@ -5,13 +5,13 @@ FILTER_MAP: 1: g 2: r 3: i -objectId: objectId -prvSources: prv_candidates +objectid: objectId +prv_sources: prv_candidates source: candidate -sourceId: candid -cutoutDifference: cutoutDifference -cutoutScience: cutoutScience -cutoutTemplate: cutoutTemplate +sourceid: candid +cutout_difference: cutoutDifference +cutout_science: cutoutScience +cutout_template: cutoutTemplate filter: fid mag: magpsf magerr: sigmapsf From 80d74b812ca4d17443c463d407da6a3ebb08dcf2 Mon Sep 17 00:00:00 2001 From: troyraen Date: Sat, 16 Sep 2023 16:20:00 -0700 Subject: [PATCH 13/55] move Alert class and related to independent module --- pittgoogle/alert.py | 281 +++++++++++++++++++++++++++++++++++++++++++ pittgoogle/pubsub.py | 226 +--------------------------------- 2 files changed, 283 insertions(+), 224 deletions(-) create mode 100644 pittgoogle/alert.py diff --git a/pittgoogle/alert.py b/pittgoogle/alert.py new file mode 100644 index 0000000..8b934bf --- /dev/null +++ b/pittgoogle/alert.py @@ -0,0 +1,281 @@ +# -*- coding: UTF-8 -*- +"""Classes to facilitate working with astronomical alerts. + +.. contents:: + :local: + :depth: 2 + +Usage Examples +--------------- + +.. code-block:: python + + import pittgoogle + +Load an alert from disk: + +.. code-block:: python + + [TODO] + +Load a ZTF alert from a Pub/Sub message that has triggered a Cloud Run module: + +.. code-block:: python + + # flask is used to work with HTTP requests, which trigger Cloud Run modules + # the request contains the Pub/Sub message, which contains the alert packet + from flask import request + + alert = pittgoogle.Alert.from_cloud_run(envelope=request.get_json(), schema_name="ztf") + +API +---- + +""" +import importlib.resources +import io +import logging +from typing import TYPE_CHECKING, Optional, Union + +import fastavro +import yaml +from attrs import define, field +from google.cloud import pubsub_v1 + +from .exceptions import OpenAlertError +from .utils import Cast + +if TYPE_CHECKING: + import google.protobuf.timestamp_pb2 + import google._upb._message + import pandas as pd + + +LOGGER = logging.getLogger(__name__) +PACKAGE_DIR = importlib.resources.files(__package__) + + +@define(frozen=True) +class _PubsubMessageLike: + """Container for an incoming Pub/Sub message that mimics a `pubsub_v1.types.PubsubMessage`. + + It is convenient for the `Alert` class to work with a message as a + `pubsub_v1.types.PubsubMessage`. However, there are many ways to obtain an alert that do + not result in a `pubsub_v1.types.PubsubMessage` (e.g., an alert packet loaded from disk or + an incoming message to a Cloud Functions or Cloud Run module). In those cases, this class + is used to create an object with the same attributes as a `pubsub_v1.types.PubsubMessage`. + This object is then assigned to the `msg` attribute of the `Alert`. + """ + + data: bytes = field() + attributes: dict = field(factory=dict) + message_id: Optional[str] = field(default=None) + publish_time: Optional["google.protobuf.timestamp_pb2.Timestamp"] = field(default=None) + ordering_key: Optional[str] = field(default=None) + + +@define(kw_only=True) +class Alert: + """Pitt-Google container for an astronomical alert. + + Alerts are typically loaded from a Pub/Sub message but may also be loaded from a file. + It is recommended to instantiate an `Alert` using one of the `from_*` methods. + + All parameters are keyword only. + + Parameters + ------------ + bytes : `bytes`, optional + The message payload, as returned by Pub/Sub. It may be Avro or JSON serialized depending + on the topic. + dict : `dict`, optional + The message payload as a dictionary. + metadata : `dict`, optional + The message metadata. + msg : `google.cloud.pubsub_v1.types.PubsubMessage`, optional + The Pub/Sub message object, documented at + ``__. + schema_name : `str` + One of (case insensitive): + - ztf + - ztf.lite + - elasticc.v0_9_1.alert + - elasticc.v0_9_1.brokerClassification + Schema name of the alert. Used for unpacking. If not provided, some properties of the + `Alert` may not be available. + """ + + # _bytes: Optional[ByteString] = field(default=None) + _dict: Optional[dict] = field(default=None) + _attributes: Optional[Union[dict, "google._upb._message.ScalarMapContainer"]] = field( + default=None + ) + # _metadata: Optional[dict] = field(default=None) + msg: Optional[Union["pubsub_v1.types.PubsubMessage", _PubsubMessageLike]] = field(default=None) + """Incoming Pub/Sub message object.""" + _dataframe: Optional["pd.DataFrame"] = field(default=None) + schema_name: str = field(factory=str, converter=str.lower) + _schema_map: Optional[dict] = field(default=None) + # _metadata: Optional[dict] = field(default=None) + bad_request: Union[bool, tuple[str, int]] = field(default=False) + + @classmethod + def from_msg(cls, msg, schema_name=str()): # [TODO] update tom_desc to use this + """Create an `Alert` from a `pubsub_v1.types.PubsubMessage`.""" + return cls(msg=msg, schema_name=schema_name) + + @classmethod + def from_cloud_run(cls, envelope: dict, schema_name: str = str()): + # check whether received message is valid + if not envelope: + return cls(bad_request=("Bad Request: no Pub/Sub message received", 400)) + if not isinstance(envelope, dict) or "message" not in envelope: + return cls(bad_request=("Bad Request: invalid Pub/Sub message format", 400)) + + return cls( + msg=_PubsubMessageLike( + data=envelope["message"]["data"], + attributes=envelope["message"]["attributes"], + message_id=envelope["message"]["message_id"], + publish_time=envelope["message"]["publish_time"], + ordering_key=envelope["message"]["ordering_key"], + ), + schema_name=schema_name, + ) + + # @property + # def bytes(self) -> bytes: + # """Message payload in original format (Avro or JSON serialized bytes).""" + # if self._bytes is None: + # # add try-except when we know what we're looking for + # self._bytes = self.msg.data + # if self._bytes is None: + # # if we add a "path" attribute for the path to an avro file on disk + # # we can load it like this: + # # with open(self.path, "rb") as f: + # # self._bytes = f.read() + # pass + # return self._bytes + + def get(self, schema_key: str, return_key_name=False): + # fullkey = self.get(key, return_key=True) + survey_key = self.schema_map.get(schema_key) + + if return_key_name: + if isinstance(survey_key, list): + return survey_key[-1] + return survey_key + + if schema_key in self.dict: + return self.dict.get(schema_key) + + if isinstance(survey_key, str): + return self.dict.get(survey_key) + + if not isinstance(survey_key, list): + return + + if len(survey_key) == 1: + return self.dict.get(survey_key[0]) + + if len(survey_key) == 2: + return self.dict.get(survey_key[0]).get(survey_key[1]) + + if len(survey_key) == 3: + return self.dict.get(survey_key[0]).get(survey_key[1]).get(survey_key[2]) + + @property + def dict(self) -> dict: + """Message payload as a dictionary. Created from `self.msg.data` and `self.schema_name`, if needed. + + Raises + ------ + :class:`pittgoogle.exceptions.OpenAlertError` + if unable to deserialize the alert bytes. + """ + if self._dict is not None: + return self._dict + + if self.schema_name.startswith("elasticc"): + # self.msg.data is avro and schemaless. load the schema, then convert the bytes to a dict + schemapath = PACKAGE_DIR / f"schemas/elasticc/{self.schema_name}.avsc" + schema = fastavro.schema.load_schema(schemapath) + with io.BytesIO(self.msg.data) as fin: + self._dict = fastavro.schemaless_reader(fin, schema) + return self._dict + + if self.schema_name == "": + LOGGER.warning("no alert schema_name provided. attempting to deserialize without it.") + + # assume this is a ztf or ztf-lite alert + # this should be rewritten to catch specific errors + # for now, just try avro then json, catching basically all errors in the process + try: + self._dict = Cast.avro_to_dict(self.msg.data) + except Exception: + try: + self._dict = Cast.json_to_dict(self.msg.data) + except Exception: + raise OpenAlertError("failed to deserialize the alert bytes") + return self._dict + + @property + def attributes(self) -> Union[dict, "google._upb._message.ScalarMapContainer"]: + """Custom metadata for the message. Pub/Sub handles this as a dict-like called "attributes". + + If None, this will be set to `self.msg.attributes`. + Update as desired. + When publishing, this will be sent as the message attributes. + """ + if self._attributes is None: + self._attributes = self.msg.attributes + return self._attributes + + @property + def dataframe(self) -> "pd.DataFrame": + if self._dataframe is None: + import pandas as pd # lazy-load pandas. it hogs memory on cloud functions and run + + if self.schema_name.endswith(".lite"): + src_df = pd.DataFrame(self.dict["source"], index=[0]) + prvs_df = pd.DataFrame(self.dict["prvSources"]) + else: + src_df = pd.DataFrame(self.dict[self.schema_map["source"]], index=[0]) + prvs_df = pd.DataFrame(self.dict[self.schema_map["prvSources"]]) + self._dataframe = pd.concat([src_df, prvs_df], ignore_index=True) + + return self._dataframe + + @property + def schema_map(self) -> dict: + if self._schema_map is None: + if self.schema_name == str(): + raise TypeError("no alert schema_name provided. unable to load schema map.") + survey = self.schema_name.split(".")[0] + path = PACKAGE_DIR / f"schema_maps/{survey}.yml" + self._schema_map = yaml.safe_load(path.read_text()) + return self._schema_map + + # @property + # def metadata(self) -> dict: + # """Pub/Sub message metadata. + + # Includes + + # - message_id, publish_time, and ordering_key* of the incoming Pub/Sub message + # - attributes, which is a dict that typically includes the attributes of the + # incoming message and possibly additional entries added by the user in the meantime. + + # *To be useful, ordering_key requires that ordering is enabled on the subscription. + # """ + # if self._metadata is None: + # self._metadata = { + # "message_id": self.msg.message_id, + # "publish_time": self.msg.publish_time, + # # ordering must be enabled on the subscription for this to be useful + # "ordering_key": self.msg.ordering_key, + # # [TODO] breaking change. attributes is now a dict. open a pr on tom_desc + # # typically includes self.msg.attributes plus additional items added by the user + # "attributes": self.attributes, + # } + # return self._metadata diff --git a/pittgoogle/pubsub.py b/pittgoogle/pubsub.py index 16ab71c..f0430c0 100644 --- a/pittgoogle/pubsub.py +++ b/pittgoogle/pubsub.py @@ -92,7 +92,7 @@ def my_batch_callback(results): from google.api_core.exceptions import NotFound from google.cloud import pubsub_v1 -from .auth import Auth +from . import Auth, Alert from .exceptions import OpenAlertError from .utils import Cast @@ -106,7 +106,7 @@ def my_batch_callback(results): PACKAGE_DIR = importlib.resources.files(__package__) -def msg_callback_example(alert: "Alert") -> "Response": +def msg_callback_example(alert: Alert) -> "Response": print(f"processing message: {alert.metadata['message_id']}") return Response(ack=True, result=alert.dict) @@ -560,228 +560,6 @@ def pull_batch(self, max_messages: int = 1) -> List["Alert"]: return self.subscription.pull_batch(max_messages=max_messages) -@define(frozen=True) -class _PubsubMessageLike: - """Container for an incoming Pub/Sub message that mimics a `pubsub_v1.types.PubsubMessage`. - - It is convenient for the `Alert` class to work with a message as a - `pubsub_v1.types.PubsubMessage`. However, there are many ways to obtain an alert that do - not result in a `pubsub_v1.types.PubsubMessage` (e.g., an alert packet loaded from disk or - an incoming message to a Cloud Functions or Cloud Run module). In those cases, this class - is used to create an object with the same attributes as a `pubsub_v1.types.PubsubMessage`. - This object is then assigned to the `msg` attribute of the `Alert`. - """ - - data: bytes = field() - attributes: dict = field(factory=dict) - message_id: Optional[str] = field(default=None) - publish_time: Optional["google.protobuf.timestamp_pb2.Timestamp"] = field(default=None) - ordering_key: Optional[str] = field(default=None) - - -@define(kw_only=True) -class Alert: - """Pitt-Google container for a Pub/Sub message. - - Typical usage is to instantiate an `Alert` using only a `msg`, and then the other attributes - will be automatically extracted and returned (lazily). - - All parameters are keyword only. - - Parameters - ------------ - bytes : `bytes`, optional - The message payload, as returned by Pub/Sub. It may be Avro or JSON serialized depending - on the topic. - dict : `dict`, optional - The message payload as a dictionary. - metadata : `dict`, optional - The message metadata. - msg : `google.cloud.pubsub_v1.types.PubsubMessage`, optional - The Pub/Sub message object, documented at - ``__. - schema_name : `str` - One of "ztf", "ztf.lite", "elasticc.v0_9_1.alert", "elasticc.v0_9_1.brokerClassification". - Schema name of the alert. Used for unpacking. If not provided, some properties of the - `Alert` may not be available. - """ - - # _bytes: Optional[ByteString] = field(default=None) - _dict: Optional[dict] = field(default=None) - _attributes: Optional[Union[dict, "google._upb._message.ScalarMapContainer"]] = field( - default=None - ) - # _metadata: Optional[dict] = field(default=None) - msg: Optional[Union["pubsub_v1.types.PubsubMessage", _PubsubMessageLike]] = field(default=None) - """Incoming Pub/Sub message object.""" - _dataframe: Optional["pd.DataFrame"] = field(default=None) - schema_name: str = field(factory=str) - _schema_map: Optional[dict] = field(default=None) - # _metadata: Optional[dict] = field(default=None) - bad_request: Union[bool, tuple[str, int]] = field(default=False) - - @classmethod - def from_msg(cls, msg, schema_name=str()): # [TODO] update tom_desc to use this - """Create an `Alert` from a `pubsub_v1.types.PubsubMessage`.""" - return cls(msg=msg, schema_name=schema_name) - - @classmethod - def from_cloud_run(cls, envelope: dict, schema_name: str = str()): - # check whether received message is valid - if not envelope: - return cls(bad_request=("Bad Request: no Pub/Sub message received", 400)) - if not isinstance(envelope, dict) or "message" not in envelope: - return cls(bad_request=("Bad Request: invalid Pub/Sub message format", 400)) - - return cls( - msg=_PubsubMessageLike( - data=envelope["message"]["data"], - attributes=envelope["message"]["attributes"], - message_id=envelope["message"]["message_id"], - publish_time=envelope["message"]["publish_time"], - ordering_key=envelope["message"]["ordering_key"], - ), - schema_name=schema_name, - ) - - # @property - # def bytes(self) -> bytes: - # """Message payload in original format (Avro or JSON serialized bytes).""" - # if self._bytes is None: - # # add try-except when we know what we're looking for - # self._bytes = self.msg.data - # if self._bytes is None: - # # if we add a "path" attribute for the path to an avro file on disk - # # we can load it like this: - # # with open(self.path, "rb") as f: - # # self._bytes = f.read() - # pass - # return self._bytes - - def get(self, schema_key: str, return_key_name=False): - # fullkey = self.get(key, return_key=True) - survey_key = self.schema_map.get(schema_key) - - if return_key_name: - if isinstance(survey_key, list): - return survey_key[-1] - return survey_key - - if schema_key in self.dict: - return self.dict.get(schema_key) - - if isinstance(survey_key, str): - return self.dict.get(survey_key) - - if not isinstance(survey_key, list): - return - - if len(survey_key) == 1: - return self.dict.get(survey_key[0]) - - if len(survey_key) == 2: - return self.dict.get(survey_key[0]).get(survey_key[1]) - - if len(survey_key) == 3: - return self.dict.get(survey_key[0]).get(survey_key[1]).get(survey_key[2]) - - @property - def dict(self) -> dict: - """Message payload as a dictionary. Created from `self.msg.data` and `self.schema_name`, if needed. - - Raises - ------ - :class:`pittgoogle.exceptions.OpenAlertError` - if unable to deserialize the alert bytes. - """ - if self._dict is not None: - return self._dict - - if self.schema_name.startswith("elasticc"): - # self.msg.data is avro and schemaless. load the schema, then convert the bytes to a dict - schemapath = PACKAGE_DIR / f"schemas/elasticc/{self.schema_name}.avsc" - schema = fastavro.schema.load_schema(schemapath) - with io.BytesIO(self.msg.data) as fin: - self._dict = fastavro.schemaless_reader(fin, schema) - return self._dict - - if self.schema_name == "": - LOGGER.warning("no alert schema_name provided. attempting to deserialize without it.") - - # assume this is a ztf or ztf-lite alert - # this should be rewritten to catch specific errors - # for now, just try avro then json, catching basically all errors in the process - try: - self._dict = Cast.avro_to_dict(self.msg.data) - except Exception: - try: - self._dict = Cast.json_to_dict(self.msg.data) - except Exception: - raise OpenAlertError("failed to deserialize the alert bytes") - return self._dict - - @property - def attributes(self) -> Union[dict, "google._upb._message.ScalarMapContainer"]: - """Custom metadata for the message. Pub/Sub handles this as a dict-like called "attributes". - - If None, this will be set to `self.msg.attributes`. - Update as desired. - When publishing, this will be sent as the message attributes. - """ - if self._attributes is None: - self._attributes = self.msg.attributes - return self._attributes - - @property - def dataframe(self) -> "pd.DataFrame": - if self._dataframe is None: - import pandas as pd # lazy-load pandas. it hogs memory on cloud functions and run - - if self.schema_name.endswith(".lite"): - src_df = pd.DataFrame(self.dict["source"], index=[0]) - prvs_df = pd.DataFrame(self.dict["prvSources"]) - else: - src_df = pd.DataFrame(self.dict[self.schema_map["source"]], index=[0]) - prvs_df = pd.DataFrame(self.dict[self.schema_map["prvSources"]]) - self._dataframe = pd.concat([src_df, prvs_df], ignore_index=True) - - return self._dataframe - - @property - def schema_map(self) -> dict: - if self._schema_map is None: - if self.schema_name == str(): - raise TypeError("no alert schema_name provided. unable to load schema map.") - survey = self.schema_name.split(".")[0] - path = PACKAGE_DIR / f"schema_maps/{survey}.yml" - self._schema_map = yaml.safe_load(path.read_text()) - return self._schema_map - - # @property - # def metadata(self) -> dict: - # """Pub/Sub message metadata. - - # Includes - - # - message_id, publish_time, and ordering_key* of the incoming Pub/Sub message - # - attributes, which is a dict that typically includes the attributes of the - # incoming message and possibly additional entries added by the user in the meantime. - - # *To be useful, ordering_key requires that ordering is enabled on the subscription. - # """ - # if self._metadata is None: - # self._metadata = { - # "message_id": self.msg.message_id, - # "publish_time": self.msg.publish_time, - # # ordering must be enabled on the subscription for this to be useful - # "ordering_key": self.msg.ordering_key, - # # [TODO] breaking change. attributes is now a dict. open a pr on tom_desc - # # typically includes self.msg.attributes plus additional items added by the user - # "attributes": self.attributes, - # } - # return self._metadata - - @define(kw_only=True, frozen=True) class Response: """Container for a response, to be returned by a :meth:`pittgoogle.pubsub.Consumer.msg_callback`. From 515f8083842209c6a31b895c4aa318c9d19bfc23 Mon Sep 17 00:00:00 2001 From: troyraen Date: Sat, 16 Sep 2023 17:20:42 -0700 Subject: [PATCH 14/55] implement BadRequest exception --- pittgoogle/alert.py | 14 ++++++++------ pittgoogle/exceptions.py | 4 ++++ 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/pittgoogle/alert.py b/pittgoogle/alert.py index 8b934bf..2103e9e 100644 --- a/pittgoogle/alert.py +++ b/pittgoogle/alert.py @@ -42,7 +42,7 @@ from attrs import define, field from google.cloud import pubsub_v1 -from .exceptions import OpenAlertError +from .exceptions import BadRequest, OpenAlertError from .utils import Cast if TYPE_CHECKING: @@ -117,7 +117,7 @@ class Alert: schema_name: str = field(factory=str, converter=str.lower) _schema_map: Optional[dict] = field(default=None) # _metadata: Optional[dict] = field(default=None) - bad_request: Union[bool, tuple[str, int]] = field(default=False) + # bad_request: Union[bool, tuple[str, int]] = field(default=False) @classmethod def from_msg(cls, msg, schema_name=str()): # [TODO] update tom_desc to use this @@ -125,12 +125,14 @@ def from_msg(cls, msg, schema_name=str()): # [TODO] update tom_desc to use this return cls(msg=msg, schema_name=schema_name) @classmethod - def from_cloud_run(cls, envelope: dict, schema_name: str = str()): - # check whether received message is valid + def from_cloud_run(cls, envelope: dict, schema_name: str = str()) -> "Alert": + # check whether received message is valid, as suggested by Cloud Run docs if not envelope: - return cls(bad_request=("Bad Request: no Pub/Sub message received", 400)) + # return cls(bad_request=("Bad Request: no Pub/Sub message received", 400)) + raise BadRequest("Bad Request: no Pub/Sub message received") if not isinstance(envelope, dict) or "message" not in envelope: - return cls(bad_request=("Bad Request: invalid Pub/Sub message format", 400)) + # return cls(bad_request=("Bad Request: invalid Pub/Sub message format", 400)) + raise BadRequest("Bad Request: invalid Pub/Sub message format") return cls( msg=_PubsubMessageLike( diff --git a/pittgoogle/exceptions.py b/pittgoogle/exceptions.py index 1c2f58f..a28eef6 100644 --- a/pittgoogle/exceptions.py +++ b/pittgoogle/exceptions.py @@ -1,3 +1,7 @@ # -*- coding: UTF-8 -*- +class BadRequest(Exception): + """Raised when a Flask request json envelope (e.g., from Cloud Run) is invalid.""" + + class OpenAlertError(Exception): """Raised when unable to deserialize a Pub/Sub message payload.""" From 09e2738d870266756500ad9049124ec1b5cd4d25 Mon Sep 17 00:00:00 2001 From: troyraen Date: Sat, 16 Sep 2023 17:26:24 -0700 Subject: [PATCH 15/55] separate get_key from get and support defaults --- pittgoogle/alert.py | 35 ++++++++++++++++++++++++----------- 1 file changed, 24 insertions(+), 11 deletions(-) diff --git a/pittgoogle/alert.py b/pittgoogle/alert.py index 2103e9e..dc38520 100644 --- a/pittgoogle/alert.py +++ b/pittgoogle/alert.py @@ -159,23 +159,19 @@ def from_cloud_run(cls, envelope: dict, schema_name: str = str()) -> "Alert": # pass # return self._bytes - def get(self, schema_key: str, return_key_name=False): - # fullkey = self.get(key, return_key=True) - survey_key = self.schema_map.get(schema_key) + def get(self, key: str, default: Optional[str] = None): + # if key is found in self.dict, just return the corresponding value + if key in self.dict: + return self.dict.get(key) - if return_key_name: - if isinstance(survey_key, list): - return survey_key[-1] - return survey_key - - if schema_key in self.dict: - return self.dict.get(schema_key) + # lookup the key in the schema map + survey_key = self.schema_map.get(key) # str or list[str] if isinstance(survey_key, str): return self.dict.get(survey_key) if not isinstance(survey_key, list): - return + return default if len(survey_key) == 1: return self.dict.get(survey_key[0]) @@ -186,6 +182,23 @@ def get(self, schema_key: str, return_key_name=False): if len(survey_key) == 3: return self.dict.get(survey_key[0]).get(survey_key[1]).get(survey_key[2]) + def get_key(self, key, name_only: bool = True): + if key in self.dict: + return key + + survey_key = self.schema_map.get(key) # str or list[str] + + if isinstance(survey_key, str): + return survey_key + + if not isinstance(survey_key, list): + return + + if name_only: + return survey_key[-1] + + return survey_key + @property def dict(self) -> dict: """Message payload as a dictionary. Created from `self.msg.data` and `self.schema_name`, if needed. From b30ec398afcd89594a44b2eba1c3d7b42f1b95a9 Mon Sep 17 00:00:00 2001 From: troyraen Date: Sat, 16 Sep 2023 17:35:46 -0700 Subject: [PATCH 16/55] change schema_map keys to snake_case yaml files were updated in 9c9a140a2a439016cdb97914dafc97984690cadf --- pittgoogle/alert.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pittgoogle/alert.py b/pittgoogle/alert.py index dc38520..8a3e383 100644 --- a/pittgoogle/alert.py +++ b/pittgoogle/alert.py @@ -253,10 +253,10 @@ def dataframe(self) -> "pd.DataFrame": if self.schema_name.endswith(".lite"): src_df = pd.DataFrame(self.dict["source"], index=[0]) - prvs_df = pd.DataFrame(self.dict["prvSources"]) + prvs_df = pd.DataFrame(self.dict["prv_sources"]) else: src_df = pd.DataFrame(self.dict[self.schema_map["source"]], index=[0]) - prvs_df = pd.DataFrame(self.dict[self.schema_map["prvSources"]]) + prvs_df = pd.DataFrame(self.dict[self.schema_map["prv_sources"]]) self._dataframe = pd.concat([src_df, prvs_df], ignore_index=True) return self._dataframe From 5d24cbbbe1d10347acaaa9899f2c990ddef87034 Mon Sep 17 00:00:00 2001 From: troyraen Date: Sat, 16 Sep 2023 17:50:24 -0700 Subject: [PATCH 17/55] add properties for objectid, sourceid, alertid --- pittgoogle/alert.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/pittgoogle/alert.py b/pittgoogle/alert.py index 8a3e383..5d63011 100644 --- a/pittgoogle/alert.py +++ b/pittgoogle/alert.py @@ -159,6 +159,21 @@ def from_cloud_run(cls, envelope: dict, schema_name: str = str()) -> "Alert": # pass # return self._bytes + @property + def alertid(self) -> Union[str, int]: + """Convenience property for the alert ID. If the survey does not define an alert ID, this is the `sourceid`.""" + return self.get("alertid", self.sourceid) + + @property + def sourceid(self) -> Union[str, int]: + """Convenience property for the source ID. The "source" is the detection that triggered the alert.""" + return self.get("sourceid") + + @property + def objectid(self) -> Union[str, int]: + """Convenience property for the object ID. The "object" represents a collection of sources, as determined by the survey.""" + return self.get("objectid") + def get(self, key: str, default: Optional[str] = None): # if key is found in self.dict, just return the corresponding value if key in self.dict: From 9f803fc9257b060068997481af9386fd1b25b8be Mon Sep 17 00:00:00 2001 From: troyraen Date: Sat, 16 Sep 2023 18:49:20 -0700 Subject: [PATCH 18/55] update cloud run example, be lenient with metadata --- pittgoogle/alert.py | 31 +++++++++++++++++++++++++------ 1 file changed, 25 insertions(+), 6 deletions(-) diff --git a/pittgoogle/alert.py b/pittgoogle/alert.py index 5d63011..91e7bd3 100644 --- a/pittgoogle/alert.py +++ b/pittgoogle/alert.py @@ -22,11 +22,29 @@ .. code-block:: python + import pittgoogle # flask is used to work with HTTP requests, which trigger Cloud Run modules # the request contains the Pub/Sub message, which contains the alert packet - from flask import request + from flask import Flask, request + + app = Flask(__name__) + + # function that receives the request + @app.route("/", methods=["POST"]) + def index(): + + try: + # unpack the alert + # if the request does not contain a valid message, this raises a `BadRequest` + alert = pittgoogle.Alert.from_cloud_run(envelope=request.get_json(), schema_name="ztf") + + except pg.exceptions.BadRequest as err: + # return the error text and an HTTP 400 Bad Request code + return err.text, 400 - alert = pittgoogle.Alert.from_cloud_run(envelope=request.get_json(), schema_name="ztf") + # continue processing the alert + # when finished, return an empty string and an HTTP success code + return "", 204 API ---- @@ -136,11 +154,12 @@ def from_cloud_run(cls, envelope: dict, schema_name: str = str()) -> "Alert": return cls( msg=_PubsubMessageLike( + # data is required. the rest should be present in the message, but let's be lenient data=envelope["message"]["data"], - attributes=envelope["message"]["attributes"], - message_id=envelope["message"]["message_id"], - publish_time=envelope["message"]["publish_time"], - ordering_key=envelope["message"]["ordering_key"], + attributes=envelope["message"].get("attributes"), + message_id=envelope["message"].get("message_id"), + publish_time=envelope["message"].get("publish_time"), + ordering_key=envelope["message"].get("ordering_key"), ), schema_name=schema_name, ) From a9f4290b452699ebe1ec5e8b64659d3891be9c9c Mon Sep 17 00:00:00 2001 From: troyraen Date: Sat, 16 Sep 2023 18:50:16 -0700 Subject: [PATCH 19/55] add from_path class method --- pittgoogle/alert.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/pittgoogle/alert.py b/pittgoogle/alert.py index 91e7bd3..17753e2 100644 --- a/pittgoogle/alert.py +++ b/pittgoogle/alert.py @@ -8,15 +8,14 @@ Usage Examples --------------- -.. code-block:: python - - import pittgoogle - Load an alert from disk: .. code-block:: python - [TODO] + import pittgoogle + + path = "path/to/ztf_alert.avro" # point this to a file containing an alert + alert = pittgoogle.Alert.from_path(path, schema_name="ztf") Load a ZTF alert from a Pub/Sub message that has triggered a Cloud Run module: @@ -164,6 +163,12 @@ def from_cloud_run(cls, envelope: dict, schema_name: str = str()) -> "Alert": schema_name=schema_name, ) + @classmethod + def from_path(cls, path, schema_name=str()) -> "Alert": + with open(path, "rb") as f: + bytes = f.read() + return cls(msg=_PubsubMessageLike(data=bytes), schema_name=schema_name) + # @property # def bytes(self) -> bytes: # """Message payload in original format (Avro or JSON serialized bytes).""" From 07b591db789cae605cdfebda755a74f9831bf246 Mon Sep 17 00:00:00 2001 From: troyraen Date: Sat, 16 Sep 2023 18:50:54 -0700 Subject: [PATCH 20/55] bugfix schema maps dir --- pittgoogle/alert.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pittgoogle/alert.py b/pittgoogle/alert.py index 17753e2..d6fb835 100644 --- a/pittgoogle/alert.py +++ b/pittgoogle/alert.py @@ -306,7 +306,7 @@ def schema_map(self) -> dict: if self.schema_name == str(): raise TypeError("no alert schema_name provided. unable to load schema map.") survey = self.schema_name.split(".")[0] - path = PACKAGE_DIR / f"schema_maps/{survey}.yml" + path = PACKAGE_DIR / f"schemas/maps/{survey}.yml" self._schema_map = yaml.safe_load(path.read_text()) return self._schema_map From 1a373b50897e36505549036253d436f01c31d93f Mon Sep 17 00:00:00 2001 From: troyraen Date: Sat, 16 Sep 2023 19:20:26 -0700 Subject: [PATCH 21/55] cleanup --- pittgoogle/alert.py | 50 +++------------------------------------------ 1 file changed, 3 insertions(+), 47 deletions(-) diff --git a/pittgoogle/alert.py b/pittgoogle/alert.py index d6fb835..fd4b23e 100644 --- a/pittgoogle/alert.py +++ b/pittgoogle/alert.py @@ -65,7 +65,7 @@ def index(): if TYPE_CHECKING: import google.protobuf.timestamp_pb2 import google._upb._message - import pandas as pd + import pandas as pd # always lazy-load pandas. it hogs memory on cloud functions and run LOGGER = logging.getLogger(__name__) @@ -122,22 +122,18 @@ class Alert: `Alert` may not be available. """ - # _bytes: Optional[ByteString] = field(default=None) _dict: Optional[dict] = field(default=None) _attributes: Optional[Union[dict, "google._upb._message.ScalarMapContainer"]] = field( default=None ) - # _metadata: Optional[dict] = field(default=None) msg: Optional[Union["pubsub_v1.types.PubsubMessage", _PubsubMessageLike]] = field(default=None) """Incoming Pub/Sub message object.""" _dataframe: Optional["pd.DataFrame"] = field(default=None) schema_name: str = field(factory=str, converter=str.lower) _schema_map: Optional[dict] = field(default=None) - # _metadata: Optional[dict] = field(default=None) - # bad_request: Union[bool, tuple[str, int]] = field(default=False) @classmethod - def from_msg(cls, msg, schema_name=str()): # [TODO] update tom_desc to use this + def from_msg(cls, msg, schema_name=str()) -> "Alert": # [TODO] update tom_desc to use this """Create an `Alert` from a `pubsub_v1.types.PubsubMessage`.""" return cls(msg=msg, schema_name=schema_name) @@ -145,15 +141,13 @@ def from_msg(cls, msg, schema_name=str()): # [TODO] update tom_desc to use this def from_cloud_run(cls, envelope: dict, schema_name: str = str()) -> "Alert": # check whether received message is valid, as suggested by Cloud Run docs if not envelope: - # return cls(bad_request=("Bad Request: no Pub/Sub message received", 400)) raise BadRequest("Bad Request: no Pub/Sub message received") if not isinstance(envelope, dict) or "message" not in envelope: - # return cls(bad_request=("Bad Request: invalid Pub/Sub message format", 400)) raise BadRequest("Bad Request: invalid Pub/Sub message format") return cls( msg=_PubsubMessageLike( - # data is required. the rest should be present in the message, but let's be lenient + # this class requires data. the rest should be present in the message, but let's be lenient data=envelope["message"]["data"], attributes=envelope["message"].get("attributes"), message_id=envelope["message"].get("message_id"), @@ -169,20 +163,6 @@ def from_path(cls, path, schema_name=str()) -> "Alert": bytes = f.read() return cls(msg=_PubsubMessageLike(data=bytes), schema_name=schema_name) - # @property - # def bytes(self) -> bytes: - # """Message payload in original format (Avro or JSON serialized bytes).""" - # if self._bytes is None: - # # add try-except when we know what we're looking for - # self._bytes = self.msg.data - # if self._bytes is None: - # # if we add a "path" attribute for the path to an avro file on disk - # # we can load it like this: - # # with open(self.path, "rb") as f: - # # self._bytes = f.read() - # pass - # return self._bytes - @property def alertid(self) -> Union[str, int]: """Convenience property for the alert ID. If the survey does not define an alert ID, this is the `sourceid`.""" @@ -309,27 +289,3 @@ def schema_map(self) -> dict: path = PACKAGE_DIR / f"schemas/maps/{survey}.yml" self._schema_map = yaml.safe_load(path.read_text()) return self._schema_map - - # @property - # def metadata(self) -> dict: - # """Pub/Sub message metadata. - - # Includes - - # - message_id, publish_time, and ordering_key* of the incoming Pub/Sub message - # - attributes, which is a dict that typically includes the attributes of the - # incoming message and possibly additional entries added by the user in the meantime. - - # *To be useful, ordering_key requires that ordering is enabled on the subscription. - # """ - # if self._metadata is None: - # self._metadata = { - # "message_id": self.msg.message_id, - # "publish_time": self.msg.publish_time, - # # ordering must be enabled on the subscription for this to be useful - # "ordering_key": self.msg.ordering_key, - # # [TODO] breaking change. attributes is now a dict. open a pr on tom_desc - # # typically includes self.msg.attributes plus additional items added by the user - # "attributes": self.attributes, - # } - # return self._metadata From d415127227320890a05164fb7c9f7240560a96ab Mon Sep 17 00:00:00 2001 From: troyraen Date: Sat, 16 Sep 2023 19:22:33 -0700 Subject: [PATCH 22/55] reorganize methods, properties, etc. --- pittgoogle/alert.py | 137 ++++++++++++++++++++++---------------------- 1 file changed, 70 insertions(+), 67 deletions(-) diff --git a/pittgoogle/alert.py b/pittgoogle/alert.py index fd4b23e..18115f6 100644 --- a/pittgoogle/alert.py +++ b/pittgoogle/alert.py @@ -122,16 +122,17 @@ class Alert: `Alert` may not be available. """ - _dict: Optional[dict] = field(default=None) + msg: Optional[Union["pubsub_v1.types.PubsubMessage", _PubsubMessageLike]] = field(default=None) + """Incoming Pub/Sub message object.""" _attributes: Optional[Union[dict, "google._upb._message.ScalarMapContainer"]] = field( default=None ) - msg: Optional[Union["pubsub_v1.types.PubsubMessage", _PubsubMessageLike]] = field(default=None) - """Incoming Pub/Sub message object.""" + _dict: Optional[dict] = field(default=None) _dataframe: Optional["pd.DataFrame"] = field(default=None) schema_name: str = field(factory=str, converter=str.lower) _schema_map: Optional[dict] = field(default=None) + # ---- class methods ---- # @classmethod def from_msg(cls, msg, schema_name=str()) -> "Alert": # [TODO] update tom_desc to use this """Create an `Alert` from a `pubsub_v1.types.PubsubMessage`.""" @@ -163,60 +164,18 @@ def from_path(cls, path, schema_name=str()) -> "Alert": bytes = f.read() return cls(msg=_PubsubMessageLike(data=bytes), schema_name=schema_name) + # ---- properties ---- # @property - def alertid(self) -> Union[str, int]: - """Convenience property for the alert ID. If the survey does not define an alert ID, this is the `sourceid`.""" - return self.get("alertid", self.sourceid) - - @property - def sourceid(self) -> Union[str, int]: - """Convenience property for the source ID. The "source" is the detection that triggered the alert.""" - return self.get("sourceid") - - @property - def objectid(self) -> Union[str, int]: - """Convenience property for the object ID. The "object" represents a collection of sources, as determined by the survey.""" - return self.get("objectid") - - def get(self, key: str, default: Optional[str] = None): - # if key is found in self.dict, just return the corresponding value - if key in self.dict: - return self.dict.get(key) - - # lookup the key in the schema map - survey_key = self.schema_map.get(key) # str or list[str] - - if isinstance(survey_key, str): - return self.dict.get(survey_key) - - if not isinstance(survey_key, list): - return default - - if len(survey_key) == 1: - return self.dict.get(survey_key[0]) - - if len(survey_key) == 2: - return self.dict.get(survey_key[0]).get(survey_key[1]) - - if len(survey_key) == 3: - return self.dict.get(survey_key[0]).get(survey_key[1]).get(survey_key[2]) - - def get_key(self, key, name_only: bool = True): - if key in self.dict: - return key - - survey_key = self.schema_map.get(key) # str or list[str] - - if isinstance(survey_key, str): - return survey_key - - if not isinstance(survey_key, list): - return - - if name_only: - return survey_key[-1] + def attributes(self) -> Union[dict, "google._upb._message.ScalarMapContainer"]: + """Custom metadata for the message. Pub/Sub handles this as a dict-like called "attributes". - return survey_key + If None, this will be set to `self.msg.attributes`. + Update as desired. + When publishing, this will be sent as the message attributes. + """ + if self._attributes is None: + self._attributes = self.msg.attributes + return self._attributes @property def dict(self) -> dict: @@ -253,18 +212,6 @@ def dict(self) -> dict: raise OpenAlertError("failed to deserialize the alert bytes") return self._dict - @property - def attributes(self) -> Union[dict, "google._upb._message.ScalarMapContainer"]: - """Custom metadata for the message. Pub/Sub handles this as a dict-like called "attributes". - - If None, this will be set to `self.msg.attributes`. - Update as desired. - When publishing, this will be sent as the message attributes. - """ - if self._attributes is None: - self._attributes = self.msg.attributes - return self._attributes - @property def dataframe(self) -> "pd.DataFrame": if self._dataframe is None: @@ -280,6 +227,21 @@ def dataframe(self) -> "pd.DataFrame": return self._dataframe + @property + def alertid(self) -> Union[str, int]: + """Convenience property for the alert ID. If the survey does not define an alert ID, this is the `sourceid`.""" + return self.get("alertid", self.sourceid) + + @property + def objectid(self) -> Union[str, int]: + """Convenience property for the object ID. The "object" represents a collection of sources, as determined by the survey.""" + return self.get("objectid") + + @property + def sourceid(self) -> Union[str, int]: + """Convenience property for the source ID. The "source" is the detection that triggered the alert.""" + return self.get("sourceid") + @property def schema_map(self) -> dict: if self._schema_map is None: @@ -289,3 +251,44 @@ def schema_map(self) -> dict: path = PACKAGE_DIR / f"schemas/maps/{survey}.yml" self._schema_map = yaml.safe_load(path.read_text()) return self._schema_map + + # ---- methods ---- # + def get(self, key: str, default: Optional[str] = None): + # if key is found in self.dict, just return the corresponding value + if key in self.dict: + return self.dict.get(key) + + # lookup the key in the schema map + survey_key = self.schema_map.get(key) # str or list[str] + + if isinstance(survey_key, str): + return self.dict.get(survey_key) + + if not isinstance(survey_key, list): + return default + + if len(survey_key) == 1: + return self.dict.get(survey_key[0]) + + if len(survey_key) == 2: + return self.dict.get(survey_key[0]).get(survey_key[1]) + + if len(survey_key) == 3: + return self.dict.get(survey_key[0]).get(survey_key[1]).get(survey_key[2]) + + def get_key(self, key, name_only: bool = True): + if key in self.dict: + return key + + survey_key = self.schema_map.get(key) # str or list[str] + + if isinstance(survey_key, str): + return survey_key + + if not isinstance(survey_key, list): + return + + if name_only: + return survey_key[-1] + + return survey_key From b1ef685d60481b41fa86f6b68cc6cb173074111c Mon Sep 17 00:00:00 2001 From: troyraen Date: Sat, 16 Sep 2023 22:42:31 -0700 Subject: [PATCH 23/55] cleanup imports --- pittgoogle/__init__.py | 7 ++++++- pittgoogle/alert.py | 8 +++++--- pittgoogle/pubsub.py | 18 +++++------------- 3 files changed, 16 insertions(+), 17 deletions(-) diff --git a/pittgoogle/__init__.py b/pittgoogle/__init__.py index 47238f1..82859bc 100644 --- a/pittgoogle/__init__.py +++ b/pittgoogle/__init__.py @@ -9,7 +9,12 @@ except ImportError: # for Python<3.8 import importlib_metadata as metadata -from . import auth, bigquery, exceptions, pubsub, utils +from .auth import Auth +from .alert import Alert +from .bigquery import Table +from .pubsub import Topic, Subscription, Consumer +from . import auth, alert, bigquery, exceptions, pubsub, utils + __version__ = metadata.version("pittgoogle-client") diff --git a/pittgoogle/alert.py b/pittgoogle/alert.py index 18115f6..877ca28 100644 --- a/pittgoogle/alert.py +++ b/pittgoogle/alert.py @@ -57,14 +57,14 @@ def index(): import fastavro import yaml from attrs import define, field -from google.cloud import pubsub_v1 from .exceptions import BadRequest, OpenAlertError from .utils import Cast if TYPE_CHECKING: - import google.protobuf.timestamp_pb2 import google._upb._message + import google.cloud.pubsub_v1 + import google.protobuf.timestamp_pb2 import pandas as pd # always lazy-load pandas. it hogs memory on cloud functions and run @@ -122,7 +122,9 @@ class Alert: `Alert` may not be available. """ - msg: Optional[Union["pubsub_v1.types.PubsubMessage", _PubsubMessageLike]] = field(default=None) + msg: Optional[Union["google.cloud.pubsub_v1.types.PubsubMessage", _PubsubMessageLike]] = field( + default=None + ) """Incoming Pub/Sub message object.""" _attributes: Optional[Union[dict, "google._upb._message.ScalarMapContainer"]] = field( default=None diff --git a/pittgoogle/pubsub.py b/pittgoogle/pubsub.py index f0430c0..3e21293 100644 --- a/pittgoogle/pubsub.py +++ b/pittgoogle/pubsub.py @@ -83,23 +83,15 @@ def my_batch_callback(results): import queue from concurrent.futures import ThreadPoolExecutor from time import sleep -from typing import TYPE_CHECKING, Any, Callable, List, Optional, Union +from typing import Any, Callable, List, Optional, Union import fastavro -import yaml -from attrs import converters, define, field +import google.cloud.pubsub_v1 as pubsub_v1 +from attrs import define, field from attrs.validators import gt, instance_of, is_callable, optional from google.api_core.exceptions import NotFound -from google.cloud import pubsub_v1 -from . import Auth, Alert -from .exceptions import OpenAlertError -from .utils import Cast - -if TYPE_CHECKING: - import google.protobuf.timestamp_pb2 - import google._upb._message - import pandas as pd +from . import Alert, Auth LOGGER = logging.getLogger(__name__) @@ -578,5 +570,5 @@ class Response: If there is no batch callback the results will be lost. """ - ack: bool = field(default=True, converter=converters.to_bool) + ack: bool = field(default=True, converter=bool) result: Any = field(default=None) From 73bf3aa0727a49b8eed32ed9ce67c5a07b15c4f3 Mon Sep 17 00:00:00 2001 From: troyraen Date: Sat, 16 Sep 2023 22:46:57 -0700 Subject: [PATCH 24/55] bugfix last --- pittgoogle/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pittgoogle/__init__.py b/pittgoogle/__init__.py index 82859bc..0a4b9ab 100644 --- a/pittgoogle/__init__.py +++ b/pittgoogle/__init__.py @@ -11,7 +11,6 @@ from .auth import Auth from .alert import Alert -from .bigquery import Table from .pubsub import Topic, Subscription, Consumer from . import auth, alert, bigquery, exceptions, pubsub, utils From c7a7033aa3b3c66f814c9ec50a85de22c7b7152b Mon Sep 17 00:00:00 2001 From: troyraen Date: Thu, 28 Sep 2023 02:46:01 -0700 Subject: [PATCH 25/55] add types_ module with Schema, PubsubMessageLike --- pittgoogle/alert.py | 33 ++++------------ pittgoogle/types_.py | 94 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 102 insertions(+), 25 deletions(-) create mode 100644 pittgoogle/types_.py diff --git a/pittgoogle/alert.py b/pittgoogle/alert.py index 877ca28..95c6056 100644 --- a/pittgoogle/alert.py +++ b/pittgoogle/alert.py @@ -72,25 +72,6 @@ def index(): PACKAGE_DIR = importlib.resources.files(__package__) -@define(frozen=True) -class _PubsubMessageLike: - """Container for an incoming Pub/Sub message that mimics a `pubsub_v1.types.PubsubMessage`. - - It is convenient for the `Alert` class to work with a message as a - `pubsub_v1.types.PubsubMessage`. However, there are many ways to obtain an alert that do - not result in a `pubsub_v1.types.PubsubMessage` (e.g., an alert packet loaded from disk or - an incoming message to a Cloud Functions or Cloud Run module). In those cases, this class - is used to create an object with the same attributes as a `pubsub_v1.types.PubsubMessage`. - This object is then assigned to the `msg` attribute of the `Alert`. - """ - - data: bytes = field() - attributes: dict = field(factory=dict) - message_id: Optional[str] = field(default=None) - publish_time: Optional["google.protobuf.timestamp_pb2.Timestamp"] = field(default=None) - ordering_key: Optional[str] = field(default=None) - - @define(kw_only=True) class Alert: """Pitt-Google container for an astronomical alert. @@ -122,9 +103,9 @@ class Alert: `Alert` may not be available. """ - msg: Optional[Union["google.cloud.pubsub_v1.types.PubsubMessage", _PubsubMessageLike]] = field( - default=None - ) + msg: Optional[ + Union["google.cloud.pubsub_v1.types.PubsubMessage", types_.PubsubMessageLike] + ] = field(default=None) """Incoming Pub/Sub message object.""" _attributes: Optional[Union[dict, "google._upb._message.ScalarMapContainer"]] = field( default=None @@ -149,7 +130,7 @@ def from_cloud_run(cls, envelope: dict, schema_name: str = str()) -> "Alert": raise BadRequest("Bad Request: invalid Pub/Sub message format") return cls( - msg=_PubsubMessageLike( + msg=types_.PubsubMessageLike( # this class requires data. the rest should be present in the message, but let's be lenient data=envelope["message"]["data"], attributes=envelope["message"].get("attributes"), @@ -163,8 +144,10 @@ def from_cloud_run(cls, envelope: dict, schema_name: str = str()) -> "Alert": @classmethod def from_path(cls, path, schema_name=str()) -> "Alert": with open(path, "rb") as f: - bytes = f.read() - return cls(msg=_PubsubMessageLike(data=bytes), schema_name=schema_name) + bytes_ = f.read() + return cls( + msg=types_.PubsubMessageLike(data=bytes_), schema_name=schema_name, path=Path(path) + ) # ---- properties ---- # @property diff --git a/pittgoogle/types_.py b/pittgoogle/types_.py new file mode 100644 index 0000000..58f2ada --- /dev/null +++ b/pittgoogle/types_.py @@ -0,0 +1,94 @@ +# -*- coding: UTF-8 -*- +"""Functions to support working with alerts and related data.""" +import importlib.resources +import json +import logging +from base64 import b64decode, b64encode +from collections import OrderedDict +from io import BytesIO +from typing import TYPE_CHECKING, Optional + +import fastavro +import pandas as pd +import yaml +from astropy.table import Table +from astropy.time import Time +from attrs import define, field + +if TYPE_CHECKING: + import google.protobuf.timestamp_pb2 + from pathlib import Path + + +LOGGER = logging.getLogger(__name__) +PACKAGE_DIR = importlib.resources.files(__package__) + + +@define(kw_only=True) +class Schema: + """Class for an individual schema. + + This class is not intended to be used directly. Instead, get a schema from the registry: + `pittgoogle.registry.Schemas`. + """ + + name: str = field() + description: str = field() + path: Optional["Path"] = field(default=None) + _map: Optional[dict] = field(default=None, init=False) + _avsc: Optional[dict] = field(default=None, init=False) + + @property + def survey(self) -> str: + """Name of the survey. This is the first block (separated by ".") in the schema's name.""" + return self.name.split(".")[0] + + @property + def definition(self) -> str: + """Pointer (e.g., URL) to the survey's schema definition.""" + return self.map.SURVEY_SCHEMA + + @property + def map(self) -> dict: + """Mapping of Pitt-Google's generic field names to survey-specific field names.""" + if self._map is None: + yml = PACKAGE_DIR / f"schemas/maps/{self.survey}.yml" + try: + self._map = yaml.safe_load(yml.read_text()) + except FileNotFoundError: + raise ValueError(f"no schema map found for schema name '{self.name}'") + return self._map + + @property + def avsc(self) -> Optional[dict]: + """The Avro schema loaded from the file at `self.path`, or None if a valid file cannot be found.""" + # if the schema has already been loaded, return it + if self._avsc is not None: + return self._avsc + + # if self.path does not point to an existing avro schema file, return None + if (self.path is None) or (self.path.suffix != ".avsc") or (not self.path.is_file()): + return None + + # load the schema and return it + self._avsc = fastavro.schema.load_schema(self.path) + return self._avsc + + +@define(frozen=True) +class PubsubMessageLike: + """Container for an incoming Pub/Sub message that mimics a `google.cloud.pubsub_v1.types.PubsubMessage`. + + It is convenient for the :class:`pittgoogle.Alert` class to work with a message as a + `pubsub_v1.types.PubsubMessage`. However, there are many ways to obtain an alert that do + not result in a `pubsub_v1.types.PubsubMessage` (e.g., an alert packet loaded from disk or + an incoming message to a Cloud Functions or Cloud Run module). In those cases, this class + is used to create an object with the same attributes as a `pubsub_v1.types.PubsubMessage`. + This object is then assigned to the `msg` attribute of the `Alert`. + """ + + data: bytes = field() + attributes: dict = field(factory=dict) + message_id: Optional[str] = field(default=None) + publish_time: Optional["google.protobuf.timestamp_pb2.Timestamp"] = field(default=None) + ordering_key: Optional[str] = field(default=None) From 65aa7cc7a4652ed3075fd6483e653332e2c3b4b0 Mon Sep 17 00:00:00 2001 From: troyraen Date: Thu, 28 Sep 2023 02:47:21 -0700 Subject: [PATCH 26/55] add registry module with ProjectIds, Schemas --- pittgoogle/registry.py | 89 ++++++++++++++++++++++++++++++++++++++++++ pittgoogle/utils.py | 18 --------- 2 files changed, 89 insertions(+), 18 deletions(-) create mode 100644 pittgoogle/registry.py diff --git a/pittgoogle/registry.py b/pittgoogle/registry.py new file mode 100644 index 0000000..ba2a62a --- /dev/null +++ b/pittgoogle/registry.py @@ -0,0 +1,89 @@ +# -*- coding: UTF-8 -*- +"""Pitt-Google registries.""" +import importlib.resources +import logging +from typing import ClassVar + +from attrs import define + +from . import types_ +from .exceptions import SchemaNotFoundError + + +LOGGER = logging.getLogger(__name__) +PACKAGE_DIR = importlib.resources.files(__package__) + + +@define(frozen=True) +class ProjectIds: + """Registry of Google Cloud Project IDs.""" + + pittgoogle: ClassVar[str] = "ardent-cycling-243415" + """Pitt-Google's production project.""" + + pittgoogle_dev: ClassVar[str] = "avid-heading-329016" + """Pitt-Google's development project.""" + + # pittgoogle_billing: ClassVar[str] = "light-cycle-328823" + # """Pitt-Google's billing project.""" + + elasticc: ClassVar[str] = "elasticc-challenge" + """Project running a classifier for ELAsTiCC alerts and reporting to DESC.""" + + +@define(frozen=True) +class Schemas: + """Registry of schemas used by Pitt-Google.""" + + # dict defining the schemas in the registry + # naming conventions: + # - schema names are expected to start with the name of the survey + # - if the survey has more than one schema, the survey name should be followed by a ".", + # followed by schema-specific specifier(s) + # - if an avro schema file is being registered with the schema (using the `path` arg), it is + # recommended that the file have the same name (path stem) as the schema. the file name + # must end with ".avsc". + dict: ClassVar[dict] = { + "elasticc.v0_9_1.alert": types_.Schema( + name="elasticc.v0_9_1.alert", + description="Avro schema of alerts published by ELAsTiCC.", + path=PACKAGE_DIR / f"schemas/elasticc/elasticc.v0_9_1.alert.avsc", + ), + "elasticc.v0_9_1.brokerClassification": types_.Schema( + name="elasticc.v0_9_1.brokerClassification", + description="Avro schema of alerts to be sent to DESC containing classifications of ELAsTiCC alerts.", + path=PACKAGE_DIR / f"schemas/elasticc/elasticc.v0_9_1.brokerClassification.avsc", + ), + "ztf": types_.Schema( + name="ztf", + description=( + "ZTF schema. The ZTF survey publishes alerts in Avro format with the schema attached " + "in the header. Pitt-Google publishes ZTF alerts in json format. This schema covers " + "both cases." # [TODO] + ), + path=None, + ), + } + """Dict defining the schemas in the registry.""" + + @classmethod + def names(cls) -> list[str]: + """Return the names of all registered schemas.""" + return list(cls.dict.keys()) + + @classmethod + def get(cls, schema_name: str) -> types_.Schema: + """Return the registered schema called `schema_name`. + + Raises + ------ + :class:`pittgoogle.exceptions.SchemaNotFoundError` + if a schema called `schema_name` is not found + """ + # if there is no registered schema with this name, raise an error + schema = cls.dict.get(schema_name) + if schema is None: + raise SchemaNotFoundError( + f"{schema_name} not found. for a list of valid names, use `pittgoogle.Schemas.names()`." + ) + return schema diff --git a/pittgoogle/utils.py b/pittgoogle/utils.py index d2a77ef..f185135 100644 --- a/pittgoogle/utils.py +++ b/pittgoogle/utils.py @@ -5,7 +5,6 @@ from base64 import b64decode, b64encode from collections import OrderedDict from io import BytesIO -from typing import ClassVar import fastavro import pandas as pd @@ -16,23 +15,6 @@ LOGGER = logging.getLogger(__name__) -@define -class ProjectIds: - """Registry of Google Cloud Project IDs.""" - - pittgoogle: ClassVar[str] = "ardent-cycling-243415" - """Pitt-Google's production project.""" - - pittgoogle_dev: ClassVar[str] = "avid-heading-329016" - """Pitt-Google's development project.""" - - # pittgoogle_billing: ClassVar[str] = "light-cycle-328823" - # """Pitt-Google's billing project.""" - - elasticc: ClassVar[str] = "elasticc-challenge" - """Project running a classifier for ELAsTiCC alerts and reporting to DESC.""" - - @define class Cast: """Methods to convert data types.""" From 16e3ffaf44555489a763b823d739c2f44b34e82b Mon Sep 17 00:00:00 2001 From: troyraen Date: Thu, 28 Sep 2023 02:48:13 -0700 Subject: [PATCH 27/55] define SchemaNotFoundError --- pittgoogle/exceptions.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pittgoogle/exceptions.py b/pittgoogle/exceptions.py index a28eef6..9ef37f7 100644 --- a/pittgoogle/exceptions.py +++ b/pittgoogle/exceptions.py @@ -5,3 +5,7 @@ class BadRequest(Exception): class OpenAlertError(Exception): """Raised when unable to deserialize a Pub/Sub message payload.""" + + +class SchemaNotFoundError(Exception): + """Raised when a schema with a given name is not found in the registry.""" From 8bdd621ef54add025ce2f535a40b3eb9fbcd0ec5 Mon Sep 17 00:00:00 2001 From: troyraen Date: Thu, 28 Sep 2023 02:58:17 -0700 Subject: [PATCH 28/55] use the new Schema class --- pittgoogle/alert.py | 64 +++++++++++++++++++++++++++++++-------------- 1 file changed, 44 insertions(+), 20 deletions(-) diff --git a/pittgoogle/alert.py b/pittgoogle/alert.py index 95c6056..c9b7876 100644 --- a/pittgoogle/alert.py +++ b/pittgoogle/alert.py @@ -112,8 +112,8 @@ class Alert: ) _dict: Optional[dict] = field(default=None) _dataframe: Optional["pd.DataFrame"] = field(default=None) - schema_name: str = field(factory=str, converter=str.lower) - _schema_map: Optional[dict] = field(default=None) + schema_name: Optional[str] = field(default=None) + _schema: Optional[types_.Schema] = field(default=None, init=False) # ---- class methods ---- # @classmethod @@ -174,19 +174,33 @@ def dict(self) -> dict: if self._dict is not None: return self._dict - if self.schema_name.startswith("elasticc"): - # self.msg.data is avro and schemaless. load the schema, then convert the bytes to a dict - schemapath = PACKAGE_DIR / f"schemas/elasticc/{self.schema_name}.avsc" - schema = fastavro.schema.load_schema(schemapath) + # deserialize self.msg.data (avro or json bytestring) into a dict. + # if self.msg.data is either (1) json; or (2) avro that contains the schema in the header, + # self.schema is not required for deserialization, so we want to be lenient. + # if self.msg.data is schemaless avro, deserialization requires self.schema.avsc to exist. + # currently, there is a clean separation between surveys: + # elasticc always requires self.schema.avsc; ztf never does. + # we'll check the survey name from self.schema.survey; but first we need to check whether + # the schema exists so we can try to continue without one instead of raising an error. + # we may want or need to handle this differently in the future. + try: + self.schema + except SchemaNotFoundError as exc: + LOGGER.warning(f"schema not found. attempting to deserialize without it. {exc}") + avro_schema = None + else: + if self.schema.survey in ["elasticc"]: + avro_schema = self.schema.avsc + else: + avro_schema = None + + # if we have an avro schema, use it to deserialize and return + if avro_schema: with io.BytesIO(self.msg.data) as fin: - self._dict = fastavro.schemaless_reader(fin, schema) + self._dict = fastavro.schemaless_reader(fin, avro_schema) return self._dict - if self.schema_name == "": - LOGGER.warning("no alert schema_name provided. attempting to deserialize without it.") - - # assume this is a ztf or ztf-lite alert - # this should be rewritten to catch specific errors + # [TODO] this should be rewritten to catch specific errors # for now, just try avro then json, catching basically all errors in the process try: self._dict = Cast.avro_to_dict(self.msg.data) @@ -228,14 +242,24 @@ def sourceid(self) -> Union[str, int]: return self.get("sourceid") @property - def schema_map(self) -> dict: - if self._schema_map is None: - if self.schema_name == str(): - raise TypeError("no alert schema_name provided. unable to load schema map.") - survey = self.schema_name.split(".")[0] - path = PACKAGE_DIR / f"schemas/maps/{survey}.yml" - self._schema_map = yaml.safe_load(path.read_text()) - return self._schema_map + def schema(self) -> types_.Schema: + """Loads the schema from the registry :class:`pittgoogle.registry.Schemas`. + + Raises + ------ + :class:`pittgoogle.exceptions.SchemaNotFoundError` + if the `schema_name` is not supplied or a schema with this name is not found + """ + if self._schema is not None: + return self._schema + + # need to load the schema. raise an error if no schema_name given + if self.schema_name is None: + raise SchemaNotFoundError("a schema_name is required") + + # this also may raise SchemaNotFoundError + self._schema = registry.Schemas.get(self.schema_name) + return self._schema # ---- methods ---- # def get(self, key: str, default: Optional[str] = None): From afb43f936013956edbee311a39fbbfdf10078e93 Mon Sep 17 00:00:00 2001 From: troyraen Date: Thu, 28 Sep 2023 02:59:58 -0700 Subject: [PATCH 29/55] refactor Alert.get and add Alert.get_key --- pittgoogle/alert.py | 101 ++++++++++++++++++++++++++++++++------------ 1 file changed, 73 insertions(+), 28 deletions(-) diff --git a/pittgoogle/alert.py b/pittgoogle/alert.py index c9b7876..d9e5334 100644 --- a/pittgoogle/alert.py +++ b/pittgoogle/alert.py @@ -262,42 +262,87 @@ def schema(self) -> types_.Schema: return self._schema # ---- methods ---- # - def get(self, key: str, default: Optional[str] = None): - # if key is found in self.dict, just return the corresponding value - if key in self.dict: - return self.dict.get(key) - - # lookup the key in the schema map - survey_key = self.schema_map.get(key) # str or list[str] - - if isinstance(survey_key, str): - return self.dict.get(survey_key) + def get(self, field: str, default: Any = None) -> Any: + """Return the value of `field` in this alert. + + The keys in the alert dictionary :attr:`pittgoogle.alert.Alert.dict` are survey-specific field names. + This method allows you to `get` values from the dict using generic names that will work across + surveys. `self.schema.map` is the mapping of generic -> survey-specific names. + To access a field using a survey-specific name, get it directly from the alert `dict`. + + Parameters + ---------- + field : str + Name of a field in the alert's schema. This must be one of the keys in the dict `self.schema.map`. + default : str or None + Default value to be returned if the field is not found. + + Returns + ------- + value : any + Value in the :attr:`pittgoogle.alert.Alert.dict` corresponding to this field. + """ + survey_field = self.schema.map.get(field) # str, list[str], or None - if not isinstance(survey_key, list): + if survey_field is None: return default - if len(survey_key) == 1: - return self.dict.get(survey_key[0]) + if isinstance(survey_field, str): + return self.dict.get(survey_field, default) - if len(survey_key) == 2: - return self.dict.get(survey_key[0]).get(survey_key[1]) + # if survey_field is not one of the expected types, the schema map is malformed + # maybe this was intentional, but we don't know how to handle it here + if not isinstance(survey_field, list): + raise TypeError( + f"field lookup not implemented for a schema-map value of type {type(survey_field)}" + ) - if len(survey_key) == 3: - return self.dict.get(survey_key[0]).get(survey_key[1]).get(survey_key[2]) + # the list must have more than 1 item, else it would be a single str + if len(survey_field) == 2: + try: + return self.dict[survey_field[0]][survey_field[1]] + except KeyError: + return default - def get_key(self, key, name_only: bool = True): - if key in self.dict: - return key + if len(survey_field) == 3: + try: + return self.dict[survey_field[0]][survey_field[1]][survey_field[2]] + except KeyError: + return default - survey_key = self.schema_map.get(key) # str or list[str] + raise NotImplementedError( + f"field lookup not implemented for depth {len(survey_field)} (key = {survey_field})" + ) - if isinstance(survey_key, str): - return survey_key + def get_key( + self, field: str, name_only: bool = False, default: Optional[str] = None + ) -> Optional[Union[str, list[str]]]: + """Return the survey-specific field name. + + Parameters + ---------- + field : str + Generic field name whose survey-specific name is to be returned. This must be one of the + keys in the dict `self.schema.map`. + name_only : bool + In case the survey-specific field name is nested below the top level, whether to return + just the single final name as a str (True) or the full path as a list[str] (False). + default : str or None + Default value to be returned if the field is not found. + + Returns + ------- + survey_field : str or list[str] + Survey-specific name for the `field`, or `default` if the field is not found. + list[str] if this is a nested field and `name_only` is False, else str with the + final field name only. + """ + survey_field = self.schema.map.get(field) # str, list[str], or None - if not isinstance(survey_key, list): - return + if survey_field is None: + return default - if name_only: - return survey_key[-1] + if name_only and isinstance(survey_field, list): + return survey_field[-1] - return survey_key + return survey_field From 14dd75784699978866b9133009cf54417d66cd4e Mon Sep 17 00:00:00 2001 From: troyraen Date: Thu, 28 Sep 2023 03:00:42 -0700 Subject: [PATCH 30/55] add method add_id_attributes --- pittgoogle/alert.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/pittgoogle/alert.py b/pittgoogle/alert.py index d9e5334..3085cec 100644 --- a/pittgoogle/alert.py +++ b/pittgoogle/alert.py @@ -262,6 +262,22 @@ def schema(self) -> types_.Schema: return self._schema # ---- methods ---- # + def add_id_attributes(self) -> None: + """Add the IDs to the attributes.""" + ids = ["alertid", "objectid", "sourceid"] + values = [self.get(id) for id in ids] + + # get the survey-specific field names + survey_names = [self.get_key(id) for id in ids] + # if the field is nested, the key will be a list + # but pubsub message attributes must be strings. join to avoid a future error on publish + names = [".".join(id) if isinstance(id, list) else id for id in survey_names] + + # only add to attributes if the survey has defined this field + for idname, idvalue in zip(names, values): + if idname is not None: + self.attributes[idname] = idvalue + def get(self, field: str, default: Any = None) -> Any: """Return the value of `field` in this alert. From 73a3e74b6e3fd6b0e9b433d912ca05e36f8855f8 Mon Sep 17 00:00:00 2001 From: troyraen Date: Thu, 28 Sep 2023 03:02:52 -0700 Subject: [PATCH 31/55] update Alert.from_* methods --- pittgoogle/alert.py | 56 ++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 50 insertions(+), 6 deletions(-) diff --git a/pittgoogle/alert.py b/pittgoogle/alert.py index 3085cec..fce3a3e 100644 --- a/pittgoogle/alert.py +++ b/pittgoogle/alert.py @@ -114,15 +114,41 @@ class Alert: _dataframe: Optional["pd.DataFrame"] = field(default=None) schema_name: Optional[str] = field(default=None) _schema: Optional[types_.Schema] = field(default=None, init=False) + path: Optional[Path] = field(default=None) # ---- class methods ---- # @classmethod - def from_msg(cls, msg, schema_name=str()) -> "Alert": # [TODO] update tom_desc to use this - """Create an `Alert` from a `pubsub_v1.types.PubsubMessage`.""" - return cls(msg=msg, schema_name=schema_name) + def from_cloud_run(cls, envelope: dict, schema_name: Optional[str] = None) -> "Alert": + """Create an `Alert` from an HTTP request envelope containing a Pub/Sub message, as received by a Cloud Run module. - @classmethod - def from_cloud_run(cls, envelope: dict, schema_name: str = str()) -> "Alert": + Example code for a Cloud Run module that uses this method to open a ZTF alert: + + .. code-block:: python + + import pittgoogle + # flask is used to work with HTTP requests, which trigger Cloud Run modules + # the request contains the Pub/Sub message, which contains the alert packet + import flask + + app = flask.Flask(__name__) + + # function that receives the request + @app.route("/", methods=["POST"]) + def index(): + + try: + # unpack the alert + # if the request does not contain a valid message, this raises a `BadRequest` + alert = pittgoogle.Alert.from_cloud_run(envelope=flask.request.get_json(), schema_name="ztf") + + except pg.exceptions.BadRequest as exc: + # return the error text and an HTTP 400 Bad Request code + return str(exc), 400 + + # continue processing the alert + # when finished, return an empty string and an HTTP success code + return "", 204 + """ # check whether received message is valid, as suggested by Cloud Run docs if not envelope: raise BadRequest("Bad Request: no Pub/Sub message received") @@ -142,7 +168,25 @@ def from_cloud_run(cls, envelope: dict, schema_name: str = str()) -> "Alert": ) @classmethod - def from_path(cls, path, schema_name=str()) -> "Alert": + def from_dict( + cls, + payload: dict, + attributes: Optional[Union[dict, "google._upb._message.ScalarMapContainer"]] = None, + schema_name: Optional[str] = None, + ) -> "Alert": # [TODO] update tom_desc to use this + """Create an `Alert` from a dictionary (`payload`).""" + return cls(dict=payload, attributes=attributes, schema_name=schema_name) + + @classmethod + def from_msg( + cls, msg: "google.cloud.pubsub_v1.types.PubsubMessage", schema_name: Optional[str] = None + ) -> "Alert": # [TODO] update tom_desc to use this + """Create an `Alert` from a `google.cloud.pubsub_v1.types.PubsubMessage`.""" + return cls(msg=msg, schema_name=schema_name) + + @classmethod + def from_path(cls, path: Union[str, Path], schema_name: Optional[str] = None) -> "Alert": + """Create an `Alert` from the file at `path`.""" with open(path, "rb") as f: bytes_ = f.read() return cls( From 6b244cfbb03fe2398e7068ec0135d3a5f8231957 Mon Sep 17 00:00:00 2001 From: troyraen Date: Thu, 28 Sep 2023 03:04:59 -0700 Subject: [PATCH 32/55] make Alert.attributes dict only --- pittgoogle/alert.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/pittgoogle/alert.py b/pittgoogle/alert.py index fce3a3e..0c7b93f 100644 --- a/pittgoogle/alert.py +++ b/pittgoogle/alert.py @@ -195,15 +195,17 @@ def from_path(cls, path: Union[str, Path], schema_name: Optional[str] = None) -> # ---- properties ---- # @property - def attributes(self) -> Union[dict, "google._upb._message.ScalarMapContainer"]: + def attributes(self) -> dict: """Custom metadata for the message. Pub/Sub handles this as a dict-like called "attributes". - If None, this will be set to `self.msg.attributes`. - Update as desired. - When publishing, this will be sent as the message attributes. + If this was not set when the `Alert` was instantiated, a new dictionary will be created using + the `attributes` field in :attr:`pittgoogle.Alert.msg` the first time it is requested. + Update this dictionary as desired (it will not affect the original `msg`). + When publishing the alert using :attr:`pittgoogle.Topic.publish`, this dictionary will be + sent as the Pub/Sub message attributes. """ if self._attributes is None: - self._attributes = self.msg.attributes + self._attributes = dict(self.msg.attributes) return self._attributes @property From 25f1136e56399ea3030d9a449fcdcef5ed0c4323 Mon Sep 17 00:00:00 2001 From: troyraen Date: Thu, 28 Sep 2023 03:06:00 -0700 Subject: [PATCH 33/55] clean up Alert.dataframe --- pittgoogle/alert.py | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/pittgoogle/alert.py b/pittgoogle/alert.py index 0c7b93f..3065e98 100644 --- a/pittgoogle/alert.py +++ b/pittgoogle/alert.py @@ -259,17 +259,14 @@ def dict(self) -> dict: @property def dataframe(self) -> "pd.DataFrame": - if self._dataframe is None: - import pandas as pd # lazy-load pandas. it hogs memory on cloud functions and run + if self._dataframe is not None: + return self._dataframe - if self.schema_name.endswith(".lite"): - src_df = pd.DataFrame(self.dict["source"], index=[0]) - prvs_df = pd.DataFrame(self.dict["prv_sources"]) - else: - src_df = pd.DataFrame(self.dict[self.schema_map["source"]], index=[0]) - prvs_df = pd.DataFrame(self.dict[self.schema_map["prv_sources"]]) - self._dataframe = pd.concat([src_df, prvs_df], ignore_index=True) + import pandas as pd # always lazy-load pandas. it hogs memory on cloud functions and run + src_df = pd.DataFrame(self.get("source"), index=[0]) + prvs_df = pd.DataFrame(self.get("prv_sources")) + self._dataframe = pd.concat([src_df, prvs_df], ignore_index=True) return self._dataframe @property From f7100d392b54ff0efadff1e87bca1623561d053d Mon Sep 17 00:00:00 2001 From: troyraen Date: Thu, 28 Sep 2023 03:10:18 -0700 Subject: [PATCH 34/55] use the new Schema class --- pittgoogle/pubsub.py | 47 +++++++++++++++++++++++++++++++++++--------- 1 file changed, 38 insertions(+), 9 deletions(-) diff --git a/pittgoogle/pubsub.py b/pittgoogle/pubsub.py index 3e21293..61b697e 100644 --- a/pittgoogle/pubsub.py +++ b/pittgoogle/pubsub.py @@ -242,19 +242,48 @@ def delete(self) -> None: else: LOGGER.info(f"deleted topic: {self.path}") - def publish(self, alert: "Alert", format="json") -> int: - """Publish the `alert.dict` in the requested `format`, attaching the `alert.attributes`. + def publish(self, alert: "Alert") -> int: + """Publish a message with `alert.dict` as the payload and `alert.attributes` as the attributes. - `format` can be "json" or a schema name. + If the `alert` has an elasticc schema, the payload will be serialized as schemaless Avro. + Otherwise, json will be used. """ - if format == "json": - message = json.dumps(alert.dict).encode("utf-8") + # we need to decide which format to use: json, avro with schema, or avro without schema + # the format that pitt-google currently (2023-09-23) uses to publish messages depends on the stream: + # - consumer modules pass on the original alert data packet, as produced by the survey. + # they do not need to use this method (in fact, the consumers do not even use python), + # so we can ignore this case. + # - all other broker pipeline modules (Pitt-Google-Broker repo) use json. + # - modules in the pittgoogle-user repo publish classifications for elasticc, and thus + # use schemaless avro. + # at some point, we should re-evaluate the broker pipeline in particular. + # + # for now, we will get close enough to the current behavior if we assume that: + # - elasticc messages should be published as schemaless avro + # - else, we should publish a json message + # this will match the current behavior in all cases except the elasticc broker pipeline modules. + # neither broker pipeline uses pittgoogle-client at this time (they use pgb-broker-utils), + # so we don't need to update or accommodate them yet. + # + # we'll get the survey name from self.schema.survey, but first we should check whether the + # schema exists so we can be lenient and just fall back to json instead of raising an error. + try: + alert.schema + except SchemaNotFoundError: + avro_schema = None + else: + if alert.schema.survey in ["elasticc"]: + avro_schema = alert.schema.avsc + else: + avro_schema = None - elif format.startswith("elasticc"): - # load the avro schema and use it to serialize alert.dict - schema = fastavro.schema.load_schema(PACKAGE_DIR / f"schemas/elasticc/{format}.avsc") + if not avro_schema: + # serialize using json + message = json.dumps(alert.dict).encode("utf-8") + else: + # serialize as schemaless avro fout = io.BytesIO() - fastavro.schemaless_writer(fout, schema, alert.dict) + fastavro.schemaless_writer(fout, avro_schema, alert.dict) fout.seek(0) message = fout.getvalue() From 0258696fdfe4a32c2aa935d03620bf0aade49fc1 Mon Sep 17 00:00:00 2001 From: troyraen Date: Thu, 28 Sep 2023 03:12:07 -0700 Subject: [PATCH 35/55] add minor conveniences --- pittgoogle/pubsub.py | 39 ++++++++++++++++++++++++++++++++++----- 1 file changed, 34 insertions(+), 5 deletions(-) diff --git a/pittgoogle/pubsub.py b/pittgoogle/pubsub.py index 61b697e..a08e99d 100644 --- a/pittgoogle/pubsub.py +++ b/pittgoogle/pubsub.py @@ -177,9 +177,38 @@ class Topic: ) @classmethod - def from_cloud(cls, name, *, projectid, testid=False): - """Create a `Topic` with a `client` using implicit credentials (no explicit `auth`).""" - # if testid is not False, "False", or None, append the testid to the name + def from_cloud( + cls, + name: str, + *, + projectid: str, + survey: Optional[str] = None, + testid: Optional[str] = None, + ): + """Create a `Topic` with a `client` using implicit credentials (no explicit `auth`). + + Parameters + ---------- + name : `str` + Name of the topic. If `survey` and/or `testid` are provided, they will be added to this + name following the Pitt-Google naming syntax. + projectid : `str` + Project ID of the Goodle Cloud project that owns this resource. Project IDs used by + Pitt-Google are listed in the registry for convenience (:class:`pittgoogle.registry.ProjectIds`). + Required because it cannot be retrieved from the `client` and there is no explicit `auth`. + survey : `str`, optional + Name of the survey. If provided, it will be prepended to `name` following the + Pitt-Google naming syntax. + testid : `str`, optional + Pipeline identifier. If this is not `None`, `False`, or `"False"` it will be appended to + the `name` following the Pitt-Google naming syntax. This used to allow pipeline modules + to find the correct resources without interfering with other pipelines that may have + deployed resources with the same base names (e.g., for development and testing purposes). + """ + # if survey and/or testid passed in, use them to construct full name using the pitt-google naming syntax + if survey is not None: + name = f"{survey}-{name}" + # must accommodate False and "False" for consistency with the broker pipeline if testid and testid != "False": name = f"{name}-{testid}" return cls(name, projectid=projectid, client=pubsub_v1.PublisherClient()) @@ -287,8 +316,8 @@ def publish(self, alert: "Alert") -> int: fout.seek(0) message = fout.getvalue() - # attribute keys and values must be strings - attributes = {str(key): str(val) for key, val in alert.attributes.items()} + # attribute keys and values must be strings. let's sort the keys while we're at it + attributes = {str(key): str(alert.attributes[key]) for key in sorted(alert.attributes)} future = self.client.publish(self.path, data=message, **attributes) return future.result() From 249d96184aa0a251173e6b05b9983829fb2801e6 Mon Sep 17 00:00:00 2001 From: troyraen Date: Thu, 28 Sep 2023 03:12:30 -0700 Subject: [PATCH 36/55] clean up imports, docs, and comments --- pittgoogle/alert.py | 56 ++++++++++++++------------------------------ pittgoogle/pubsub.py | 14 +++++++---- 2 files changed, 27 insertions(+), 43 deletions(-) diff --git a/pittgoogle/alert.py b/pittgoogle/alert.py index 3065e98..4c94a6d 100644 --- a/pittgoogle/alert.py +++ b/pittgoogle/alert.py @@ -17,34 +17,6 @@ path = "path/to/ztf_alert.avro" # point this to a file containing an alert alert = pittgoogle.Alert.from_path(path, schema_name="ztf") -Load a ZTF alert from a Pub/Sub message that has triggered a Cloud Run module: - -.. code-block:: python - - import pittgoogle - # flask is used to work with HTTP requests, which trigger Cloud Run modules - # the request contains the Pub/Sub message, which contains the alert packet - from flask import Flask, request - - app = Flask(__name__) - - # function that receives the request - @app.route("/", methods=["POST"]) - def index(): - - try: - # unpack the alert - # if the request does not contain a valid message, this raises a `BadRequest` - alert = pittgoogle.Alert.from_cloud_run(envelope=request.get_json(), schema_name="ztf") - - except pg.exceptions.BadRequest as err: - # return the error text and an HTTP 400 Bad Request code - return err.text, 400 - - # continue processing the alert - # when finished, return an empty string and an HTTP success code - return "", 204 - API ---- @@ -52,19 +24,19 @@ def index(): import importlib.resources import io import logging -from typing import TYPE_CHECKING, Optional, Union +from pathlib import Path +from typing import Any, TYPE_CHECKING, Optional, Union import fastavro -import yaml from attrs import define, field -from .exceptions import BadRequest, OpenAlertError +from . import registry, types_ +from .exceptions import BadRequest, OpenAlertError, SchemaNotFoundError from .utils import Cast if TYPE_CHECKING: import google._upb._message import google.cloud.pubsub_v1 - import google.protobuf.timestamp_pb2 import pandas as pd # always lazy-load pandas. it hogs memory on cloud functions and run @@ -76,8 +48,7 @@ def index(): class Alert: """Pitt-Google container for an astronomical alert. - Alerts are typically loaded from a Pub/Sub message but may also be loaded from a file. - It is recommended to instantiate an `Alert` using one of the `from_*` methods. + Recommended to instantiate using one of the `from_*` methods. All parameters are keyword only. @@ -210,7 +181,7 @@ def attributes(self) -> dict: @property def dict(self) -> dict: - """Message payload as a dictionary. Created from `self.msg.data` and `self.schema_name`, if needed. + """Alert data as a dictionary. Created from `self.msg.data`, if needed. Raises ------ @@ -271,17 +242,26 @@ def dataframe(self) -> "pd.DataFrame": @property def alertid(self) -> Union[str, int]: - """Convenience property for the alert ID. If the survey does not define an alert ID, this is the `sourceid`.""" + """Convenience property to get the alert ID. + + If the survey does not define an alert ID, this returns the `sourceid`. + """ return self.get("alertid", self.sourceid) @property def objectid(self) -> Union[str, int]: - """Convenience property for the object ID. The "object" represents a collection of sources, as determined by the survey.""" + """Convenience property to get the object ID. + + The "object" represents a collection of sources, as determined by the survey. + """ return self.get("objectid") @property def sourceid(self) -> Union[str, int]: - """Convenience property for the source ID. The "source" is the detection that triggered the alert.""" + """Convenience property to get the source ID. + + The "source" is the detection that triggered the alert. + """ return self.get("sourceid") @property diff --git a/pittgoogle/pubsub.py b/pittgoogle/pubsub.py index a08e99d..5ec7c1a 100644 --- a/pittgoogle/pubsub.py +++ b/pittgoogle/pubsub.py @@ -23,11 +23,14 @@ .. code-block:: python - subscription = pittgoogle.pubsub.Subscription( - "my-ztf-loop-subscription", - # topic only required if the subscription does not yet exist in Google Cloud - topic=pittgoogle.pubsub.Topic("ztf-loop", pittgoogle.utils.ProjectIds.pittgoogle) - ) + # topic the subscription will be connected to + # only required if the subscription does not yet exist in Google Cloud + topic = pittgoogle.Topic(name="ztf-loop", projectid=pittgoogle.ProjectIds.pittgoogle) + + # choose your own name for the subscription + subscription = pittgoogle.Subscription(name="my-ztf-loop-subscription", topic=topic, schema_name="ztf") + + # make sure the subscription exists and we can connect to it. create it if necessary subscription.touch() Pull a small batch of alerts. Helpful for testing. Not recommended for long-runnining listeners. @@ -92,6 +95,7 @@ def my_batch_callback(results): from google.api_core.exceptions import NotFound from . import Alert, Auth +from .exceptions import SchemaNotFoundError LOGGER = logging.getLogger(__name__) From 4f90ffa4d55b5d2464a6861ac92e32f7a7107b1c Mon Sep 17 00:00:00 2001 From: troyraen Date: Thu, 28 Sep 2023 03:13:42 -0700 Subject: [PATCH 37/55] remove bigquery module --- pittgoogle/bigquery.py | 692 ----------------------------------------- 1 file changed, 692 deletions(-) delete mode 100644 pittgoogle/bigquery.py diff --git a/pittgoogle/bigquery.py b/pittgoogle/bigquery.py deleted file mode 100644 index 9da91b7..0000000 --- a/pittgoogle/bigquery.py +++ /dev/null @@ -1,692 +0,0 @@ -# -*- coding: UTF-8 -*- -"""The ``bigquery`` module facilitates querying Pitt-Google Broker's -BigQuery databases and reading the results. -See the tutorial for usage help. -""" -from typing import Generator, List, Optional, Tuple, Union - -import astropy -import pandas as pd -from astropy import coordinates as coord -from google.cloud import bigquery -from tabulate import tabulate - -from .utils import ProjectIds - - -pgb_project_id = ProjectIds.pittgoogle - -# --- BigQuery Client -user_bq_client, user_project_id = None, None # module's global Client, related id - - -def create_client(project_id: str): - """Open a BigQuery Client. - - Args: - project_id: User's Google Cloud Platform project ID - """ - - global user_bq_client - global user_project_id - - # instantiate the client - print(f"\nInstantiating a BigQuery client with project_id: {project_id}\n") - user_bq_client = bigquery.Client(project=project_id) - - # if the user passed a bad project_id, we won't know it yet. Let's check - _create_client_raise_exception_if_not_connected(project_id) - - # client is connected. set the global user_project_id - user_project_id = project_id - - -def _create_client_raise_exception_if_not_connected(project_id: str): - """Checks that the user's client can successfully connect to our tables - by executing a dry run query. - """ - - global user_bq_client - - query = f"SELECT candid FROM `{pgb_project_id}.ztf_alerts.salt2`" - try: - dry_run(query, notify=False) - except: - user_bq_client = None # reset so the user can try again - msg = ( - "You have tried to create a BigQuery Client with the project_id:\n" - f"\t{project_id}\n" - "But the Client cannot connect to the Pitt-Google Broker.\n" - "Check that your project_id is valid " - "(e.g., it should not be wrapped in quotes)." - ) - raise ValueError(msg) - - -def _check_client_isinstance(): - msg = ( - "You must create a BigQuery client first. " - "Run `pittgoogle.bigquery.create_client('your_project_id')`" - ) - assert isinstance(user_bq_client, bigquery.client.Client), msg - - -def _create_client_if_needed(): - stop = False # will be set to True if the user chooses to exit - - try: - _check_client_isinstance() - - except AssertionError: - # help the user open a bigquery client - msg = ( - "\nTo run queries, you must first open a BigQuery Client.\n" - "Enter your Google Cloud Platform project ID now " - "or exit (just press Enter) and run\n" - "`pittgoogle.bigquery.create_client(my_project_id)`\n" - "\nProject ID: " - ) - project_id = input(msg) or "" - - if project_id == "": - stop = True # user wants to exit rather than creating a client - else: - create_client(project_id) - - return stop - - -# --- Get information about PGB datasets and tables -def get_table_info(table: Union[str, list] = "all", dataset: str = "ztf_alerts"): - """Retrieves and prints BigQuery table schemas. - - Args: - - table: Name of the BigQuery table or list of the same. - 'all' will print the info for all tables in the dataset. - - dataset: Name of BigQuery dataset that the table(s) belong to. - """ - - # if a bigquery Client does not exist, help the user instantiate one - stop = _create_client_if_needed() - if stop: # the user has chosen to exit rather than create a client - return - - # get the table names in a list - if table == "all": - tables = get_dataset_table_names(dataset=dataset) - elif isinstance(table, str): - tables = [table] - else: - tables = table - - # get and print info about each table - for t in tables: - df = get_table_schema(table=t, dataset=dataset) - - # print the metadata and column info - print(df.table_name) - print(tabulate(df, headers="keys", tablefmt="grid")) # psql - print(f"\n{df.table_name} has {df.num_rows} rows.\n") - - -def get_table_schema(table: str, dataset: str = "ztf_alerts") -> pd.DataFrame: - """Retrieves information about the columns in a BigQuery table and returns - it as a DataFrame. - - Args: - table: Name of the BigQuery table - dataset: Name of BigQuery dataset that the table(s) belong to. - Returns - Column information from the BigQuery table schema. - """ - - # if a bigquery Client does not exist, help the user instantiate one - stop = _create_client_if_needed() - if stop: # the user has chosen to exit rather than create a client - return - - bqtable = user_bq_client.get_table(f"{pgb_project_id}.{dataset}.{table}") - cols = [] - for field in bqtable.schema: - cols.append((field.name, field.description, field.field_type)) - - if field.field_type == "RECORD": - for subfield in field.fields: - cols.append( - ( - f"{field.name}.{subfield.name}", - subfield.description, - subfield.field_type, - ) - ) - - # cols = [(s.name, s.description, s.field_type, s.mode) for s in bqtable.schema] - colnames = ["column_name", "description", "type"] - df = pd.DataFrame(cols, columns=colnames) - - # add some metadata - df.table_name = f"{bqtable.project}.{bqtable.dataset_id}.{bqtable.table_id}" - df.num_rows = bqtable.num_rows - - return df - - -def get_dataset_table_names(dataset: str = "ztf_alerts") -> List[str]: - """ - Args: - dataset: Name of the BigQuery dataset. - - Returns: - List of table names in the dataset. - """ - - # if a bigquery Client does not exist, help the user instantiate one - stop = _create_client_if_needed() - if stop: # the user has chosen to exit rather than create a client - return - - print(f"Getting table names for dataset: {dataset}") - - query = "SELECT * " f"FROM {pgb_project_id}.{dataset}.INFORMATION_SCHEMA.TABLES" - query_job = user_bq_client.query(query) - tables = [row["table_name"] for row in query_job] - tables.sort(key=str.lower) - return tables - - -# --- Setup to query for object histories -def get_history_column_names() -> List[str]: - """ - It would be convenient to also return the column descriptions, but - that is more complicated, and this function will be completely - obsolete if we change the database structure to store only the - "candidate" observation and metadata. - - Returns: - Column names appropriate for querying object histories. - """ - - dropcols = ["prv_candidates", "cutoutScience", "cutoutDifference", "cutoutTemplate"] - - sdf = get_table_schema("alerts") - schemacols = list(sdf["column_name"]) - - # drop the prv_candidates and cutout columns - historycols = [c for c in schemacols if c.split(".")[0] not in dropcols] - - # drop the full "candidate" RECORD column - historycols.remove("candidate") - - # drop "candidate.candid" as it is simply a repeat of "candid" - historycols.remove("candidate.candid") - - # strip out "candidate." from nested columns - # query_objects() uses only the base names - historycols = [c.replace("candidate.", "") for c in historycols] - - return historycols - - -def check_history_column_names(columns: List[str]) -> Union[List[str], bool]: - """Make sure user-submitted column names are appropriate to query object histories.""" - - -def _split_good_bad_history_column_names( - columns: List[str], -) -> Tuple[List[str], List[str]]: - """Split columns list into "good" and "bad" according to whether they are - suitable for querying an object's history. - """ - - badcols = list(set(columns) - set(get_history_column_names())) - goodcols = columns.copy() - for bc in badcols: - goodcols.remove(bc) - return (goodcols, badcols) - - -def object_history_sql_statement( - columns: List[str], objectIds: Optional[list] = None, limit: Optional[int] = None -) -> str: - """Convince function that generates the SQL string needed to - query the alerts table and aggregate data by objectId. - When the resulting SQL query is executed, the query job will contain - one row for each objectId, with the object's data aggregated into - arrays (one array per column in columns) ordered by the observation date. - - Note: Arrays may contain duplicated observations; it is the user's - responsibility to clean them. - - Args: - columns: Names of columns to select from the alerts table. - The 'objectId' and 'candid' columns are automatically included - and do not need to be in this list. - objectIds: IDs of ZTF objects to include in the query. - limit: Maximum number of rows to be returned. - - Returns: - SQL statement to query the alerts table and aggregate data by objectId. - """ - - dataset = "ztf_alerts" - table = "alerts" - objectcols = [ - "objectId", - ] # columns unique to an object - # make sure 'candid' is in columns. (objectcols handled separately) - columns = list(set(columns).union(set(["candid"]))) - - # SELECT statement - # create a list of strings that will aggregate columns into arrays - aggcols = _list_aggcols_sql_statements(columns) - selects = f'SELECT {", ".join(objectcols + aggcols)}' - - # FROM statement - froms = f"FROM `{pgb_project_id}.{dataset}.{table}`" - # concat the statements into the beginning of a SQL query statement - sqlquery = " ".join([selects, froms]) - - # WHERE statement - if objectIds is not None: - # wrap each objectId in quotes and join to single string - oids = ",".join([f'"{o}"' for o in objectIds]) - wheres = f"WHERE objectId IN ({oids})" - # concat the statements into a SQL query statement - sqlquery = " ".join([sqlquery, wheres]) - - # GROUP BY statement - groupbys = "GROUP BY objectId" - sqlquery = " ".join([sqlquery, groupbys]) - - # LIMIT statement - if limit is not None: - limits = f"LIMIT {limit}" - sqlquery = " ".join([sqlquery, limits]) - - return sqlquery - - -def _list_aggcols_sql_statements(columns: List[str]) -> List[str]: - """Create a list of SQL string query segments that will aggregate - all columns not in objectcols. - """ - - objectcols = [ - "objectId", - ] - flatcols = [ - "schemavsn", - "publisher", - "candid", - ] - - # list of requested flatcols - fcols = list(set(columns) & set(flatcols)) - # list of requested columns nested under 'candidate' - ncols = list(set(columns) - set(objectcols) - set(flatcols)) - ncols = [f"candidate.{c}" for c in ncols] - # complete list of columns to be aggregated (group by) objectId - aggcols = fcols + ncols - # attach the ARRAY_AGG, ORDER By, and AS statements to the aggcols - aggcols = [f'ARRAY_AGG({c} ORDER BY candidate.jd) AS {c.split(".")[-1]}' for c in aggcols] - - return aggcols - - -# --- Dry runs -def dry_run(query: str, notify: bool = True): - """Perform a dry run to find out how many bytes the query will process. - Args: - query: SQL query statement - """ - - global user_project_id - _check_client_isinstance() # make sure we have a bigquery.client - - job_config = bigquery.QueryJobConfig(dry_run=True, use_query_cache=False) - query_job = user_bq_client.query(query, job_config=job_config) - - if notify: - nbytes, TiB = query_job.total_bytes_processed, 2**40 - pTiB = nbytes / TiB * 100 # nbytes as a percent of 1 TiB - print("\nQuery statement:") - print(f'\n"{query}"\n') - print(f"will process {nbytes} bytes of data.") - print(f"({pTiB:.3}% of your 1 TiB Free Tier monthly allotment.)") - - -def _dry_run_and_confirm(query: str) -> bool: - # print dry run info - dry_run(query) - # ask user if they want to proceed - cont = input("Continue? [y/N]: ") or "N" - do_the_query = cont in ["y", "Y"] - return do_the_query - - -# --- Query for object histories -def query_objects( - columns: List[str], - objectIds: Optional[list] = None, - limit: Optional[int] = None, - format: str = "pandas", - iterator: bool = False, - dry_run: bool = True, -) -> Union[ - str, - pd.DataFrame, - bigquery.job.QueryJob, - Generator[Union[str, pd.DataFrame], None, None], -]: - """Query the alerts database for object histories. - - Args: - columns: Names of columns to select from the alerts table. - The 'objectId' and 'candid' columns are automatically included - and do not need to be in this list. - objectIds: IDs of ZTF objects to include in the query. - limit: Limit the number of objects returned to N <= limit. - format: One of 'pandas', 'json', or 'query_job'. Query results will be - returned in this format. Results returned as 'query_job' may - contain duplicate observations; else duplicates are dropped. - iterator: If True, iterate over the objects and return one at a time. - Else return the full query results together. - This parameter is ignored if `format` == 'query_job'. - dry_run: If True, `pittgoogle.bigquery.dry_run` will be called first and the - user will be asked to confirm before continuing. - - Returns: - Query results in the requested format. - """ - - # make sure we have appropriate column names - goodcols = _query_objects_check_history_column_names(columns) - if len(goodcols) == 0: # user submitted bad columns and wants to abort - return - - # if a bigquery client does not exist, help the user instantiate one - stop = _create_client_if_needed() - if stop: # the user has chosen to exit rather than create a client - return - - # generate the SQL statement to query alerts db and aggregate histories - query = object_history_sql_statement(goodcols, objectIds, limit=limit) # str - - # print dry run results - if dry_run: - do_the_query = _dry_run_and_confirm(query) - if not do_the_query: # user has chosen to abort the query - return - - # make the API call - query_job = user_bq_client.query(query) - - # return the results - if format == "query_job": - return query_job - elif iterator: # return a generator that cycles through the objects/rows - return (format_history_query_results(row=row, format=format) for row in query_job) - else: # format and return all rows at once - return format_history_query_results(query_job=query_job, format=format) - - -def _query_objects_check_history_column_names(columns: List[str]) -> List[str]: - """Make sure user-submitted column names are appropriate for `query_objects()`. - - Returns one of: - Columns stripped of bad column names. - Empty list if there were bad columns and the user wants to abort the query. - """ - - goodcols, badcols = _split_good_bad_history_column_names(columns) - - try: - assert len(badcols) == 0 - except AssertionError: - msg = ( - "\nYou have requested columns that are not available to `query_objects()`.\n" - "(To view available columns, use `pittgoogle.bigquery.get_history_column_names()`)\n" - f"\nRequested columns:\n\t{columns}\n" - f"Unavailable columns:\n\t{badcols}\n" - "\nProceed without the unavailable columns? [y/N] " - ) - proceed = input(msg) or "N" - - if proceed not in ["y", "Y"]: # user wants to exit; return an empty list - return [] - - return goodcols - - -# --- Format query results -def format_history_query_results( - query_job: Optional[bigquery.job.QueryJob] = None, - row: Optional[bigquery.table.Row] = None, - format: str = "pandas", -) -> Union[pd.DataFrame, str]: - """Converts the results of a BigQuery query to the desired format. - Must pass either query_job or row. - Any duplicate observations will be dropped. - - Args: - query_job: Results from a object history query job. SQL statement needed - to create the job can be obtained with object_history_sql_statement(). - Must supply either query_job or row. - - row: A single row from query_job. Must supply either row or query_job. - - format: One of 'pandas' or 'json'. Input query results will be returned - in this format. - - Returns: - histories: Input query results converted to requested format - """ - - # make sure we have an appropriate param combination - do_job, do_row = query_job is not None, row is not None - good_format = format in ["pandas", "json"] - good_combo = (do_job != do_row) and good_format - if not good_combo: - raise ValueError("Must pass one of query_job or row.") - - # convert query_job - if do_job: - histories = _format_history_query_results_to_df(query_job) # df - if format == "json": - histories = histories.reset_index().to_json() # str - - # convert row - if do_row: - histories = _format_history_row_to_df(row) # df - if format == "json": - histories["objectId"] = histories.objectId # persist metadata - histories = histories.reset_index().to_json() # str - - return histories - - -def _format_history_query_results_to_df(query_job: bigquery.job.QueryJob): - """Convert a query_job (containing multiple rows of object history data) - to a DataFrame. - Any duplicate observations will be dropped. - """ - - dflist = [] - for row in query_job: - # convert to DataFrame - df = _format_history_row_to_df(row) - # add the objectId so we can use it to multi-index - df["objectId"] = df.objectId - # set the multi-index and append to the list - dflist.append(df.reset_index().set_index(["objectId", "candid"])) - - histories = pd.concat(dflist) - - return histories - - -def _format_history_row_to_df(row: Union[dict, bigquery.table.Row]): - """Convert a single object's history from a query row to a DataFrame. - Any duplicate observations will be dropped. - """ - - d = dict(row.items()) - oid, cid = d.pop("objectId"), d.pop("candid") - df = pd.DataFrame(data=d, index=pd.Index(cid, name="candid")) - df.drop_duplicates(inplace=True) - df.objectId = oid - return df - - -# --- Cone Search -def cone_search( - center: astropy.coordinates.SkyCoord, - radius: astropy.coordinates.Angle, - columns: List[str], - objectIds: Optional[list] = None, - format: str = "pandas", - iterator: bool = False, - dry_run: bool = True, -) -> Union[str, pd.DataFrame, Generator[Union[str, pd.DataFrame], None, None]]: - """Perform a cone search on the alerts database and return object histories. - This uses the coordinates of the most recent observation to determine - whether an object is within the cone. - - Args: - center: Center of the cone to search within. - radius: Radius of the cone to search within. - columns: Names of history columns to select from the alerts table. - The 'objectId' and 'candid' columns are automatically included - and do not need to be in this list. - objectIds: IDs of ZTF objects to include in the query. - format: One of 'pandas', or 'json'. Query results will be - returned in this format. Duplicate observations are dropped. - iterator: If True, iterate over the objects and return one at a time. - Else return the full query results together. - dry_run: If True, `pittgoogle.bigquery.dry_run` will be called first and the - user will be asked to confirm before continuing. - - Returns: - Query results in the requested format. - """ - - # make sure we have required columns - for c in ["jd", "ra", "dec"]: - if c not in columns: - columns.append(c) - - # Performing a dry run prints the SQL query statement, which does not account - # for the cone search. We'll print some things to reduce user confusion. - if dry_run: - print("\nInitiating a cone search.") - - # Query the database for object histories. - objects = query_objects( - columns, - objectIds=objectIds, - format="pandas", - iterator=iterator, - dry_run=dry_run, - ) - # == None if user chose to abort; else DataFrame or generator of same - if objects is None: - return - - if dry_run: - print("\nFiltering for objects within the given cone.") - - # filter out objects not in the cone and return the rest - objects_in_cone = _do_cone_search(objects, center, radius, format, iterator) - return objects_in_cone - - -def _do_cone_search( - objects: Union[pd.DataFrame, Generator[pd.DataFrame, None, None]], - center: astropy.coordinates.SkyCoord, - radius: astropy.coordinates.Angle, - format: str = "pandas", - iterator: bool = False, -) -> Union[str, pd.DataFrame, Generator[Union[str, pd.DataFrame], None, None]]: - """Apply the cone search filter and return appropriate objects.""" - - if iterator: # objects is a generator, return a generator - return _do_cone_search_iterator(objects, center, radius, format) - - else: # objects is single df - return _do_cone_search_all(objects, center, radius, format) - - -def _do_cone_search_iterator( - objects: pd.DataFrame, - center: astropy.coordinates.SkyCoord, - radius: astropy.coordinates.Angle, - format, -): - """Iterate objects, format and yield those that are in the cone. - - Args: - objects: DataFrame containing histories of multiple objectIds. - """ - - for df in objects: - in_cone = object_is_in_cone(df, center, radius) - - if in_cone: # format and yield - if format == "json": - df["objectId"] = df.objectId # else metadata is lost - object = df.reset_index().to_json() # str - else: - object = df - yield object - - -def _do_cone_search_all( - objects: pd.DataFrame, - center: astropy.coordinates.SkyCoord, - radius: astropy.coordinates.Angle, - format, -): - """Filter out objects not in the cone, format, and return. - - Args: - objects: DataFrame containing histories of multiple objectIds. - """ - - gb = objects.groupby(level="objectId") - objects_in_cone = gb.filter(lambda df: object_is_in_cone(df, center, radius)) - if format == "json": - objects_in_cone = objects_in_cone.reset_index().to_json() # str - return objects_in_cone - - -def object_is_in_cone( - object: pd.DataFrame, - center: astropy.coordinates.SkyCoord, - radius: astropy.coordinates.Angle, -): - """Checks whether the object's most recent observation has a position that - is within a cone defined by center and radius. - - Args: - object: DataFrame containing the history of a single objectId. - Required columns: ['jd','ra','dec'] - center: Center of the cone to search within. - radius: Radius of the cone to search within. - - Returns: - True if object is within radius of center, else False - """ - - # get the SkyCoords of the most recent observation - # to do: use the epoch with highest S/N instead - obs = object.loc[object["jd"] == object["jd"].max(), :] - obs_coords = coord.SkyCoord(obs["ra"], obs["dec"], frame="icrs", unit="deg") - - # check whether obs_coords are within the cone - dist = center.separation(obs_coords) - in_cone = dist < radius # array with a single bool - in_cone = in_cone[0] - - return in_cone From ebe79e928e55700ecf6b1be41dc9ba64eb11f7d3 Mon Sep 17 00:00:00 2001 From: troyraen Date: Thu, 28 Sep 2023 03:18:50 -0700 Subject: [PATCH 38/55] add bigquery module with Table class --- pittgoogle/bigquery.py | 164 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 164 insertions(+) create mode 100644 pittgoogle/bigquery.py diff --git a/pittgoogle/bigquery.py b/pittgoogle/bigquery.py new file mode 100644 index 0000000..68af542 --- /dev/null +++ b/pittgoogle/bigquery.py @@ -0,0 +1,164 @@ +# -*- coding: UTF-8 -*- +"""Classes to facilitate connections to BigQuery datasets and tables. + +.. contents:: + :local: + :depth: 2 + +.. note:: + + This module relies on :mod:`pittgoogle.auth` to authenticate API calls. + The examples given below assume the use of a :ref:`service account ` and + :ref:`environment variables `. In this case, :mod:`pittgoogle.auth` does not + need to be called explicitly. + +Usage Examples +--------------- + +.. code-block:: python + + import pittgoogle + + [TODO] + +API +---- + +""" +import logging +from typing import TYPE_CHECKING, Optional, Union + +import google.cloud.bigquery as bigquery +from attrs import define, field +from attrs.validators import instance_of, optional + +from .auth import Auth + +if TYPE_CHECKING: + from . import Alert + + +LOGGER = logging.getLogger(__name__) + + +@define +class Table: + """Methods and properties for a BigQuery table. + + Parameters + ------------ + name : `str` + Name of the BigQuery table. + dataset : `str` + Name of the BigQuery dataset this table belongs to. + + projectid : `str`, optional + The topic owner's Google Cloud project ID. Either this or `auth` is required. Use this + if you are connecting to a subscription owned by a different project than this topic. Note: + :attr:`pittgoogle.utils.ProjectIds` is a registry containing Pitt-Google's project IDs. + auth : :class:`pittgoogle.auth.Auth`, optional + Credentials for the Google Cloud project that owns this topic. If not provided, + it will be created from environment variables when needed. + client : `pubsub_v1.PublisherClient`, optional + Pub/Sub client that will be used to access the topic. If not provided, a new client will + be created (using `auth`) the first time it is requested. + """ + + name: str = field() + dataset: str = field() + projectid: str = field(default=None) + _auth: Auth = field(default=None, validator=optional(instance_of(Auth))) + _client: Optional[bigquery.Client] = field( + default=None, validator=optional(instance_of(bigquery.Client)) + ) + _table: Optional[bigquery.Table] = field(default=None, init=False) + + @classmethod + def from_cloud( + cls, + name: str, + *, + dataset: Optional[str] = None, + survey: Optional[str] = None, + testid: Optional[str] = None, + ): + """Create a `Table` with a `client` using implicit credentials (no explicit `auth`). + + The `projectid` will be retrieved from the `client`. + + Parameters + ---------- + name : `str` + Name of the table. + dataset : `str`, optional + Name of the dataset containing the table. Either this or a `survey` is required. If a + `testid` is provided, it will be appended to this name following the Pitt-Google naming syntax. + survey : `str`, optional + Name of the survey. This will be used as the name of the dataset if the `dataset` kwarg + is not provided. This kwarg is provided for convenience in cases where the Pitt-Google + naming syntax is used to name resources. + testid : `str`, optional + Pipeline identifier. If this is not `None`, `False`, or `"False"` it will be appended to + the dataset name. This is used in cases where the Pitt-Google naming syntax is used to name + resources. This allows pipeline modules to find the correct resources without interfering + with other pipelines that may have deployed resources with the same base names + (e.g., for development and testing purposes). + """ + if dataset is None: + # [TODO] update the elasticc broker to name the dataset using the survey name only + dataset = survey + # if testid is not False, "False", or None, append it to the dataset + if testid and testid != "False": + dataset = f"{dataset}_{testid}" + client = bigquery.Client() + table = cls(name, dataset=dataset, projectid=client.project, client=client) + # make the get request now to create a connection to the table + _ = table.table + return table + + @property + def auth(self) -> Auth: + """Credentials for the Google Cloud project that owns this topic. + + This will be created from environment variables if `self._auth` is None. + """ + if self._auth is None: + self._auth = Auth() + + if (self.projectid != self._auth.GOOGLE_CLOUD_PROJECT) and (self.projectid is not None): + LOGGER.warning(f"setting projectid to match auth: {self._auth.GOOGLE_CLOUD_PROJECT}") + self.projectid = self._auth.GOOGLE_CLOUD_PROJECT + + return self._auth + + @property + def id(self) -> str: + """Fully qualified table ID.""" + # make sure we have a projectid. if it needs to be set, call auth + if self.projectid is None: + self.auth + return f"{self.projectid}.{self.dataset}.{self.name}" + + @property + def table(self) -> bigquery.Table: + """Return a BigQuery Table object that's connected to the table. Makes a get request if necessary.""" + if self._table is None: + self._table = self.client.get_table(self.id) + return self._table + + @property + def client(self) -> bigquery.Client: + """BigQuery client for table access. + + Will be created using `self.auth.credentials` if necessary. + """ + if self._client is None: + self._client = bigquery.Client(credentials=self.auth.credentials) + return self._client + + def insert_rows(self, alerts: Union["Alert", list["Alert"]]) -> list[dict]: + rows = [alert.dict for alert in list(alerts)] + errors = self.client.insert_rows(self.table, rows) + if len(errors) > 0: + LOGGER.warning(f"BigQuery insert error: {errors}") + return errors From ca495ac46715dab210129a84dfe5bc54b2d05b42 Mon Sep 17 00:00:00 2001 From: troyraen Date: Thu, 28 Sep 2023 03:24:44 -0700 Subject: [PATCH 39/55] update __init__ imports --- pittgoogle/__init__.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pittgoogle/__init__.py b/pittgoogle/__init__.py index 0a4b9ab..94d388a 100644 --- a/pittgoogle/__init__.py +++ b/pittgoogle/__init__.py @@ -11,8 +11,10 @@ from .auth import Auth from .alert import Alert +from .bigquery import Table from .pubsub import Topic, Subscription, Consumer -from . import auth, alert, bigquery, exceptions, pubsub, utils +from .registry import ProjectIds, Schemas +from . import exceptions, types_, registry, utils, auth, alert, bigquery, pubsub __version__ = metadata.version("pittgoogle-client") From cf65479354b230af3323d862c77c3a75bc922d8c Mon Sep 17 00:00:00 2001 From: troyraen Date: Thu, 28 Sep 2023 03:32:04 -0700 Subject: [PATCH 40/55] remove obsolete method --- pittgoogle/utils.py | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/pittgoogle/utils.py b/pittgoogle/utils.py index f185135..e605f0e 100644 --- a/pittgoogle/utils.py +++ b/pittgoogle/utils.py @@ -106,24 +106,6 @@ def b64avro_to_dict(bytes_data): return Cast.avro_to_dict(b64decode(bytes_data)) # --- Work with alert dictionaries - @staticmethod - def alert_dict_to_dataframe(alert_dict: dict) -> pd.DataFrame: - """Package a ZTF alert dictionary into a dataframe. - - Adapted from: - https://github.com/ZwickyTransientFacility/ztf-avro-alert/blob/master/notebooks/Filtering_alerts.ipynb - """ - dfc = pd.DataFrame(alert_dict["candidate"], index=[0]) - df_prv = pd.DataFrame(alert_dict["prv_candidates"]) - df = pd.concat([dfc, df_prv], ignore_index=True, sort=True) - df = df[dfc.columns] # return to original column ordering - - # we'll attach some metadata - # note this may not be preserved after all operations - # https://stackoverflow.com/questions/14688306/adding-meta-information-metadata-to-pandas-dataframe - df.objectId = alert_dict["objectId"] - return df - @staticmethod def alert_dict_to_table(alert_dict: dict) -> Table: """Package a ZTF alert dictionary into an Astopy Table.""" From f5997786c2be09be5063ffd715ff6920218add3f Mon Sep 17 00:00:00 2001 From: troyraen Date: Thu, 28 Sep 2023 03:33:31 -0700 Subject: [PATCH 41/55] clean up imports --- pittgoogle/types_.py | 7 ------- pittgoogle/utils.py | 1 - 2 files changed, 8 deletions(-) diff --git a/pittgoogle/types_.py b/pittgoogle/types_.py index 58f2ada..494267f 100644 --- a/pittgoogle/types_.py +++ b/pittgoogle/types_.py @@ -1,18 +1,11 @@ # -*- coding: UTF-8 -*- """Functions to support working with alerts and related data.""" import importlib.resources -import json import logging -from base64 import b64decode, b64encode -from collections import OrderedDict -from io import BytesIO from typing import TYPE_CHECKING, Optional import fastavro -import pandas as pd import yaml -from astropy.table import Table -from astropy.time import Time from attrs import define, field if TYPE_CHECKING: diff --git a/pittgoogle/utils.py b/pittgoogle/utils.py index e605f0e..cd18980 100644 --- a/pittgoogle/utils.py +++ b/pittgoogle/utils.py @@ -7,7 +7,6 @@ from io import BytesIO import fastavro -import pandas as pd from astropy.table import Table from astropy.time import Time from attrs import define From 681324eefae20fdd8835cabc8b5bec621ba35bb6 Mon Sep 17 00:00:00 2001 From: troyraen Date: Thu, 28 Sep 2023 03:44:12 -0700 Subject: [PATCH 42/55] replace ClassVar -> Final --- pittgoogle/registry.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pittgoogle/registry.py b/pittgoogle/registry.py index ba2a62a..02ff63f 100644 --- a/pittgoogle/registry.py +++ b/pittgoogle/registry.py @@ -2,7 +2,7 @@ """Pitt-Google registries.""" import importlib.resources import logging -from typing import ClassVar +from typing import Final from attrs import define @@ -18,16 +18,16 @@ class ProjectIds: """Registry of Google Cloud Project IDs.""" - pittgoogle: ClassVar[str] = "ardent-cycling-243415" + pittgoogle: Final[str] = "ardent-cycling-243415" """Pitt-Google's production project.""" - pittgoogle_dev: ClassVar[str] = "avid-heading-329016" + pittgoogle_dev: Final[str] = "avid-heading-329016" """Pitt-Google's development project.""" - # pittgoogle_billing: ClassVar[str] = "light-cycle-328823" + # pittgoogle_billing: Final[str] = "light-cycle-328823" # """Pitt-Google's billing project.""" - elasticc: ClassVar[str] = "elasticc-challenge" + elasticc: Final[str] = "elasticc-challenge" """Project running a classifier for ELAsTiCC alerts and reporting to DESC.""" @@ -43,7 +43,7 @@ class Schemas: # - if an avro schema file is being registered with the schema (using the `path` arg), it is # recommended that the file have the same name (path stem) as the schema. the file name # must end with ".avsc". - dict: ClassVar[dict] = { + dict: Final[dict] = { "elasticc.v0_9_1.alert": types_.Schema( name="elasticc.v0_9_1.alert", description="Avro schema of alerts published by ELAsTiCC.", From 54103e15266e821c0332616980220d7932ca71fa Mon Sep 17 00:00:00 2001 From: troyraen Date: Thu, 28 Sep 2023 04:02:52 -0700 Subject: [PATCH 43/55] update descriptions --- pittgoogle/registry.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pittgoogle/registry.py b/pittgoogle/registry.py index 02ff63f..a322c7c 100644 --- a/pittgoogle/registry.py +++ b/pittgoogle/registry.py @@ -22,13 +22,13 @@ class ProjectIds: """Pitt-Google's production project.""" pittgoogle_dev: Final[str] = "avid-heading-329016" - """Pitt-Google's development project.""" + """Pitt-Google's testing and development project.""" # pittgoogle_billing: Final[str] = "light-cycle-328823" # """Pitt-Google's billing project.""" elasticc: Final[str] = "elasticc-challenge" - """Project running a classifier for ELAsTiCC alerts and reporting to DESC.""" + """Project running classifiers for ELAsTiCC alerts and reporting to DESC.""" @define(frozen=True) From a2ed5d72f748be17a5e459a5f6279142c50312ea Mon Sep 17 00:00:00 2001 From: troyraen Date: Thu, 28 Sep 2023 04:03:29 -0700 Subject: [PATCH 44/55] make the Schemas dict a class method --- pittgoogle/registry.py | 63 +++++++++++++++++++++--------------------- 1 file changed, 32 insertions(+), 31 deletions(-) diff --git a/pittgoogle/registry.py b/pittgoogle/registry.py index a322c7c..c8f14d7 100644 --- a/pittgoogle/registry.py +++ b/pittgoogle/registry.py @@ -35,41 +35,42 @@ class ProjectIds: class Schemas: """Registry of schemas used by Pitt-Google.""" - # dict defining the schemas in the registry - # naming conventions: - # - schema names are expected to start with the name of the survey - # - if the survey has more than one schema, the survey name should be followed by a ".", - # followed by schema-specific specifier(s) - # - if an avro schema file is being registered with the schema (using the `path` arg), it is - # recommended that the file have the same name (path stem) as the schema. the file name - # must end with ".avsc". - dict: Final[dict] = { - "elasticc.v0_9_1.alert": types_.Schema( - name="elasticc.v0_9_1.alert", - description="Avro schema of alerts published by ELAsTiCC.", - path=PACKAGE_DIR / f"schemas/elasticc/elasticc.v0_9_1.alert.avsc", - ), - "elasticc.v0_9_1.brokerClassification": types_.Schema( - name="elasticc.v0_9_1.brokerClassification", - description="Avro schema of alerts to be sent to DESC containing classifications of ELAsTiCC alerts.", - path=PACKAGE_DIR / f"schemas/elasticc/elasticc.v0_9_1.brokerClassification.avsc", - ), - "ztf": types_.Schema( - name="ztf", - description=( - "ZTF schema. The ZTF survey publishes alerts in Avro format with the schema attached " - "in the header. Pitt-Google publishes ZTF alerts in json format. This schema covers " - "both cases." # [TODO] + @classmethod + def manifest(cls) -> dict: + """Return the dictionary defining the schemas in the registry.""" + # naming conventions: + # - schema names are expected to start with the name of the survey + # - if the survey has more than one schema, the survey name should be followed by a ".", + # followed by schema-specific specifier(s) + # - if an avro schema file is being registered with the schema (using the `path` arg), it is + # recommended that the file have the same name (path stem) as the schema. the file name + # must end with ".avsc". + return { + "elasticc.v0_9_1.alert": types_.Schema( + name="elasticc.v0_9_1.alert", + description="Avro schema of alerts published by ELAsTiCC.", + path=PACKAGE_DIR / f"schemas/elasticc/elasticc.v0_9_1.alert.avsc", + ), + "elasticc.v0_9_1.brokerClassification": types_.Schema( + name="elasticc.v0_9_1.brokerClassification", + description="Avro schema of alerts to be sent to DESC containing classifications of ELAsTiCC alerts.", + path=PACKAGE_DIR / f"schemas/elasticc/elasticc.v0_9_1.brokerClassification.avsc", + ), + "ztf": types_.Schema( + name="ztf", + description=( + "ZTF schema. The ZTF survey publishes alerts in Avro format with the schema attached " + "in the header. Pitt-Google publishes ZTF alerts in json format. This schema covers " + "both cases." # [TODO] + ), + path=None, ), - path=None, - ), - } - """Dict defining the schemas in the registry.""" + } @classmethod def names(cls) -> list[str]: """Return the names of all registered schemas.""" - return list(cls.dict.keys()) + return list(cls.manifest().keys()) @classmethod def get(cls, schema_name: str) -> types_.Schema: @@ -81,7 +82,7 @@ def get(cls, schema_name: str) -> types_.Schema: if a schema called `schema_name` is not found """ # if there is no registered schema with this name, raise an error - schema = cls.dict.get(schema_name) + schema = cls.manifest().get(schema_name) if schema is None: raise SchemaNotFoundError( f"{schema_name} not found. for a list of valid names, use `pittgoogle.Schemas.names()`." From cf307f4ccbd2e5cdadd1491fdd95701e03bc190e Mon Sep 17 00:00:00 2001 From: troyraen Date: Fri, 29 Sep 2023 02:31:23 -0700 Subject: [PATCH 45/55] bugfix insert_rows --- pittgoogle/bigquery.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/pittgoogle/bigquery.py b/pittgoogle/bigquery.py index 68af542..e7bf769 100644 --- a/pittgoogle/bigquery.py +++ b/pittgoogle/bigquery.py @@ -26,16 +26,13 @@ """ import logging -from typing import TYPE_CHECKING, Optional, Union +from typing import Optional, Union import google.cloud.bigquery as bigquery from attrs import define, field from attrs.validators import instance_of, optional -from .auth import Auth - -if TYPE_CHECKING: - from . import Alert +from . import Alert, Auth LOGGER = logging.getLogger(__name__) @@ -156,9 +153,10 @@ def client(self) -> bigquery.Client: self._client = bigquery.Client(credentials=self.auth.credentials) return self._client - def insert_rows(self, alerts: Union["Alert", list["Alert"]]) -> list[dict]: - rows = [alert.dict for alert in list(alerts)] - errors = self.client.insert_rows(self.table, rows) + def insert_rows(self, rows: Union[list[dict], list[Alert]]) -> list[dict]: + # if elements of rows are Alerts, need to extract the dicts + myrows = [row.dict if isinstance(row, Alert) else row for row in rows] + errors = self.client.insert_rows(self.table, myrows) if len(errors) > 0: LOGGER.warning(f"BigQuery insert error: {errors}") return errors From 2928b5b2baf7828e9c3bff66c161822fa348fa8b Mon Sep 17 00:00:00 2001 From: troyraen Date: Sun, 8 Oct 2023 00:28:25 -0700 Subject: [PATCH 46/55] publish_time -> datetime --- pittgoogle/alert.py | 11 ++++++++++- pittgoogle/types_.py | 4 ++-- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/pittgoogle/alert.py b/pittgoogle/alert.py index 4c94a6d..d8c8560 100644 --- a/pittgoogle/alert.py +++ b/pittgoogle/alert.py @@ -24,6 +24,7 @@ import importlib.resources import io import logging +from datetime import datetime from pathlib import Path from typing import Any, TYPE_CHECKING, Optional, Union @@ -126,13 +127,21 @@ def index(): if not isinstance(envelope, dict) or "message" not in envelope: raise BadRequest("Bad Request: invalid Pub/Sub message format") + # convert the message publish_time string -> datetime + # occasionally the string doesn't include microseconds so we need a try/except + publish_time = envelope["message"]["publish_time"].replace("Z", "+00:00") + try: + publish_time = datetime.strptime(publish_time, "%Y-%m-%dT%H:%M:%S.%f%z") + except ValueError: + publish_time = datetime.strptime(publish_time, "%Y-%m-%dT%H:%M:%S%z") + return cls( msg=types_.PubsubMessageLike( # this class requires data. the rest should be present in the message, but let's be lenient data=envelope["message"]["data"], attributes=envelope["message"].get("attributes"), message_id=envelope["message"].get("message_id"), - publish_time=envelope["message"].get("publish_time"), + publish_time=publish_time, ordering_key=envelope["message"].get("ordering_key"), ), schema_name=schema_name, diff --git a/pittgoogle/types_.py b/pittgoogle/types_.py index 494267f..a9ebee0 100644 --- a/pittgoogle/types_.py +++ b/pittgoogle/types_.py @@ -9,7 +9,7 @@ from attrs import define, field if TYPE_CHECKING: - import google.protobuf.timestamp_pb2 + import datetime from pathlib import Path @@ -83,5 +83,5 @@ class PubsubMessageLike: data: bytes = field() attributes: dict = field(factory=dict) message_id: Optional[str] = field(default=None) - publish_time: Optional["google.protobuf.timestamp_pb2.Timestamp"] = field(default=None) + publish_time: Optional["datetime.datetime"] = field(default=None) ordering_key: Optional[str] = field(default=None) From f430751ac175ab1855e58bb3303824acb953079f Mon Sep 17 00:00:00 2001 From: troyraen Date: Sun, 8 Oct 2023 00:28:35 -0700 Subject: [PATCH 47/55] cleanup strings --- pittgoogle/registry.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pittgoogle/registry.py b/pittgoogle/registry.py index c8f14d7..8260ebe 100644 --- a/pittgoogle/registry.py +++ b/pittgoogle/registry.py @@ -49,19 +49,19 @@ def manifest(cls) -> dict: "elasticc.v0_9_1.alert": types_.Schema( name="elasticc.v0_9_1.alert", description="Avro schema of alerts published by ELAsTiCC.", - path=PACKAGE_DIR / f"schemas/elasticc/elasticc.v0_9_1.alert.avsc", + path=PACKAGE_DIR / "schemas/elasticc/elasticc.v0_9_1.alert.avsc", ), "elasticc.v0_9_1.brokerClassification": types_.Schema( name="elasticc.v0_9_1.brokerClassification", description="Avro schema of alerts to be sent to DESC containing classifications of ELAsTiCC alerts.", - path=PACKAGE_DIR / f"schemas/elasticc/elasticc.v0_9_1.brokerClassification.avsc", + path=PACKAGE_DIR / "schemas/elasticc/elasticc.v0_9_1.brokerClassification.avsc", ), "ztf": types_.Schema( name="ztf", description=( "ZTF schema. The ZTF survey publishes alerts in Avro format with the schema attached " "in the header. Pitt-Google publishes ZTF alerts in json format. This schema covers " - "both cases." # [TODO] + "both cases." ), path=None, ), From 5f9c0f148b53bf6aa1605598d3181662bbd3b4ce Mon Sep 17 00:00:00 2001 From: troyraen Date: Sun, 8 Oct 2023 01:45:24 -0700 Subject: [PATCH 48/55] move schemas manifest to yaml --- pittgoogle/registry.py | 62 +++++++---------------- pittgoogle/registry_manifests/schemas.yml | 16 ++++++ 2 files changed, 35 insertions(+), 43 deletions(-) create mode 100644 pittgoogle/registry_manifests/schemas.yml diff --git a/pittgoogle/registry.py b/pittgoogle/registry.py index 8260ebe..cdc50ad 100644 --- a/pittgoogle/registry.py +++ b/pittgoogle/registry.py @@ -4,6 +4,7 @@ import logging from typing import Final +import yaml from attrs import define from . import types_ @@ -12,6 +13,7 @@ LOGGER = logging.getLogger(__name__) PACKAGE_DIR = importlib.resources.files(__package__) +SCHEMA_MANIFEST = yaml.safe_load((PACKAGE_DIR / "registry_manifests/schemas.yml").read_text()) @define(frozen=True) @@ -35,43 +37,6 @@ class ProjectIds: class Schemas: """Registry of schemas used by Pitt-Google.""" - @classmethod - def manifest(cls) -> dict: - """Return the dictionary defining the schemas in the registry.""" - # naming conventions: - # - schema names are expected to start with the name of the survey - # - if the survey has more than one schema, the survey name should be followed by a ".", - # followed by schema-specific specifier(s) - # - if an avro schema file is being registered with the schema (using the `path` arg), it is - # recommended that the file have the same name (path stem) as the schema. the file name - # must end with ".avsc". - return { - "elasticc.v0_9_1.alert": types_.Schema( - name="elasticc.v0_9_1.alert", - description="Avro schema of alerts published by ELAsTiCC.", - path=PACKAGE_DIR / "schemas/elasticc/elasticc.v0_9_1.alert.avsc", - ), - "elasticc.v0_9_1.brokerClassification": types_.Schema( - name="elasticc.v0_9_1.brokerClassification", - description="Avro schema of alerts to be sent to DESC containing classifications of ELAsTiCC alerts.", - path=PACKAGE_DIR / "schemas/elasticc/elasticc.v0_9_1.brokerClassification.avsc", - ), - "ztf": types_.Schema( - name="ztf", - description=( - "ZTF schema. The ZTF survey publishes alerts in Avro format with the schema attached " - "in the header. Pitt-Google publishes ZTF alerts in json format. This schema covers " - "both cases." - ), - path=None, - ), - } - - @classmethod - def names(cls) -> list[str]: - """Return the names of all registered schemas.""" - return list(cls.manifest().keys()) - @classmethod def get(cls, schema_name: str) -> types_.Schema: """Return the registered schema called `schema_name`. @@ -81,10 +46,21 @@ def get(cls, schema_name: str) -> types_.Schema: :class:`pittgoogle.exceptions.SchemaNotFoundError` if a schema called `schema_name` is not found """ - # if there is no registered schema with this name, raise an error - schema = cls.manifest().get(schema_name) - if schema is None: - raise SchemaNotFoundError( - f"{schema_name} not found. for a list of valid names, use `pittgoogle.Schemas.names()`." + for schema in SCHEMA_MANIFEST: + if schema["name"] != schema_name: + continue + + return types_.Schema( + name=schema["name"], + description=schema["description"], + path=PACKAGE_DIR / schema["path"] if schema["path"] is not None else None, ) - return schema + + raise SchemaNotFoundError( + f"{schema_name} not found. for a list of valid names, use `pittgoogle.Schemas.names()`." + ) + + @classmethod + def names(cls) -> list[str]: + """Return the names of all registered schemas.""" + return [schema["name"] for schema in SCHEMA_MANIFEST] diff --git a/pittgoogle/registry_manifests/schemas.yml b/pittgoogle/registry_manifests/schemas.yml new file mode 100644 index 0000000..654dd52 --- /dev/null +++ b/pittgoogle/registry_manifests/schemas.yml @@ -0,0 +1,16 @@ +# Guidelines: +# - Schema names must start with the name of the survey. If the survey has more than one schema +# the survey name should be followed by a "." and then a schema-specific specifier(s). +# - If a schema file is also being registered (path key), it is recommended that the file have the +# same name (path stem) as the schema. Avro is the only file type currently implemented, and the file name +# must end with ".avsc". +# - The path must be relative to the package directory or null if no schema file is being registered. +- name: "elasticc.v0_9_1.alert" + description: "Avro schema of alerts published by ELAsTiCC." + path: PACKAGE_DIR / "schemas/elasticc/elasticc.v0_9_1.alert.avsc" +- name: "elasticc.v0_9_1.brokerClassification" + description: "Avro schema of alerts to be sent to DESC containing classifications of ELAsTiCC alerts." + path: PACKAGE_DIR / "schemas/elasticc/elasticc.v0_9_1.brokerClassification.avsc" +- name: "ztf" + description: "ZTF schema. The ZTF survey publishes alerts in Avro format with the schema attached in the header. Pitt-Google publishes ZTF alerts in json format. This schema covers both cases." + path: null From 3da9628b7ad399462f3d46d1d17e10d7b82e10f6 Mon Sep 17 00:00:00 2001 From: troyraen Date: Sun, 8 Oct 2023 02:40:12 -0700 Subject: [PATCH 49/55] sort package imports --- pittgoogle/__init__.py | 7 +++---- pittgoogle/alert.py | 10 ++++------ pittgoogle/bigquery.py | 4 ++-- pittgoogle/pubsub.py | 4 ++-- pittgoogle/registry.py | 1 - pittgoogle/types_.py | 1 - 6 files changed, 11 insertions(+), 16 deletions(-) diff --git a/pittgoogle/__init__.py b/pittgoogle/__init__.py index 94d388a..2da4e88 100644 --- a/pittgoogle/__init__.py +++ b/pittgoogle/__init__.py @@ -9,13 +9,12 @@ except ImportError: # for Python<3.8 import importlib_metadata as metadata -from .auth import Auth +from . import alert, auth, bigquery, exceptions, pubsub, registry, types_, utils from .alert import Alert +from .auth import Auth from .bigquery import Table -from .pubsub import Topic, Subscription, Consumer +from .pubsub import Consumer, Subscription, Topic from .registry import ProjectIds, Schemas -from . import exceptions, types_, registry, utils, auth, alert, bigquery, pubsub - __version__ = metadata.version("pittgoogle-client") diff --git a/pittgoogle/alert.py b/pittgoogle/alert.py index d8c8560..2771ed5 100644 --- a/pittgoogle/alert.py +++ b/pittgoogle/alert.py @@ -26,21 +26,19 @@ import logging from datetime import datetime from pathlib import Path -from typing import Any, TYPE_CHECKING, Optional, Union +from typing import TYPE_CHECKING, Any, Optional, Union import fastavro from attrs import define, field -from . import registry, types_ +from . import registry, types_, utils from .exceptions import BadRequest, OpenAlertError, SchemaNotFoundError -from .utils import Cast if TYPE_CHECKING: import google._upb._message import google.cloud.pubsub_v1 import pandas as pd # always lazy-load pandas. it hogs memory on cloud functions and run - LOGGER = logging.getLogger(__name__) PACKAGE_DIR = importlib.resources.files(__package__) @@ -229,10 +227,10 @@ def dict(self) -> dict: # [TODO] this should be rewritten to catch specific errors # for now, just try avro then json, catching basically all errors in the process try: - self._dict = Cast.avro_to_dict(self.msg.data) + self._dict = utils.Cast.avro_to_dict(self.msg.data) except Exception: try: - self._dict = Cast.json_to_dict(self.msg.data) + self._dict = utils.Cast.json_to_dict(self.msg.data) except Exception: raise OpenAlertError("failed to deserialize the alert bytes") return self._dict diff --git a/pittgoogle/bigquery.py b/pittgoogle/bigquery.py index e7bf769..04cbf79 100644 --- a/pittgoogle/bigquery.py +++ b/pittgoogle/bigquery.py @@ -32,8 +32,8 @@ from attrs import define, field from attrs.validators import instance_of, optional -from . import Alert, Auth - +from .alert import Alert +from .auth import Auth LOGGER = logging.getLogger(__name__) diff --git a/pittgoogle/pubsub.py b/pittgoogle/pubsub.py index 5ec7c1a..4f8ecbe 100644 --- a/pittgoogle/pubsub.py +++ b/pittgoogle/pubsub.py @@ -94,10 +94,10 @@ def my_batch_callback(results): from attrs.validators import gt, instance_of, is_callable, optional from google.api_core.exceptions import NotFound -from . import Alert, Auth +from .alert import Alert +from .auth import Auth from .exceptions import SchemaNotFoundError - LOGGER = logging.getLogger(__name__) PACKAGE_DIR = importlib.resources.files(__package__) diff --git a/pittgoogle/registry.py b/pittgoogle/registry.py index cdc50ad..29cb75f 100644 --- a/pittgoogle/registry.py +++ b/pittgoogle/registry.py @@ -10,7 +10,6 @@ from . import types_ from .exceptions import SchemaNotFoundError - LOGGER = logging.getLogger(__name__) PACKAGE_DIR = importlib.resources.files(__package__) SCHEMA_MANIFEST = yaml.safe_load((PACKAGE_DIR / "registry_manifests/schemas.yml").read_text()) diff --git a/pittgoogle/types_.py b/pittgoogle/types_.py index a9ebee0..104a769 100644 --- a/pittgoogle/types_.py +++ b/pittgoogle/types_.py @@ -12,7 +12,6 @@ import datetime from pathlib import Path - LOGGER = logging.getLogger(__name__) PACKAGE_DIR = importlib.resources.files(__package__) From 90dc68c74311f5cef7265d02bdfe361185dc23b6 Mon Sep 17 00:00:00 2001 From: troyraen Date: Sun, 8 Oct 2023 02:41:08 -0700 Subject: [PATCH 50/55] bugfix schema paths --- pittgoogle/registry_manifests/schemas.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pittgoogle/registry_manifests/schemas.yml b/pittgoogle/registry_manifests/schemas.yml index 654dd52..1929b99 100644 --- a/pittgoogle/registry_manifests/schemas.yml +++ b/pittgoogle/registry_manifests/schemas.yml @@ -7,10 +7,10 @@ # - The path must be relative to the package directory or null if no schema file is being registered. - name: "elasticc.v0_9_1.alert" description: "Avro schema of alerts published by ELAsTiCC." - path: PACKAGE_DIR / "schemas/elasticc/elasticc.v0_9_1.alert.avsc" + path: "schemas/elasticc/elasticc.v0_9_1.alert.avsc" - name: "elasticc.v0_9_1.brokerClassification" description: "Avro schema of alerts to be sent to DESC containing classifications of ELAsTiCC alerts." - path: PACKAGE_DIR / "schemas/elasticc/elasticc.v0_9_1.brokerClassification.avsc" + path: "schemas/elasticc/elasticc.v0_9_1.brokerClassification.avsc" - name: "ztf" description: "ZTF schema. The ZTF survey publishes alerts in Avro format with the schema attached in the header. Pitt-Google publishes ZTF alerts in json format. This schema covers both cases." path: null From 5d82aaaf8741a9a63c30735649a9d8b56689a4b9 Mon Sep 17 00:00:00 2001 From: troyraen Date: Sun, 14 Jan 2024 02:54:17 -0800 Subject: [PATCH 51/55] add forced sources to alert.dataframe --- pittgoogle/alert.py | 20 +++++++++++++++++--- pittgoogle/schemas/maps/elasticc.yml | 1 + 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/pittgoogle/alert.py b/pittgoogle/alert.py index 2771ed5..03dd017 100644 --- a/pittgoogle/alert.py +++ b/pittgoogle/alert.py @@ -242,9 +242,23 @@ def dataframe(self) -> "pd.DataFrame": import pandas as pd # always lazy-load pandas. it hogs memory on cloud functions and run - src_df = pd.DataFrame(self.get("source"), index=[0]) - prvs_df = pd.DataFrame(self.get("prv_sources")) - self._dataframe = pd.concat([src_df, prvs_df], ignore_index=True) + # sources and previous sources are expected to have the same fields + sources_df = pd.DataFrame([self.get("source")] + self.get("prv_sources")) + # sources and forced sources may have different fields + forced_df = pd.DataFrame(self.get("prv_forced_sources")) + + # use nullable integer data type to avoid converting ints to floats + # for columns in one dataframe but not the other + sources_ints = [c for c, v in sources_df.dtypes.items() if v == int] + sources_df = sources_df.astype( + {c: "Int64" for c in set(sources_ints) - set(forced_df.columns)} + ) + forced_ints = [c for c, v in forced_df.dtypes.items() if v == int] + forced_df = forced_df.astype( + {c: "Int64" for c in set(forced_ints) - set(sources_df.columns)} + ) + + self._dataframe = pd.concat([sources_df, forced_df], ignore_index=True) return self._dataframe @property diff --git a/pittgoogle/schemas/maps/elasticc.yml b/pittgoogle/schemas/maps/elasticc.yml index 50852c1..7087ff4 100644 --- a/pittgoogle/schemas/maps/elasticc.yml +++ b/pittgoogle/schemas/maps/elasticc.yml @@ -8,6 +8,7 @@ objectid: [diaObject, diaObjectId] source: diaSource sourceid: [diaSource, diaSourceId] prv_sources: prvDiaSources +prv_forced_sources: prvDiaForcedSources mjd: midPointTai filter: filterName mag: magpsf From 8498ab19c8e792e3228835006fb17ac15371437f Mon Sep 17 00:00:00 2001 From: troyraen Date: Sun, 14 Jan 2024 03:18:25 -0800 Subject: [PATCH 52/55] add projectid property --- pittgoogle/bigquery.py | 27 +++++++++++++++------------ pittgoogle/pubsub.py | 16 ++++++++++------ 2 files changed, 25 insertions(+), 18 deletions(-) diff --git a/pittgoogle/bigquery.py b/pittgoogle/bigquery.py index 04cbf79..d0e3144 100644 --- a/pittgoogle/bigquery.py +++ b/pittgoogle/bigquery.py @@ -50,20 +50,19 @@ class Table: Name of the BigQuery dataset this table belongs to. projectid : `str`, optional - The topic owner's Google Cloud project ID. Either this or `auth` is required. Use this - if you are connecting to a subscription owned by a different project than this topic. Note: + The table owner's Google Cloud project ID. Either this or `auth` is required. Note: :attr:`pittgoogle.utils.ProjectIds` is a registry containing Pitt-Google's project IDs. auth : :class:`pittgoogle.auth.Auth`, optional - Credentials for the Google Cloud project that owns this topic. If not provided, + Credentials for the Google Cloud project that owns this table. If not provided, it will be created from environment variables when needed. - client : `pubsub_v1.PublisherClient`, optional - Pub/Sub client that will be used to access the topic. If not provided, a new client will + client : `bigquery.Client`, optional + BigQuery client that will be used to access the table. If not provided, a new client will be created (using `auth`) the first time it is requested. """ name: str = field() dataset: str = field() - projectid: str = field(default=None) + _projectid: str = field(default=None) _auth: Auth = field(default=None, validator=optional(instance_of(Auth))) _client: Optional[bigquery.Client] = field( default=None, validator=optional(instance_of(bigquery.Client)) @@ -115,27 +114,31 @@ def from_cloud( @property def auth(self) -> Auth: - """Credentials for the Google Cloud project that owns this topic. + """Credentials for the Google Cloud project that owns this table. This will be created from environment variables if `self._auth` is None. """ if self._auth is None: self._auth = Auth() - if (self.projectid != self._auth.GOOGLE_CLOUD_PROJECT) and (self.projectid is not None): + if (self._projectid != self._auth.GOOGLE_CLOUD_PROJECT) and (self._projectid is not None): LOGGER.warning(f"setting projectid to match auth: {self._auth.GOOGLE_CLOUD_PROJECT}") - self.projectid = self._auth.GOOGLE_CLOUD_PROJECT + self._projectid = self._auth.GOOGLE_CLOUD_PROJECT return self._auth @property def id(self) -> str: """Fully qualified table ID.""" - # make sure we have a projectid. if it needs to be set, call auth - if self.projectid is None: - self.auth return f"{self.projectid}.{self.dataset}.{self.name}" + @property + def projectid(self) -> str: + """The table owner's Google Cloud project ID.""" + if self._projectid is None: + self._projectid = self.auth.GOOGLE_CLOUD_PROJECT + return self._projectid + @property def table(self) -> bigquery.Table: """Return a BigQuery Table object that's connected to the table. Makes a get request if necessary.""" diff --git a/pittgoogle/pubsub.py b/pittgoogle/pubsub.py index 4f8ecbe..dc99a07 100644 --- a/pittgoogle/pubsub.py +++ b/pittgoogle/pubsub.py @@ -174,7 +174,7 @@ class Topic: """ name: str = field() - projectid: str = field(default=None) + _projectid: str = field(default=None) _auth: Auth = field(default=None, validator=optional(instance_of(Auth))) _client: Optional[pubsub_v1.PublisherClient] = field( default=None, validator=optional(instance_of(pubsub_v1.PublisherClient)) @@ -232,20 +232,24 @@ def auth(self) -> Auth: if self._auth is None: self._auth = Auth() - if (self.projectid != self._auth.GOOGLE_CLOUD_PROJECT) and (self.projectid is not None): + if (self._projectid != self._auth.GOOGLE_CLOUD_PROJECT) and (self._projectid is not None): LOGGER.warning(f"setting projectid to match auth: {self._auth.GOOGLE_CLOUD_PROJECT}") - self.projectid = self._auth.GOOGLE_CLOUD_PROJECT + self._projectid = self._auth.GOOGLE_CLOUD_PROJECT return self._auth @property def path(self) -> str: """Fully qualified path to the topic.""" - # make sure we have a projectid. if it needs to be set, call auth - if self.projectid is None: - self.auth return f"projects/{self.projectid}/topics/{self.name}" + @property + def projectid(self) -> str: + """The topic owner's Google Cloud project ID.""" + if self._projectid is None: + self._projectid = self.auth.GOOGLE_CLOUD_PROJECT + return self._projectid + @property def client(self) -> pubsub_v1.PublisherClient: """Pub/Sub client for topic access. From ae38ec5abe2359560c7d1f070da75ed852a1df5a Mon Sep 17 00:00:00 2001 From: troyraen Date: Sun, 14 Jan 2024 04:58:31 -0800 Subject: [PATCH 53/55] fix type hints --- pittgoogle/alert.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/pittgoogle/alert.py b/pittgoogle/alert.py index 03dd017..b6c2908 100644 --- a/pittgoogle/alert.py +++ b/pittgoogle/alert.py @@ -26,7 +26,7 @@ import logging from datetime import datetime from pathlib import Path -from typing import TYPE_CHECKING, Any, Optional, Union +from typing import TYPE_CHECKING, Any, Dict, Optional, Union import fastavro from attrs import define, field @@ -77,10 +77,10 @@ class Alert: Union["google.cloud.pubsub_v1.types.PubsubMessage", types_.PubsubMessageLike] ] = field(default=None) """Incoming Pub/Sub message object.""" - _attributes: Optional[Union[dict, "google._upb._message.ScalarMapContainer"]] = field( + _attributes: Optional[Union[Dict, "google._upb._message.ScalarMapContainer"]] = field( default=None ) - _dict: Optional[dict] = field(default=None) + _dict: Optional[Dict] = field(default=None) _dataframe: Optional["pd.DataFrame"] = field(default=None) schema_name: Optional[str] = field(default=None) _schema: Optional[types_.Schema] = field(default=None, init=False) @@ -88,7 +88,7 @@ class Alert: # ---- class methods ---- # @classmethod - def from_cloud_run(cls, envelope: dict, schema_name: Optional[str] = None) -> "Alert": + def from_cloud_run(cls, envelope: Dict, schema_name: Optional[str] = None) -> "Alert": """Create an `Alert` from an HTTP request envelope containing a Pub/Sub message, as received by a Cloud Run module. Example code for a Cloud Run module that uses this method to open a ZTF alert: @@ -148,8 +148,8 @@ def index(): @classmethod def from_dict( cls, - payload: dict, - attributes: Optional[Union[dict, "google._upb._message.ScalarMapContainer"]] = None, + payload: Dict, + attributes: Optional[Union[Dict, "google._upb._message.ScalarMapContainer"]] = None, schema_name: Optional[str] = None, ) -> "Alert": # [TODO] update tom_desc to use this """Create an `Alert` from a dictionary (`payload`).""" @@ -173,7 +173,7 @@ def from_path(cls, path: Union[str, Path], schema_name: Optional[str] = None) -> # ---- properties ---- # @property - def attributes(self) -> dict: + def attributes(self) -> Dict: """Custom metadata for the message. Pub/Sub handles this as a dict-like called "attributes". If this was not set when the `Alert` was instantiated, a new dictionary will be created using @@ -187,7 +187,7 @@ def attributes(self) -> dict: return self._attributes @property - def dict(self) -> dict: + def dict(self) -> Dict: """Alert data as a dictionary. Created from `self.msg.data`, if needed. Raises From 5533dbe38fdceeeef533a4c141d8d578b0d6920d Mon Sep 17 00:00:00 2001 From: Troy Raen Date: Thu, 6 Jun 2024 12:21:35 -0700 Subject: [PATCH 54/55] update changelog --- CHANGELOG.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 37b3dd0..078c48e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,10 +12,16 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) ### Added +- `Alert` and `Table` classes. +- Registry for alert schemas and GCP Project IDs. +- Alert schemas (Avro) and schema maps (yaml). +- Exceptions: `BadRequest` and `SchemaNotFoundError`. +- Types: `PubsubMessageLike` and `Schema`. - ZTF Figures Tutorial ### Changed +- Update PubSub classes. - update README.md to point to the new docs - remove setup and requirements files that are no longer needed after switching away from Read The Docs @@ -23,6 +29,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) - `figures` module (content moved to tutorial). This allowed the removal of the following explicit dependencies: `aplpy`, `matplotlib`, `numpy`. +- v0.1 BigQuery functions. ## \[0.2.0\] - 2023-07-02 From 36bf5290ef7ff020412201250f466ad164b0440d Mon Sep 17 00:00:00 2001 From: Troy Raen Date: Thu, 6 Jun 2024 12:27:33 -0700 Subject: [PATCH 55/55] fix .md formatting --- CHANGELOG.md | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 078c48e..37c78cd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,40 +12,40 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) ### Added -- `Alert` and `Table` classes. -- Registry for alert schemas and GCP Project IDs. -- Alert schemas (Avro) and schema maps (yaml). -- Exceptions: `BadRequest` and `SchemaNotFoundError`. -- Types: `PubsubMessageLike` and `Schema`. -- ZTF Figures Tutorial +- `Alert` and `Table` classes. +- Registry for alert schemas and GCP Project IDs. +- Alert schemas (Avro) and schema maps (yaml). +- Exceptions: `BadRequest` and `SchemaNotFoundError`. +- Types: `PubsubMessageLike` and `Schema`. +- ZTF Figures Tutorial ### Changed -- Update PubSub classes. -- update README.md to point to the new docs -- remove setup and requirements files that are no longer needed after switching away from Read The Docs +- Update PubSub classes. +- update README.md to point to the new docs +- remove setup and requirements files that are no longer needed after switching away from Read The Docs ### Removed -- `figures` module (content moved to tutorial). This allowed the removal of the following explicit +- `figures` module (content moved to tutorial). This allowed the removal of the following explicit dependencies: `aplpy`, `matplotlib`, `numpy`. -- v0.1 BigQuery functions. +- v0.1 BigQuery functions. ## \[0.2.0\] - 2023-07-02 ### Added -- `auth` module supporting authentication via a service account or oauth2 -- `exceptions` module with class `OpenAlertError` -- "Overview" section in docs -- classes in `utils` module: `ProjectIds`, `Cast` -- files: `CHANGELOG.md`, `pittgoogle_env.yml` +- `auth` module supporting authentication via a service account or oauth2 +- `exceptions` module with class `OpenAlertError` +- "Overview" section in docs +- classes in `utils` module: `ProjectIds`, `Cast` +- files: `CHANGELOG.md`, `pittgoogle_env.yml` ### Changed -- Overhaul the `pubsub` module. Add classes `Topic`, `Subscription`, `Consumer`, `Alert`, +- Overhaul the `pubsub` module. Add classes `Topic`, `Subscription`, `Consumer`, `Alert`, `Response`. ### Fixed -- cleanup some issues flagged by Codacy +- cleanup some issues flagged by Codacy