From 6d148561bd0a171b851245cb59bdbf8db2a533a8 Mon Sep 17 00:00:00 2001
From: troyraen <troy.raen@pitt.edu>
Date: Wed, 2 Aug 2023 20:04:11 -0700
Subject: [PATCH 01/55] prep imports and module-level variables

---
 pittgoogle/pubsub.py | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/pittgoogle/pubsub.py b/pittgoogle/pubsub.py
index 28b1ffc..7b32ed4 100644
--- a/pittgoogle/pubsub.py
+++ b/pittgoogle/pubsub.py
@@ -76,12 +76,17 @@ def my_batch_callback(results):
 ----
 
 """
+import importlib.resources
+import io
+import json
 import logging
 import queue
 from concurrent.futures import ThreadPoolExecutor
 from time import sleep
-from typing import Any, ByteString, Callable, List, Optional, Union
+from typing import TYPE_CHECKING, Any, Callable, List, Optional, Union
 
+import fastavro
+import yaml
 from attrs import converters, define, field
 from attrs.validators import gt, instance_of, is_callable, optional
 from google.api_core.exceptions import NotFound
@@ -91,7 +96,14 @@ def my_batch_callback(results):
 from .exceptions import OpenAlertError
 from .utils import Cast
 
+if TYPE_CHECKING:
+    import google.protobuf.timestamp_pb2
+    import google._upb._message
+    import pandas as pd
+
+
 LOGGER = logging.getLogger(__name__)
+PACKAGE_DIR = importlib.resources.files(__package__)
 
 
 def msg_callback_example(alert: "Alert") -> "Response":

From 48f67ef7731238759a5ed466f7eca2a8d7470286 Mon Sep 17 00:00:00 2001
From: troyraen <troy.raen@pitt.edu>
Date: Wed, 2 Aug 2023 20:06:05 -0700
Subject: [PATCH 02/55] add methods to create, delete, publish to topic

---
 pittgoogle/pubsub.py | 95 +++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 86 insertions(+), 9 deletions(-)

diff --git a/pittgoogle/pubsub.py b/pittgoogle/pubsub.py
index 7b32ed4..650b056 100644
--- a/pittgoogle/pubsub.py
+++ b/pittgoogle/pubsub.py
@@ -158,18 +158,24 @@ class Topic:
     ------------
     name : `str`
         Name of the Pub/Sub topic.
-    projectid : `str`
-        The topic owner's Google Cloud project ID. Note: :attr:`pittgoogle.utils.ProjectIds`
-        is a registry containing Pitt-Google's project IDs.
+    projectid : `str`, optional
+        The topic owner's Google Cloud project ID. Either this or `auth` is required. Use this
+        if you are connecting to a subscription owned by a different project than this topic. Note:
+        :attr:`pittgoogle.utils.ProjectIds` is a registry containing Pitt-Google's project IDs.
+    auth : :class:`pittgoogle.auth.Auth`, optional
+        Credentials for the Google Cloud project that owns this topic. If not provided,
+        it will be created from environment variables when needed.
+    client : `pubsub_v1.PublisherClient`, optional
+        Pub/Sub client that will be used to access the topic. If not provided, a new client will
+        be created (using `auth`) the first time it is requested.
     """
 
     name: str = field()
-    projectid: str = field()
-
-    @property
-    def path(self) -> str:
-        """Fully qualified path to the topic."""
-        return f"projects/{self.projectid}/topics/{self.name}"
+    projectid: str = field(default=None)
+    _auth: Auth = field(default=None, validator=optional(instance_of(Auth)))
+    _client: Optional[pubsub_v1.PublisherClient] = field(
+        default=None, validator=optional(instance_of(pubsub_v1.PublisherClient))
+    )
 
     @classmethod
     def from_path(cls, path) -> "Topic":
@@ -177,6 +183,77 @@ def from_path(cls, path) -> "Topic":
         _, projectid, _, name = path.split("/")
         return cls(name, projectid)
 
+    @property
+    def auth(self) -> Auth:
+        """Credentials for the Google Cloud project that owns this topic.
+
+        This will be created from environment variables if `self._auth` is None.
+        """
+        if self._auth is None:
+            self._auth = Auth()
+
+        if (self.projectid != self._auth.GOOGLE_CLOUD_PROJECT) and (self.projectid is not None):
+            LOGGER.warning(f"setting projectid to match auth: {self._auth.GOOGLE_CLOUD_PROJECT}")
+        self.projectid = self._auth.GOOGLE_CLOUD_PROJECT
+
+        return self._auth
+
+    @property
+    def path(self) -> str:
+        """Fully qualified path to the topic."""
+        # make sure we have a projectid. if it needs to be set, call auth
+        if self.projectid is None:
+            self.auth
+        return f"projects/{self.projectid}/topics/{self.name}"
+
+    @property
+    def client(self) -> pubsub_v1.PublisherClient:
+        """Pub/Sub client for topic access.
+
+        Will be created using `self.auth.credentials` if necessary.
+        """
+        if self._client is None:
+            self._client = pubsub_v1.PublisherClient(credentials=self.auth.credentials)
+        return self._client
+
+    def touch(self) -> None:
+        """Test the connection to the topic, creating it if necessary."""
+        try:
+            self.client.get_topic(topic=self.path)
+            LOGGER.info(f"topic exists: {self.path}")
+
+        except NotFound:
+            self.client.create_topic(name=self.path)
+            LOGGER.info(f"topic created: {self.path}")
+
+    def delete(self) -> None:
+        """Delete the topic."""
+        try:
+            self.client.delete_topic(topic=self.path)
+        except NotFound:
+            LOGGER.info(f"nothing to delete. topic not found: {self.path}")
+        else:
+            LOGGER.info(f"deleted topic: {self.path}")
+
+    def publish(self, alert: "Alert", format="json") -> int:
+        """Publish the `alert.dict` in the requested `format`, attaching the `alert.attributes`."""
+        if format == "json":
+            message = json.dumps(alert.dict).encode("utf-8")
+
+        elif format.startswith("elasticc"):
+            # load the avro schema and use it to serialize alert.dict
+            schema = fastavro.schema.load_schema(PACKAGE_DIR / f"schemas/elasticc/{format}.avsc")
+            fout = io.BytesIO()
+            fastavro.schemaless_writer(fout, schema, alert.dict)
+            fout.seek(0)
+            message = fout.getvalue()
+
+        # attribute keys and values must be strings
+        attributes = {str(key): str(val) for key, val in alert.attributes.items()}
+
+        future = self.client.publish(self.path, data=message, **attributes)
+        return future.result()
+
 
 @define
 class Subscription:

From cf2398e58660e4b98e389856df1c9196d3192d20 Mon Sep 17 00:00:00 2001
From: troyraen <troy.raen@pitt.edu>
Date: Wed, 2 Aug 2023 22:00:50 -0700
Subject: [PATCH 03/55] revamp Alert class add: attributes, dataframe,
 schema_name, schema_map remove: bytes (available as alert.msg.data), metadata
 (available as alert.msg.*)

---
 pittgoogle/pubsub.py | 165 ++++++++++++++++++++++++++++++++-----------
 1 file changed, 125 insertions(+), 40 deletions(-)

diff --git a/pittgoogle/pubsub.py b/pittgoogle/pubsub.py
index 650b056..2f187d0 100644
--- a/pittgoogle/pubsub.py
+++ b/pittgoogle/pubsub.py
@@ -120,6 +120,7 @@ def batch_callback_example(batch: list) -> None:
 def pull_batch(
     subscription: Union[str, "Subscription"],
     max_messages: int = 1,
+    schema_name: str = str(),
     **subscription_kwargs,
 ) -> List["Alert"]:
     """Pull a single batch of messages from the `subscription`.
@@ -130,6 +131,10 @@ def pull_batch(
         Subscription to be pulled. If `str`, the name of the subscription.
     max_messages : `int`
         Maximum number of messages to be pulled.
+    schema_name : `str`
+        One of "ztf", "ztf.lite", "elasticc.v0_9_1.alert", "elasticc.v0_9_1.brokerClassification".
+        Schema name of the alerts in the subscription. Passed to :class:`pittgoogle.pubsub.Alert`
+        for unpacking. If not provided, some properties of the `Alert` may not be available.
     subscription_kwargs
         Keyword arguments sent to :class:`pittgoogle.pubsub.Subscription`.
         Ignored if `subscription` is a :class:`pittgoogle.pubsub.Subscription`.
@@ -272,6 +277,10 @@ class Subscription:
     client : `pubsub_v1.SubscriberClient`, optional
         Pub/Sub client that will be used to access the subscription. This kwarg is useful if you
         want to reuse a client. If None, a new client will be created.
+    schema_name : `str`
+        One of "ztf", "ztf.lite", "elasticc.v0_9_1.alert", "elasticc.v0_9_1.brokerClassification".
+        Schema name of the alerts in the subscription. Passed to :class:`pittgoogle.pubsub.Alert`
+        for unpacking. If not provided, some properties of the `Alert` may not be available.
     """
 
     name: str = field()
@@ -280,6 +289,7 @@ class Subscription:
     _client: Optional[pubsub_v1.SubscriberClient] = field(
         default=None, validator=optional(instance_of(pubsub_v1.SubscriberClient))
     )
+    schema_name: str = field(factory=str)
 
     @property
     def projectid(self) -> str:
@@ -544,63 +554,138 @@ class Alert:
         The message metadata.
     msg : `google.cloud.pubsub_v1.types.PubsubMessage`, optional
         The Pub/Sub message object, documented at
-        `<https://googleapis.dev/python/pubsub/latest/types.html>`__.
+        `<https://cloud.google.com/python/docs/reference/pubsub/latest/google.cloud.pubsub_v1.types.PubsubMessage>`__.
+    schema_name : `str`
+        One of "ztf", "ztf.lite", "elasticc.v0_9_1.alert", "elasticc.v0_9_1.brokerClassification".
+        Schema name of the alert. Used for unpacking. If not provided, some properties of the
+        `Alert` may not be available.
     """
 
-    _bytes: Optional[ByteString] = field(default=None)
+    # _bytes: Optional[ByteString] = field(default=None)
     _dict: Optional[dict] = field(default=None)
-    _metadata: Optional[dict] = field(default=None)
-    msg: Optional["pubsub_v1.types.PubsubMessage"] = field(default=None)
-    """Original Pub/Sub message object."""
-
-    @property
-    def bytes(self) -> bytes:
-        """Message payload in original format (Avro or JSON serialized bytes)."""
-        if self._bytes is None:
-            # add try-except when we know what we're looking for
-            self._bytes = self.msg.data
-            if self._bytes is None:
-                # if we add a "path" attribute for the path to an avro file on disk
-                # we can load it like this:
-                #     with open(self.path, "rb") as f:
-                #         self._bytes = f.read()
-                pass
-        return self._bytes
+    _attributes: Optional[Union[dict, "google._upb._message.ScalarMapContainer"]] = field(
+        default=None
+    )
+    # _metadata: Optional[dict] = field(default=None)
+    msg: Optional[Union["pubsub_v1.types.PubsubMessage", "_PubsubMessageLike"]] = field(
+        default=None
+    )
+    """Incoming Pub/Sub message object."""
+    _dataframe: Optional["pd.DataFrame"] = field(default=None)
+    schema_name: str = field(factory=str)
+    _schema_map: Optional[dict] = field(default=None)
+    # _metadata: Optional[dict] = field(default=None)
+
+
+    # @property
+    # def bytes(self) -> bytes:
+    #     """Message payload in original format (Avro or JSON serialized bytes)."""
+    #     if self._bytes is None:
+    #         # add try-except when we know what we're looking for
+    #         self._bytes = self.msg.data
+    #         if self._bytes is None:
+    #             # if we add a "path" attribute for the path to an avro file on disk
+    #             # we can load it like this:
+    #             #     with open(self.path, "rb") as f:
+    #             #         self._bytes = f.read()
+    #             pass
+    #     return self._bytes
 
     @property
     def dict(self) -> dict:
-        """Message payload as a dictionary.
+        """Message payload as a dictionary. Created from `self.msg.data` and `self.schema_name`, if needed.
 
         Raises
         ------
         :class:`pittgoogle.exceptions.OpenAlertError`
             if unable to deserialize the alert bytes.
         """
-        if self._dict is None:
-            # this should be rewritten to catch specific errors
-            # for now, just try avro then json, catching basically all errors in the process
+        if self._dict is not None:
+            return self._dict
+
+        if self.schema_name.startswith("elasticc"):
+            # self.msg.data is avro and schemaless. load the schema, then convert the bytes to a dict
+            schemapath = PACKAGE_DIR / f"schemas/elasticc/{self.schema_name}.avsc"
+            schema = fastavro.schema.load_schema(schemapath)
+            with io.BytesIO(self.msg.data) as fin:
+                self._dict = fastavro.schemaless_reader(fin, schema)
+            return self._dict
+
+        if self.schema_name == "":
+            LOGGER.warning("no alert schema_name provided. attempting to deserialize without it.")
+
+        # assume this is a ztf or ztf-lite alert
+        # this should be rewritten to catch specific errors
+        # for now, just try avro then json, catching basically all errors in the process
+        try:
+            self._dict = Cast.avro_to_dict(self.msg.data)
+        except Exception:
             try:
-                self._dict = Cast.avro_to_dict(self.bytes)
+                self._dict = Cast.json_to_dict(self.msg.data)
             except Exception:
-                try:
-                    self._dict = Cast.json_to_dict(self.bytes)
-                except Exception:
-                    raise OpenAlertError("failed to deserialize the alert bytes")
+                raise OpenAlertError("failed to deserialize the alert bytes")
         return self._dict
 
     @property
-    def metadata(self) -> dict:
-        """Message metadata as a flat dictionary."""
-        if self._metadata is None:
-            self._metadata = {
-                "message_id": self.msg.message_id,
-                "publish_time": self.msg.publish_time,
-                # ordering must be enabled on the subscription for this to be useful
-                "ordering_key": self.msg.ordering_key,
-                # flatten the dict containing our custom attributes
-                **self.msg.attributes,
-            }
-        return self._metadata
+    def attributes(self) -> Union[dict, "google._upb._message.ScalarMapContainer"]:
+        """Custom metadata for the message. Pub/Sub handles this as a dict-like called "attributes".
+
+        If None, this will be set to `self.msg.attributes`.
+        Update as desired.
+        When publishing, this will be sent as the message attributes.
+        """
+        if self._attributes is None:
+            self._attributes = self.msg.attributes
+        return self._attributes
+
+    @property
+    def dataframe(self) -> "pd.DataFrame":
+        if self._dataframe is None:
+            import pandas as pd  # lazy-load pandas. it hogs memory on cloud functions and run
+
+            if self.schema_name.endswith(".lite"):
+                src_df = pd.DataFrame(self.dict["source"], index=[0])
+                prvs_df = pd.DataFrame(self.dict["prvSources"])
+            else:
+                src_df = pd.DataFrame(self.dict[self.schema_map["source"]], index=[0])
+                prvs_df = pd.DataFrame(self.dict[self.schema_map["prvSources"]])
+            self._dataframe = pd.concat([src_df, prvs_df], ignore_index=True)
+
+        return self._dataframe
+
+    @property
+    def schema_map(self) -> dict:
+        if self._schema_map is None:
+            if self.schema_name == str():
+                raise TypeError("no alert schema_name provided. unable to load schema map.")
+            survey = self.schema_name.split(".")[0]
+            path = PACKAGE_DIR / f"schema_maps/{survey}.yml"
+            self._schema_map = yaml.safe_load(path.read_text())
+        return self._schema_map
+
+    # @property
+    # def metadata(self) -> dict:
+    #     """Pub/Sub message metadata.
+
+    #     Includes
+
+    #         - message_id, publish_time, and ordering_key* of the incoming Pub/Sub message
+    #         - attributes, which is a dict that typically includes the attributes of the
+    #           incoming message and possibly additional entries added by the user in the meantime.
+
+    #     *To be useful, ordering_key requires that ordering is enabled on the subscription.
+    #     """
+    #     if self._metadata is None:
+    #         self._metadata = {
+    #             "message_id": self.msg.message_id,
+    #             "publish_time": self.msg.publish_time,
+    #             # ordering must be enabled on the subscription for this to be useful
+    #             "ordering_key": self.msg.ordering_key,
+    #             # [TODO] breaking change. attributes is now a dict. open a pr on tom_desc
+    #             # typically includes self.msg.attributes plus additional items added by the user
+    #             "attributes": self.attributes,
+    #         }
+    #     return self._metadata
 
 
 @define(kw_only=True, frozen=True)

From dafec2d917d9abd68c52924032f0d4b142884ed9 Mon Sep 17 00:00:00 2001
From: troyraen <troy.raen@pitt.edu>
Date: Wed, 2 Aug 2023 22:01:41 -0700
Subject: [PATCH 04/55] add schemas and schema maps

---
 pittgoogle/schema_maps/decat.yml              | 17 ++++
 pittgoogle/schema_maps/elasticc.yml           | 18 +++++
 pittgoogle/schema_maps/ztf.yml                | 18 +++++
 .../elasticc/elasticc.v0_9_1.alert.avsc       | 17 ++++
 .../elasticc.v0_9_1.brokerClassification.avsc | 35 ++++++++
 .../elasticc.v0_9_1.diaForcedSource.avsc      | 13 +++
 .../elasticc.v0_9_1.diaNondetectionLimit.avsc | 11 +++
 .../elasticc/elasticc.v0_9_1.diaObject.avsc   | 79 +++++++++++++++++++
 .../elasticc/elasticc.v0_9_1.diaSource.avsc   | 16 ++++
 9 files changed, 224 insertions(+)
 create mode 100644 pittgoogle/schema_maps/decat.yml
 create mode 100644 pittgoogle/schema_maps/elasticc.yml
 create mode 100644 pittgoogle/schema_maps/ztf.yml
 create mode 100644 pittgoogle/schemas/elasticc/elasticc.v0_9_1.alert.avsc
 create mode 100644 pittgoogle/schemas/elasticc/elasticc.v0_9_1.brokerClassification.avsc
 create mode 100644 pittgoogle/schemas/elasticc/elasticc.v0_9_1.diaForcedSource.avsc
 create mode 100644 pittgoogle/schemas/elasticc/elasticc.v0_9_1.diaNondetectionLimit.avsc
 create mode 100644 pittgoogle/schemas/elasticc/elasticc.v0_9_1.diaObject.avsc
 create mode 100644 pittgoogle/schemas/elasticc/elasticc.v0_9_1.diaSource.avsc

diff --git a/pittgoogle/schema_maps/decat.yml b/pittgoogle/schema_maps/decat.yml
new file mode 100644
index 0000000..068839c
--- /dev/null
+++ b/pittgoogle/schema_maps/decat.yml
@@ -0,0 +1,17 @@
+SURVEY: decat
+SURVEY_SCHEMA: https://github.com/rknop/decat_schema
+TOPIC_SYNTAX: decat_yyyymmdd_2021A-0113  # replace yyyymmdd with the date
+FILTER_MAP:
+  g DECam SDSS c0001 4720.0 1520.0: g
+  r DECam SDSS c0002 6415.0 1480.0: r
+objectId: objectid
+prvSources: sources
+source: triggersource
+sourceId: sourceid
+cutoutDifference: diffcutout
+cutoutScience: scicutout
+cutoutTemplate: refcutout
+filter: filter
+mag: mag
+magerr: magerr
+magzp: magzp
diff --git a/pittgoogle/schema_maps/elasticc.yml b/pittgoogle/schema_maps/elasticc.yml
new file mode 100644
index 0000000..a0f953f
--- /dev/null
+++ b/pittgoogle/schema_maps/elasticc.yml
@@ -0,0 +1,18 @@
+SURVEY: elasticc
+SURVEY_SCHEMA: https://github.com/LSSTDESC/elasticc/tree/main/alert_schema
+SCHEMA_VERSION: v0_9_1
+TOPIC_SYNTAX:
+FILTER_MAP:
+objectId: [diaObject, diaObjectId]
+prvSources: prvDiaSources
+source: diaSource
+sourceId: [diaSource, diaSourceId]
+cutoutScience: none
+filter: filterName
+mag: magpsf
+magerr: sigmapsf
+magzp: magzpsci
+psFlux: psFlux
+psFluxErr: psFluxErr
+dec: decl
+ra: ra
diff --git a/pittgoogle/schema_maps/ztf.yml b/pittgoogle/schema_maps/ztf.yml
new file mode 100644
index 0000000..5f80e1e
--- /dev/null
+++ b/pittgoogle/schema_maps/ztf.yml
@@ -0,0 +1,18 @@
+SURVEY: ztf
+SURVEY_SCHEMA: https://zwickytransientfacility.github.io/ztf-avro-alert/schema.html
+TOPIC_SYNTAX: ztf_yyyymmdd_programid1  # replace yyyymmdd with the date
+FILTER_MAP:
+  1: g
+  2: r
+  3: i
+objectId: objectId
+prvSources: prv_candidates
+source: candidate
+sourceId: candid
+cutoutDifference: cutoutDifference
+cutoutScience: cutoutScience
+cutoutTemplate: cutoutTemplate
+filter: fid
+mag: magpsf
+magerr: sigmapsf
+magzp: magzpsci
diff --git a/pittgoogle/schemas/elasticc/elasticc.v0_9_1.alert.avsc b/pittgoogle/schemas/elasticc/elasticc.v0_9_1.alert.avsc
new file mode 100644
index 0000000..d5b89ea
--- /dev/null
+++ b/pittgoogle/schemas/elasticc/elasticc.v0_9_1.alert.avsc
@@ -0,0 +1,17 @@
+{
+    "namespace": "elasticc.v0_9_1",
+    "type": "record",
+    "name": "alert",
+    "doc": "sample avro alert schema v4.1",
+    "fields": [
+        {"name": "alertId", "type": "long", "doc": "unique alert identifer"},
+        {"name": "diaSource", "type": "elasticc.v0_9_1.diaSource"},
+        {"name": "prvDiaSources", "type": ["null", {
+             "type": "array",
+             "items": "elasticc.v0_9_1.diaSource"}], "default": null},
+        {"name": "prvDiaForcedSources", "type": ["null", {
+             "type": "array",
+             "items": "elasticc.v0_9_1.diaForcedSource"}], "default": null},
+        {"name": "diaObject", "type": ["null", "elasticc.v0_9_1.diaObject"], "default": null}
+    ]
+}
diff --git a/pittgoogle/schemas/elasticc/elasticc.v0_9_1.brokerClassification.avsc b/pittgoogle/schemas/elasticc/elasticc.v0_9_1.brokerClassification.avsc
new file mode 100644
index 0000000..f975f9a
--- /dev/null
+++ b/pittgoogle/schemas/elasticc/elasticc.v0_9_1.brokerClassification.avsc
@@ -0,0 +1,35 @@
+{
+  "namespace": "elasticc.v0_9_1",
+  "type": "record",
+  "name": "brokerClassfication",
+  "fields": [
+    {"name": "alertId", "type": "long", "doc": "unique alert identifer"},
+    {"name": "diaSourceId", "type": "long", "doc": "id of source that triggered this classification"},
+    {"name": "elasticcPublishTimestamp",
+        "type": {"type": "long", "logicalType": "timestamp-millis"},
+        "doc": "timestamp from originating ELAsTiCC alert"
+    },
+    {"name": "brokerIngestTimestamp",
+        "type": ["null", {"type": "long", "logicalType": "timestamp-millis"}],
+        "doc": "timestamp of broker ingestion of ELAsTiCC alert"
+    },
+    {"name": "brokerName", "type": "string", "doc": "Name of broker (never changes)" },
+    {"name": "brokerVersion", "type": "string", "doc": "Version/Release of broker's software" },
+    {"name": "classifierName", "type": "string",
+       "doc": "Name of classifier broker is using, including software version" },
+    {"name": "classifierParams", "type": "string",
+       "doc": "Any classifier parameter information worth noting for this classification" },
+    {"name": "classifications", "type": {
+        "type": "array",
+        "items": {
+          "type": "record",
+          "name": "classificationDict",
+          "fields": [
+            {"name": "classId", "type": "int", "doc": "See https://github.com/LSSTDESC/elasticc/tree/main/taxonomy/taxonomy.ipynb for specification" },
+            {"name": "probability",  "type": "float", "doc": "0-1" }
+          ]
+        }
+      }
+    }
+  ]
+}
diff --git a/pittgoogle/schemas/elasticc/elasticc.v0_9_1.diaForcedSource.avsc b/pittgoogle/schemas/elasticc/elasticc.v0_9_1.diaForcedSource.avsc
new file mode 100644
index 0000000..d5d180f
--- /dev/null
+++ b/pittgoogle/schemas/elasticc/elasticc.v0_9_1.diaForcedSource.avsc
@@ -0,0 +1,13 @@
+{
+    "namespace": "elasticc.v0_9_1",
+    "name": "diaForcedSource",
+    "type": "record",
+    "fields": [
+        {"name": "diaForcedSourceId", "type": "long"},
+        {"name": "diaObjectId", "type": "long"},
+        {"name": "midPointTai", "type": "double"},
+        {"name": "filterName", "type": "string"},
+        {"name": "psFlux", "type": "float"},
+        {"name": "psFluxErr", "type": "float"}
+    ]
+}
diff --git a/pittgoogle/schemas/elasticc/elasticc.v0_9_1.diaNondetectionLimit.avsc b/pittgoogle/schemas/elasticc/elasticc.v0_9_1.diaNondetectionLimit.avsc
new file mode 100644
index 0000000..2cffef3
--- /dev/null
+++ b/pittgoogle/schemas/elasticc/elasticc.v0_9_1.diaNondetectionLimit.avsc
@@ -0,0 +1,11 @@
+{
+	"namespace": "elasticc.v0_9_1",
+	"name": "diaNondetectionLimit",
+	"type": "record",
+	"fields": [
+		{"name": "ccdVisitId",  "type": "long"},
+		{"name": "midPointTai", "type": "double"},
+		{"name": "filterName", "type": "string"},
+		{"name": "diaNoise", "type": "float"}
+	]
+}
diff --git a/pittgoogle/schemas/elasticc/elasticc.v0_9_1.diaObject.avsc b/pittgoogle/schemas/elasticc/elasticc.v0_9_1.diaObject.avsc
new file mode 100644
index 0000000..5b65699
--- /dev/null
+++ b/pittgoogle/schemas/elasticc/elasticc.v0_9_1.diaObject.avsc
@@ -0,0 +1,79 @@
+{
+    "namespace": "elasticc.v0_9_1",
+    "name": "diaObject",
+    "type": "record",
+    "fields": [
+        {"name": "diaObjectId", "type": "long"},
+        {"name": "simVersion",  "type": ["null", "string"], "doc": "diaObject provenance"},
+        {"name": "ra", "type": "double"},
+        {"name": "decl", "type": "double"},
+        {"name": "mwebv", "type": ["null", "float"], "default": null},
+        {"name": "mwebv_err", "type": ["null", "float"], "default": null},
+        {"name": "z_final", "type": ["null", "float"], "default": null},
+        {"name": "z_final_err", "type": ["null", "float"], "default": null},
+        {"name": "hostgal_ellipticity", "type": ["null", "float"], "default": null},
+        {"name": "hostgal_sqradius", "type": ["null", "float"], "default": null},
+        {"name": "hostgal_zspec", "type": ["null", "float"], "default": null},
+        {"name": "hostgal_zspec_err", "type": ["null", "float"], "default": null},
+        {"name": "hostgal_zphot", "type": ["null", "float"], "default": null},
+        {"name": "hostgal_zphot_err", "type": ["null", "float"], "default": null},
+        {"name": "hostgal_zphot_q000", "type": ["null", "float"], "default": null},
+        {"name": "hostgal_zphot_q010", "type": ["null", "float"], "default": null},
+        {"name": "hostgal_zphot_q020", "type": ["null", "float"], "default": null},
+        {"name": "hostgal_zphot_q030", "type": ["null", "float"], "default": null},
+        {"name": "hostgal_zphot_q040", "type": ["null", "float"], "default": null},
+        {"name": "hostgal_zphot_q050", "type": ["null", "float"], "default": null},
+        {"name": "hostgal_zphot_q060", "type": ["null", "float"], "default": null},
+        {"name": "hostgal_zphot_q070", "type": ["null", "float"], "default": null},
+        {"name": "hostgal_zphot_q080", "type": ["null", "float"], "default": null},
+        {"name": "hostgal_zphot_q090", "type": ["null", "float"], "default": null},
+        {"name": "hostgal_zphot_q100", "type": ["null", "float"], "default": null},
+        {"name": "hostgal_mag_u", "type": ["null", "float"], "default": null},
+        {"name": "hostgal_mag_g", "type": ["null", "float"], "default": null},
+        {"name": "hostgal_mag_r", "type": ["null", "float"], "default": null},
+        {"name": "hostgal_mag_i", "type": ["null", "float"], "default": null},
+        {"name": "hostgal_mag_z", "type": ["null", "float"], "default": null},
+        {"name": "hostgal_mag_Y", "type": ["null", "float"], "default": null},
+        {"name": "hostgal_ra", "type": ["null", "float"], "default": null},
+        {"name": "hostgal_dec", "type": ["null", "float"], "default": null},
+        {"name": "hostgal_snsep", "type": ["null", "float"], "default": null},
+        {"name": "hostgal_magerr_u", "type": ["null", "float"], "default": null},
+        {"name": "hostgal_magerr_g", "type": ["null", "float"], "default": null},
+        {"name": "hostgal_magerr_r", "type": ["null", "float"], "default": null},
+        {"name": "hostgal_magerr_i", "type": ["null", "float"], "default": null},
+        {"name": "hostgal_magerr_z", "type": ["null", "float"], "default": null},
+        {"name": "hostgal_magerr_Y", "type": ["null", "float"], "default": null},
+        {"name": "hostgal2_ellipticity", "type": ["null", "float"], "default": null},
+        {"name": "hostgal2_sqradius", "type": ["null", "float"], "default": null},
+        {"name": "hostgal2_zspec", "type": ["null", "float"], "default": null},
+        {"name": "hostgal2_zspec_err", "type": ["null", "float"], "default": null},
+        {"name": "hostgal2_zphot", "type": ["null", "float"], "default": null},
+        {"name": "hostgal2_zphot_err", "type": ["null", "float"], "default": null},
+        {"name": "hostgal2_zphot_q000", "type": ["null", "float"], "default": null},
+        {"name": "hostgal2_zphot_q010", "type": ["null", "float"], "default": null},
+        {"name": "hostgal2_zphot_q020", "type": ["null", "float"], "default": null},
+        {"name": "hostgal2_zphot_q030", "type": ["null", "float"], "default": null},
+        {"name": "hostgal2_zphot_q040", "type": ["null", "float"], "default": null},
+        {"name": "hostgal2_zphot_q050", "type": ["null", "float"], "default": null},
+        {"name": "hostgal2_zphot_q060", "type": ["null", "float"], "default": null},
+        {"name": "hostgal2_zphot_q070", "type": ["null", "float"], "default": null},
+        {"name": "hostgal2_zphot_q080", "type": ["null", "float"], "default": null},
+        {"name": "hostgal2_zphot_q090", "type": ["null", "float"], "default": null},
+        {"name": "hostgal2_zphot_q100", "type": ["null", "float"], "default": null},
+        {"name": "hostgal2_mag_u", "type": ["null", "float"], "default": null},
+        {"name": "hostgal2_mag_g", "type": ["null", "float"], "default": null},
+        {"name": "hostgal2_mag_r", "type": ["null", "float"], "default": null},
+        {"name": "hostgal2_mag_i", "type": ["null", "float"], "default": null},
+        {"name": "hostgal2_mag_z", "type": ["null", "float"], "default": null},
+        {"name": "hostgal2_mag_Y", "type": ["null", "float"], "default": null},
+        {"name": "hostgal2_ra", "type": ["null", "float"], "default": null},
+        {"name": "hostgal2_dec", "type": ["null", "float"], "default": null},
+        {"name": "hostgal2_snsep", "type": ["null", "float"], "default": null},
+        {"name": "hostgal2_magerr_u", "type": ["null", "float"], "default": null},
+        {"name": "hostgal2_magerr_g", "type": ["null", "float"], "default": null},
+        {"name": "hostgal2_magerr_r", "type": ["null", "float"], "default": null},
+        {"name": "hostgal2_magerr_i", "type": ["null", "float"], "default": null},
+        {"name": "hostgal2_magerr_z", "type": ["null", "float"], "default": null},
+        {"name": "hostgal2_magerr_Y", "type": ["null", "float"], "default": null}
+    ]
+}
diff --git a/pittgoogle/schemas/elasticc/elasticc.v0_9_1.diaSource.avsc b/pittgoogle/schemas/elasticc/elasticc.v0_9_1.diaSource.avsc
new file mode 100644
index 0000000..4906aa7
--- /dev/null
+++ b/pittgoogle/schemas/elasticc/elasticc.v0_9_1.diaSource.avsc
@@ -0,0 +1,16 @@
+{
+    "namespace": "elasticc.v0_9_1",
+    "name": "diaSource",
+    "type": "record",
+    "fields": [
+        {"name": "diaSourceId", "type": "long"},
+        {"name": "diaObjectId", "type": ["null", "long"], "default": null},
+        {"name": "midPointTai", "type": "double"},
+        {"name": "filterName", "type": "string"},
+        {"name": "ra", "type": "double"},
+        {"name": "decl", "type": "double"},
+        {"name": "psFlux", "type": "float"},
+        {"name": "psFluxErr", "type": "float"},
+        {"name": "snr", "type": "float"}
+    ]
+}

From 16719c614d5eebf65a5fa8b4e62586f2ea660453 Mon Sep 17 00:00:00 2001
From: troyraen <troy.raen@pitt.edu>
Date: Wed, 2 Aug 2023 22:05:26 -0700
Subject: [PATCH 05/55] add Alert class methods and _PubsubMessageLike

---
 pittgoogle/pubsub.py | 44 +++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 41 insertions(+), 3 deletions(-)

diff --git a/pittgoogle/pubsub.py b/pittgoogle/pubsub.py
index 2f187d0..82994fd 100644
--- a/pittgoogle/pubsub.py
+++ b/pittgoogle/pubsub.py
@@ -146,13 +146,15 @@ def pull_batch(
         {"subscription": subscription.path, "max_messages": max_messages}
     )
 
-    message_list = [Alert(msg=msg.message) for msg in response.received_messages]
-    ack_ids = [msg.ack_id for msg in response.received_messages]
+    alerts = [
+        Alert.from_msg(msg.message, schema_name=schema_name) for msg in response.received_messages
+    ]
 
+    ack_ids = [msg.ack_id for msg in response.received_messages]
     if len(ack_ids) > 0:
         subscription.client.acknowledge({"subscription": subscription.path, "ack_ids": ack_ids})
 
-    return message_list
+    return alerts
 
 
 @define
@@ -576,6 +578,23 @@ class Alert:
     _schema_map: Optional[dict] = field(default=None)
     # _metadata: Optional[dict] = field(default=None)
 
+    @classmethod
+    def from_msg(cls, msg, schema_name=str()):  # [TODO] update tom_desc to use this
+        """Create an `Alert` from a `pubsub_v1.types.PubsubMessage`."""
+        return cls(msg=msg, schema_name=schema_name)
+
+    @classmethod
+    def from_cloud_run(cls, envelope, schema_name=str()):
+        return cls(
+            msg=_PubsubMessageLike(
+                data=envelope["message"]["data"],
+                attributes=envelope["message"]["attributes"],
+                message_id=envelope["message"]["message_id"],
+                publish_time=envelope["message"]["publish_time"],
+                ordering_key=envelope["message"]["ordering_key"],
+            ),
+            schema_name=schema_name,
+        )
 
     # @property
     # def bytes(self) -> bytes:
@@ -708,3 +727,22 @@ class Response:
 
     ack: bool = field(default=True, converter=converters.to_bool)
     result: Any = field(default=None)
+
+
+@define(frozen=True)
+class _PubsubMessageLike:
+    """Container for an incoming Pub/Sub message that mimics a `pubsub_v1.types.PubsubMessage`.
+
+    It is convenient for the `Alert` class to work with a message as a
+    `pubsub_v1.types.PubsubMessage`. However, there are many ways to obtain an alert that do
+    not result in a `pubsub_v1.types.PubsubMessage` (e.g., an alert packet loaded from disk or
+    an incoming message to a Cloud Functions or Cloud Run module). In those cases, this class
+    is used to create an object with the same attributes as a `pubsub_v1.types.PubsubMessage`.
+    This object is then assigned to the `msg` attribute of the `Alert`.
+    """
+
+    data: bytes = field()
+    attributes: dict = field(factory=dict)
+    message_id: Optional[str] = field(default=None)
+    publish_time: Optional["google.protobuf.timestamp_pb2.Timestamp"] = field(default=None)
+    ordering_key: Optional[str] = field(default=None)

From d4fdba874a7c867c4259481b3e1beefa564553ee Mon Sep 17 00:00:00 2001
From: troyraen <troy.raen@pitt.edu>
Date: Wed, 2 Aug 2023 22:10:32 -0700
Subject: [PATCH 06/55] add Subscription.pull_batch

---
 pittgoogle/pubsub.py | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/pittgoogle/pubsub.py b/pittgoogle/pubsub.py
index 82994fd..498c187 100644
--- a/pittgoogle/pubsub.py
+++ b/pittgoogle/pubsub.py
@@ -367,6 +367,19 @@ def delete(self) -> None:
         else:
             LOGGER.info(f"deleted subscription: {self.path}")
 
+    def pull_batch(self, max_messages: int = 1) -> List["Alert"]:
+        """Pull a single batch of messages.
+
+        Recommended for testing. Not recommended for long-running listeners (use the
+        :meth:`~Consumer.stream` method instead).
+
+        Parameters
+        ----------
+        max_messages : `int`
+            Maximum number of messages to be pulled.
+        """
+        return pull_batch(self, max_messages=max_messages, schema_name=self.schema_name)
+
 
 @define()
 class Consumer:
@@ -533,7 +546,7 @@ def pull_batch(self, max_messages: int = 1) -> List["Alert"]:
         max_messages : `int`
             Maximum number of messages to be pulled.
         """
-        return pull_batch(self.subscription, max_messages)
+        return self.subscription.pull_batch(max_messages=max_messages)
 
 
 @define(kw_only=True)

From 426281e4009ec3103e73bd207691e9b12a5ec2b8 Mon Sep 17 00:00:00 2001
From: troyraen <troy.raen@pitt.edu>
Date: Thu, 3 Aug 2023 12:24:31 -0700
Subject: [PATCH 07/55] reorder classes to avoid forward reference

---
 pittgoogle/pubsub.py | 42 ++++++++++++++++++++----------------------
 1 file changed, 20 insertions(+), 22 deletions(-)

diff --git a/pittgoogle/pubsub.py b/pittgoogle/pubsub.py
index 498c187..ac8a541 100644
--- a/pittgoogle/pubsub.py
+++ b/pittgoogle/pubsub.py
@@ -549,6 +549,25 @@ def pull_batch(self, max_messages: int = 1) -> List["Alert"]:
         return self.subscription.pull_batch(max_messages=max_messages)
 
 
+@define(frozen=True)
+class _PubsubMessageLike:
+    """Container for an incoming Pub/Sub message that mimics a `pubsub_v1.types.PubsubMessage`.
+
+    It is convenient for the `Alert` class to work with a message as a
+    `pubsub_v1.types.PubsubMessage`. However, there are many ways to obtain an alert that do
+    not result in a `pubsub_v1.types.PubsubMessage` (e.g., an alert packet loaded from disk or
+    an incoming message to a Cloud Functions or Cloud Run module). In those cases, this class
+    is used to create an object with the same attributes as a `pubsub_v1.types.PubsubMessage`.
+    This object is then assigned to the `msg` attribute of the `Alert`.
+    """
+
+    data: bytes = field()
+    attributes: dict = field(factory=dict)
+    message_id: Optional[str] = field(default=None)
+    publish_time: Optional["google.protobuf.timestamp_pb2.Timestamp"] = field(default=None)
+    ordering_key: Optional[str] = field(default=None)
+
+
 @define(kw_only=True)
 class Alert:
     """Pitt-Google container for a Pub/Sub message.
@@ -582,9 +601,7 @@ class Alert:
         default=None
     )
     # _metadata: Optional[dict] = field(default=None)
-    msg: Optional[Union["pubsub_v1.types.PubsubMessage", "_PubsubMessageLike"]] = field(
-        default=None
-    )
+    msg: Optional[Union["pubsub_v1.types.PubsubMessage", _PubsubMessageLike]] = field(default=None)
     """Incoming Pub/Sub message object."""
     _dataframe: Optional["pd.DataFrame"] = field(default=None)
     schema_name: str = field(factory=str)
@@ -740,22 +757,3 @@ class Response:
 
     ack: bool = field(default=True, converter=converters.to_bool)
     result: Any = field(default=None)
-
-
-@define(frozen=True)
-class _PubsubMessageLike:
-    """Container for an incoming Pub/Sub message that mimics a `pubsub_v1.types.PubsubMessage`.
-
-    It is convenient for the `Alert` class to work with a message as a
-    `pubsub_v1.types.PubsubMessage`. However, there are many ways to obtain an alert that do
-    not result in a `pubsub_v1.types.PubsubMessage` (e.g., an alert packet loaded from disk or
-    an incoming message to a Cloud Functions or Cloud Run module). In those cases, this class
-    is used to create an object with the same attributes as a `pubsub_v1.types.PubsubMessage`.
-    This object is then assigned to the `msg` attribute of the `Alert`.
-    """
-
-    data: bytes = field()
-    attributes: dict = field(factory=dict)
-    message_id: Optional[str] = field(default=None)
-    publish_time: Optional["google.protobuf.timestamp_pb2.Timestamp"] = field(default=None)
-    ordering_key: Optional[str] = field(default=None)

From 793612b5e97f543e8276b5f899f09ddecaf7d2df Mon Sep 17 00:00:00 2001
From: troyraen <troy.raen@pitt.edu>
Date: Thu, 3 Aug 2023 12:25:52 -0700
Subject: [PATCH 08/55] add Topic.from_cloud for client w/o explicit Auth

---
 pittgoogle/pubsub.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/pittgoogle/pubsub.py b/pittgoogle/pubsub.py
index ac8a541..ee8dc90 100644
--- a/pittgoogle/pubsub.py
+++ b/pittgoogle/pubsub.py
@@ -184,6 +184,14 @@ class Topic:
         default=None, validator=optional(instance_of(pubsub_v1.PublisherClient))
     )
 
+    @classmethod
+    def from_cloud(cls, name, *, projectid, testid=False):
+        """Create a `Topic` with a `client` using implicit credentials (no explicit `auth`)."""
+        # if testid is not False, "False", or None, append the testid to the name
+        if testid and testid != "False":
+            name = f"{name}-{testid}"
+        return cls(name, projectid=projectid, client=pubsub_v1.PublisherClient())
+
     @classmethod
     def from_path(cls, path) -> "Topic":
         """Parse the `path` and return a new `Topic`."""

From d751e037ac7110f3c3a0fe10023ea27b11c838ea Mon Sep 17 00:00:00 2001
From: troyraen <troy.raen@pitt.edu>
Date: Thu, 3 Aug 2023 12:26:50 -0700
Subject: [PATCH 09/55] check envelope for bad request

---
 pittgoogle/pubsub.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/pittgoogle/pubsub.py b/pittgoogle/pubsub.py
index ee8dc90..bf47f9b 100644
--- a/pittgoogle/pubsub.py
+++ b/pittgoogle/pubsub.py
@@ -615,6 +615,7 @@ class Alert:
     schema_name: str = field(factory=str)
     _schema_map: Optional[dict] = field(default=None)
     # _metadata: Optional[dict] = field(default=None)
+    bad_request: Union[bool, tuple[str, int]] = field(default=False)
 
     @classmethod
     def from_msg(cls, msg, schema_name=str()):  # [TODO] update tom_desc to use this
@@ -622,7 +623,13 @@ def from_msg(cls, msg, schema_name=str()):  # [TODO] update tom_desc to use this
         return cls(msg=msg, schema_name=schema_name)
 
     @classmethod
-    def from_cloud_run(cls, envelope, schema_name=str()):
+    def from_cloud_run(cls, envelope: dict, schema_name: str = str()):
+        # check whether received message is valid
+        if not envelope:
+            return cls(bad_request=("Bad Request: no Pub/Sub message received", 400))
+        if not isinstance(envelope, dict) or "message" not in envelope:
+            return cls(bad_request=("Bad Request: invalid Pub/Sub message format", 400))
+
         return cls(
             msg=_PubsubMessageLike(
                 data=envelope["message"]["data"],

From e5503b3423aede91280bb3db9a719c7fb1643d63 Mon Sep 17 00:00:00 2001
From: troyraen <troy.raen@pitt.edu>
Date: Thu, 3 Aug 2023 12:27:16 -0700
Subject: [PATCH 10/55] add Alert.get

---
 pittgoogle/pubsub.py | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/pittgoogle/pubsub.py b/pittgoogle/pubsub.py
index bf47f9b..c2f0844 100644
--- a/pittgoogle/pubsub.py
+++ b/pittgoogle/pubsub.py
@@ -655,6 +655,33 @@ def from_cloud_run(cls, envelope: dict, schema_name: str = str()):
     #             pass
     #     return self._bytes
 
+    def get(self, schema_key: str, return_key_name=False):
+        # fullkey = self.get(key, return_key=True)
+        survey_key = self.schema_map.get(schema_key)
+
+        if return_key_name:
+            if isinstance(survey_key, list):
+                return survey_key[-1]
+            return survey_key
+
+        if schema_key in self.dict:
+            return self.dict.get(schema_key)
+
+        if isinstance(survey_key, str):
+            return self.dict.get(survey_key)
+
+        if not isinstance(survey_key, list):
+            return
+
+        if len(survey_key) == 1:
+            return self.dict.get(survey_key[0])
+
+        if len(survey_key) == 2:
+            return self.dict.get(survey_key[0]).get(survey_key[1])
+
+        if len(survey_key) == 3:
+            return self.dict.get(survey_key[0]).get(survey_key[1]).get(survey_key[2])
+
     @property
     def dict(self) -> dict:
         """Message payload as a dictionary. Created from `self.msg.data` and `self.schema_name`, if needed.

From 878a281f0a6e7f3ca94fed9930ab6420cedf7f8f Mon Sep 17 00:00:00 2001
From: troyraen <troy.raen@pitt.edu>
Date: Thu, 3 Aug 2023 12:28:00 -0700
Subject: [PATCH 11/55] clarify docstring

---
 pittgoogle/pubsub.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/pittgoogle/pubsub.py b/pittgoogle/pubsub.py
index c2f0844..16ab71c 100644
--- a/pittgoogle/pubsub.py
+++ b/pittgoogle/pubsub.py
@@ -251,7 +251,10 @@ def delete(self) -> None:
             LOGGER.info(f"deleted topic: {self.path}")
 
     def publish(self, alert: "Alert", format="json") -> int:
-        """Publish the `alert.dict` in the requested `format`, attaching the `alert.attributes`."""
+        """Publish the `alert.dict` in the requested `format`, attaching the `alert.attributes`.
+
+        `format` can be "json" or a schema name.
+        """
         if format == "json":
             message = json.dumps(alert.dict).encode("utf-8")
 

From 60cbc8d3a22122fa6cee1f541736535dc38753ac Mon Sep 17 00:00:00 2001
From: troyraen <troy.raen@pitt.edu>
Date: Sat, 16 Sep 2023 15:38:12 -0700
Subject: [PATCH 12/55] move schema maps under the schemas directory

---
 .../{schema_maps => schemas/maps}/decat.yml    | 12 ++++++------
 .../{schema_maps => schemas/maps}/elasticc.yml | 18 +++++++++++-------
 .../{schema_maps => schemas/maps}/ztf.yml      | 12 ++++++------
 3 files changed, 23 insertions(+), 19 deletions(-)
 rename pittgoogle/{schema_maps => schemas/maps}/decat.yml (68%)
 rename pittgoogle/{schema_maps => schemas/maps}/elasticc.yml (52%)
 rename pittgoogle/{schema_maps => schemas/maps}/ztf.yml (63%)

diff --git a/pittgoogle/schema_maps/decat.yml b/pittgoogle/schemas/maps/decat.yml
similarity index 68%
rename from pittgoogle/schema_maps/decat.yml
rename to pittgoogle/schemas/maps/decat.yml
index 068839c..c150e38 100644
--- a/pittgoogle/schema_maps/decat.yml
+++ b/pittgoogle/schemas/maps/decat.yml
@@ -4,13 +4,13 @@ TOPIC_SYNTAX: decat_yyyymmdd_2021A-0113  # replace yyyymmdd with the date
 FILTER_MAP:
   g DECam SDSS c0001 4720.0 1520.0: g
   r DECam SDSS c0002 6415.0 1480.0: r
-objectId: objectid
-prvSources: sources
+objectid: objectid
+prv_sources: sources
 source: triggersource
-sourceId: sourceid
-cutoutDifference: diffcutout
-cutoutScience: scicutout
-cutoutTemplate: refcutout
+sourceid: sourceid
+cutout_difference: diffcutout
+cutout_science: scicutout
+cutout_template: refcutout
 filter: filter
 mag: mag
 magerr: magerr
diff --git a/pittgoogle/schema_maps/elasticc.yml b/pittgoogle/schemas/maps/elasticc.yml
similarity index 52%
rename from pittgoogle/schema_maps/elasticc.yml
rename to pittgoogle/schemas/maps/elasticc.yml
index a0f953f..50852c1 100644
--- a/pittgoogle/schema_maps/elasticc.yml
+++ b/pittgoogle/schemas/maps/elasticc.yml
@@ -3,16 +3,20 @@ SURVEY_SCHEMA: https://github.com/LSSTDESC/elasticc/tree/main/alert_schema
 SCHEMA_VERSION: v0_9_1
 TOPIC_SYNTAX:
 FILTER_MAP:
-objectId: [diaObject, diaObjectId]
-prvSources: prvDiaSources
+alertid: alertId
+objectid: [diaObject, diaObjectId]
 source: diaSource
-sourceId: [diaSource, diaSourceId]
-cutoutScience: none
+sourceid: [diaSource, diaSourceId]
+prv_sources: prvDiaSources
+mjd: midPointTai
 filter: filterName
 mag: magpsf
 magerr: sigmapsf
 magzp: magzpsci
-psFlux: psFlux
-psFluxErr: psFluxErr
-dec: decl
+flux: psFlux
+fluxerr: psFluxErr
 ra: ra
+dec: decl
+cutout_science:
+cutout_template:
+cutout_difference:
diff --git a/pittgoogle/schema_maps/ztf.yml b/pittgoogle/schemas/maps/ztf.yml
similarity index 63%
rename from pittgoogle/schema_maps/ztf.yml
rename to pittgoogle/schemas/maps/ztf.yml
index 5f80e1e..4aaf800 100644
--- a/pittgoogle/schema_maps/ztf.yml
+++ b/pittgoogle/schemas/maps/ztf.yml
@@ -5,13 +5,13 @@ FILTER_MAP:
   1: g
   2: r
   3: i
-objectId: objectId
-prvSources: prv_candidates
+objectid: objectId
+prv_sources: prv_candidates
 source: candidate
-sourceId: candid
-cutoutDifference: cutoutDifference
-cutoutScience: cutoutScience
-cutoutTemplate: cutoutTemplate
+sourceid: candid
+cutout_difference: cutoutDifference
+cutout_science: cutoutScience
+cutout_template: cutoutTemplate
 filter: fid
 mag: magpsf
 magerr: sigmapsf

From 80d74b812ca4d17443c463d407da6a3ebb08dcf2 Mon Sep 17 00:00:00 2001
From: troyraen <troy.raen@pitt.edu>
Date: Sat, 16 Sep 2023 16:20:00 -0700
Subject: [PATCH 13/55] move Alert class and related to independent module

---
 pittgoogle/alert.py  | 281 +++++++++++++++++++++++++++++++++++++++++++
 pittgoogle/pubsub.py | 226 +---------------------------------
 2 files changed, 283 insertions(+), 224 deletions(-)
 create mode 100644 pittgoogle/alert.py

diff --git a/pittgoogle/alert.py b/pittgoogle/alert.py
new file mode 100644
index 0000000..8b934bf
--- /dev/null
+++ b/pittgoogle/alert.py
@@ -0,0 +1,281 @@
+# -*- coding: UTF-8 -*-
+"""Classes to facilitate working with astronomical alerts.
+
+.. contents::
+   :local:
+   :depth: 2
+
+Usage Examples
+---------------
+
+.. code-block:: python
+
+    import pittgoogle
+
+Load an alert from disk:
+
+.. code-block:: python
+
+    [TODO]
+
+Load a ZTF alert from a Pub/Sub message that has triggered a Cloud Run module:
+
+.. code-block:: python
+
+    # flask is used to work with HTTP requests, which trigger Cloud Run modules
+    # the request contains the Pub/Sub message, which contains the alert packet
+    from flask import request
+
+    alert = pittgoogle.Alert.from_cloud_run(envelope=request.get_json(), schema_name="ztf")
+
+API
+----
+
+"""
+import importlib.resources
+import io
+import logging
+from typing import TYPE_CHECKING, Optional, Union
+
+import fastavro
+import yaml
+from attrs import define, field
+from google.cloud import pubsub_v1
+
+from .exceptions import OpenAlertError
+from .utils import Cast
+
+if TYPE_CHECKING:
+    import google.protobuf.timestamp_pb2
+    import google._upb._message
+    import pandas as pd
+
+
+LOGGER = logging.getLogger(__name__)
+PACKAGE_DIR = importlib.resources.files(__package__)
+
+
+@define(frozen=True)
+class _PubsubMessageLike:
+    """Container for an incoming Pub/Sub message that mimics a `pubsub_v1.types.PubsubMessage`.
+
+    It is convenient for the `Alert` class to work with a message as a
+    `pubsub_v1.types.PubsubMessage`. However, there are many ways to obtain an alert that do
+    not result in a `pubsub_v1.types.PubsubMessage` (e.g., an alert packet loaded from disk or
+    an incoming message to a Cloud Functions or Cloud Run module). In those cases, this class
+    is used to create an object with the same attributes as a `pubsub_v1.types.PubsubMessage`.
+    This object is then assigned to the `msg` attribute of the `Alert`.
+    """
+
+    data: bytes = field()
+    attributes: dict = field(factory=dict)
+    message_id: Optional[str] = field(default=None)
+    publish_time: Optional["google.protobuf.timestamp_pb2.Timestamp"] = field(default=None)
+    ordering_key: Optional[str] = field(default=None)
+
+
+@define(kw_only=True)
+class Alert:
+    """Pitt-Google container for an astronomical alert.
+
+    Alerts are typically loaded from a Pub/Sub message but may also be loaded from a file.
+    It is recommended to instantiate an `Alert` using one of the `from_*` methods.
+
+    All parameters are keyword only.
+
+    Parameters
+    ------------
+    bytes : `bytes`, optional
+        The message payload, as returned by Pub/Sub. It may be Avro or JSON serialized depending
+        on the topic.
+    dict : `dict`, optional
+        The message payload as a dictionary.
+    metadata : `dict`, optional
+        The message metadata.
+    msg : `google.cloud.pubsub_v1.types.PubsubMessage`, optional
+        The Pub/Sub message object, documented at
+        `<https://cloud.google.com/python/docs/reference/pubsub/latest/google.cloud.pubsub_v1.types.PubsubMessage>`__.
+    schema_name : `str`
+        One of (case insensitive):
+            - ztf
+            - ztf.lite
+            - elasticc.v0_9_1.alert
+            - elasticc.v0_9_1.brokerClassification
+        Schema name of the alert. Used for unpacking. If not provided, some properties of the
+        `Alert` may not be available.
+    """
+
+    # _bytes: Optional[ByteString] = field(default=None)
+    _dict: Optional[dict] = field(default=None)
+    _attributes: Optional[Union[dict, "google._upb._message.ScalarMapContainer"]] = field(
+        default=None
+    )
+    # _metadata: Optional[dict] = field(default=None)
+    msg: Optional[Union["pubsub_v1.types.PubsubMessage", _PubsubMessageLike]] = field(default=None)
+    """Incoming Pub/Sub message object."""
+    _dataframe: Optional["pd.DataFrame"] = field(default=None)
+    schema_name: str = field(factory=str, converter=str.lower)
+    _schema_map: Optional[dict] = field(default=None)
+    # _metadata: Optional[dict] = field(default=None)
+    bad_request: Union[bool, tuple[str, int]] = field(default=False)
+
+    @classmethod
+    def from_msg(cls, msg, schema_name=str()):  # [TODO] update tom_desc to use this
+        """Create an `Alert` from a `pubsub_v1.types.PubsubMessage`."""
+        return cls(msg=msg, schema_name=schema_name)
+
+    @classmethod
+    def from_cloud_run(cls, envelope: dict, schema_name: str = str()):
+        # check whether received message is valid
+        if not envelope:
+            return cls(bad_request=("Bad Request: no Pub/Sub message received", 400))
+        if not isinstance(envelope, dict) or "message" not in envelope:
+            return cls(bad_request=("Bad Request: invalid Pub/Sub message format", 400))
+
+        return cls(
+            msg=_PubsubMessageLike(
+                data=envelope["message"]["data"],
+                attributes=envelope["message"]["attributes"],
+                message_id=envelope["message"]["message_id"],
+                publish_time=envelope["message"]["publish_time"],
+                ordering_key=envelope["message"]["ordering_key"],
+            ),
+            schema_name=schema_name,
+        )
+
+    # @property
+    # def bytes(self) -> bytes:
+    #     """Message payload in original format (Avro or JSON serialized bytes)."""
+    #     if self._bytes is None:
+    #         # add try-except when we know what we're looking for
+    #         self._bytes = self.msg.data
+    #         if self._bytes is None:
+    #             # if we add a "path" attribute for the path to an avro file on disk
+    #             # we can load it like this:
+    #             #     with open(self.path, "rb") as f:
+    #             #         self._bytes = f.read()
+    #             pass
+    #     return self._bytes
+
+    def get(self, schema_key: str, return_key_name=False):
+        # fullkey = self.get(key, return_key=True)
+        survey_key = self.schema_map.get(schema_key)
+
+        if return_key_name:
+            if isinstance(survey_key, list):
+                return survey_key[-1]
+            return survey_key
+
+        if schema_key in self.dict:
+            return self.dict.get(schema_key)
+
+        if isinstance(survey_key, str):
+            return self.dict.get(survey_key)
+
+        if not isinstance(survey_key, list):
+            return
+
+        if len(survey_key) == 1:
+            return self.dict.get(survey_key[0])
+
+        if len(survey_key) == 2:
+            return self.dict.get(survey_key[0]).get(survey_key[1])
+
+        if len(survey_key) == 3:
+            return self.dict.get(survey_key[0]).get(survey_key[1]).get(survey_key[2])
+
+    @property
+    def dict(self) -> dict:
+        """Message payload as a dictionary. Created from `self.msg.data` and `self.schema_name`, if needed.
+
+        Raises
+        ------
+        :class:`pittgoogle.exceptions.OpenAlertError`
+            if unable to deserialize the alert bytes.
+        """
+        if self._dict is not None:
+            return self._dict
+
+        if self.schema_name.startswith("elasticc"):
+            # self.msg.data is avro and schemaless. load the schema, then convert the bytes to a dict
+            schemapath = PACKAGE_DIR / f"schemas/elasticc/{self.schema_name}.avsc"
+            schema = fastavro.schema.load_schema(schemapath)
+            with io.BytesIO(self.msg.data) as fin:
+                self._dict = fastavro.schemaless_reader(fin, schema)
+            return self._dict
+
+        if self.schema_name == "":
+            LOGGER.warning("no alert schema_name provided. attempting to deserialize without it.")
+
+        # assume this is a ztf or ztf-lite alert
+        # this should be rewritten to catch specific errors
+        # for now, just try avro then json, catching basically all errors in the process
+        try:
+            self._dict = Cast.avro_to_dict(self.msg.data)
+        except Exception:
+            try:
+                self._dict = Cast.json_to_dict(self.msg.data)
+            except Exception:
+                raise OpenAlertError("failed to deserialize the alert bytes")
+        return self._dict
+
+    @property
+    def attributes(self) -> Union[dict, "google._upb._message.ScalarMapContainer"]:
+        """Custom metadata for the message. Pub/Sub handles this as a dict-like called "attributes".
+
+        If None, this will be set to `self.msg.attributes`.
+        Update as desired.
+        When publishing, this will be sent as the message attributes.
+        """
+        if self._attributes is None:
+            self._attributes = self.msg.attributes
+        return self._attributes
+
+    @property
+    def dataframe(self) -> "pd.DataFrame":
+        if self._dataframe is None:
+            import pandas as pd  # lazy-load pandas. it hogs memory on cloud functions and run
+
+            if self.schema_name.endswith(".lite"):
+                src_df = pd.DataFrame(self.dict["source"], index=[0])
+                prvs_df = pd.DataFrame(self.dict["prvSources"])
+            else:
+                src_df = pd.DataFrame(self.dict[self.schema_map["source"]], index=[0])
+                prvs_df = pd.DataFrame(self.dict[self.schema_map["prvSources"]])
+            self._dataframe = pd.concat([src_df, prvs_df], ignore_index=True)
+
+        return self._dataframe
+
+    @property
+    def schema_map(self) -> dict:
+        if self._schema_map is None:
+            if self.schema_name == str():
+                raise TypeError("no alert schema_name provided. unable to load schema map.")
+            survey = self.schema_name.split(".")[0]
+            path = PACKAGE_DIR / f"schema_maps/{survey}.yml"
+            self._schema_map = yaml.safe_load(path.read_text())
+        return self._schema_map
+
+    # @property
+    # def metadata(self) -> dict:
+    #     """Pub/Sub message metadata.
+
+    #     Includes
+
+    #         - message_id, publish_time, and ordering_key* of the incoming Pub/Sub message
+    #         - attributes, which is a dict that typically includes the attributes of the
+    #           incoming message and possibly additional entries added by the user in the meantime.
+
+    #     *To be useful, ordering_key requires that ordering is enabled on the subscription.
+    #     """
+    #     if self._metadata is None:
+    #         self._metadata = {
+    #             "message_id": self.msg.message_id,
+    #             "publish_time": self.msg.publish_time,
+    #             # ordering must be enabled on the subscription for this to be useful
+    #             "ordering_key": self.msg.ordering_key,
+    #             # [TODO] breaking change. attributes is now a dict. open a pr on tom_desc
+    #             # typically includes self.msg.attributes plus additional items added by the user
+    #             "attributes": self.attributes,
+    #         }
+    #     return self._metadata
diff --git a/pittgoogle/pubsub.py b/pittgoogle/pubsub.py
index 16ab71c..f0430c0 100644
--- a/pittgoogle/pubsub.py
+++ b/pittgoogle/pubsub.py
@@ -92,7 +92,7 @@ def my_batch_callback(results):
 from google.api_core.exceptions import NotFound
 from google.cloud import pubsub_v1
 
-from .auth import Auth
+from . import Auth, Alert
 from .exceptions import OpenAlertError
 from .utils import Cast
 
@@ -106,7 +106,7 @@ def my_batch_callback(results):
 PACKAGE_DIR = importlib.resources.files(__package__)
 
 
-def msg_callback_example(alert: "Alert") -> "Response":
+def msg_callback_example(alert: Alert) -> "Response":
     print(f"processing message: {alert.metadata['message_id']}")
     return Response(ack=True, result=alert.dict)
 
@@ -560,228 +560,6 @@ def pull_batch(self, max_messages: int = 1) -> List["Alert"]:
         return self.subscription.pull_batch(max_messages=max_messages)
 
 
-@define(frozen=True)
-class _PubsubMessageLike:
-    """Container for an incoming Pub/Sub message that mimics a `pubsub_v1.types.PubsubMessage`.
-
-    It is convenient for the `Alert` class to work with a message as a
-    `pubsub_v1.types.PubsubMessage`. However, there are many ways to obtain an alert that do
-    not result in a `pubsub_v1.types.PubsubMessage` (e.g., an alert packet loaded from disk or
-    an incoming message to a Cloud Functions or Cloud Run module). In those cases, this class
-    is used to create an object with the same attributes as a `pubsub_v1.types.PubsubMessage`.
-    This object is then assigned to the `msg` attribute of the `Alert`.
-    """
-
-    data: bytes = field()
-    attributes: dict = field(factory=dict)
-    message_id: Optional[str] = field(default=None)
-    publish_time: Optional["google.protobuf.timestamp_pb2.Timestamp"] = field(default=None)
-    ordering_key: Optional[str] = field(default=None)
-
-
-@define(kw_only=True)
-class Alert:
-    """Pitt-Google container for a Pub/Sub message.
-
-    Typical usage is to instantiate an `Alert` using only a `msg`, and then the other attributes
-    will be automatically extracted and returned (lazily).
-
-    All parameters are keyword only.
-
-    Parameters
-    ------------
-    bytes : `bytes`, optional
-        The message payload, as returned by Pub/Sub. It may be Avro or JSON serialized depending
-        on the topic.
-    dict : `dict`, optional
-        The message payload as a dictionary.
-    metadata : `dict`, optional
-        The message metadata.
-    msg : `google.cloud.pubsub_v1.types.PubsubMessage`, optional
-        The Pub/Sub message object, documented at
-        `<https://cloud.google.com/python/docs/reference/pubsub/latest/google.cloud.pubsub_v1.types.PubsubMessage>`__.
-    schema_name : `str`
-        One of "ztf", "ztf.lite", "elasticc.v0_9_1.alert", "elasticc.v0_9_1.brokerClassification".
-        Schema name of the alert. Used for unpacking. If not provided, some properties of the
-        `Alert` may not be available.
-    """
-
-    # _bytes: Optional[ByteString] = field(default=None)
-    _dict: Optional[dict] = field(default=None)
-    _attributes: Optional[Union[dict, "google._upb._message.ScalarMapContainer"]] = field(
-        default=None
-    )
-    # _metadata: Optional[dict] = field(default=None)
-    msg: Optional[Union["pubsub_v1.types.PubsubMessage", _PubsubMessageLike]] = field(default=None)
-    """Incoming Pub/Sub message object."""
-    _dataframe: Optional["pd.DataFrame"] = field(default=None)
-    schema_name: str = field(factory=str)
-    _schema_map: Optional[dict] = field(default=None)
-    # _metadata: Optional[dict] = field(default=None)
-    bad_request: Union[bool, tuple[str, int]] = field(default=False)
-
-    @classmethod
-    def from_msg(cls, msg, schema_name=str()):  # [TODO] update tom_desc to use this
-        """Create an `Alert` from a `pubsub_v1.types.PubsubMessage`."""
-        return cls(msg=msg, schema_name=schema_name)
-
-    @classmethod
-    def from_cloud_run(cls, envelope: dict, schema_name: str = str()):
-        # check whether received message is valid
-        if not envelope:
-            return cls(bad_request=("Bad Request: no Pub/Sub message received", 400))
-        if not isinstance(envelope, dict) or "message" not in envelope:
-            return cls(bad_request=("Bad Request: invalid Pub/Sub message format", 400))
-
-        return cls(
-            msg=_PubsubMessageLike(
-                data=envelope["message"]["data"],
-                attributes=envelope["message"]["attributes"],
-                message_id=envelope["message"]["message_id"],
-                publish_time=envelope["message"]["publish_time"],
-                ordering_key=envelope["message"]["ordering_key"],
-            ),
-            schema_name=schema_name,
-        )
-
-    # @property
-    # def bytes(self) -> bytes:
-    #     """Message payload in original format (Avro or JSON serialized bytes)."""
-    #     if self._bytes is None:
-    #         # add try-except when we know what we're looking for
-    #         self._bytes = self.msg.data
-    #         if self._bytes is None:
-    #             # if we add a "path" attribute for the path to an avro file on disk
-    #             # we can load it like this:
-    #             #     with open(self.path, "rb") as f:
-    #             #         self._bytes = f.read()
-    #             pass
-    #     return self._bytes
-
-    def get(self, schema_key: str, return_key_name=False):
-        # fullkey = self.get(key, return_key=True)
-        survey_key = self.schema_map.get(schema_key)
-
-        if return_key_name:
-            if isinstance(survey_key, list):
-                return survey_key[-1]
-            return survey_key
-
-        if schema_key in self.dict:
-            return self.dict.get(schema_key)
-
-        if isinstance(survey_key, str):
-            return self.dict.get(survey_key)
-
-        if not isinstance(survey_key, list):
-            return
-
-        if len(survey_key) == 1:
-            return self.dict.get(survey_key[0])
-
-        if len(survey_key) == 2:
-            return self.dict.get(survey_key[0]).get(survey_key[1])
-
-        if len(survey_key) == 3:
-            return self.dict.get(survey_key[0]).get(survey_key[1]).get(survey_key[2])
-
-    @property
-    def dict(self) -> dict:
-        """Message payload as a dictionary. Created from `self.msg.data` and `self.schema_name`, if needed.
-
-        Raises
-        ------
-        :class:`pittgoogle.exceptions.OpenAlertError`
-            if unable to deserialize the alert bytes.
-        """
-        if self._dict is not None:
-            return self._dict
-
-        if self.schema_name.startswith("elasticc"):
-            # self.msg.data is avro and schemaless. load the schema, then convert the bytes to a dict
-            schemapath = PACKAGE_DIR / f"schemas/elasticc/{self.schema_name}.avsc"
-            schema = fastavro.schema.load_schema(schemapath)
-            with io.BytesIO(self.msg.data) as fin:
-                self._dict = fastavro.schemaless_reader(fin, schema)
-            return self._dict
-
-        if self.schema_name == "":
-            LOGGER.warning("no alert schema_name provided. attempting to deserialize without it.")
-
-        # assume this is a ztf or ztf-lite alert
-        # this should be rewritten to catch specific errors
-        # for now, just try avro then json, catching basically all errors in the process
-        try:
-            self._dict = Cast.avro_to_dict(self.msg.data)
-        except Exception:
-            try:
-                self._dict = Cast.json_to_dict(self.msg.data)
-            except Exception:
-                raise OpenAlertError("failed to deserialize the alert bytes")
-        return self._dict
-
-    @property
-    def attributes(self) -> Union[dict, "google._upb._message.ScalarMapContainer"]:
-        """Custom metadata for the message. Pub/Sub handles this as a dict-like called "attributes".
-
-        If None, this will be set to `self.msg.attributes`.
-        Update as desired.
-        When publishing, this will be sent as the message attributes.
-        """
-        if self._attributes is None:
-            self._attributes = self.msg.attributes
-        return self._attributes
-
-    @property
-    def dataframe(self) -> "pd.DataFrame":
-        if self._dataframe is None:
-            import pandas as pd  # lazy-load pandas. it hogs memory on cloud functions and run
-
-            if self.schema_name.endswith(".lite"):
-                src_df = pd.DataFrame(self.dict["source"], index=[0])
-                prvs_df = pd.DataFrame(self.dict["prvSources"])
-            else:
-                src_df = pd.DataFrame(self.dict[self.schema_map["source"]], index=[0])
-                prvs_df = pd.DataFrame(self.dict[self.schema_map["prvSources"]])
-            self._dataframe = pd.concat([src_df, prvs_df], ignore_index=True)
-
-        return self._dataframe
-
-    @property
-    def schema_map(self) -> dict:
-        if self._schema_map is None:
-            if self.schema_name == str():
-                raise TypeError("no alert schema_name provided. unable to load schema map.")
-            survey = self.schema_name.split(".")[0]
-            path = PACKAGE_DIR / f"schema_maps/{survey}.yml"
-            self._schema_map = yaml.safe_load(path.read_text())
-        return self._schema_map
-
-    # @property
-    # def metadata(self) -> dict:
-    #     """Pub/Sub message metadata.
-
-    #     Includes
-
-    #         - message_id, publish_time, and ordering_key* of the incoming Pub/Sub message
-    #         - attributes, which is a dict that typically includes the attributes of the
-    #           incoming message and possibly additional entries added by the user in the meantime.
-
-    #     *To be useful, ordering_key requires that ordering is enabled on the subscription.
-    #     """
-    #     if self._metadata is None:
-    #         self._metadata = {
-    #             "message_id": self.msg.message_id,
-    #             "publish_time": self.msg.publish_time,
-    #             # ordering must be enabled on the subscription for this to be useful
-    #             "ordering_key": self.msg.ordering_key,
-    #             # [TODO] breaking change. attributes is now a dict. open a pr on tom_desc
-    #             # typically includes self.msg.attributes plus additional items added by the user
-    #             "attributes": self.attributes,
-    #         }
-    #     return self._metadata
-
-
 @define(kw_only=True, frozen=True)
 class Response:
     """Container for a response, to be returned by a :meth:`pittgoogle.pubsub.Consumer.msg_callback`.

From 515f8083842209c6a31b895c4aa318c9d19bfc23 Mon Sep 17 00:00:00 2001
From: troyraen <troy.raen@pitt.edu>
Date: Sat, 16 Sep 2023 17:20:42 -0700
Subject: [PATCH 14/55] implement BadRequest exception

---
 pittgoogle/alert.py      | 14 ++++++++------
 pittgoogle/exceptions.py |  4 ++++
 2 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/pittgoogle/alert.py b/pittgoogle/alert.py
index 8b934bf..2103e9e 100644
--- a/pittgoogle/alert.py
+++ b/pittgoogle/alert.py
@@ -42,7 +42,7 @@
 from attrs import define, field
 from google.cloud import pubsub_v1
 
-from .exceptions import OpenAlertError
+from .exceptions import BadRequest, OpenAlertError
 from .utils import Cast
 
 if TYPE_CHECKING:
@@ -117,7 +117,7 @@ class Alert:
     schema_name: str = field(factory=str, converter=str.lower)
     _schema_map: Optional[dict] = field(default=None)
     # _metadata: Optional[dict] = field(default=None)
-    bad_request: Union[bool, tuple[str, int]] = field(default=False)
+    # bad_request: Union[bool, tuple[str, int]] = field(default=False)
 
     @classmethod
     def from_msg(cls, msg, schema_name=str()):  # [TODO] update tom_desc to use this
@@ -125,12 +125,14 @@ def from_msg(cls, msg, schema_name=str()):  # [TODO] update tom_desc to use this
         return cls(msg=msg, schema_name=schema_name)
 
     @classmethod
-    def from_cloud_run(cls, envelope: dict, schema_name: str = str()):
-        # check whether received message is valid
+    def from_cloud_run(cls, envelope: dict, schema_name: str = str()) -> "Alert":
+        # check whether received message is valid, as suggested by Cloud Run docs
         if not envelope:
-            return cls(bad_request=("Bad Request: no Pub/Sub message received", 400))
+            # return cls(bad_request=("Bad Request: no Pub/Sub message received", 400))
+            raise BadRequest("Bad Request: no Pub/Sub message received")
         if not isinstance(envelope, dict) or "message" not in envelope:
-            return cls(bad_request=("Bad Request: invalid Pub/Sub message format", 400))
+            # return cls(bad_request=("Bad Request: invalid Pub/Sub message format", 400))
+            raise BadRequest("Bad Request: invalid Pub/Sub message format")
 
         return cls(
             msg=_PubsubMessageLike(
diff --git a/pittgoogle/exceptions.py b/pittgoogle/exceptions.py
index 1c2f58f..a28eef6 100644
--- a/pittgoogle/exceptions.py
+++ b/pittgoogle/exceptions.py
@@ -1,3 +1,7 @@
 # -*- coding: UTF-8 -*-
+class BadRequest(Exception):
+    """Raised when a Flask request json envelope (e.g., from Cloud Run) is invalid."""
+
+
 class OpenAlertError(Exception):
     """Raised when unable to deserialize a Pub/Sub message payload."""

From 09e2738d870266756500ad9049124ec1b5cd4d25 Mon Sep 17 00:00:00 2001
From: troyraen <troy.raen@pitt.edu>
Date: Sat, 16 Sep 2023 17:26:24 -0700
Subject: [PATCH 15/55] separate get_key from get and support defaults

---
 pittgoogle/alert.py | 35 ++++++++++++++++++++++++-----------
 1 file changed, 24 insertions(+), 11 deletions(-)

diff --git a/pittgoogle/alert.py b/pittgoogle/alert.py
index 2103e9e..dc38520 100644
--- a/pittgoogle/alert.py
+++ b/pittgoogle/alert.py
@@ -159,23 +159,19 @@ def from_cloud_run(cls, envelope: dict, schema_name: str = str()) -> "Alert":
     #             pass
     #     return self._bytes
 
-    def get(self, schema_key: str, return_key_name=False):
-        # fullkey = self.get(key, return_key=True)
-        survey_key = self.schema_map.get(schema_key)
+    def get(self, key: str, default: Optional[str] = None):
+        # if key is found in self.dict, just return the corresponding value
+        if key in self.dict:
+            return self.dict.get(key)
 
-        if return_key_name:
-            if isinstance(survey_key, list):
-                return survey_key[-1]
-            return survey_key
-
-        if schema_key in self.dict:
-            return self.dict.get(schema_key)
+        # lookup the key in the schema map
+        survey_key = self.schema_map.get(key)  # str or list[str]
 
         if isinstance(survey_key, str):
             return self.dict.get(survey_key)
 
         if not isinstance(survey_key, list):
-            return
+            return default
 
         if len(survey_key) == 1:
             return self.dict.get(survey_key[0])
@@ -186,6 +182,23 @@ def get(self, schema_key: str, return_key_name=False):
         if len(survey_key) == 3:
             return self.dict.get(survey_key[0]).get(survey_key[1]).get(survey_key[2])
 
+    def get_key(self, key, name_only: bool = True):
+        if key in self.dict:
+            return key
+
+        survey_key = self.schema_map.get(key)  # str or list[str]
+
+        if isinstance(survey_key, str):
+            return survey_key
+
+        if not isinstance(survey_key, list):
+            return
+
+        if name_only:
+            return survey_key[-1]
+
+        return survey_key
+
     @property
     def dict(self) -> dict:
         """Message payload as a dictionary. Created from `self.msg.data` and `self.schema_name`, if needed.

From b30ec398afcd89594a44b2eba1c3d7b42f1b95a9 Mon Sep 17 00:00:00 2001
From: troyraen <troy.raen@pitt.edu>
Date: Sat, 16 Sep 2023 17:35:46 -0700
Subject: [PATCH 16/55] change schema_map keys to snake_case yaml files were
 updated in 9c9a140a2a439016cdb97914dafc97984690cadf

---
 pittgoogle/alert.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pittgoogle/alert.py b/pittgoogle/alert.py
index dc38520..8a3e383 100644
--- a/pittgoogle/alert.py
+++ b/pittgoogle/alert.py
@@ -253,10 +253,10 @@ def dataframe(self) -> "pd.DataFrame":
 
             if self.schema_name.endswith(".lite"):
                 src_df = pd.DataFrame(self.dict["source"], index=[0])
-                prvs_df = pd.DataFrame(self.dict["prvSources"])
+                prvs_df = pd.DataFrame(self.dict["prv_sources"])
             else:
                 src_df = pd.DataFrame(self.dict[self.schema_map["source"]], index=[0])
-                prvs_df = pd.DataFrame(self.dict[self.schema_map["prvSources"]])
+                prvs_df = pd.DataFrame(self.dict[self.schema_map["prv_sources"]])
             self._dataframe = pd.concat([src_df, prvs_df], ignore_index=True)
 
         return self._dataframe

From 5d24cbbbe1d10347acaaa9899f2c990ddef87034 Mon Sep 17 00:00:00 2001
From: troyraen <troy.raen@pitt.edu>
Date: Sat, 16 Sep 2023 17:50:24 -0700
Subject: [PATCH 17/55] add properties for objectid, sourceid, alertid

---
 pittgoogle/alert.py | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/pittgoogle/alert.py b/pittgoogle/alert.py
index 8a3e383..5d63011 100644
--- a/pittgoogle/alert.py
+++ b/pittgoogle/alert.py
@@ -159,6 +159,21 @@ def from_cloud_run(cls, envelope: dict, schema_name: str = str()) -> "Alert":
     #             pass
     #     return self._bytes
 
+    @property
+    def alertid(self) -> Union[str, int]:
+        """Convenience property for the alert ID. If the survey does not define an alert ID, this is the `sourceid`."""
+        return self.get("alertid", self.sourceid)
+
+    @property
+    def sourceid(self) -> Union[str, int]:
+        """Convenience property for the source ID. The "source" is the detection that triggered the alert."""
+        return self.get("sourceid")
+
+    @property
+    def objectid(self) -> Union[str, int]:
+        """Convenience property for the object ID. The "object" represents a collection of sources, as determined by the survey."""
+        return self.get("objectid")
+
     def get(self, key: str, default: Optional[str] = None):
         # if key is found in self.dict, just return the corresponding value
         if key in self.dict:

From 9f803fc9257b060068997481af9386fd1b25b8be Mon Sep 17 00:00:00 2001
From: troyraen <troy.raen@pitt.edu>
Date: Sat, 16 Sep 2023 18:49:20 -0700
Subject: [PATCH 18/55] update cloud run example, be lenient with metadata

---
 pittgoogle/alert.py | 31 +++++++++++++++++++++++++------
 1 file changed, 25 insertions(+), 6 deletions(-)

diff --git a/pittgoogle/alert.py b/pittgoogle/alert.py
index 5d63011..91e7bd3 100644
--- a/pittgoogle/alert.py
+++ b/pittgoogle/alert.py
@@ -22,11 +22,29 @@
 
 .. code-block:: python
 
+    import pittgoogle
     # flask is used to work with HTTP requests, which trigger Cloud Run modules
     # the request contains the Pub/Sub message, which contains the alert packet
-    from flask import request
+    from flask import Flask, request
+
+    app = Flask(__name__)
+
+    # function that receives the request
+    @app.route("/", methods=["POST"])
+    def index():
+
+        try:
+            # unpack the alert
+            # if the request does not contain a valid message, this raises a `BadRequest`
+            alert = pittgoogle.Alert.from_cloud_run(envelope=request.get_json(), schema_name="ztf")
+
+        except pg.exceptions.BadRequest as err:
+            # return the error text and an HTTP 400 Bad Request code
+            return err.text, 400
 
-    alert = pittgoogle.Alert.from_cloud_run(envelope=request.get_json(), schema_name="ztf")
+        # continue processing the alert
+        # when finished, return an empty string and an HTTP success code
+        return "", 204
 
 API
 ----
@@ -136,11 +154,12 @@ def from_cloud_run(cls, envelope: dict, schema_name: str = str()) -> "Alert":
 
         return cls(
             msg=_PubsubMessageLike(
+                # data is required. the rest should be present in the message, but let's be lenient
                 data=envelope["message"]["data"],
-                attributes=envelope["message"]["attributes"],
-                message_id=envelope["message"]["message_id"],
-                publish_time=envelope["message"]["publish_time"],
-                ordering_key=envelope["message"]["ordering_key"],
+                attributes=envelope["message"].get("attributes"),
+                message_id=envelope["message"].get("message_id"),
+                publish_time=envelope["message"].get("publish_time"),
+                ordering_key=envelope["message"].get("ordering_key"),
             ),
             schema_name=schema_name,
         )

From a9f4290b452699ebe1ec5e8b64659d3891be9c9c Mon Sep 17 00:00:00 2001
From: troyraen <troy.raen@pitt.edu>
Date: Sat, 16 Sep 2023 18:50:16 -0700
Subject: [PATCH 19/55] add from_path class method

---
 pittgoogle/alert.py | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/pittgoogle/alert.py b/pittgoogle/alert.py
index 91e7bd3..17753e2 100644
--- a/pittgoogle/alert.py
+++ b/pittgoogle/alert.py
@@ -8,15 +8,14 @@
 Usage Examples
 ---------------
 
-.. code-block:: python
-
-    import pittgoogle
-
 Load an alert from disk:
 
 .. code-block:: python
 
-    [TODO]
+    import pittgoogle
+
+    path = "path/to/ztf_alert.avro"  # point this to a file containing an alert
+    alert = pittgoogle.Alert.from_path(path, schema_name="ztf")
 
 Load a ZTF alert from a Pub/Sub message that has triggered a Cloud Run module:
 
@@ -164,6 +163,12 @@ def from_cloud_run(cls, envelope: dict, schema_name: str = str()) -> "Alert":
             schema_name=schema_name,
         )
 
+    @classmethod
+    def from_path(cls, path, schema_name=str()) -> "Alert":
+        with open(path, "rb") as f:
+            bytes = f.read()
+        return cls(msg=_PubsubMessageLike(data=bytes), schema_name=schema_name)
+
     # @property
     # def bytes(self) -> bytes:
     #     """Message payload in original format (Avro or JSON serialized bytes)."""

From 07b591db789cae605cdfebda755a74f9831bf246 Mon Sep 17 00:00:00 2001
From: troyraen <troy.raen@pitt.edu>
Date: Sat, 16 Sep 2023 18:50:54 -0700
Subject: [PATCH 20/55] bugfix schema maps dir

---
 pittgoogle/alert.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pittgoogle/alert.py b/pittgoogle/alert.py
index 17753e2..d6fb835 100644
--- a/pittgoogle/alert.py
+++ b/pittgoogle/alert.py
@@ -306,7 +306,7 @@ def schema_map(self) -> dict:
             if self.schema_name == str():
                 raise TypeError("no alert schema_name provided. unable to load schema map.")
             survey = self.schema_name.split(".")[0]
-            path = PACKAGE_DIR / f"schema_maps/{survey}.yml"
+            path = PACKAGE_DIR / f"schemas/maps/{survey}.yml"
             self._schema_map = yaml.safe_load(path.read_text())
         return self._schema_map
 

From 1a373b50897e36505549036253d436f01c31d93f Mon Sep 17 00:00:00 2001
From: troyraen <troy.raen@pitt.edu>
Date: Sat, 16 Sep 2023 19:20:26 -0700
Subject: [PATCH 21/55] cleanup

---
 pittgoogle/alert.py | 50 +++------------------------------------------
 1 file changed, 3 insertions(+), 47 deletions(-)

diff --git a/pittgoogle/alert.py b/pittgoogle/alert.py
index d6fb835..fd4b23e 100644
--- a/pittgoogle/alert.py
+++ b/pittgoogle/alert.py
@@ -65,7 +65,7 @@ def index():
 if TYPE_CHECKING:
     import google.protobuf.timestamp_pb2
     import google._upb._message
-    import pandas as pd
+    import pandas as pd  # always lazy-load pandas. it hogs memory on cloud functions and run
 
 
 LOGGER = logging.getLogger(__name__)
@@ -122,22 +122,18 @@ class Alert:
         `Alert` may not be available.
     """
 
-    # _bytes: Optional[ByteString] = field(default=None)
     _dict: Optional[dict] = field(default=None)
     _attributes: Optional[Union[dict, "google._upb._message.ScalarMapContainer"]] = field(
         default=None
     )
-    # _metadata: Optional[dict] = field(default=None)
     msg: Optional[Union["pubsub_v1.types.PubsubMessage", _PubsubMessageLike]] = field(default=None)
     """Incoming Pub/Sub message object."""
     _dataframe: Optional["pd.DataFrame"] = field(default=None)
     schema_name: str = field(factory=str, converter=str.lower)
     _schema_map: Optional[dict] = field(default=None)
-    # _metadata: Optional[dict] = field(default=None)
-    # bad_request: Union[bool, tuple[str, int]] = field(default=False)
 
     @classmethod
-    def from_msg(cls, msg, schema_name=str()):  # [TODO] update tom_desc to use this
+    def from_msg(cls, msg, schema_name=str()) -> "Alert":  # [TODO] update tom_desc to use this
         """Create an `Alert` from a `pubsub_v1.types.PubsubMessage`."""
         return cls(msg=msg, schema_name=schema_name)
 
@@ -145,15 +141,13 @@ def from_msg(cls, msg, schema_name=str()):  # [TODO] update tom_desc to use this
     def from_cloud_run(cls, envelope: dict, schema_name: str = str()) -> "Alert":
         # check whether received message is valid, as suggested by Cloud Run docs
         if not envelope:
-            # return cls(bad_request=("Bad Request: no Pub/Sub message received", 400))
             raise BadRequest("Bad Request: no Pub/Sub message received")
         if not isinstance(envelope, dict) or "message" not in envelope:
-            # return cls(bad_request=("Bad Request: invalid Pub/Sub message format", 400))
             raise BadRequest("Bad Request: invalid Pub/Sub message format")
 
         return cls(
             msg=_PubsubMessageLike(
-                # data is required. the rest should be present in the message, but let's be lenient
+                # this class requires data. the rest should be present in the message, but let's be lenient
                 data=envelope["message"]["data"],
                 attributes=envelope["message"].get("attributes"),
                 message_id=envelope["message"].get("message_id"),
@@ -169,20 +163,6 @@ def from_path(cls, path, schema_name=str()) -> "Alert":
             bytes = f.read()
         return cls(msg=_PubsubMessageLike(data=bytes), schema_name=schema_name)
 
-    # @property
-    # def bytes(self) -> bytes:
-    #     """Message payload in original format (Avro or JSON serialized bytes)."""
-    #     if self._bytes is None:
-    #         # add try-except when we know what we're looking for
-    #         self._bytes = self.msg.data
-    #         if self._bytes is None:
-    #             # if we add a "path" attribute for the path to an avro file on disk
-    #             # we can load it like this:
-    #             #     with open(self.path, "rb") as f:
-    #             #         self._bytes = f.read()
-    #             pass
-    #     return self._bytes
-
     @property
     def alertid(self) -> Union[str, int]:
         """Convenience property for the alert ID. If the survey does not define an alert ID, this is the `sourceid`."""
@@ -309,27 +289,3 @@ def schema_map(self) -> dict:
             path = PACKAGE_DIR / f"schemas/maps/{survey}.yml"
             self._schema_map = yaml.safe_load(path.read_text())
         return self._schema_map
-
-    # @property
-    # def metadata(self) -> dict:
-    #     """Pub/Sub message metadata.
-
-    #     Includes
-
-    #         - message_id, publish_time, and ordering_key* of the incoming Pub/Sub message
-    #         - attributes, which is a dict that typically includes the attributes of the
-    #           incoming message and possibly additional entries added by the user in the meantime.
-
-    #     *To be useful, ordering_key requires that ordering is enabled on the subscription.
-    #     """
-    #     if self._metadata is None:
-    #         self._metadata = {
-    #             "message_id": self.msg.message_id,
-    #             "publish_time": self.msg.publish_time,
-    #             # ordering must be enabled on the subscription for this to be useful
-    #             "ordering_key": self.msg.ordering_key,
-    #             # [TODO] breaking change. attributes is now a dict. open a pr on tom_desc
-    #             # typically includes self.msg.attributes plus additional items added by the user
-    #             "attributes": self.attributes,
-    #         }
-    #     return self._metadata

From d415127227320890a05164fb7c9f7240560a96ab Mon Sep 17 00:00:00 2001
From: troyraen <troy.raen@pitt.edu>
Date: Sat, 16 Sep 2023 19:22:33 -0700
Subject: [PATCH 22/55] reorganize methods, properties, etc.

---
 pittgoogle/alert.py | 137 ++++++++++++++++++++++----------------------
 1 file changed, 70 insertions(+), 67 deletions(-)

diff --git a/pittgoogle/alert.py b/pittgoogle/alert.py
index fd4b23e..18115f6 100644
--- a/pittgoogle/alert.py
+++ b/pittgoogle/alert.py
@@ -122,16 +122,17 @@ class Alert:
         `Alert` may not be available.
     """
 
-    _dict: Optional[dict] = field(default=None)
+    msg: Optional[Union["pubsub_v1.types.PubsubMessage", _PubsubMessageLike]] = field(default=None)
+    """Incoming Pub/Sub message object."""
     _attributes: Optional[Union[dict, "google._upb._message.ScalarMapContainer"]] = field(
         default=None
     )
-    msg: Optional[Union["pubsub_v1.types.PubsubMessage", _PubsubMessageLike]] = field(default=None)
-    """Incoming Pub/Sub message object."""
+    _dict: Optional[dict] = field(default=None)
     _dataframe: Optional["pd.DataFrame"] = field(default=None)
     schema_name: str = field(factory=str, converter=str.lower)
     _schema_map: Optional[dict] = field(default=None)
 
+    # ---- class methods ---- #
     @classmethod
     def from_msg(cls, msg, schema_name=str()) -> "Alert":  # [TODO] update tom_desc to use this
         """Create an `Alert` from a `pubsub_v1.types.PubsubMessage`."""
@@ -163,60 +164,18 @@ def from_path(cls, path, schema_name=str()) -> "Alert":
             bytes = f.read()
         return cls(msg=_PubsubMessageLike(data=bytes), schema_name=schema_name)
 
+    # ---- properties ---- #
     @property
-    def alertid(self) -> Union[str, int]:
-        """Convenience property for the alert ID. If the survey does not define an alert ID, this is the `sourceid`."""
-        return self.get("alertid", self.sourceid)
-
-    @property
-    def sourceid(self) -> Union[str, int]:
-        """Convenience property for the source ID. The "source" is the detection that triggered the alert."""
-        return self.get("sourceid")
-
-    @property
-    def objectid(self) -> Union[str, int]:
-        """Convenience property for the object ID. The "object" represents a collection of sources, as determined by the survey."""
-        return self.get("objectid")
-
-    def get(self, key: str, default: Optional[str] = None):
-        # if key is found in self.dict, just return the corresponding value
-        if key in self.dict:
-            return self.dict.get(key)
-
-        # lookup the key in the schema map
-        survey_key = self.schema_map.get(key)  # str or list[str]
-
-        if isinstance(survey_key, str):
-            return self.dict.get(survey_key)
-
-        if not isinstance(survey_key, list):
-            return default
-
-        if len(survey_key) == 1:
-            return self.dict.get(survey_key[0])
-
-        if len(survey_key) == 2:
-            return self.dict.get(survey_key[0]).get(survey_key[1])
-
-        if len(survey_key) == 3:
-            return self.dict.get(survey_key[0]).get(survey_key[1]).get(survey_key[2])
-
-    def get_key(self, key, name_only: bool = True):
-        if key in self.dict:
-            return key
-
-        survey_key = self.schema_map.get(key)  # str or list[str]
-
-        if isinstance(survey_key, str):
-            return survey_key
-
-        if not isinstance(survey_key, list):
-            return
-
-        if name_only:
-            return survey_key[-1]
+    def attributes(self) -> Union[dict, "google._upb._message.ScalarMapContainer"]:
+        """Custom metadata for the message. Pub/Sub handles this as a dict-like called "attributes".
 
-        return survey_key
+        If None, this will be set to `self.msg.attributes`.
+        Update as desired.
+        When publishing, this will be sent as the message attributes.
+        """
+        if self._attributes is None:
+            self._attributes = self.msg.attributes
+        return self._attributes
 
     @property
     def dict(self) -> dict:
@@ -253,18 +212,6 @@ def dict(self) -> dict:
                 raise OpenAlertError("failed to deserialize the alert bytes")
         return self._dict
 
-    @property
-    def attributes(self) -> Union[dict, "google._upb._message.ScalarMapContainer"]:
-        """Custom metadata for the message. Pub/Sub handles this as a dict-like called "attributes".
-
-        If None, this will be set to `self.msg.attributes`.
-        Update as desired.
-        When publishing, this will be sent as the message attributes.
-        """
-        if self._attributes is None:
-            self._attributes = self.msg.attributes
-        return self._attributes
-
     @property
     def dataframe(self) -> "pd.DataFrame":
         if self._dataframe is None:
@@ -280,6 +227,21 @@ def dataframe(self) -> "pd.DataFrame":
 
         return self._dataframe
 
+    @property
+    def alertid(self) -> Union[str, int]:
+        """Convenience property for the alert ID. If the survey does not define an alert ID, this is the `sourceid`."""
+        return self.get("alertid", self.sourceid)
+
+    @property
+    def objectid(self) -> Union[str, int]:
+        """Convenience property for the object ID. The "object" represents a collection of sources, as determined by the survey."""
+        return self.get("objectid")
+
+    @property
+    def sourceid(self) -> Union[str, int]:
+        """Convenience property for the source ID. The "source" is the detection that triggered the alert."""
+        return self.get("sourceid")
+
     @property
     def schema_map(self) -> dict:
         if self._schema_map is None:
@@ -289,3 +251,44 @@ def schema_map(self) -> dict:
             path = PACKAGE_DIR / f"schemas/maps/{survey}.yml"
             self._schema_map = yaml.safe_load(path.read_text())
         return self._schema_map
+
+    # ---- methods ---- #
+    def get(self, key: str, default: Optional[str] = None):
+        # if key is found in self.dict, just return the corresponding value
+        if key in self.dict:
+            return self.dict.get(key)
+
+        # lookup the key in the schema map
+        survey_key = self.schema_map.get(key)  # str or list[str]
+
+        if isinstance(survey_key, str):
+            return self.dict.get(survey_key)
+
+        if not isinstance(survey_key, list):
+            return default
+
+        if len(survey_key) == 1:
+            return self.dict.get(survey_key[0])
+
+        if len(survey_key) == 2:
+            return self.dict.get(survey_key[0]).get(survey_key[1])
+
+        if len(survey_key) == 3:
+            return self.dict.get(survey_key[0]).get(survey_key[1]).get(survey_key[2])
+
+    def get_key(self, key, name_only: bool = True):
+        if key in self.dict:
+            return key
+
+        survey_key = self.schema_map.get(key)  # str or list[str]
+
+        if isinstance(survey_key, str):
+            return survey_key
+
+        if not isinstance(survey_key, list):
+            return
+
+        if name_only:
+            return survey_key[-1]
+
+        return survey_key

From b1ef685d60481b41fa86f6b68cc6cb173074111c Mon Sep 17 00:00:00 2001
From: troyraen <troy.raen@pitt.edu>
Date: Sat, 16 Sep 2023 22:42:31 -0700
Subject: [PATCH 23/55] cleanup imports

---
 pittgoogle/__init__.py |  7 ++++++-
 pittgoogle/alert.py    |  8 +++++---
 pittgoogle/pubsub.py   | 18 +++++-------------
 3 files changed, 16 insertions(+), 17 deletions(-)

diff --git a/pittgoogle/__init__.py b/pittgoogle/__init__.py
index 47238f1..82859bc 100644
--- a/pittgoogle/__init__.py
+++ b/pittgoogle/__init__.py
@@ -9,7 +9,12 @@
 except ImportError:  # for Python<3.8
     import importlib_metadata as metadata
 
-from . import auth, bigquery, exceptions, pubsub, utils
+from .auth import Auth
+from .alert import Alert
+from .bigquery import Table
+from .pubsub import Topic, Subscription, Consumer
+from . import auth, alert, bigquery, exceptions, pubsub, utils
+
 
 __version__ = metadata.version("pittgoogle-client")
 
diff --git a/pittgoogle/alert.py b/pittgoogle/alert.py
index 18115f6..877ca28 100644
--- a/pittgoogle/alert.py
+++ b/pittgoogle/alert.py
@@ -57,14 +57,14 @@ def index():
 import fastavro
 import yaml
 from attrs import define, field
-from google.cloud import pubsub_v1
 
 from .exceptions import BadRequest, OpenAlertError
 from .utils import Cast
 
 if TYPE_CHECKING:
-    import google.protobuf.timestamp_pb2
     import google._upb._message
+    import google.cloud.pubsub_v1
+    import google.protobuf.timestamp_pb2
     import pandas as pd  # always lazy-load pandas. it hogs memory on cloud functions and run
 
 
@@ -122,7 +122,9 @@ class Alert:
         `Alert` may not be available.
     """
 
-    msg: Optional[Union["pubsub_v1.types.PubsubMessage", _PubsubMessageLike]] = field(default=None)
+    msg: Optional[Union["google.cloud.pubsub_v1.types.PubsubMessage", _PubsubMessageLike]] = field(
+        default=None
+    )
     """Incoming Pub/Sub message object."""
     _attributes: Optional[Union[dict, "google._upb._message.ScalarMapContainer"]] = field(
         default=None
diff --git a/pittgoogle/pubsub.py b/pittgoogle/pubsub.py
index f0430c0..3e21293 100644
--- a/pittgoogle/pubsub.py
+++ b/pittgoogle/pubsub.py
@@ -83,23 +83,15 @@ def my_batch_callback(results):
 import queue
 from concurrent.futures import ThreadPoolExecutor
 from time import sleep
-from typing import TYPE_CHECKING, Any, Callable, List, Optional, Union
+from typing import Any, Callable, List, Optional, Union
 
 import fastavro
-import yaml
-from attrs import converters, define, field
+import google.cloud.pubsub_v1 as pubsub_v1
+from attrs import define, field
 from attrs.validators import gt, instance_of, is_callable, optional
 from google.api_core.exceptions import NotFound
-from google.cloud import pubsub_v1
 
-from . import Auth, Alert
-from .exceptions import OpenAlertError
-from .utils import Cast
-
-if TYPE_CHECKING:
-    import google.protobuf.timestamp_pb2
-    import google._upb._message
-    import pandas as pd
+from . import Alert, Auth
 
 
 LOGGER = logging.getLogger(__name__)
@@ -578,5 +570,5 @@ class Response:
         If there is no batch callback the results will be lost.
     """
 
-    ack: bool = field(default=True, converter=converters.to_bool)
+    ack: bool = field(default=True, converter=bool)
     result: Any = field(default=None)

From 73bf3aa0727a49b8eed32ed9ce67c5a07b15c4f3 Mon Sep 17 00:00:00 2001
From: troyraen <troy.raen@pitt.edu>
Date: Sat, 16 Sep 2023 22:46:57 -0700
Subject: [PATCH 24/55] bugfix last

---
 pittgoogle/__init__.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/pittgoogle/__init__.py b/pittgoogle/__init__.py
index 82859bc..0a4b9ab 100644
--- a/pittgoogle/__init__.py
+++ b/pittgoogle/__init__.py
@@ -11,7 +11,6 @@
 
 from .auth import Auth
 from .alert import Alert
-from .bigquery import Table
 from .pubsub import Topic, Subscription, Consumer
 from . import auth, alert, bigquery, exceptions, pubsub, utils
 

From c7a7033aa3b3c66f814c9ec50a85de22c7b7152b Mon Sep 17 00:00:00 2001
From: troyraen <troy.raen@pitt.edu>
Date: Thu, 28 Sep 2023 02:46:01 -0700
Subject: [PATCH 25/55] add types_ module with Schema, PubsubMessageLike

---
 pittgoogle/alert.py  | 33 ++++------------
 pittgoogle/types_.py | 94 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 102 insertions(+), 25 deletions(-)
 create mode 100644 pittgoogle/types_.py

diff --git a/pittgoogle/alert.py b/pittgoogle/alert.py
index 877ca28..95c6056 100644
--- a/pittgoogle/alert.py
+++ b/pittgoogle/alert.py
@@ -72,25 +72,6 @@ def index():
 PACKAGE_DIR = importlib.resources.files(__package__)
 
 
-@define(frozen=True)
-class _PubsubMessageLike:
-    """Container for an incoming Pub/Sub message that mimics a `pubsub_v1.types.PubsubMessage`.
-
-    It is convenient for the `Alert` class to work with a message as a
-    `pubsub_v1.types.PubsubMessage`. However, there are many ways to obtain an alert that do
-    not result in a `pubsub_v1.types.PubsubMessage` (e.g., an alert packet loaded from disk or
-    an incoming message to a Cloud Functions or Cloud Run module). In those cases, this class
-    is used to create an object with the same attributes as a `pubsub_v1.types.PubsubMessage`.
-    This object is then assigned to the `msg` attribute of the `Alert`.
-    """
-
-    data: bytes = field()
-    attributes: dict = field(factory=dict)
-    message_id: Optional[str] = field(default=None)
-    publish_time: Optional["google.protobuf.timestamp_pb2.Timestamp"] = field(default=None)
-    ordering_key: Optional[str] = field(default=None)
-
-
 @define(kw_only=True)
 class Alert:
     """Pitt-Google container for an astronomical alert.
@@ -122,9 +103,9 @@ class Alert:
         `Alert` may not be available.
     """
 
-    msg: Optional[Union["google.cloud.pubsub_v1.types.PubsubMessage", _PubsubMessageLike]] = field(
-        default=None
-    )
+    msg: Optional[
+        Union["google.cloud.pubsub_v1.types.PubsubMessage", types_.PubsubMessageLike]
+    ] = field(default=None)
     """Incoming Pub/Sub message object."""
     _attributes: Optional[Union[dict, "google._upb._message.ScalarMapContainer"]] = field(
         default=None
@@ -149,7 +130,7 @@ def from_cloud_run(cls, envelope: dict, schema_name: str = str()) -> "Alert":
             raise BadRequest("Bad Request: invalid Pub/Sub message format")
 
         return cls(
-            msg=_PubsubMessageLike(
+            msg=types_.PubsubMessageLike(
                 # this class requires data. the rest should be present in the message, but let's be lenient
                 data=envelope["message"]["data"],
                 attributes=envelope["message"].get("attributes"),
@@ -163,8 +144,10 @@ def from_cloud_run(cls, envelope: dict, schema_name: str = str()) -> "Alert":
     @classmethod
     def from_path(cls, path, schema_name=str()) -> "Alert":
         with open(path, "rb") as f:
-            bytes = f.read()
-        return cls(msg=_PubsubMessageLike(data=bytes), schema_name=schema_name)
+            bytes_ = f.read()
+        return cls(
+            msg=types_.PubsubMessageLike(data=bytes_), schema_name=schema_name, path=Path(path)
+        )
 
     # ---- properties ---- #
     @property
diff --git a/pittgoogle/types_.py b/pittgoogle/types_.py
new file mode 100644
index 0000000..58f2ada
--- /dev/null
+++ b/pittgoogle/types_.py
@@ -0,0 +1,94 @@
+# -*- coding: UTF-8 -*-
+"""Functions to support working with alerts and related data."""
+import importlib.resources
+import json
+import logging
+from base64 import b64decode, b64encode
+from collections import OrderedDict
+from io import BytesIO
+from typing import TYPE_CHECKING, Optional
+
+import fastavro
+import pandas as pd
+import yaml
+from astropy.table import Table
+from astropy.time import Time
+from attrs import define, field
+
+if TYPE_CHECKING:
+    import google.protobuf.timestamp_pb2
+    from pathlib import Path
+
+
+LOGGER = logging.getLogger(__name__)
+PACKAGE_DIR = importlib.resources.files(__package__)
+
+
+@define(kw_only=True)
+class Schema:
+    """Class for an individual schema.
+
+    This class is not intended to be used directly. Instead, get a schema from the registry:
+    `pittgoogle.registry.Schemas`.
+    """
+
+    name: str = field()
+    description: str = field()
+    path: Optional["Path"] = field(default=None)
+    _map: Optional[dict] = field(default=None, init=False)
+    _avsc: Optional[dict] = field(default=None, init=False)
+
+    @property
+    def survey(self) -> str:
+        """Name of the survey. This is the first block (separated by ".") in the schema's name."""
+        return self.name.split(".")[0]
+
+    @property
+    def definition(self) -> str:
+        """Pointer (e.g., URL) to the survey's schema definition."""
+        return self.map.SURVEY_SCHEMA
+
+    @property
+    def map(self) -> dict:
+        """Mapping of Pitt-Google's generic field names to survey-specific field names."""
+        if self._map is None:
+            yml = PACKAGE_DIR / f"schemas/maps/{self.survey}.yml"
+            try:
+                self._map = yaml.safe_load(yml.read_text())
+            except FileNotFoundError:
+                raise ValueError(f"no schema map found for schema name '{self.name}'")
+        return self._map
+
+    @property
+    def avsc(self) -> Optional[dict]:
+        """The Avro schema loaded from the file at `self.path`, or None if a valid file cannot be found."""
+        # if the schema has already been loaded, return it
+        if self._avsc is not None:
+            return self._avsc
+
+        # if self.path does not point to an existing avro schema file, return None
+        if (self.path is None) or (self.path.suffix != ".avsc") or (not self.path.is_file()):
+            return None
+
+        # load the schema and return it
+        self._avsc = fastavro.schema.load_schema(self.path)
+        return self._avsc
+
+
+@define(frozen=True)
+class PubsubMessageLike:
+    """Container for an incoming Pub/Sub message that mimics a `google.cloud.pubsub_v1.types.PubsubMessage`.
+
+    It is convenient for the :class:`pittgoogle.Alert` class to work with a message as a
+    `pubsub_v1.types.PubsubMessage`. However, there are many ways to obtain an alert that do
+    not result in a `pubsub_v1.types.PubsubMessage` (e.g., an alert packet loaded from disk or
+    an incoming message to a Cloud Functions or Cloud Run module). In those cases, this class
+    is used to create an object with the same attributes as a `pubsub_v1.types.PubsubMessage`.
+    This object is then assigned to the `msg` attribute of the `Alert`.
+    """
+
+    data: bytes = field()
+    attributes: dict = field(factory=dict)
+    message_id: Optional[str] = field(default=None)
+    publish_time: Optional["google.protobuf.timestamp_pb2.Timestamp"] = field(default=None)
+    ordering_key: Optional[str] = field(default=None)

From 65aa7cc7a4652ed3075fd6483e653332e2c3b4b0 Mon Sep 17 00:00:00 2001
From: troyraen <troy.raen@pitt.edu>
Date: Thu, 28 Sep 2023 02:47:21 -0700
Subject: [PATCH 26/55] add registry module with ProjectIds, Schemas

---
 pittgoogle/registry.py | 89 ++++++++++++++++++++++++++++++++++++++++++
 pittgoogle/utils.py    | 18 ---------
 2 files changed, 89 insertions(+), 18 deletions(-)
 create mode 100644 pittgoogle/registry.py

diff --git a/pittgoogle/registry.py b/pittgoogle/registry.py
new file mode 100644
index 0000000..ba2a62a
--- /dev/null
+++ b/pittgoogle/registry.py
@@ -0,0 +1,89 @@
+# -*- coding: UTF-8 -*-
+"""Pitt-Google registries."""
+import importlib.resources
+import logging
+from typing import ClassVar
+
+from attrs import define
+
+from . import types_
+from .exceptions import SchemaNotFoundError
+
+
+LOGGER = logging.getLogger(__name__)
+PACKAGE_DIR = importlib.resources.files(__package__)
+
+
+@define(frozen=True)
+class ProjectIds:
+    """Registry of Google Cloud Project IDs."""
+
+    pittgoogle: ClassVar[str] = "ardent-cycling-243415"
+    """Pitt-Google's production project."""
+
+    pittgoogle_dev: ClassVar[str] = "avid-heading-329016"
+    """Pitt-Google's development project."""
+
+    # pittgoogle_billing: ClassVar[str] = "light-cycle-328823"
+    # """Pitt-Google's billing project."""
+
+    elasticc: ClassVar[str] = "elasticc-challenge"
+    """Project running a classifier for ELAsTiCC alerts and reporting to DESC."""
+
+
+@define(frozen=True)
+class Schemas:
+    """Registry of schemas used by Pitt-Google."""
+
+    # dict defining the schemas in the registry
+    # naming conventions:
+    # - schema names are expected to start with the name of the survey
+    # - if the survey has more than one schema, the survey name should be followed by a ".",
+    #   followed by schema-specific specifier(s)
+    # - if an avro schema file is being registered with the schema (using the `path` arg), it is
+    #   recommended that the file have the same name (path stem) as the schema. the file name
+    #   must end with ".avsc".
+    dict: ClassVar[dict] = {
+        "elasticc.v0_9_1.alert": types_.Schema(
+            name="elasticc.v0_9_1.alert",
+            description="Avro schema of alerts published by ELAsTiCC.",
+            path=PACKAGE_DIR / f"schemas/elasticc/elasticc.v0_9_1.alert.avsc",
+        ),
+        "elasticc.v0_9_1.brokerClassification": types_.Schema(
+            name="elasticc.v0_9_1.brokerClassification",
+            description="Avro schema of alerts to be sent to DESC containing classifications of ELAsTiCC alerts.",
+            path=PACKAGE_DIR / f"schemas/elasticc/elasticc.v0_9_1.brokerClassification.avsc",
+        ),
+        "ztf": types_.Schema(
+            name="ztf",
+            description=(
+                "ZTF schema. The ZTF survey publishes alerts in Avro format with the schema attached "
+                "in the header. Pitt-Google publishes ZTF alerts in json format. This schema covers "
+                "both cases."  # [TODO]
+            ),
+            path=None,
+        ),
+    }
+    """Dict defining the schemas in the registry."""
+
+    @classmethod
+    def names(cls) -> list[str]:
+        """Return the names of all registered schemas."""
+        return list(cls.dict.keys())
+
+    @classmethod
+    def get(cls, schema_name: str) -> types_.Schema:
+        """Return the registered schema called `schema_name`.
+
+        Raises
+        ------
+        :class:`pittgoogle.exceptions.SchemaNotFoundError`
+            if a schema called `schema_name` is not found
+        """
+        # if there is no registered schema with this name, raise an error
+        schema = cls.dict.get(schema_name)
+        if schema is None:
+            raise SchemaNotFoundError(
+                f"{schema_name} not found. for a list of valid names, use `pittgoogle.Schemas.names()`."
+            )
+        return schema
diff --git a/pittgoogle/utils.py b/pittgoogle/utils.py
index d2a77ef..f185135 100644
--- a/pittgoogle/utils.py
+++ b/pittgoogle/utils.py
@@ -5,7 +5,6 @@
 from base64 import b64decode, b64encode
 from collections import OrderedDict
 from io import BytesIO
-from typing import ClassVar
 
 import fastavro
 import pandas as pd
@@ -16,23 +15,6 @@
 LOGGER = logging.getLogger(__name__)
 
 
-@define
-class ProjectIds:
-    """Registry of Google Cloud Project IDs."""
-
-    pittgoogle: ClassVar[str] = "ardent-cycling-243415"
-    """Pitt-Google's production project."""
-
-    pittgoogle_dev: ClassVar[str] = "avid-heading-329016"
-    """Pitt-Google's development project."""
-
-    # pittgoogle_billing: ClassVar[str] = "light-cycle-328823"
-    # """Pitt-Google's billing project."""
-
-    elasticc: ClassVar[str] = "elasticc-challenge"
-    """Project running a classifier for ELAsTiCC alerts and reporting to DESC."""
-
-
 @define
 class Cast:
     """Methods to convert data types."""

From 16e3ffaf44555489a763b823d739c2f44b34e82b Mon Sep 17 00:00:00 2001
From: troyraen <troy.raen@pitt.edu>
Date: Thu, 28 Sep 2023 02:48:13 -0700
Subject: [PATCH 27/55] define SchemaNotFoundError

---
 pittgoogle/exceptions.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/pittgoogle/exceptions.py b/pittgoogle/exceptions.py
index a28eef6..9ef37f7 100644
--- a/pittgoogle/exceptions.py
+++ b/pittgoogle/exceptions.py
@@ -5,3 +5,7 @@ class BadRequest(Exception):
 
 class OpenAlertError(Exception):
     """Raised when unable to deserialize a Pub/Sub message payload."""
+
+
+class SchemaNotFoundError(Exception):
+    """Raised when a schema with a given name is not found in the registry."""

From 8bdd621ef54add025ce2f535a40b3eb9fbcd0ec5 Mon Sep 17 00:00:00 2001
From: troyraen <troy.raen@pitt.edu>
Date: Thu, 28 Sep 2023 02:58:17 -0700
Subject: [PATCH 28/55] use the new Schema class

---
 pittgoogle/alert.py | 64 +++++++++++++++++++++++++++++++--------------
 1 file changed, 44 insertions(+), 20 deletions(-)

diff --git a/pittgoogle/alert.py b/pittgoogle/alert.py
index 95c6056..c9b7876 100644
--- a/pittgoogle/alert.py
+++ b/pittgoogle/alert.py
@@ -112,8 +112,8 @@ class Alert:
     )
     _dict: Optional[dict] = field(default=None)
     _dataframe: Optional["pd.DataFrame"] = field(default=None)
-    schema_name: str = field(factory=str, converter=str.lower)
-    _schema_map: Optional[dict] = field(default=None)
+    schema_name: Optional[str] = field(default=None)
+    _schema: Optional[types_.Schema] = field(default=None, init=False)
 
     # ---- class methods ---- #
     @classmethod
@@ -174,19 +174,33 @@ def dict(self) -> dict:
         if self._dict is not None:
             return self._dict
 
-        if self.schema_name.startswith("elasticc"):
-            # self.msg.data is avro and schemaless. load the schema, then convert the bytes to a dict
-            schemapath = PACKAGE_DIR / f"schemas/elasticc/{self.schema_name}.avsc"
-            schema = fastavro.schema.load_schema(schemapath)
+        # deserialize self.msg.data (avro or json bytestring) into a dict.
+        # if self.msg.data is either (1) json; or (2) avro that contains the schema in the header,
+        # self.schema is not required for deserialization, so we want to be lenient.
+        # if self.msg.data is schemaless avro, deserialization requires self.schema.avsc to exist.
+        # currently, there is a clean separation between surveys:
+        #     elasticc always requires self.schema.avsc; ztf never does.
+        # we'll check the survey name from self.schema.survey; but first we need to check whether
+        # the schema exists so we can try to continue without one instead of raising an error.
+        # we may want or need to handle this differently in the future.
+        try:
+            self.schema
+        except SchemaNotFoundError as exc:
+            LOGGER.warning(f"schema not found. attempting to deserialize without it. {exc}")
+            avro_schema = None
+        else:
+            if self.schema.survey in ["elasticc"]:
+                avro_schema = self.schema.avsc
+            else:
+                avro_schema = None
+
+        # if we have an avro schema, use it to deserialize and return
+        if avro_schema:
             with io.BytesIO(self.msg.data) as fin:
-                self._dict = fastavro.schemaless_reader(fin, schema)
+                self._dict = fastavro.schemaless_reader(fin, avro_schema)
             return self._dict
 
-        if self.schema_name == "":
-            LOGGER.warning("no alert schema_name provided. attempting to deserialize without it.")
-
-        # assume this is a ztf or ztf-lite alert
-        # this should be rewritten to catch specific errors
+        # [TODO] this should be rewritten to catch specific errors
         # for now, just try avro then json, catching basically all errors in the process
         try:
             self._dict = Cast.avro_to_dict(self.msg.data)
@@ -228,14 +242,24 @@ def sourceid(self) -> Union[str, int]:
         return self.get("sourceid")
 
     @property
-    def schema_map(self) -> dict:
-        if self._schema_map is None:
-            if self.schema_name == str():
-                raise TypeError("no alert schema_name provided. unable to load schema map.")
-            survey = self.schema_name.split(".")[0]
-            path = PACKAGE_DIR / f"schemas/maps/{survey}.yml"
-            self._schema_map = yaml.safe_load(path.read_text())
-        return self._schema_map
+    def schema(self) -> types_.Schema:
+        """Loads the schema from the registry :class:`pittgoogle.registry.Schemas`.
+
+        Raises
+        ------
+        :class:`pittgoogle.exceptions.SchemaNotFoundError`
+            if the `schema_name` is not supplied or a schema with this name is not found
+        """
+        if self._schema is not None:
+            return self._schema
+
+        # need to load the schema. raise an error if no schema_name given
+        if self.schema_name is None:
+            raise SchemaNotFoundError("a schema_name is required")
+
+        # this also may raise SchemaNotFoundError
+        self._schema = registry.Schemas.get(self.schema_name)
+        return self._schema
 
     # ---- methods ---- #
     def get(self, key: str, default: Optional[str] = None):

From afb43f936013956edbee311a39fbbfdf10078e93 Mon Sep 17 00:00:00 2001
From: troyraen <troy.raen@pitt.edu>
Date: Thu, 28 Sep 2023 02:59:58 -0700
Subject: [PATCH 29/55] refactor Alert.get and add Alert.get_key

---
 pittgoogle/alert.py | 101 ++++++++++++++++++++++++++++++++------------
 1 file changed, 73 insertions(+), 28 deletions(-)

diff --git a/pittgoogle/alert.py b/pittgoogle/alert.py
index c9b7876..d9e5334 100644
--- a/pittgoogle/alert.py
+++ b/pittgoogle/alert.py
@@ -262,42 +262,87 @@ def schema(self) -> types_.Schema:
         return self._schema
 
     # ---- methods ---- #
-    def get(self, key: str, default: Optional[str] = None):
-        # if key is found in self.dict, just return the corresponding value
-        if key in self.dict:
-            return self.dict.get(key)
-
-        # lookup the key in the schema map
-        survey_key = self.schema_map.get(key)  # str or list[str]
-
-        if isinstance(survey_key, str):
-            return self.dict.get(survey_key)
+    def get(self, field: str, default: Any = None) -> Any:
+        """Return the value of `field` in this alert.
+
+        The keys in the alert dictionary :attr:`pittgoogle.alert.Alert.dict` are survey-specific field names.
+        This method allows you to `get` values from the dict using generic names that will work across
+        surveys. `self.schema.map` is the mapping of generic -> survey-specific names.
+        To access a field using a survey-specific name, get it directly from the alert `dict`.
+
+        Parameters
+        ----------
+        field : str
+            Name of a field in the alert's schema. This must be one of the keys in the dict `self.schema.map`.
+        default : str or None
+            Default value to be returned if the field is not found.
+
+        Returns
+        -------
+        value : any
+            Value in the :attr:`pittgoogle.alert.Alert.dict` corresponding to this field.
+        """
+        survey_field = self.schema.map.get(field)  # str, list[str], or None
 
-        if not isinstance(survey_key, list):
+        if survey_field is None:
             return default
 
-        if len(survey_key) == 1:
-            return self.dict.get(survey_key[0])
+        if isinstance(survey_field, str):
+            return self.dict.get(survey_field, default)
 
-        if len(survey_key) == 2:
-            return self.dict.get(survey_key[0]).get(survey_key[1])
+        # if survey_field is not one of the expected types, the schema map is malformed
+        # maybe this was intentional, but we don't know how to handle it here
+        if not isinstance(survey_field, list):
+            raise TypeError(
+                f"field lookup not implemented for a schema-map value of type {type(survey_field)}"
+            )
 
-        if len(survey_key) == 3:
-            return self.dict.get(survey_key[0]).get(survey_key[1]).get(survey_key[2])
+        # the list must have more than 1 item, else it would be a single str
+        if len(survey_field) == 2:
+            try:
+                return self.dict[survey_field[0]][survey_field[1]]
+            except KeyError:
+                return default
 
-    def get_key(self, key, name_only: bool = True):
-        if key in self.dict:
-            return key
+        if len(survey_field) == 3:
+            try:
+                return self.dict[survey_field[0]][survey_field[1]][survey_field[2]]
+            except KeyError:
+                return default
 
-        survey_key = self.schema_map.get(key)  # str or list[str]
+        raise NotImplementedError(
+            f"field lookup not implemented for depth {len(survey_field)} (key = {survey_field})"
+        )
 
-        if isinstance(survey_key, str):
-            return survey_key
+    def get_key(
+        self, field: str, name_only: bool = False, default: Optional[str] = None
+    ) -> Optional[Union[str, list[str]]]:
+        """Return the survey-specific field name.
+
+        Parameters
+        ----------
+        field : str
+            Generic field name whose survey-specific name is to be returned. This must be one of the
+            keys in the dict `self.schema.map`.
+        name_only : bool
+            In case the survey-specific field name is nested below the top level, whether to return
+            just the single final name as a str (True) or the full path as a list[str] (False).
+        default : str or None
+            Default value to be returned if the field is not found.
+
+        Returns
+        -------
+        survey_field : str or list[str]
+            Survey-specific name for the `field`, or `default` if the field is not found.
+            list[str] if this is a nested field and `name_only` is False, else str with the
+            final field name only.
+        """
+        survey_field = self.schema.map.get(field)  # str, list[str], or None
 
-        if not isinstance(survey_key, list):
-            return
+        if survey_field is None:
+            return default
 
-        if name_only:
-            return survey_key[-1]
+        if name_only and isinstance(survey_field, list):
+            return survey_field[-1]
 
-        return survey_key
+        return survey_field

From 14dd75784699978866b9133009cf54417d66cd4e Mon Sep 17 00:00:00 2001
From: troyraen <troy.raen@pitt.edu>
Date: Thu, 28 Sep 2023 03:00:42 -0700
Subject: [PATCH 30/55] add method add_id_attributes

---
 pittgoogle/alert.py | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/pittgoogle/alert.py b/pittgoogle/alert.py
index d9e5334..3085cec 100644
--- a/pittgoogle/alert.py
+++ b/pittgoogle/alert.py
@@ -262,6 +262,22 @@ def schema(self) -> types_.Schema:
         return self._schema
 
     # ---- methods ---- #
+    def add_id_attributes(self) -> None:
+        """Add the IDs to the attributes."""
+        ids = ["alertid", "objectid", "sourceid"]
+        values = [self.get(id) for id in ids]
+
+        # get the survey-specific field names
+        survey_names = [self.get_key(id) for id in ids]
+        # if the field is nested, the key will be a list
+        # but pubsub message attributes must be strings. join to avoid a future error on publish
+        names = [".".join(id) if isinstance(id, list) else id for id in survey_names]
+
+        # only add to attributes if the survey has defined this field
+        for idname, idvalue in zip(names, values):
+            if idname is not None:
+                self.attributes[idname] = idvalue
+
     def get(self, field: str, default: Any = None) -> Any:
         """Return the value of `field` in this alert.
 

From 73a3e74b6e3fd6b0e9b433d912ca05e36f8855f8 Mon Sep 17 00:00:00 2001
From: troyraen <troy.raen@pitt.edu>
Date: Thu, 28 Sep 2023 03:02:52 -0700
Subject: [PATCH 31/55] update Alert.from_* methods

---
 pittgoogle/alert.py | 56 ++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 50 insertions(+), 6 deletions(-)

diff --git a/pittgoogle/alert.py b/pittgoogle/alert.py
index 3085cec..fce3a3e 100644
--- a/pittgoogle/alert.py
+++ b/pittgoogle/alert.py
@@ -114,15 +114,41 @@ class Alert:
     _dataframe: Optional["pd.DataFrame"] = field(default=None)
     schema_name: Optional[str] = field(default=None)
     _schema: Optional[types_.Schema] = field(default=None, init=False)
+    path: Optional[Path] = field(default=None)
 
     # ---- class methods ---- #
     @classmethod
-    def from_msg(cls, msg, schema_name=str()) -> "Alert":  # [TODO] update tom_desc to use this
-        """Create an `Alert` from a `pubsub_v1.types.PubsubMessage`."""
-        return cls(msg=msg, schema_name=schema_name)
+    def from_cloud_run(cls, envelope: dict, schema_name: Optional[str] = None) -> "Alert":
+        """Create an `Alert` from an HTTP request envelope containing a Pub/Sub message, as received by a Cloud Run module.
 
-    @classmethod
-    def from_cloud_run(cls, envelope: dict, schema_name: str = str()) -> "Alert":
+        Example code for a Cloud Run module that uses this method to open a ZTF alert:
+
+        .. code-block:: python
+
+            import pittgoogle
+            # flask is used to work with HTTP requests, which trigger Cloud Run modules
+            # the request contains the Pub/Sub message, which contains the alert packet
+            import flask
+
+            app = flask.Flask(__name__)
+
+            # function that receives the request
+            @app.route("/", methods=["POST"])
+            def index():
+
+                try:
+                    # unpack the alert
+                    # if the request does not contain a valid message, this raises a `BadRequest`
+                    alert = pittgoogle.Alert.from_cloud_run(envelope=flask.request.get_json(), schema_name="ztf")
+
+                except pg.exceptions.BadRequest as exc:
+                    # return the error text and an HTTP 400 Bad Request code
+                    return str(exc), 400
+
+                # continue processing the alert
+                # when finished, return an empty string and an HTTP success code
+                return "", 204
+        """
         # check whether received message is valid, as suggested by Cloud Run docs
         if not envelope:
             raise BadRequest("Bad Request: no Pub/Sub message received")
@@ -142,7 +168,25 @@ def from_cloud_run(cls, envelope: dict, schema_name: str = str()) -> "Alert":
         )
 
     @classmethod
-    def from_path(cls, path, schema_name=str()) -> "Alert":
+    def from_dict(
+        cls,
+        payload: dict,
+        attributes: Optional[Union[dict, "google._upb._message.ScalarMapContainer"]] = None,
+        schema_name: Optional[str] = None,
+    ) -> "Alert":  # [TODO] update tom_desc to use this
+        """Create an `Alert` from a dictionary (`payload`)."""
+        return cls(dict=payload, attributes=attributes, schema_name=schema_name)
+
+    @classmethod
+    def from_msg(
+        cls, msg: "google.cloud.pubsub_v1.types.PubsubMessage", schema_name: Optional[str] = None
+    ) -> "Alert":  # [TODO] update tom_desc to use this
+        """Create an `Alert` from a `google.cloud.pubsub_v1.types.PubsubMessage`."""
+        return cls(msg=msg, schema_name=schema_name)
+
+    @classmethod
+    def from_path(cls, path: Union[str, Path], schema_name: Optional[str] = None) -> "Alert":
+        """Create an `Alert` from the file at `path`."""
         with open(path, "rb") as f:
             bytes_ = f.read()
         return cls(

From 6b244cfbb03fe2398e7068ec0135d3a5f8231957 Mon Sep 17 00:00:00 2001
From: troyraen <troy.raen@pitt.edu>
Date: Thu, 28 Sep 2023 03:04:59 -0700
Subject: [PATCH 32/55] make Alert.attributes dict only

---
 pittgoogle/alert.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/pittgoogle/alert.py b/pittgoogle/alert.py
index fce3a3e..0c7b93f 100644
--- a/pittgoogle/alert.py
+++ b/pittgoogle/alert.py
@@ -195,15 +195,17 @@ def from_path(cls, path: Union[str, Path], schema_name: Optional[str] = None) ->
 
     # ---- properties ---- #
     @property
-    def attributes(self) -> Union[dict, "google._upb._message.ScalarMapContainer"]:
+    def attributes(self) -> dict:
         """Custom metadata for the message. Pub/Sub handles this as a dict-like called "attributes".
 
-        If None, this will be set to `self.msg.attributes`.
-        Update as desired.
-        When publishing, this will be sent as the message attributes.
+        If this was not set when the `Alert` was instantiated, a new dictionary will be created using
+        the `attributes` field in :attr:`pittgoogle.Alert.msg` the first time it is requested.
+        Update this dictionary as desired (it will not affect the original `msg`).
+        When publishing the alert using :attr:`pittgoogle.Topic.publish`, this dictionary will be
+        sent as the Pub/Sub message attributes.
         """
         if self._attributes is None:
-            self._attributes = self.msg.attributes
+            self._attributes = dict(self.msg.attributes)
         return self._attributes
 
     @property

From 25f1136e56399ea3030d9a449fcdcef5ed0c4323 Mon Sep 17 00:00:00 2001
From: troyraen <troy.raen@pitt.edu>
Date: Thu, 28 Sep 2023 03:06:00 -0700
Subject: [PATCH 33/55] clean up Alert.dataframe

---
 pittgoogle/alert.py | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/pittgoogle/alert.py b/pittgoogle/alert.py
index 0c7b93f..3065e98 100644
--- a/pittgoogle/alert.py
+++ b/pittgoogle/alert.py
@@ -259,17 +259,14 @@ def dict(self) -> dict:
 
     @property
     def dataframe(self) -> "pd.DataFrame":
-        if self._dataframe is None:
-            import pandas as pd  # lazy-load pandas. it hogs memory on cloud functions and run
+        if self._dataframe is not None:
+            return self._dataframe
 
-            if self.schema_name.endswith(".lite"):
-                src_df = pd.DataFrame(self.dict["source"], index=[0])
-                prvs_df = pd.DataFrame(self.dict["prv_sources"])
-            else:
-                src_df = pd.DataFrame(self.dict[self.schema_map["source"]], index=[0])
-                prvs_df = pd.DataFrame(self.dict[self.schema_map["prv_sources"]])
-            self._dataframe = pd.concat([src_df, prvs_df], ignore_index=True)
+        import pandas as pd  # always lazy-load pandas. it hogs memory on cloud functions and run
 
+        src_df = pd.DataFrame(self.get("source"), index=[0])
+        prvs_df = pd.DataFrame(self.get("prv_sources"))
+        self._dataframe = pd.concat([src_df, prvs_df], ignore_index=True)
         return self._dataframe
 
     @property

From f7100d392b54ff0efadff1e87bca1623561d053d Mon Sep 17 00:00:00 2001
From: troyraen <troy.raen@pitt.edu>
Date: Thu, 28 Sep 2023 03:10:18 -0700
Subject: [PATCH 34/55] use the new Schema class

---
 pittgoogle/pubsub.py | 47 +++++++++++++++++++++++++++++++++++---------
 1 file changed, 38 insertions(+), 9 deletions(-)

diff --git a/pittgoogle/pubsub.py b/pittgoogle/pubsub.py
index 3e21293..61b697e 100644
--- a/pittgoogle/pubsub.py
+++ b/pittgoogle/pubsub.py
@@ -242,19 +242,48 @@ def delete(self) -> None:
         else:
             LOGGER.info(f"deleted topic: {self.path}")
 
-    def publish(self, alert: "Alert", format="json") -> int:
-        """Publish the `alert.dict` in the requested `format`, attaching the `alert.attributes`.
+    def publish(self, alert: "Alert") -> int:
+        """Publish a message with `alert.dict` as the payload and `alert.attributes` as the attributes.
 
-        `format` can be "json" or a schema name.
+        If the `alert` has an elasticc schema, the payload will be serialized as schemaless Avro.
+        Otherwise, json will be used.
         """
-        if format == "json":
-            message = json.dumps(alert.dict).encode("utf-8")
+        # we need to decide which format to use: json, avro with schema, or avro without schema
+        # the format that pitt-google currently (2023-09-23) uses to publish messages depends on the stream:
+        #     - consumer modules pass on the original alert data packet, as produced by the survey.
+        #       they do not need to use this method (in fact, the consumers do not even use python),
+        #       so we can ignore this case.
+        #     - all other broker pipeline modules (Pitt-Google-Broker repo) use json.
+        #     - modules in the pittgoogle-user repo publish classifications for elasticc, and thus
+        #       use schemaless avro.
+        # at some point, we should re-evaluate the broker pipeline in particular.
+        #
+        # for now, we will get close enough to the current behavior if we assume that:
+        #     - elasticc messages should be published as schemaless avro
+        #     - else, we should publish a json message
+        # this will match the current behavior in all cases except the elasticc broker pipeline modules.
+        # neither broker pipeline uses pittgoogle-client at this time (they use pgb-broker-utils),
+        # so we don't need to update or accommodate them yet.
+        #
+        # we'll get the survey name from self.schema.survey, but first we should check whether the
+        # schema exists so we can be lenient and just fall back to json instead of raising an error.
+        try:
+            alert.schema
+        except SchemaNotFoundError:
+            avro_schema = None
+        else:
+            if alert.schema.survey in ["elasticc"]:
+                avro_schema = alert.schema.avsc
+            else:
+                avro_schema = None
 
-        elif format.startswith("elasticc"):
-            # load the avro schema and use it to serialize alert.dict
-            schema = fastavro.schema.load_schema(PACKAGE_DIR / f"schemas/elasticc/{format}.avsc")
+        if not avro_schema:
+            # serialize using json
+            message = json.dumps(alert.dict).encode("utf-8")
+        else:
+            # serialize as schemaless avro
             fout = io.BytesIO()
-            fastavro.schemaless_writer(fout, schema, alert.dict)
+            fastavro.schemaless_writer(fout, avro_schema, alert.dict)
             fout.seek(0)
             message = fout.getvalue()
 

From 0258696fdfe4a32c2aa935d03620bf0aade49fc1 Mon Sep 17 00:00:00 2001
From: troyraen <troy.raen@pitt.edu>
Date: Thu, 28 Sep 2023 03:12:07 -0700
Subject: [PATCH 35/55] add minor conveniences

---
 pittgoogle/pubsub.py | 39 ++++++++++++++++++++++++++++++++++-----
 1 file changed, 34 insertions(+), 5 deletions(-)

diff --git a/pittgoogle/pubsub.py b/pittgoogle/pubsub.py
index 61b697e..a08e99d 100644
--- a/pittgoogle/pubsub.py
+++ b/pittgoogle/pubsub.py
@@ -177,9 +177,38 @@ class Topic:
     )
 
     @classmethod
-    def from_cloud(cls, name, *, projectid, testid=False):
-        """Create a `Topic` with a `client` using implicit credentials (no explicit `auth`)."""
-        # if testid is not False, "False", or None, append the testid to the name
+    def from_cloud(
+        cls,
+        name: str,
+        *,
+        projectid: str,
+        survey: Optional[str] = None,
+        testid: Optional[str] = None,
+    ):
+        """Create a `Topic` with a `client` using implicit credentials (no explicit `auth`).
+
+        Parameters
+        ----------
+        name : `str`
+            Name of the topic. If `survey` and/or `testid` are provided, they will be added to this
+            name following the Pitt-Google naming syntax.
+        projectid : `str`
+            Project ID of the Goodle Cloud project that owns this resource. Project IDs used by
+            Pitt-Google are listed in the registry for convenience (:class:`pittgoogle.registry.ProjectIds`).
+            Required because it cannot be retrieved from the `client` and there is no explicit `auth`.
+        survey : `str`, optional
+            Name of the survey. If provided, it will be prepended to `name` following the
+            Pitt-Google naming syntax.
+        testid : `str`, optional
+            Pipeline identifier. If this is not `None`, `False`, or `"False"` it will be appended to
+            the `name` following the Pitt-Google naming syntax. This used to allow pipeline modules
+            to find the correct resources without interfering with other pipelines that may have
+            deployed resources with the same base names (e.g., for development and testing purposes).
+        """
+        # if survey and/or testid passed in, use them to construct full name using the pitt-google naming syntax
+        if survey is not None:
+            name = f"{survey}-{name}"
+        # must accommodate False and "False" for consistency with the broker pipeline
         if testid and testid != "False":
             name = f"{name}-{testid}"
         return cls(name, projectid=projectid, client=pubsub_v1.PublisherClient())
@@ -287,8 +316,8 @@ def publish(self, alert: "Alert") -> int:
             fout.seek(0)
             message = fout.getvalue()
 
-        # attribute keys and values must be strings
-        attributes = {str(key): str(val) for key, val in alert.attributes.items()}
+        # attribute keys and values must be strings. let's sort the keys while we're at it
+        attributes = {str(key): str(alert.attributes[key]) for key in sorted(alert.attributes)}
 
         future = self.client.publish(self.path, data=message, **attributes)
         return future.result()

From 249d96184aa0a251173e6b05b9983829fb2801e6 Mon Sep 17 00:00:00 2001
From: troyraen <troy.raen@pitt.edu>
Date: Thu, 28 Sep 2023 03:12:30 -0700
Subject: [PATCH 36/55] clean up imports, docs, and comments

---
 pittgoogle/alert.py  | 56 ++++++++++++++------------------------------
 pittgoogle/pubsub.py | 14 +++++++----
 2 files changed, 27 insertions(+), 43 deletions(-)

diff --git a/pittgoogle/alert.py b/pittgoogle/alert.py
index 3065e98..4c94a6d 100644
--- a/pittgoogle/alert.py
+++ b/pittgoogle/alert.py
@@ -17,34 +17,6 @@
     path = "path/to/ztf_alert.avro"  # point this to a file containing an alert
     alert = pittgoogle.Alert.from_path(path, schema_name="ztf")
 
-Load a ZTF alert from a Pub/Sub message that has triggered a Cloud Run module:
-
-.. code-block:: python
-
-    import pittgoogle
-    # flask is used to work with HTTP requests, which trigger Cloud Run modules
-    # the request contains the Pub/Sub message, which contains the alert packet
-    from flask import Flask, request
-
-    app = Flask(__name__)
-
-    # function that receives the request
-    @app.route("/", methods=["POST"])
-    def index():
-
-        try:
-            # unpack the alert
-            # if the request does not contain a valid message, this raises a `BadRequest`
-            alert = pittgoogle.Alert.from_cloud_run(envelope=request.get_json(), schema_name="ztf")
-
-        except pg.exceptions.BadRequest as err:
-            # return the error text and an HTTP 400 Bad Request code
-            return err.text, 400
-
-        # continue processing the alert
-        # when finished, return an empty string and an HTTP success code
-        return "", 204
-
 API
 ----
 
@@ -52,19 +24,19 @@ def index():
 import importlib.resources
 import io
 import logging
-from typing import TYPE_CHECKING, Optional, Union
+from pathlib import Path
+from typing import Any, TYPE_CHECKING, Optional, Union
 
 import fastavro
-import yaml
 from attrs import define, field
 
-from .exceptions import BadRequest, OpenAlertError
+from . import registry, types_
+from .exceptions import BadRequest, OpenAlertError, SchemaNotFoundError
 from .utils import Cast
 
 if TYPE_CHECKING:
     import google._upb._message
     import google.cloud.pubsub_v1
-    import google.protobuf.timestamp_pb2
     import pandas as pd  # always lazy-load pandas. it hogs memory on cloud functions and run
 
 
@@ -76,8 +48,7 @@ def index():
 class Alert:
     """Pitt-Google container for an astronomical alert.
 
-    Alerts are typically loaded from a Pub/Sub message but may also be loaded from a file.
-    It is recommended to instantiate an `Alert` using one of the `from_*` methods.
+    Recommended to instantiate using one of the `from_*` methods.
 
     All parameters are keyword only.
 
@@ -210,7 +181,7 @@ def attributes(self) -> dict:
 
     @property
     def dict(self) -> dict:
-        """Message payload as a dictionary. Created from `self.msg.data` and `self.schema_name`, if needed.
+        """Alert data as a dictionary. Created from `self.msg.data`, if needed.
 
         Raises
         ------
@@ -271,17 +242,26 @@ def dataframe(self) -> "pd.DataFrame":
 
     @property
     def alertid(self) -> Union[str, int]:
-        """Convenience property for the alert ID. If the survey does not define an alert ID, this is the `sourceid`."""
+        """Convenience property to get the alert ID.
+
+        If the survey does not define an alert ID, this returns the `sourceid`.
+        """
         return self.get("alertid", self.sourceid)
 
     @property
     def objectid(self) -> Union[str, int]:
-        """Convenience property for the object ID. The "object" represents a collection of sources, as determined by the survey."""
+        """Convenience property to get the object ID.
+
+        The "object" represents a collection of sources, as determined by the survey.
+        """
         return self.get("objectid")
 
     @property
     def sourceid(self) -> Union[str, int]:
-        """Convenience property for the source ID. The "source" is the detection that triggered the alert."""
+        """Convenience property to get the source ID.
+
+        The "source" is the detection that triggered the alert.
+        """
         return self.get("sourceid")
 
     @property
diff --git a/pittgoogle/pubsub.py b/pittgoogle/pubsub.py
index a08e99d..5ec7c1a 100644
--- a/pittgoogle/pubsub.py
+++ b/pittgoogle/pubsub.py
@@ -23,11 +23,14 @@
 
 .. code-block:: python
 
-    subscription = pittgoogle.pubsub.Subscription(
-        "my-ztf-loop-subscription",
-        # topic only required if the subscription does not yet exist in Google Cloud
-        topic=pittgoogle.pubsub.Topic("ztf-loop", pittgoogle.utils.ProjectIds.pittgoogle)
-    )
+    # topic the subscription will be connected to
+    # only required if the subscription does not yet exist in Google Cloud
+    topic = pittgoogle.Topic(name="ztf-loop", projectid=pittgoogle.ProjectIds.pittgoogle)
+
+    # choose your own name for the subscription
+    subscription = pittgoogle.Subscription(name="my-ztf-loop-subscription", topic=topic, schema_name="ztf")
+
+    # make sure the subscription exists and we can connect to it. create it if necessary
     subscription.touch()
 
 Pull a small batch of alerts. Helpful for testing. Not recommended for long-runnining listeners.
@@ -92,6 +95,7 @@ def my_batch_callback(results):
 from google.api_core.exceptions import NotFound
 
 from . import Alert, Auth
+from .exceptions import SchemaNotFoundError
 
 
 LOGGER = logging.getLogger(__name__)

From 4f90ffa4d55b5d2464a6861ac92e32f7a7107b1c Mon Sep 17 00:00:00 2001
From: troyraen <troy.raen@pitt.edu>
Date: Thu, 28 Sep 2023 03:13:42 -0700
Subject: [PATCH 37/55] remove bigquery module

---
 pittgoogle/bigquery.py | 692 -----------------------------------------
 1 file changed, 692 deletions(-)
 delete mode 100644 pittgoogle/bigquery.py

diff --git a/pittgoogle/bigquery.py b/pittgoogle/bigquery.py
deleted file mode 100644
index 9da91b7..0000000
--- a/pittgoogle/bigquery.py
+++ /dev/null
@@ -1,692 +0,0 @@
-# -*- coding: UTF-8 -*-
-"""The ``bigquery`` module facilitates querying Pitt-Google Broker's
-BigQuery databases and reading the results.
-See the tutorial for usage help.
-"""
-from typing import Generator, List, Optional, Tuple, Union
-
-import astropy
-import pandas as pd
-from astropy import coordinates as coord
-from google.cloud import bigquery
-from tabulate import tabulate
-
-from .utils import ProjectIds
-
-
-pgb_project_id = ProjectIds.pittgoogle
-
-# --- BigQuery Client
-user_bq_client, user_project_id = None, None  # module's global Client, related id
-
-
-def create_client(project_id: str):
-    """Open a BigQuery Client.
-
-    Args:
-        project_id: User's Google Cloud Platform project ID
-    """
-
-    global user_bq_client
-    global user_project_id
-
-    # instantiate the client
-    print(f"\nInstantiating a BigQuery client with project_id: {project_id}\n")
-    user_bq_client = bigquery.Client(project=project_id)
-
-    # if the user passed a bad project_id, we won't know it yet. Let's check
-    _create_client_raise_exception_if_not_connected(project_id)
-
-    # client is connected. set the global user_project_id
-    user_project_id = project_id
-
-
-def _create_client_raise_exception_if_not_connected(project_id: str):
-    """Checks that the user's client can successfully connect to our tables
-    by executing a dry run query.
-    """
-
-    global user_bq_client
-
-    query = f"SELECT candid FROM `{pgb_project_id}.ztf_alerts.salt2`"
-    try:
-        dry_run(query, notify=False)
-    except:
-        user_bq_client = None  # reset so the user can try again
-        msg = (
-            "You have tried to create a BigQuery Client with the project_id:\n"
-            f"\t{project_id}\n"
-            "But the Client cannot connect to the Pitt-Google Broker.\n"
-            "Check that your project_id is valid "
-            "(e.g., it should not be wrapped in quotes)."
-        )
-        raise ValueError(msg)
-
-
-def _check_client_isinstance():
-    msg = (
-        "You must create a BigQuery client first. "
-        "Run `pittgoogle.bigquery.create_client('your_project_id')`"
-    )
-    assert isinstance(user_bq_client, bigquery.client.Client), msg
-
-
-def _create_client_if_needed():
-    stop = False  # will be set to True if the user chooses to exit
-
-    try:
-        _check_client_isinstance()
-
-    except AssertionError:
-        # help the user open a bigquery client
-        msg = (
-            "\nTo run queries, you must first open a BigQuery Client.\n"
-            "Enter your Google Cloud Platform project ID now "
-            "or exit (just press Enter) and run\n"
-            "`pittgoogle.bigquery.create_client(my_project_id)`\n"
-            "\nProject ID: "
-        )
-        project_id = input(msg) or ""
-
-        if project_id == "":
-            stop = True  # user wants to exit rather than creating a client
-        else:
-            create_client(project_id)
-
-    return stop
-
-
-# --- Get information about PGB datasets and tables
-def get_table_info(table: Union[str, list] = "all", dataset: str = "ztf_alerts"):
-    """Retrieves and prints BigQuery table schemas.
-
-    Args:
-
-        table: Name of the BigQuery table or list of the same.
-               'all' will print the info for all tables in the dataset.
-
-        dataset: Name of BigQuery dataset that the table(s) belong to.
-    """
-
-    # if a bigquery Client does not exist, help the user instantiate one
-    stop = _create_client_if_needed()
-    if stop:  # the user has chosen to exit rather than create a client
-        return
-
-    # get the table names in a list
-    if table == "all":
-        tables = get_dataset_table_names(dataset=dataset)
-    elif isinstance(table, str):
-        tables = [table]
-    else:
-        tables = table
-
-    # get and print info about each table
-    for t in tables:
-        df = get_table_schema(table=t, dataset=dataset)
-
-        # print the metadata and column info
-        print(df.table_name)
-        print(tabulate(df, headers="keys", tablefmt="grid"))  # psql
-        print(f"\n{df.table_name} has {df.num_rows} rows.\n")
-
-
-def get_table_schema(table: str, dataset: str = "ztf_alerts") -> pd.DataFrame:
-    """Retrieves information about the columns in a BigQuery table and returns
-    it as a DataFrame.
-
-    Args:
-        table: Name of the BigQuery table
-        dataset: Name of BigQuery dataset that the table(s) belong to.
-    Returns
-        Column information from the BigQuery table schema.
-    """
-
-    # if a bigquery Client does not exist, help the user instantiate one
-    stop = _create_client_if_needed()
-    if stop:  # the user has chosen to exit rather than create a client
-        return
-
-    bqtable = user_bq_client.get_table(f"{pgb_project_id}.{dataset}.{table}")
-    cols = []
-    for field in bqtable.schema:
-        cols.append((field.name, field.description, field.field_type))
-
-        if field.field_type == "RECORD":
-            for subfield in field.fields:
-                cols.append(
-                    (
-                        f"{field.name}.{subfield.name}",
-                        subfield.description,
-                        subfield.field_type,
-                    )
-                )
-
-    # cols = [(s.name, s.description, s.field_type, s.mode) for s in bqtable.schema]
-    colnames = ["column_name", "description", "type"]
-    df = pd.DataFrame(cols, columns=colnames)
-
-    # add some metadata
-    df.table_name = f"{bqtable.project}.{bqtable.dataset_id}.{bqtable.table_id}"
-    df.num_rows = bqtable.num_rows
-
-    return df
-
-
-def get_dataset_table_names(dataset: str = "ztf_alerts") -> List[str]:
-    """
-    Args:
-        dataset: Name of the BigQuery dataset.
-
-    Returns:
-        List of table names in the dataset.
-    """
-
-    # if a bigquery Client does not exist, help the user instantiate one
-    stop = _create_client_if_needed()
-    if stop:  # the user has chosen to exit rather than create a client
-        return
-
-    print(f"Getting table names for dataset: {dataset}")
-
-    query = "SELECT * " f"FROM {pgb_project_id}.{dataset}.INFORMATION_SCHEMA.TABLES"
-    query_job = user_bq_client.query(query)
-    tables = [row["table_name"] for row in query_job]
-    tables.sort(key=str.lower)
-    return tables
-
-
-# --- Setup to query for object histories
-def get_history_column_names() -> List[str]:
-    """
-    It would be convenient to also return the column descriptions, but
-    that is more complicated, and this function will be completely
-    obsolete if we change the database structure to store only the
-    "candidate" observation and metadata.
-
-    Returns:
-        Column names appropriate for querying object histories.
-    """
-
-    dropcols = ["prv_candidates", "cutoutScience", "cutoutDifference", "cutoutTemplate"]
-
-    sdf = get_table_schema("alerts")
-    schemacols = list(sdf["column_name"])
-
-    # drop the prv_candidates and cutout columns
-    historycols = [c for c in schemacols if c.split(".")[0] not in dropcols]
-
-    # drop the full "candidate" RECORD column
-    historycols.remove("candidate")
-
-    # drop "candidate.candid" as it is simply a repeat of "candid"
-    historycols.remove("candidate.candid")
-
-    # strip out "candidate." from nested columns
-    # query_objects() uses only the base names
-    historycols = [c.replace("candidate.", "") for c in historycols]
-
-    return historycols
-
-
-def check_history_column_names(columns: List[str]) -> Union[List[str], bool]:
-    """Make sure user-submitted column names are appropriate to query object histories."""
-
-
-def _split_good_bad_history_column_names(
-    columns: List[str],
-) -> Tuple[List[str], List[str]]:
-    """Split columns list into "good" and "bad" according to whether they are
-    suitable for querying an object's history.
-    """
-
-    badcols = list(set(columns) - set(get_history_column_names()))
-    goodcols = columns.copy()
-    for bc in badcols:
-        goodcols.remove(bc)
-    return (goodcols, badcols)
-
-
-def object_history_sql_statement(
-    columns: List[str], objectIds: Optional[list] = None, limit: Optional[int] = None
-) -> str:
-    """Convince function that generates the SQL string needed to
-    query the alerts table and aggregate data by objectId.
-    When the resulting SQL query is executed, the query job will contain
-    one row for each objectId, with the object's data aggregated into
-    arrays (one array per column in columns) ordered by the observation date.
-
-    Note: Arrays may contain duplicated observations; it is the user's
-    responsibility to clean them.
-
-    Args:
-        columns: Names of columns to select from the alerts table.
-                 The 'objectId' and 'candid' columns are automatically included
-                 and do not need to be in this list.
-        objectIds: IDs of ZTF objects to include in the query.
-        limit: Maximum number of rows to be returned.
-
-    Returns:
-        SQL statement to query the alerts table and aggregate data by objectId.
-    """
-
-    dataset = "ztf_alerts"
-    table = "alerts"
-    objectcols = [
-        "objectId",
-    ]  # columns unique to an object
-    # make sure 'candid' is in columns. (objectcols handled separately)
-    columns = list(set(columns).union(set(["candid"])))
-
-    # SELECT statement
-    # create a list of strings that will aggregate columns into arrays
-    aggcols = _list_aggcols_sql_statements(columns)
-    selects = f'SELECT {", ".join(objectcols + aggcols)}'
-
-    # FROM statement
-    froms = f"FROM `{pgb_project_id}.{dataset}.{table}`"
-    # concat the statements into the beginning of a SQL query statement
-    sqlquery = " ".join([selects, froms])
-
-    # WHERE statement
-    if objectIds is not None:
-        # wrap each objectId in quotes and join to single string
-        oids = ",".join([f'"{o}"' for o in objectIds])
-        wheres = f"WHERE objectId IN ({oids})"
-        # concat the statements into a SQL query statement
-        sqlquery = " ".join([sqlquery, wheres])
-
-    # GROUP BY statement
-    groupbys = "GROUP BY objectId"
-    sqlquery = " ".join([sqlquery, groupbys])
-
-    # LIMIT statement
-    if limit is not None:
-        limits = f"LIMIT {limit}"
-        sqlquery = " ".join([sqlquery, limits])
-
-    return sqlquery
-
-
-def _list_aggcols_sql_statements(columns: List[str]) -> List[str]:
-    """Create a list of SQL string query segments that will aggregate
-    all columns not in objectcols.
-    """
-
-    objectcols = [
-        "objectId",
-    ]
-    flatcols = [
-        "schemavsn",
-        "publisher",
-        "candid",
-    ]
-
-    # list of requested flatcols
-    fcols = list(set(columns) & set(flatcols))
-    # list of requested columns nested under 'candidate'
-    ncols = list(set(columns) - set(objectcols) - set(flatcols))
-    ncols = [f"candidate.{c}" for c in ncols]
-    # complete list of columns to be aggregated (group by) objectId
-    aggcols = fcols + ncols
-    # attach the ARRAY_AGG, ORDER By, and AS statements to the aggcols
-    aggcols = [f'ARRAY_AGG({c} ORDER BY candidate.jd) AS {c.split(".")[-1]}' for c in aggcols]
-
-    return aggcols
-
-
-# --- Dry runs
-def dry_run(query: str, notify: bool = True):
-    """Perform a dry run to find out how many bytes the query will process.
-    Args:
-        query: SQL query statement
-    """
-
-    global user_project_id
-    _check_client_isinstance()  # make sure we have a bigquery.client
-
-    job_config = bigquery.QueryJobConfig(dry_run=True, use_query_cache=False)
-    query_job = user_bq_client.query(query, job_config=job_config)
-
-    if notify:
-        nbytes, TiB = query_job.total_bytes_processed, 2**40
-        pTiB = nbytes / TiB * 100  # nbytes as a percent of 1 TiB
-        print("\nQuery statement:")
-        print(f'\n"{query}"\n')
-        print(f"will process {nbytes} bytes of data.")
-        print(f"({pTiB:.3}% of your 1 TiB Free Tier monthly allotment.)")
-
-
-def _dry_run_and_confirm(query: str) -> bool:
-    # print dry run info
-    dry_run(query)
-    # ask user if they want to proceed
-    cont = input("Continue? [y/N]: ") or "N"
-    do_the_query = cont in ["y", "Y"]
-    return do_the_query
-
-
-# --- Query for object histories
-def query_objects(
-    columns: List[str],
-    objectIds: Optional[list] = None,
-    limit: Optional[int] = None,
-    format: str = "pandas",
-    iterator: bool = False,
-    dry_run: bool = True,
-) -> Union[
-    str,
-    pd.DataFrame,
-    bigquery.job.QueryJob,
-    Generator[Union[str, pd.DataFrame], None, None],
-]:
-    """Query the alerts database for object histories.
-
-    Args:
-        columns: Names of columns to select from the alerts table.
-                 The 'objectId' and 'candid' columns are automatically included
-                 and do not need to be in this list.
-        objectIds: IDs of ZTF objects to include in the query.
-        limit: Limit the number of objects returned to N <= limit.
-        format: One of 'pandas', 'json', or 'query_job'. Query results will be
-                returned in this format. Results returned as 'query_job' may
-                contain duplicate observations; else duplicates are dropped.
-        iterator: If True, iterate over the objects and return one at a time.
-                  Else return the full query results together.
-                  This parameter is ignored if `format` == 'query_job'.
-        dry_run: If True, `pittgoogle.bigquery.dry_run` will be called first and the
-                 user will be asked to confirm before continuing.
-
-    Returns:
-        Query results in the requested format.
-    """
-
-    # make sure we have appropriate column names
-    goodcols = _query_objects_check_history_column_names(columns)
-    if len(goodcols) == 0:  # user submitted bad columns and wants to abort
-        return
-
-    # if a bigquery client does not exist, help the user instantiate one
-    stop = _create_client_if_needed()
-    if stop:  # the user has chosen to exit rather than create a client
-        return
-
-    # generate the SQL statement to query alerts db and aggregate histories
-    query = object_history_sql_statement(goodcols, objectIds, limit=limit)  # str
-
-    # print dry run results
-    if dry_run:
-        do_the_query = _dry_run_and_confirm(query)
-        if not do_the_query:  # user has chosen to abort the query
-            return
-
-    # make the API call
-    query_job = user_bq_client.query(query)
-
-    # return the results
-    if format == "query_job":
-        return query_job
-    elif iterator:  # return a generator that cycles through the objects/rows
-        return (format_history_query_results(row=row, format=format) for row in query_job)
-    else:  # format and return all rows at once
-        return format_history_query_results(query_job=query_job, format=format)
-
-
-def _query_objects_check_history_column_names(columns: List[str]) -> List[str]:
-    """Make sure user-submitted column names are appropriate for `query_objects()`.
-
-    Returns one of:
-        Columns stripped of bad column names.
-        Empty list if there were bad columns and the user wants to abort the query.
-    """
-
-    goodcols, badcols = _split_good_bad_history_column_names(columns)
-
-    try:
-        assert len(badcols) == 0
-    except AssertionError:
-        msg = (
-            "\nYou have requested columns that are not available to `query_objects()`.\n"
-            "(To view available columns, use `pittgoogle.bigquery.get_history_column_names()`)\n"
-            f"\nRequested columns:\n\t{columns}\n"
-            f"Unavailable columns:\n\t{badcols}\n"
-            "\nProceed without the unavailable columns? [y/N] "
-        )
-        proceed = input(msg) or "N"
-
-        if proceed not in ["y", "Y"]:  # user wants to exit; return an empty list
-            return []
-
-    return goodcols
-
-
-# --- Format query results
-def format_history_query_results(
-    query_job: Optional[bigquery.job.QueryJob] = None,
-    row: Optional[bigquery.table.Row] = None,
-    format: str = "pandas",
-) -> Union[pd.DataFrame, str]:
-    """Converts the results of a BigQuery query to the desired format.
-    Must pass either query_job or row.
-    Any duplicate observations will be dropped.
-
-    Args:
-        query_job: Results from a object history query job. SQL statement needed
-                   to create the job can be obtained with object_history_sql_statement().
-                   Must supply either query_job or row.
-
-        row: A single row from query_job. Must supply either row or query_job.
-
-        format: One of 'pandas' or 'json'. Input query results will be returned
-                in this format.
-
-    Returns:
-        histories: Input query results converted to requested format
-    """
-
-    # make sure we have an appropriate param combination
-    do_job, do_row = query_job is not None, row is not None
-    good_format = format in ["pandas", "json"]
-    good_combo = (do_job != do_row) and good_format
-    if not good_combo:
-        raise ValueError("Must pass one of query_job or row.")
-
-    # convert query_job
-    if do_job:
-        histories = _format_history_query_results_to_df(query_job)  # df
-        if format == "json":
-            histories = histories.reset_index().to_json()  # str
-
-    # convert row
-    if do_row:
-        histories = _format_history_row_to_df(row)  # df
-        if format == "json":
-            histories["objectId"] = histories.objectId  # persist metadata
-            histories = histories.reset_index().to_json()  # str
-
-    return histories
-
-
-def _format_history_query_results_to_df(query_job: bigquery.job.QueryJob):
-    """Convert a query_job (containing multiple rows of object history data)
-    to a DataFrame.
-    Any duplicate observations will be dropped.
-    """
-
-    dflist = []
-    for row in query_job:
-        # convert to DataFrame
-        df = _format_history_row_to_df(row)
-        # add the objectId so we can use it to multi-index
-        df["objectId"] = df.objectId
-        # set the multi-index and append to the list
-        dflist.append(df.reset_index().set_index(["objectId", "candid"]))
-
-    histories = pd.concat(dflist)
-
-    return histories
-
-
-def _format_history_row_to_df(row: Union[dict, bigquery.table.Row]):
-    """Convert a single object's history from a query row to a DataFrame.
-    Any duplicate observations will be dropped.
-    """
-
-    d = dict(row.items())
-    oid, cid = d.pop("objectId"), d.pop("candid")
-    df = pd.DataFrame(data=d, index=pd.Index(cid, name="candid"))
-    df.drop_duplicates(inplace=True)
-    df.objectId = oid
-    return df
-
-
-# --- Cone Search
-def cone_search(
-    center: astropy.coordinates.SkyCoord,
-    radius: astropy.coordinates.Angle,
-    columns: List[str],
-    objectIds: Optional[list] = None,
-    format: str = "pandas",
-    iterator: bool = False,
-    dry_run: bool = True,
-) -> Union[str, pd.DataFrame, Generator[Union[str, pd.DataFrame], None, None]]:
-    """Perform a cone search on the alerts database and return object histories.
-    This uses the coordinates of the most recent observation to determine
-    whether an object is within the cone.
-
-    Args:
-        center: Center of the cone to search within.
-        radius: Radius of the cone to search within.
-        columns: Names of history columns to select from the alerts table.
-                 The 'objectId' and 'candid' columns are automatically included
-                 and do not need to be in this list.
-        objectIds: IDs of ZTF objects to include in the query.
-        format: One of 'pandas', or 'json'. Query results will be
-                returned in this format. Duplicate observations are dropped.
-        iterator: If True, iterate over the objects and return one at a time.
-                  Else return the full query results together.
-        dry_run: If True, `pittgoogle.bigquery.dry_run` will be called first and the
-                 user will be asked to confirm before continuing.
-
-    Returns:
-        Query results in the requested format.
-    """
-
-    # make sure we have required columns
-    for c in ["jd", "ra", "dec"]:
-        if c not in columns:
-            columns.append(c)
-
-    # Performing a dry run prints the SQL query statement, which does not account
-    # for the cone search. We'll print some things to reduce user confusion.
-    if dry_run:
-        print("\nInitiating a cone search.")
-
-    # Query the database for object histories.
-    objects = query_objects(
-        columns,
-        objectIds=objectIds,
-        format="pandas",
-        iterator=iterator,
-        dry_run=dry_run,
-    )
-    # == None if user chose to abort; else DataFrame or generator of same
-    if objects is None:
-        return
-
-    if dry_run:
-        print("\nFiltering for objects within the given cone.")
-
-    # filter out objects not in the cone and return the rest
-    objects_in_cone = _do_cone_search(objects, center, radius, format, iterator)
-    return objects_in_cone
-
-
-def _do_cone_search(
-    objects: Union[pd.DataFrame, Generator[pd.DataFrame, None, None]],
-    center: astropy.coordinates.SkyCoord,
-    radius: astropy.coordinates.Angle,
-    format: str = "pandas",
-    iterator: bool = False,
-) -> Union[str, pd.DataFrame, Generator[Union[str, pd.DataFrame], None, None]]:
-    """Apply the cone search filter and return appropriate objects."""
-
-    if iterator:  # objects is a generator, return a generator
-        return _do_cone_search_iterator(objects, center, radius, format)
-
-    else:  # objects is single df
-        return _do_cone_search_all(objects, center, radius, format)
-
-
-def _do_cone_search_iterator(
-    objects: pd.DataFrame,
-    center: astropy.coordinates.SkyCoord,
-    radius: astropy.coordinates.Angle,
-    format,
-):
-    """Iterate objects, format and yield those that are in the cone.
-
-    Args:
-        objects: DataFrame containing histories of multiple objectIds.
-    """
-
-    for df in objects:
-        in_cone = object_is_in_cone(df, center, radius)
-
-        if in_cone:  # format and yield
-            if format == "json":
-                df["objectId"] = df.objectId  # else metadata is lost
-                object = df.reset_index().to_json()  # str
-            else:
-                object = df
-            yield object
-
-
-def _do_cone_search_all(
-    objects: pd.DataFrame,
-    center: astropy.coordinates.SkyCoord,
-    radius: astropy.coordinates.Angle,
-    format,
-):
-    """Filter out objects not in the cone, format, and return.
-
-    Args:
-        objects: DataFrame containing histories of multiple objectIds.
-    """
-
-    gb = objects.groupby(level="objectId")
-    objects_in_cone = gb.filter(lambda df: object_is_in_cone(df, center, radius))
-    if format == "json":
-        objects_in_cone = objects_in_cone.reset_index().to_json()  # str
-    return objects_in_cone
-
-
-def object_is_in_cone(
-    object: pd.DataFrame,
-    center: astropy.coordinates.SkyCoord,
-    radius: astropy.coordinates.Angle,
-):
-    """Checks whether the object's most recent observation has a position that
-    is within a cone defined by center and radius.
-
-    Args:
-        object: DataFrame containing the history of a single objectId.
-                Required columns: ['jd','ra','dec']
-        center: Center of the cone to search within.
-        radius: Radius of the cone to search within.
-
-    Returns:
-        True if object is within radius of center, else False
-    """
-
-    # get the SkyCoords of the most recent observation
-    # to do: use the epoch with highest S/N instead
-    obs = object.loc[object["jd"] == object["jd"].max(), :]
-    obs_coords = coord.SkyCoord(obs["ra"], obs["dec"], frame="icrs", unit="deg")
-
-    # check whether obs_coords are within the cone
-    dist = center.separation(obs_coords)
-    in_cone = dist < radius  # array with a single bool
-    in_cone = in_cone[0]
-
-    return in_cone

From ebe79e928e55700ecf6b1be41dc9ba64eb11f7d3 Mon Sep 17 00:00:00 2001
From: troyraen <troy.raen@pitt.edu>
Date: Thu, 28 Sep 2023 03:18:50 -0700
Subject: [PATCH 38/55] add bigquery module with Table class

---
 pittgoogle/bigquery.py | 164 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 164 insertions(+)
 create mode 100644 pittgoogle/bigquery.py

diff --git a/pittgoogle/bigquery.py b/pittgoogle/bigquery.py
new file mode 100644
index 0000000..68af542
--- /dev/null
+++ b/pittgoogle/bigquery.py
@@ -0,0 +1,164 @@
+# -*- coding: UTF-8 -*-
+"""Classes to facilitate connections to BigQuery datasets and tables.
+
+.. contents::
+   :local:
+   :depth: 2
+
+.. note::
+
+    This module relies on :mod:`pittgoogle.auth` to authenticate API calls.
+    The examples given below assume the use of a :ref:`service account <service account>` and
+    :ref:`environment variables <set env vars>`. In this case, :mod:`pittgoogle.auth` does not
+    need to be called explicitly.
+
+Usage Examples
+---------------
+
+.. code-block:: python
+
+    import pittgoogle
+
+    [TODO]
+
+API
+----
+
+"""
+import logging
+from typing import TYPE_CHECKING, Optional, Union
+
+import google.cloud.bigquery as bigquery
+from attrs import define, field
+from attrs.validators import instance_of, optional
+
+from .auth import Auth
+
+if TYPE_CHECKING:
+    from . import Alert
+
+
+LOGGER = logging.getLogger(__name__)
+
+
+@define
+class Table:
+    """Methods and properties for a BigQuery table.
+
+    Parameters
+    ------------
+    name : `str`
+        Name of the BigQuery table.
+    dataset : `str`
+        Name of the BigQuery dataset this table belongs to.
+
+    projectid : `str`, optional
+        The topic owner's Google Cloud project ID. Either this or `auth` is required. Use this
+        if you are connecting to a subscription owned by a different project than this topic. Note:
+        :attr:`pittgoogle.utils.ProjectIds` is a registry containing Pitt-Google's project IDs.
+    auth : :class:`pittgoogle.auth.Auth`, optional
+        Credentials for the Google Cloud project that owns this topic. If not provided,
+        it will be created from environment variables when needed.
+    client : `pubsub_v1.PublisherClient`, optional
+        Pub/Sub client that will be used to access the topic. If not provided, a new client will
+        be created (using `auth`) the first time it is requested.
+    """
+
+    name: str = field()
+    dataset: str = field()
+    projectid: str = field(default=None)
+    _auth: Auth = field(default=None, validator=optional(instance_of(Auth)))
+    _client: Optional[bigquery.Client] = field(
+        default=None, validator=optional(instance_of(bigquery.Client))
+    )
+    _table: Optional[bigquery.Table] = field(default=None, init=False)
+
+    @classmethod
+    def from_cloud(
+        cls,
+        name: str,
+        *,
+        dataset: Optional[str] = None,
+        survey: Optional[str] = None,
+        testid: Optional[str] = None,
+    ):
+        """Create a `Table` with a `client` using implicit credentials (no explicit `auth`).
+
+        The `projectid` will be retrieved from the `client`.
+
+        Parameters
+        ----------
+        name : `str`
+            Name of the table.
+        dataset : `str`, optional
+            Name of the dataset containing the table. Either this or a `survey` is required. If a
+            `testid` is provided, it will be appended to this name following the Pitt-Google naming syntax.
+        survey : `str`, optional
+            Name of the survey. This will be used as the name of the dataset if the `dataset` kwarg
+            is not provided. This kwarg is provided for convenience in cases where the Pitt-Google
+            naming syntax is used to name resources.
+        testid : `str`, optional
+            Pipeline identifier. If this is not `None`, `False`, or `"False"` it will be appended to
+            the dataset name. This is used in cases where the Pitt-Google naming syntax is used to name
+            resources. This allows pipeline modules to find the correct resources without interfering
+            with other pipelines that may have deployed resources with the same base names
+            (e.g., for development and testing purposes).
+        """
+        if dataset is None:
+            # [TODO] update the elasticc broker to name the dataset using the survey name only
+            dataset = survey
+        # if testid is not False, "False", or None, append it to the dataset
+        if testid and testid != "False":
+            dataset = f"{dataset}_{testid}"
+        client = bigquery.Client()
+        table = cls(name, dataset=dataset, projectid=client.project, client=client)
+        # make the get request now to create a connection to the table
+        _ = table.table
+        return table
+
+    @property
+    def auth(self) -> Auth:
+        """Credentials for the Google Cloud project that owns this topic.
+
+        This will be created from environment variables if `self._auth` is None.
+        """
+        if self._auth is None:
+            self._auth = Auth()
+
+        if (self.projectid != self._auth.GOOGLE_CLOUD_PROJECT) and (self.projectid is not None):
+            LOGGER.warning(f"setting projectid to match auth: {self._auth.GOOGLE_CLOUD_PROJECT}")
+        self.projectid = self._auth.GOOGLE_CLOUD_PROJECT
+
+        return self._auth
+
+    @property
+    def id(self) -> str:
+        """Fully qualified table ID."""
+        # make sure we have a projectid. if it needs to be set, call auth
+        if self.projectid is None:
+            self.auth
+        return f"{self.projectid}.{self.dataset}.{self.name}"
+
+    @property
+    def table(self) -> bigquery.Table:
+        """Return a BigQuery Table object that's connected to the table. Makes a get request if necessary."""
+        if self._table is None:
+            self._table = self.client.get_table(self.id)
+        return self._table
+
+    @property
+    def client(self) -> bigquery.Client:
+        """BigQuery client for table access.
+
+        Will be created using `self.auth.credentials` if necessary.
+        """
+        if self._client is None:
+            self._client = bigquery.Client(credentials=self.auth.credentials)
+        return self._client
+
+    def insert_rows(self, alerts: Union["Alert", list["Alert"]]) -> list[dict]:
+        rows = [alert.dict for alert in list(alerts)]
+        errors = self.client.insert_rows(self.table, rows)
+        if len(errors) > 0:
+            LOGGER.warning(f"BigQuery insert error: {errors}")
+        return errors

From ca495ac46715dab210129a84dfe5bc54b2d05b42 Mon Sep 17 00:00:00 2001
From: troyraen <troy.raen@pitt.edu>
Date: Thu, 28 Sep 2023 03:24:44 -0700
Subject: [PATCH 39/55] update __init__ imports

---
 pittgoogle/__init__.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/pittgoogle/__init__.py b/pittgoogle/__init__.py
index 0a4b9ab..94d388a 100644
--- a/pittgoogle/__init__.py
+++ b/pittgoogle/__init__.py
@@ -11,8 +11,10 @@
 
 from .auth import Auth
 from .alert import Alert
+from .bigquery import Table
 from .pubsub import Topic, Subscription, Consumer
-from . import auth, alert, bigquery, exceptions, pubsub, utils
+from .registry import ProjectIds, Schemas
+from . import exceptions, types_, registry, utils, auth, alert, bigquery, pubsub
 
 
 __version__ = metadata.version("pittgoogle-client")

From cf65479354b230af3323d862c77c3a75bc922d8c Mon Sep 17 00:00:00 2001
From: troyraen <troy.raen@pitt.edu>
Date: Thu, 28 Sep 2023 03:32:04 -0700
Subject: [PATCH 40/55] remove obsolete method

---
 pittgoogle/utils.py | 18 ------------------
 1 file changed, 18 deletions(-)

diff --git a/pittgoogle/utils.py b/pittgoogle/utils.py
index f185135..e605f0e 100644
--- a/pittgoogle/utils.py
+++ b/pittgoogle/utils.py
@@ -106,24 +106,6 @@ def b64avro_to_dict(bytes_data):
         return Cast.avro_to_dict(b64decode(bytes_data))
 
     # --- Work with alert dictionaries
-    @staticmethod
-    def alert_dict_to_dataframe(alert_dict: dict) -> pd.DataFrame:
-        """Package a ZTF alert dictionary into a dataframe.
-
-        Adapted from:
-        https://github.com/ZwickyTransientFacility/ztf-avro-alert/blob/master/notebooks/Filtering_alerts.ipynb
-        """
-        dfc = pd.DataFrame(alert_dict["candidate"], index=[0])
-        df_prv = pd.DataFrame(alert_dict["prv_candidates"])
-        df = pd.concat([dfc, df_prv], ignore_index=True, sort=True)
-        df = df[dfc.columns]  # return to original column ordering
-
-        # we'll attach some metadata
-        # note this may not be preserved after all operations
-        # https://stackoverflow.com/questions/14688306/adding-meta-information-metadata-to-pandas-dataframe
-        df.objectId = alert_dict["objectId"]
-        return df
-
     @staticmethod
     def alert_dict_to_table(alert_dict: dict) -> Table:
         """Package a ZTF alert dictionary into an Astopy Table."""

From f5997786c2be09be5063ffd715ff6920218add3f Mon Sep 17 00:00:00 2001
From: troyraen <troy.raen@pitt.edu>
Date: Thu, 28 Sep 2023 03:33:31 -0700
Subject: [PATCH 41/55] clean up imports

---
 pittgoogle/types_.py | 7 -------
 pittgoogle/utils.py  | 1 -
 2 files changed, 8 deletions(-)

diff --git a/pittgoogle/types_.py b/pittgoogle/types_.py
index 58f2ada..494267f 100644
--- a/pittgoogle/types_.py
+++ b/pittgoogle/types_.py
@@ -1,18 +1,11 @@
 # -*- coding: UTF-8 -*-
 """Functions to support working with alerts and related data."""
 import importlib.resources
-import json
 import logging
-from base64 import b64decode, b64encode
-from collections import OrderedDict
-from io import BytesIO
 from typing import TYPE_CHECKING, Optional
 
 import fastavro
-import pandas as pd
 import yaml
-from astropy.table import Table
-from astropy.time import Time
 from attrs import define, field
 
 if TYPE_CHECKING:
diff --git a/pittgoogle/utils.py b/pittgoogle/utils.py
index e605f0e..cd18980 100644
--- a/pittgoogle/utils.py
+++ b/pittgoogle/utils.py
@@ -7,7 +7,6 @@
 from io import BytesIO
 
 import fastavro
-import pandas as pd
 from astropy.table import Table
 from astropy.time import Time
 from attrs import define

From 681324eefae20fdd8835cabc8b5bec621ba35bb6 Mon Sep 17 00:00:00 2001
From: troyraen <troy.raen@pitt.edu>
Date: Thu, 28 Sep 2023 03:44:12 -0700
Subject: [PATCH 42/55] replace ClassVar -> Final

---
 pittgoogle/registry.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/pittgoogle/registry.py b/pittgoogle/registry.py
index ba2a62a..02ff63f 100644
--- a/pittgoogle/registry.py
+++ b/pittgoogle/registry.py
@@ -2,7 +2,7 @@
 """Pitt-Google registries."""
 import importlib.resources
 import logging
-from typing import ClassVar
+from typing import Final
 
 from attrs import define
 
@@ -18,16 +18,16 @@
 class ProjectIds:
     """Registry of Google Cloud Project IDs."""
 
-    pittgoogle: ClassVar[str] = "ardent-cycling-243415"
+    pittgoogle: Final[str] = "ardent-cycling-243415"
     """Pitt-Google's production project."""
 
-    pittgoogle_dev: ClassVar[str] = "avid-heading-329016"
+    pittgoogle_dev: Final[str] = "avid-heading-329016"
     """Pitt-Google's development project."""
 
-    # pittgoogle_billing: ClassVar[str] = "light-cycle-328823"
+    # pittgoogle_billing: Final[str] = "light-cycle-328823"
     # """Pitt-Google's billing project."""
 
-    elasticc: ClassVar[str] = "elasticc-challenge"
+    elasticc: Final[str] = "elasticc-challenge"
     """Project running a classifier for ELAsTiCC alerts and reporting to DESC."""
 
 
@@ -43,7 +43,7 @@ class Schemas:
     # - if an avro schema file is being registered with the schema (using the `path` arg), it is
     #   recommended that the file have the same name (path stem) as the schema. the file name
     #   must end with ".avsc".
-    dict: ClassVar[dict] = {
+    dict: Final[dict] = {
         "elasticc.v0_9_1.alert": types_.Schema(
             name="elasticc.v0_9_1.alert",
             description="Avro schema of alerts published by ELAsTiCC.",

From 54103e15266e821c0332616980220d7932ca71fa Mon Sep 17 00:00:00 2001
From: troyraen <troy.raen@pitt.edu>
Date: Thu, 28 Sep 2023 04:02:52 -0700
Subject: [PATCH 43/55] update descriptions

---
 pittgoogle/registry.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pittgoogle/registry.py b/pittgoogle/registry.py
index 02ff63f..a322c7c 100644
--- a/pittgoogle/registry.py
+++ b/pittgoogle/registry.py
@@ -22,13 +22,13 @@ class ProjectIds:
     """Pitt-Google's production project."""
 
     pittgoogle_dev: Final[str] = "avid-heading-329016"
-    """Pitt-Google's development project."""
+    """Pitt-Google's testing and development project."""
 
     # pittgoogle_billing: Final[str] = "light-cycle-328823"
     # """Pitt-Google's billing project."""
 
     elasticc: Final[str] = "elasticc-challenge"
-    """Project running a classifier for ELAsTiCC alerts and reporting to DESC."""
+    """Project running classifiers for ELAsTiCC alerts and reporting to DESC."""
 
 
 @define(frozen=True)

From a2ed5d72f748be17a5e459a5f6279142c50312ea Mon Sep 17 00:00:00 2001
From: troyraen <troy.raen@pitt.edu>
Date: Thu, 28 Sep 2023 04:03:29 -0700
Subject: [PATCH 44/55] make the Schemas dict a class method

---
 pittgoogle/registry.py | 63 +++++++++++++++++++++---------------------
 1 file changed, 32 insertions(+), 31 deletions(-)

diff --git a/pittgoogle/registry.py b/pittgoogle/registry.py
index a322c7c..c8f14d7 100644
--- a/pittgoogle/registry.py
+++ b/pittgoogle/registry.py
@@ -35,41 +35,42 @@ class ProjectIds:
 class Schemas:
     """Registry of schemas used by Pitt-Google."""
 
-    # dict defining the schemas in the registry
-    # naming conventions:
-    # - schema names are expected to start with the name of the survey
-    # - if the survey has more than one schema, the survey name should be followed by a ".",
-    #   followed by schema-specific specifier(s)
-    # - if an avro schema file is being registered with the schema (using the `path` arg), it is
-    #   recommended that the file have the same name (path stem) as the schema. the file name
-    #   must end with ".avsc".
-    dict: Final[dict] = {
-        "elasticc.v0_9_1.alert": types_.Schema(
-            name="elasticc.v0_9_1.alert",
-            description="Avro schema of alerts published by ELAsTiCC.",
-            path=PACKAGE_DIR / f"schemas/elasticc/elasticc.v0_9_1.alert.avsc",
-        ),
-        "elasticc.v0_9_1.brokerClassification": types_.Schema(
-            name="elasticc.v0_9_1.brokerClassification",
-            description="Avro schema of alerts to be sent to DESC containing classifications of ELAsTiCC alerts.",
-            path=PACKAGE_DIR / f"schemas/elasticc/elasticc.v0_9_1.brokerClassification.avsc",
-        ),
-        "ztf": types_.Schema(
-            name="ztf",
-            description=(
-                "ZTF schema. The ZTF survey publishes alerts in Avro format with the schema attached "
-                "in the header. Pitt-Google publishes ZTF alerts in json format. This schema covers "
-                "both cases."  # [TODO]
+    @classmethod
+    def manifest(cls) -> dict:
+        """Return the dictionary defining the schemas in the registry."""
+        # naming conventions:
+        # - schema names are expected to start with the name of the survey
+        # - if the survey has more than one schema, the survey name should be followed by a ".",
+        #   followed by schema-specific specifier(s)
+        # - if an avro schema file is being registered with the schema (using the `path` arg), it is
+        #   recommended that the file have the same name (path stem) as the schema. the file name
+        #   must end with ".avsc".
+        return {
+            "elasticc.v0_9_1.alert": types_.Schema(
+                name="elasticc.v0_9_1.alert",
+                description="Avro schema of alerts published by ELAsTiCC.",
+                path=PACKAGE_DIR / f"schemas/elasticc/elasticc.v0_9_1.alert.avsc",
+            ),
+            "elasticc.v0_9_1.brokerClassification": types_.Schema(
+                name="elasticc.v0_9_1.brokerClassification",
+                description="Avro schema of alerts to be sent to DESC containing classifications of ELAsTiCC alerts.",
+                path=PACKAGE_DIR / f"schemas/elasticc/elasticc.v0_9_1.brokerClassification.avsc",
+            ),
+            "ztf": types_.Schema(
+                name="ztf",
+                description=(
+                    "ZTF schema. The ZTF survey publishes alerts in Avro format with the schema attached "
+                    "in the header. Pitt-Google publishes ZTF alerts in json format. This schema covers "
+                    "both cases."  # [TODO]
+                ),
+                path=None,
             ),
-            path=None,
-        ),
-    }
-    """Dict defining the schemas in the registry."""
+        }
 
     @classmethod
     def names(cls) -> list[str]:
         """Return the names of all registered schemas."""
-        return list(cls.dict.keys())
+        return list(cls.manifest().keys())
 
     @classmethod
     def get(cls, schema_name: str) -> types_.Schema:
@@ -81,7 +82,7 @@ def get(cls, schema_name: str) -> types_.Schema:
             if a schema called `schema_name` is not found
         """
         # if there is no registered schema with this name, raise an error
-        schema = cls.dict.get(schema_name)
+        schema = cls.manifest().get(schema_name)
         if schema is None:
             raise SchemaNotFoundError(
                 f"{schema_name} not found. for a list of valid names, use `pittgoogle.Schemas.names()`."

From cf307f4ccbd2e5cdadd1491fdd95701e03bc190e Mon Sep 17 00:00:00 2001
From: troyraen <troy.raen@pitt.edu>
Date: Fri, 29 Sep 2023 02:31:23 -0700
Subject: [PATCH 45/55] bugfix insert_rows

---
 pittgoogle/bigquery.py | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/pittgoogle/bigquery.py b/pittgoogle/bigquery.py
index 68af542..e7bf769 100644
--- a/pittgoogle/bigquery.py
+++ b/pittgoogle/bigquery.py
@@ -26,16 +26,13 @@
 
 """
 import logging
-from typing import TYPE_CHECKING, Optional, Union
+from typing import Optional, Union
 
 import google.cloud.bigquery as bigquery
 from attrs import define, field
 from attrs.validators import instance_of, optional
 
-from .auth import Auth
-
-if TYPE_CHECKING:
-    from . import Alert
+from . import Alert, Auth
 
 
 LOGGER = logging.getLogger(__name__)
@@ -156,9 +153,10 @@ def client(self) -> bigquery.Client:
             self._client = bigquery.Client(credentials=self.auth.credentials)
         return self._client
 
-    def insert_rows(self, alerts: Union["Alert", list["Alert"]]) -> list[dict]:
-        rows = [alert.dict for alert in list(alerts)]
-        errors = self.client.insert_rows(self.table, rows)
+    def insert_rows(self, rows: Union[list[dict], list[Alert]]) -> list[dict]:
+        # if elements of rows are Alerts, need to extract the dicts
+        myrows = [row.dict if isinstance(row, Alert) else row for row in rows]
+        errors = self.client.insert_rows(self.table, myrows)
         if len(errors) > 0:
             LOGGER.warning(f"BigQuery insert error: {errors}")
         return errors

From 2928b5b2baf7828e9c3bff66c161822fa348fa8b Mon Sep 17 00:00:00 2001
From: troyraen <troy.raen@pitt.edu>
Date: Sun, 8 Oct 2023 00:28:25 -0700
Subject: [PATCH 46/55] publish_time -> datetime

---
 pittgoogle/alert.py  | 11 ++++++++++-
 pittgoogle/types_.py |  4 ++--
 2 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/pittgoogle/alert.py b/pittgoogle/alert.py
index 4c94a6d..d8c8560 100644
--- a/pittgoogle/alert.py
+++ b/pittgoogle/alert.py
@@ -24,6 +24,7 @@
 import importlib.resources
 import io
 import logging
+from datetime import datetime
 from pathlib import Path
 from typing import Any, TYPE_CHECKING, Optional, Union
 
@@ -126,13 +127,21 @@ def index():
         if not isinstance(envelope, dict) or "message" not in envelope:
             raise BadRequest("Bad Request: invalid Pub/Sub message format")
 
+        # convert the message publish_time string -> datetime
+        # occasionally the string doesn't include microseconds so we need a try/except
+        publish_time = envelope["message"]["publish_time"].replace("Z", "+00:00")
+        try:
+            publish_time = datetime.strptime(publish_time, "%Y-%m-%dT%H:%M:%S.%f%z")
+        except ValueError:
+            publish_time = datetime.strptime(publish_time, "%Y-%m-%dT%H:%M:%S%z")
+
         return cls(
             msg=types_.PubsubMessageLike(
                 # this class requires data. the rest should be present in the message, but let's be lenient
                 data=envelope["message"]["data"],
                 attributes=envelope["message"].get("attributes"),
                 message_id=envelope["message"].get("message_id"),
-                publish_time=envelope["message"].get("publish_time"),
+                publish_time=publish_time,
                 ordering_key=envelope["message"].get("ordering_key"),
             ),
             schema_name=schema_name,
diff --git a/pittgoogle/types_.py b/pittgoogle/types_.py
index 494267f..a9ebee0 100644
--- a/pittgoogle/types_.py
+++ b/pittgoogle/types_.py
@@ -9,7 +9,7 @@
 from attrs import define, field
 
 if TYPE_CHECKING:
-    import google.protobuf.timestamp_pb2
+    import datetime
     from pathlib import Path
 
 
@@ -83,5 +83,5 @@ class PubsubMessageLike:
     data: bytes = field()
     attributes: dict = field(factory=dict)
     message_id: Optional[str] = field(default=None)
-    publish_time: Optional["google.protobuf.timestamp_pb2.Timestamp"] = field(default=None)
+    publish_time: Optional["datetime.datetime"] = field(default=None)
     ordering_key: Optional[str] = field(default=None)

From f430751ac175ab1855e58bb3303824acb953079f Mon Sep 17 00:00:00 2001
From: troyraen <troy.raen@pitt.edu>
Date: Sun, 8 Oct 2023 00:28:35 -0700
Subject: [PATCH 47/55] cleanup strings

---
 pittgoogle/registry.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/pittgoogle/registry.py b/pittgoogle/registry.py
index c8f14d7..8260ebe 100644
--- a/pittgoogle/registry.py
+++ b/pittgoogle/registry.py
@@ -49,19 +49,19 @@ def manifest(cls) -> dict:
             "elasticc.v0_9_1.alert": types_.Schema(
                 name="elasticc.v0_9_1.alert",
                 description="Avro schema of alerts published by ELAsTiCC.",
-                path=PACKAGE_DIR / f"schemas/elasticc/elasticc.v0_9_1.alert.avsc",
+                path=PACKAGE_DIR / "schemas/elasticc/elasticc.v0_9_1.alert.avsc",
             ),
             "elasticc.v0_9_1.brokerClassification": types_.Schema(
                 name="elasticc.v0_9_1.brokerClassification",
                 description="Avro schema of alerts to be sent to DESC containing classifications of ELAsTiCC alerts.",
-                path=PACKAGE_DIR / f"schemas/elasticc/elasticc.v0_9_1.brokerClassification.avsc",
+                path=PACKAGE_DIR / "schemas/elasticc/elasticc.v0_9_1.brokerClassification.avsc",
             ),
             "ztf": types_.Schema(
                 name="ztf",
                 description=(
                     "ZTF schema. The ZTF survey publishes alerts in Avro format with the schema attached "
                     "in the header. Pitt-Google publishes ZTF alerts in json format. This schema covers "
-                    "both cases."  # [TODO]
+                    "both cases."
                 ),
                 path=None,
             ),

From 5f9c0f148b53bf6aa1605598d3181662bbd3b4ce Mon Sep 17 00:00:00 2001
From: troyraen <troy.raen@pitt.edu>
Date: Sun, 8 Oct 2023 01:45:24 -0700
Subject: [PATCH 48/55] move schemas manifest to yaml

---
 pittgoogle/registry.py                    | 62 +++++++----------------
 pittgoogle/registry_manifests/schemas.yml | 16 ++++++
 2 files changed, 35 insertions(+), 43 deletions(-)
 create mode 100644 pittgoogle/registry_manifests/schemas.yml

diff --git a/pittgoogle/registry.py b/pittgoogle/registry.py
index 8260ebe..cdc50ad 100644
--- a/pittgoogle/registry.py
+++ b/pittgoogle/registry.py
@@ -4,6 +4,7 @@
 import logging
 from typing import Final
 
+import yaml
 from attrs import define
 
 from . import types_
@@ -12,6 +13,7 @@
 
 LOGGER = logging.getLogger(__name__)
 PACKAGE_DIR = importlib.resources.files(__package__)
+SCHEMA_MANIFEST = yaml.safe_load((PACKAGE_DIR / "registry_manifests/schemas.yml").read_text())
 
 
 @define(frozen=True)
@@ -35,43 +37,6 @@ class ProjectIds:
 class Schemas:
     """Registry of schemas used by Pitt-Google."""
 
-    @classmethod
-    def manifest(cls) -> dict:
-        """Return the dictionary defining the schemas in the registry."""
-        # naming conventions:
-        # - schema names are expected to start with the name of the survey
-        # - if the survey has more than one schema, the survey name should be followed by a ".",
-        #   followed by schema-specific specifier(s)
-        # - if an avro schema file is being registered with the schema (using the `path` arg), it is
-        #   recommended that the file have the same name (path stem) as the schema. the file name
-        #   must end with ".avsc".
-        return {
-            "elasticc.v0_9_1.alert": types_.Schema(
-                name="elasticc.v0_9_1.alert",
-                description="Avro schema of alerts published by ELAsTiCC.",
-                path=PACKAGE_DIR / "schemas/elasticc/elasticc.v0_9_1.alert.avsc",
-            ),
-            "elasticc.v0_9_1.brokerClassification": types_.Schema(
-                name="elasticc.v0_9_1.brokerClassification",
-                description="Avro schema of alerts to be sent to DESC containing classifications of ELAsTiCC alerts.",
-                path=PACKAGE_DIR / "schemas/elasticc/elasticc.v0_9_1.brokerClassification.avsc",
-            ),
-            "ztf": types_.Schema(
-                name="ztf",
-                description=(
-                    "ZTF schema. The ZTF survey publishes alerts in Avro format with the schema attached "
-                    "in the header. Pitt-Google publishes ZTF alerts in json format. This schema covers "
-                    "both cases."
-                ),
-                path=None,
-            ),
-        }
-
-    @classmethod
-    def names(cls) -> list[str]:
-        """Return the names of all registered schemas."""
-        return list(cls.manifest().keys())
-
     @classmethod
     def get(cls, schema_name: str) -> types_.Schema:
         """Return the registered schema called `schema_name`.
@@ -81,10 +46,21 @@ def get(cls, schema_name: str) -> types_.Schema:
         :class:`pittgoogle.exceptions.SchemaNotFoundError`
             if a schema called `schema_name` is not found
         """
-        # if there is no registered schema with this name, raise an error
-        schema = cls.manifest().get(schema_name)
-        if schema is None:
-            raise SchemaNotFoundError(
-                f"{schema_name} not found. for a list of valid names, use `pittgoogle.Schemas.names()`."
+        for schema in SCHEMA_MANIFEST:
+            if schema["name"] != schema_name:
+                continue
+
+            return types_.Schema(
+                name=schema["name"],
+                description=schema["description"],
+                path=PACKAGE_DIR / schema["path"] if schema["path"] is not None else None,
             )
-        return schema
+
+        raise SchemaNotFoundError(
+            f"{schema_name} not found. for a list of valid names, use `pittgoogle.Schemas.names()`."
+        )
+
+    @classmethod
+    def names(cls) -> list[str]:
+        """Return the names of all registered schemas."""
+        return [schema["name"] for schema in SCHEMA_MANIFEST]
diff --git a/pittgoogle/registry_manifests/schemas.yml b/pittgoogle/registry_manifests/schemas.yml
new file mode 100644
index 0000000..654dd52
--- /dev/null
+++ b/pittgoogle/registry_manifests/schemas.yml
@@ -0,0 +1,16 @@
+# Guidelines:
+# - Schema names must start with the name of the survey. If the survey has more than one schema
+#   the survey name should be followed by a "." and then a schema-specific specifier(s).
+# - If a schema file is also being registered (path key), it is recommended that the file have the
+#   same name (path stem) as the schema. Avro is the only file type currently implemented, and the file name
+#   must end with ".avsc".
+# - The path must be relative to the package directory or null if no schema file is being registered.
+- name: "elasticc.v0_9_1.alert"
+  description: "Avro schema of alerts published by ELAsTiCC."
+  path: PACKAGE_DIR / "schemas/elasticc/elasticc.v0_9_1.alert.avsc"
+- name: "elasticc.v0_9_1.brokerClassification"
+  description: "Avro schema of alerts to be sent to DESC containing classifications of ELAsTiCC alerts."
+  path: PACKAGE_DIR / "schemas/elasticc/elasticc.v0_9_1.brokerClassification.avsc"
+- name: "ztf"
+  description: "ZTF schema. The ZTF survey publishes alerts in Avro format with the schema attached in the header. Pitt-Google publishes ZTF alerts in json format. This schema covers both cases."
+  path: null

From 3da9628b7ad399462f3d46d1d17e10d7b82e10f6 Mon Sep 17 00:00:00 2001
From: troyraen <troy.raen@pitt.edu>
Date: Sun, 8 Oct 2023 02:40:12 -0700
Subject: [PATCH 49/55] sort package imports

---
 pittgoogle/__init__.py |  7 +++----
 pittgoogle/alert.py    | 10 ++++------
 pittgoogle/bigquery.py |  4 ++--
 pittgoogle/pubsub.py   |  4 ++--
 pittgoogle/registry.py |  1 -
 pittgoogle/types_.py   |  1 -
 6 files changed, 11 insertions(+), 16 deletions(-)

diff --git a/pittgoogle/__init__.py b/pittgoogle/__init__.py
index 94d388a..2da4e88 100644
--- a/pittgoogle/__init__.py
+++ b/pittgoogle/__init__.py
@@ -9,13 +9,12 @@
 except ImportError:  # for Python<3.8
     import importlib_metadata as metadata
 
-from .auth import Auth
+from . import alert, auth, bigquery, exceptions, pubsub, registry, types_, utils
 from .alert import Alert
+from .auth import Auth
 from .bigquery import Table
-from .pubsub import Topic, Subscription, Consumer
+from .pubsub import Consumer, Subscription, Topic
 from .registry import ProjectIds, Schemas
-from . import exceptions, types_, registry, utils, auth, alert, bigquery, pubsub
-
 
 __version__ = metadata.version("pittgoogle-client")
 
diff --git a/pittgoogle/alert.py b/pittgoogle/alert.py
index d8c8560..2771ed5 100644
--- a/pittgoogle/alert.py
+++ b/pittgoogle/alert.py
@@ -26,21 +26,19 @@
 import logging
 from datetime import datetime
 from pathlib import Path
-from typing import Any, TYPE_CHECKING, Optional, Union
+from typing import TYPE_CHECKING, Any, Optional, Union
 
 import fastavro
 from attrs import define, field
 
-from . import registry, types_
+from . import registry, types_, utils
 from .exceptions import BadRequest, OpenAlertError, SchemaNotFoundError
-from .utils import Cast
 
 if TYPE_CHECKING:
     import google._upb._message
     import google.cloud.pubsub_v1
     import pandas as pd  # always lazy-load pandas. it hogs memory on cloud functions and run
 
-
 LOGGER = logging.getLogger(__name__)
 PACKAGE_DIR = importlib.resources.files(__package__)
 
@@ -229,10 +227,10 @@ def dict(self) -> dict:
         # [TODO] this should be rewritten to catch specific errors
         # for now, just try avro then json, catching basically all errors in the process
         try:
-            self._dict = Cast.avro_to_dict(self.msg.data)
+            self._dict = utils.Cast.avro_to_dict(self.msg.data)
         except Exception:
             try:
-                self._dict = Cast.json_to_dict(self.msg.data)
+                self._dict = utils.Cast.json_to_dict(self.msg.data)
             except Exception:
                 raise OpenAlertError("failed to deserialize the alert bytes")
         return self._dict
diff --git a/pittgoogle/bigquery.py b/pittgoogle/bigquery.py
index e7bf769..04cbf79 100644
--- a/pittgoogle/bigquery.py
+++ b/pittgoogle/bigquery.py
@@ -32,8 +32,8 @@
 from attrs import define, field
 from attrs.validators import instance_of, optional
 
-from . import Alert, Auth
-
+from .alert import Alert
+from .auth import Auth
 
 LOGGER = logging.getLogger(__name__)
 
diff --git a/pittgoogle/pubsub.py b/pittgoogle/pubsub.py
index 5ec7c1a..4f8ecbe 100644
--- a/pittgoogle/pubsub.py
+++ b/pittgoogle/pubsub.py
@@ -94,10 +94,10 @@ def my_batch_callback(results):
 from attrs.validators import gt, instance_of, is_callable, optional
 from google.api_core.exceptions import NotFound
 
-from . import Alert, Auth
+from .alert import Alert
+from .auth import Auth
 from .exceptions import SchemaNotFoundError
 
-
 LOGGER = logging.getLogger(__name__)
 PACKAGE_DIR = importlib.resources.files(__package__)
 
diff --git a/pittgoogle/registry.py b/pittgoogle/registry.py
index cdc50ad..29cb75f 100644
--- a/pittgoogle/registry.py
+++ b/pittgoogle/registry.py
@@ -10,7 +10,6 @@
 from . import types_
 from .exceptions import SchemaNotFoundError
 
-
 LOGGER = logging.getLogger(__name__)
 PACKAGE_DIR = importlib.resources.files(__package__)
 SCHEMA_MANIFEST = yaml.safe_load((PACKAGE_DIR / "registry_manifests/schemas.yml").read_text())
diff --git a/pittgoogle/types_.py b/pittgoogle/types_.py
index a9ebee0..104a769 100644
--- a/pittgoogle/types_.py
+++ b/pittgoogle/types_.py
@@ -12,7 +12,6 @@
     import datetime
     from pathlib import Path
 
-
 LOGGER = logging.getLogger(__name__)
 PACKAGE_DIR = importlib.resources.files(__package__)
 

From 90dc68c74311f5cef7265d02bdfe361185dc23b6 Mon Sep 17 00:00:00 2001
From: troyraen <troy.raen@pitt.edu>
Date: Sun, 8 Oct 2023 02:41:08 -0700
Subject: [PATCH 50/55] bugfix schema paths

---
 pittgoogle/registry_manifests/schemas.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pittgoogle/registry_manifests/schemas.yml b/pittgoogle/registry_manifests/schemas.yml
index 654dd52..1929b99 100644
--- a/pittgoogle/registry_manifests/schemas.yml
+++ b/pittgoogle/registry_manifests/schemas.yml
@@ -7,10 +7,10 @@
 # - The path must be relative to the package directory or null if no schema file is being registered.
 - name: "elasticc.v0_9_1.alert"
   description: "Avro schema of alerts published by ELAsTiCC."
-  path: PACKAGE_DIR / "schemas/elasticc/elasticc.v0_9_1.alert.avsc"
+  path: "schemas/elasticc/elasticc.v0_9_1.alert.avsc"
 - name: "elasticc.v0_9_1.brokerClassification"
   description: "Avro schema of alerts to be sent to DESC containing classifications of ELAsTiCC alerts."
-  path: PACKAGE_DIR / "schemas/elasticc/elasticc.v0_9_1.brokerClassification.avsc"
+  path: "schemas/elasticc/elasticc.v0_9_1.brokerClassification.avsc"
 - name: "ztf"
   description: "ZTF schema. The ZTF survey publishes alerts in Avro format with the schema attached in the header. Pitt-Google publishes ZTF alerts in json format. This schema covers both cases."
   path: null

From 5d82aaaf8741a9a63c30735649a9d8b56689a4b9 Mon Sep 17 00:00:00 2001
From: troyraen <troy.raen@pitt.edu>
Date: Sun, 14 Jan 2024 02:54:17 -0800
Subject: [PATCH 51/55] add forced sources to alert.dataframe

---
 pittgoogle/alert.py                  | 20 +++++++++++++++++---
 pittgoogle/schemas/maps/elasticc.yml |  1 +
 2 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/pittgoogle/alert.py b/pittgoogle/alert.py
index 2771ed5..03dd017 100644
--- a/pittgoogle/alert.py
+++ b/pittgoogle/alert.py
@@ -242,9 +242,23 @@ def dataframe(self) -> "pd.DataFrame":
 
         import pandas as pd  # always lazy-load pandas. it hogs memory on cloud functions and run
 
-        src_df = pd.DataFrame(self.get("source"), index=[0])
-        prvs_df = pd.DataFrame(self.get("prv_sources"))
-        self._dataframe = pd.concat([src_df, prvs_df], ignore_index=True)
+        # sources and previous sources are expected to have the same fields
+        sources_df = pd.DataFrame([self.get("source")] + self.get("prv_sources"))
+        # sources and forced sources may have different fields
+        forced_df = pd.DataFrame(self.get("prv_forced_sources"))
+
+        # use nullable integer data type to avoid converting ints to floats
+        # for columns in one dataframe but not the other
+        sources_ints = [c for c, v in sources_df.dtypes.items() if v == int]
+        sources_df = sources_df.astype(
+            {c: "Int64" for c in set(sources_ints) - set(forced_df.columns)}
+        )
+        forced_ints = [c for c, v in forced_df.dtypes.items() if v == int]
+        forced_df = forced_df.astype(
+            {c: "Int64" for c in set(forced_ints) - set(sources_df.columns)}
+        )
+
+        self._dataframe = pd.concat([sources_df, forced_df], ignore_index=True)
         return self._dataframe
 
     @property
diff --git a/pittgoogle/schemas/maps/elasticc.yml b/pittgoogle/schemas/maps/elasticc.yml
index 50852c1..7087ff4 100644
--- a/pittgoogle/schemas/maps/elasticc.yml
+++ b/pittgoogle/schemas/maps/elasticc.yml
@@ -8,6 +8,7 @@ objectid: [diaObject, diaObjectId]
 source: diaSource
 sourceid: [diaSource, diaSourceId]
 prv_sources: prvDiaSources
+prv_forced_sources: prvDiaForcedSources
 mjd: midPointTai
 filter: filterName
 mag: magpsf

From 8498ab19c8e792e3228835006fb17ac15371437f Mon Sep 17 00:00:00 2001
From: troyraen <troy.raen@pitt.edu>
Date: Sun, 14 Jan 2024 03:18:25 -0800
Subject: [PATCH 52/55] add projectid property

---
 pittgoogle/bigquery.py | 27 +++++++++++++++------------
 pittgoogle/pubsub.py   | 16 ++++++++++------
 2 files changed, 25 insertions(+), 18 deletions(-)

diff --git a/pittgoogle/bigquery.py b/pittgoogle/bigquery.py
index 04cbf79..d0e3144 100644
--- a/pittgoogle/bigquery.py
+++ b/pittgoogle/bigquery.py
@@ -50,20 +50,19 @@ class Table:
         Name of the BigQuery dataset this table belongs to.
 
     projectid : `str`, optional
-        The topic owner's Google Cloud project ID. Either this or `auth` is required. Use this
-        if you are connecting to a subscription owned by a different project than this topic. Note:
+        The table owner's Google Cloud project ID. Either this or `auth` is required. Note:
         :attr:`pittgoogle.utils.ProjectIds` is a registry containing Pitt-Google's project IDs.
     auth : :class:`pittgoogle.auth.Auth`, optional
-        Credentials for the Google Cloud project that owns this topic. If not provided,
+        Credentials for the Google Cloud project that owns this table. If not provided,
         it will be created from environment variables when needed.
-    client : `pubsub_v1.PublisherClient`, optional
-        Pub/Sub client that will be used to access the topic. If not provided, a new client will
+    client : `bigquery.Client`, optional
+        BigQuery client that will be used to access the table. If not provided, a new client will
         be created (using `auth`) the first time it is requested.
     """
 
     name: str = field()
     dataset: str = field()
-    projectid: str = field(default=None)
+    _projectid: str = field(default=None)
     _auth: Auth = field(default=None, validator=optional(instance_of(Auth)))
     _client: Optional[bigquery.Client] = field(
         default=None, validator=optional(instance_of(bigquery.Client))
@@ -115,27 +114,31 @@ def from_cloud(
 
     @property
     def auth(self) -> Auth:
-        """Credentials for the Google Cloud project that owns this topic.
+        """Credentials for the Google Cloud project that owns this table.
 
         This will be created from environment variables if `self._auth` is None.
         """
         if self._auth is None:
             self._auth = Auth()
 
-        if (self.projectid != self._auth.GOOGLE_CLOUD_PROJECT) and (self.projectid is not None):
+        if (self._projectid != self._auth.GOOGLE_CLOUD_PROJECT) and (self._projectid is not None):
             LOGGER.warning(f"setting projectid to match auth: {self._auth.GOOGLE_CLOUD_PROJECT}")
-        self.projectid = self._auth.GOOGLE_CLOUD_PROJECT
+            self._projectid = self._auth.GOOGLE_CLOUD_PROJECT
 
         return self._auth
 
     @property
     def id(self) -> str:
         """Fully qualified table ID."""
-        # make sure we have a projectid. if it needs to be set, call auth
-        if self.projectid is None:
-            self.auth
         return f"{self.projectid}.{self.dataset}.{self.name}"
 
+    @property
+    def projectid(self) -> str:
+        """The table owner's Google Cloud project ID."""
+        if self._projectid is None:
+            self._projectid = self.auth.GOOGLE_CLOUD_PROJECT
+        return self._projectid
+
     @property
     def table(self) -> bigquery.Table:
         """Return a BigQuery Table object that's connected to the table. Makes a get request if necessary."""
diff --git a/pittgoogle/pubsub.py b/pittgoogle/pubsub.py
index 4f8ecbe..dc99a07 100644
--- a/pittgoogle/pubsub.py
+++ b/pittgoogle/pubsub.py
@@ -174,7 +174,7 @@ class Topic:
     """
 
     name: str = field()
-    projectid: str = field(default=None)
+    _projectid: str = field(default=None)
     _auth: Auth = field(default=None, validator=optional(instance_of(Auth)))
     _client: Optional[pubsub_v1.PublisherClient] = field(
         default=None, validator=optional(instance_of(pubsub_v1.PublisherClient))
@@ -232,20 +232,24 @@ def auth(self) -> Auth:
         if self._auth is None:
             self._auth = Auth()
 
-        if (self.projectid != self._auth.GOOGLE_CLOUD_PROJECT) and (self.projectid is not None):
+        if (self._projectid != self._auth.GOOGLE_CLOUD_PROJECT) and (self._projectid is not None):
             LOGGER.warning(f"setting projectid to match auth: {self._auth.GOOGLE_CLOUD_PROJECT}")
-        self.projectid = self._auth.GOOGLE_CLOUD_PROJECT
+            self._projectid = self._auth.GOOGLE_CLOUD_PROJECT
 
         return self._auth
 
     @property
     def path(self) -> str:
         """Fully qualified path to the topic."""
-        # make sure we have a projectid. if it needs to be set, call auth
-        if self.projectid is None:
-            self.auth
         return f"projects/{self.projectid}/topics/{self.name}"
 
+    @property
+    def projectid(self) -> str:
+        """The topic owner's Google Cloud project ID."""
+        if self._projectid is None:
+            self._projectid = self.auth.GOOGLE_CLOUD_PROJECT
+        return self._projectid
+
     @property
     def client(self) -> pubsub_v1.PublisherClient:
         """Pub/Sub client for topic access.

From ae38ec5abe2359560c7d1f070da75ed852a1df5a Mon Sep 17 00:00:00 2001
From: troyraen <troy.raen@pitt.edu>
Date: Sun, 14 Jan 2024 04:58:31 -0800
Subject: [PATCH 53/55] fix type hints

---
 pittgoogle/alert.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/pittgoogle/alert.py b/pittgoogle/alert.py
index 03dd017..b6c2908 100644
--- a/pittgoogle/alert.py
+++ b/pittgoogle/alert.py
@@ -26,7 +26,7 @@
 import logging
 from datetime import datetime
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Optional, Union
+from typing import TYPE_CHECKING, Any, Dict, Optional, Union
 
 import fastavro
 from attrs import define, field
@@ -77,10 +77,10 @@ class Alert:
         Union["google.cloud.pubsub_v1.types.PubsubMessage", types_.PubsubMessageLike]
     ] = field(default=None)
     """Incoming Pub/Sub message object."""
-    _attributes: Optional[Union[dict, "google._upb._message.ScalarMapContainer"]] = field(
+    _attributes: Optional[Union[Dict, "google._upb._message.ScalarMapContainer"]] = field(
         default=None
     )
-    _dict: Optional[dict] = field(default=None)
+    _dict: Optional[Dict] = field(default=None)
     _dataframe: Optional["pd.DataFrame"] = field(default=None)
     schema_name: Optional[str] = field(default=None)
     _schema: Optional[types_.Schema] = field(default=None, init=False)
@@ -88,7 +88,7 @@ class Alert:
 
     # ---- class methods ---- #
     @classmethod
-    def from_cloud_run(cls, envelope: dict, schema_name: Optional[str] = None) -> "Alert":
+    def from_cloud_run(cls, envelope: Dict, schema_name: Optional[str] = None) -> "Alert":
         """Create an `Alert` from an HTTP request envelope containing a Pub/Sub message, as received by a Cloud Run module.
 
         Example code for a Cloud Run module that uses this method to open a ZTF alert:
@@ -148,8 +148,8 @@ def index():
     @classmethod
     def from_dict(
         cls,
-        payload: dict,
-        attributes: Optional[Union[dict, "google._upb._message.ScalarMapContainer"]] = None,
+        payload: Dict,
+        attributes: Optional[Union[Dict, "google._upb._message.ScalarMapContainer"]] = None,
         schema_name: Optional[str] = None,
     ) -> "Alert":  # [TODO] update tom_desc to use this
         """Create an `Alert` from a dictionary (`payload`)."""
@@ -173,7 +173,7 @@ def from_path(cls, path: Union[str, Path], schema_name: Optional[str] = None) ->
 
     # ---- properties ---- #
     @property
-    def attributes(self) -> dict:
+    def attributes(self) -> Dict:
         """Custom metadata for the message. Pub/Sub handles this as a dict-like called "attributes".
 
         If this was not set when the `Alert` was instantiated, a new dictionary will be created using
@@ -187,7 +187,7 @@ def attributes(self) -> dict:
         return self._attributes
 
     @property
-    def dict(self) -> dict:
+    def dict(self) -> Dict:
         """Alert data as a dictionary. Created from `self.msg.data`, if needed.
 
         Raises

From 5533dbe38fdceeeef533a4c141d8d578b0d6920d Mon Sep 17 00:00:00 2001
From: Troy Raen <raen@ipac.caltech.edu>
Date: Thu, 6 Jun 2024 12:21:35 -0700
Subject: [PATCH 54/55] update changelog

---
 CHANGELOG.md | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 37b3dd0..078c48e 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -12,10 +12,16 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
 
 ### Added
 
+-   `Alert` and `Table` classes.
+-   Registry for alert schemas and GCP Project IDs.
+-   Alert schemas (Avro) and schema maps (yaml).
+-   Exceptions: `BadRequest` and `SchemaNotFoundError`.
+-   Types: `PubsubMessageLike` and `Schema`.
 -   ZTF Figures Tutorial
 
 ### Changed
 
+-   Update PubSub classes.
 -   update README.md to point to the new docs
 -   remove setup and requirements files that are no longer needed after switching away from Read The Docs
 
@@ -23,6 +29,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
 
 -   `figures` module (content moved to tutorial). This allowed the removal of the following explicit
     dependencies: `aplpy`, `matplotlib`, `numpy`.
+-   v0.1 BigQuery functions.
 
 ## \[0.2.0\] - 2023-07-02
 

From 36bf5290ef7ff020412201250f466ad164b0440d Mon Sep 17 00:00:00 2001
From: Troy Raen <raen@ipac.caltech.edu>
Date: Thu, 6 Jun 2024 12:27:33 -0700
Subject: [PATCH 55/55] fix .md formatting

---
 CHANGELOG.md | 36 ++++++++++++++++++------------------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 078c48e..37c78cd 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -12,40 +12,40 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/)
 
 ### Added
 
--   `Alert` and `Table` classes.
--   Registry for alert schemas and GCP Project IDs.
--   Alert schemas (Avro) and schema maps (yaml).
--   Exceptions: `BadRequest` and `SchemaNotFoundError`.
--   Types: `PubsubMessageLike` and `Schema`.
--   ZTF Figures Tutorial
+- `Alert` and `Table` classes.
+- Registry for alert schemas and GCP Project IDs.
+- Alert schemas (Avro) and schema maps (yaml).
+- Exceptions: `BadRequest` and `SchemaNotFoundError`.
+- Types: `PubsubMessageLike` and `Schema`.
+- ZTF Figures Tutorial
 
 ### Changed
 
--   Update PubSub classes.
--   update README.md to point to the new docs
--   remove setup and requirements files that are no longer needed after switching away from Read The Docs
+- Update PubSub classes.
+- update README.md to point to the new docs
+- remove setup and requirements files that are no longer needed after switching away from Read The Docs
 
 ### Removed
 
--   `figures` module (content moved to tutorial). This allowed the removal of the following explicit
+- `figures` module (content moved to tutorial). This allowed the removal of the following explicit
     dependencies: `aplpy`, `matplotlib`, `numpy`.
--   v0.1 BigQuery functions.
+- v0.1 BigQuery functions.
 
 ## \[0.2.0\] - 2023-07-02
 
 ### Added
 
--   `auth` module supporting authentication via a service account or oauth2
--   `exceptions` module with class `OpenAlertError`
--   "Overview" section in docs
--   classes in `utils` module: `ProjectIds`, `Cast`
--   files: `CHANGELOG.md`, `pittgoogle_env.yml`
+- `auth` module supporting authentication via a service account or oauth2
+- `exceptions` module with class `OpenAlertError`
+- "Overview" section in docs
+- classes in `utils` module: `ProjectIds`, `Cast`
+- files: `CHANGELOG.md`, `pittgoogle_env.yml`
 
 ### Changed
 
--   Overhaul the `pubsub` module. Add classes `Topic`, `Subscription`, `Consumer`, `Alert`,
+- Overhaul the `pubsub` module. Add classes `Topic`, `Subscription`, `Consumer`, `Alert`,
   `Response`.
 
 ### Fixed
 
--   cleanup some issues flagged by Codacy
+- cleanup some issues flagged by Codacy