From dc03395bd6ac35ae277fae0c6718bd2c8560ec1f Mon Sep 17 00:00:00 2001 From: Michael Kranzlein <8162250+mkranzlein@users.noreply.github.com> Date: Wed, 27 Sep 2023 12:07:12 -0400 Subject: [PATCH] Planning chunk+sentence behavior --- src/hipool/models.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/hipool/models.py b/src/hipool/models.py index d33ee2c..0597ea1 100644 --- a/src/hipool/models.py +++ b/src/hipool/models.py @@ -70,6 +70,16 @@ def forward(self, ids: list[Integer[Tensor, "_ d"]], token_type_ids: A list of varied-length tensors token_type_ids. All 0s. """ + + # Get hipool embedding + + # Forward pass happens on one or more documents + # One is the minimum because hipool needs all of the document's chunks + # Pipeline: send document through bert sentence by sentence + + # Chunking approaches: equal number of sentences, equal number of tokens, + # unequal number of sentences that approximates an equal number of tokens + # Pad such that each sequence has the same number of chunks # Padding chunks c-dim vectors, where all the input ids are 0, which is