Skip to content

Commit

Permalink
Initial dataset reader tests
Browse files Browse the repository at this point in the history
  • Loading branch information
mkranzlein committed Aug 21, 2023
1 parent 0107d39 commit 6dec7c9
Show file tree
Hide file tree
Showing 2 changed files with 33 additions and 0 deletions.
1 change: 1 addition & 0 deletions fixtures/data/curiam_sample.json

Large diffs are not rendered by default.

32 changes: 32 additions & 0 deletions tests/curiam_reader_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
import pytest
import torch
from transformers import BertTokenizer

from hipool import curiam_reader


@pytest.fixture
def curiam_sample():
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
dataset = curiam_reader.CuriamDataset(
json_file_path="fixtures/data/curiam_sample.json",
tokenizer=bert_tokenizer,
chunk_len=50,
overlap_len=10)
return dataset


def test_read_json_len(curiam_sample):
assert len(curiam_sample.documents) == 2
assert len(curiam_sample.labels) == 2


def test_read_json_tokens(curiam_sample):
assert curiam_sample.documents[0][0] == "Justice"
assert curiam_sample.documents[0][1] == "GORSUCH"


def test_getitem_labels(curiam_sample):
first_chunk_labels = curiam_sample[0]["targets"][0]
first_token_labels = first_chunk_labels[0]
assert torch.equal(first_token_labels, torch.zeros((9)))

0 comments on commit 6dec7c9

Please sign in to comment.