diff --git a/README.md b/README.md index ee4495c..98c1d6e 100644 --- a/README.md +++ b/README.md @@ -40,6 +40,7 @@ conda activate .venvs/epbd_bert_condavenv_test1 python setup.py install conda install -c conda-forge scikit-learn scipy -y +pip uninstall triton # We did not utilize triton for underlying hardware dependency # To deactivate and remove the venv conda deactivate diff --git a/epbd_bert/dnabert2_epbd/train_lightning.py b/epbd_bert/dnabert2_epbd/train_lightning.py index 9b14b1a..17339fc 100644 --- a/epbd_bert/dnabert2_epbd/train_lightning.py +++ b/epbd_bert/dnabert2_epbd/train_lightning.py @@ -24,14 +24,14 @@ tokenizer = get_dnabert2_tokenizer(max_num_tokens=512) data_collator = SeqLabelEPBDDataCollator(pad_token_id=tokenizer.pad_token_id) train_dataset = SequenceEPBDDataset( - data_path="data/train_val_test/peaks_with_labels_train.tsv.gz", + data_path="resources/train_val_test/peaks_with_labels_train.tsv.gz", + pydnaepbd_features_path="resources/pydnaepbd_things/coord_flips/id_seqs/", # ../data, resources tokenizer=tokenizer, - epbd_features_type=configs.epbd_features_type, ) val_dataset = SequenceEPBDDataset( - data_path="data/train_val_test/peaks_with_labels_val.tsv.gz", + data_path="resources/train_val_test/peaks_with_labels_val.tsv.gz", + pydnaepbd_features_path="resources/pydnaepbd_things/coord_flips/id_seqs/", # ../data, resources tokenizer=tokenizer, - epbd_features_type=configs.epbd_features_type, ) print(train_dataset.__len__(), val_dataset.__len__()) train_dl = DataLoader( diff --git a/epbd_bert/dnabert2_epbd_crossattn/train_lightning.py b/epbd_bert/dnabert2_epbd_crossattn/train_lightning.py index f059e2c..35791f5 100644 --- a/epbd_bert/dnabert2_epbd_crossattn/train_lightning.py +++ b/epbd_bert/dnabert2_epbd_crossattn/train_lightning.py @@ -36,11 +36,13 @@ tokenizer = get_dnabert2_tokenizer(max_num_tokens=512) data_collator = SeqLabelEPBDDataCollator(tokenizer.pad_token_id) train_dataset = SequenceRandEPBDMultiModalDataset( - data_path="data/train_val_test/peaks_with_labels_train.tsv.gz", + data_path="resources/train_val_test/peaks_with_labels_train.tsv.gz", + pydnaepbd_features_path="resources/pydnaepbd_things/coord_flips/id_seqs/", # ../data, resources tokenizer=tokenizer, ) val_dataset = SequenceRandEPBDMultiModalDataset( - data_path="data/train_val_test/peaks_with_labels_val.tsv.gz", + data_path="resources/train_val_test/peaks_with_labels_val.tsv.gz", + pydnaepbd_features_path="resources/pydnaepbd_things/coord_flips/id_seqs/", # ../data, resources tokenizer=tokenizer, ) print(train_dataset.__len__(), val_dataset.__len__())