Skip to content

Commit

Permalink
Update Utils
Browse files Browse the repository at this point in the history
  • Loading branch information
Javclaude committed May 30, 2020
1 parent 738b008 commit 39ffe0c
Showing 1 changed file with 54 additions and 3 deletions.
57 changes: 54 additions & 3 deletions src/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,22 +3,39 @@
from typing import List

import pandas as pd
import numpy as np

## Modelling
from transformers import AutoTokenizer

import torch
from torch.utils.data import (
TensorDataset,
RandomSampler,
SequentialSampler,
DataLoader
)

## Metrics / Utils
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

def set_seed(seed):
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available:
torch.cuda.manual_seed_all(seed)

def get_length(df: pd.DataFrame, col_index: int):
def get_length(df: pd.DataFrame, texts_col: int):
'''
Max sequence length for training a NN
Based on heuristic (mean and std over the length distribution of texts)
'''
df[df.columns[col_index]] = df[df.columns[col_index]].astype(str)
df[texts_col] = df[texts_col].astype(str)

sequences_length = df[df.columns[col_index]].apply(lambda x: len(x.split()))
sequences_length = df[texts_col].apply(lambda x: len(x.split()))

max_seq_length = int(round(sequences_length.mean() + sequences_length.std()))

Expand Down Expand Up @@ -53,3 +70,37 @@ def encode_texts(df: pd.DataFrame, texts_col: str, tokenizer: str = "bert-base-u
return_attention_masks=False,
return_token_type_ids=False)['input_ids']
return encoded_sequence, pretrained_tokenizer.vocab_size

def create_TorchLoaders(X: List = None, y: np.array = None, test_size: int = 0.10, batch_size: int = 32, batch_size_eval: int = 64):
'''
'''
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = test_size)

train_dataset = TensorDataset(
torch.tensor(X_train, dtype=torch.long),
torch.tensor(y_train, dtype=torch.float)
)

test_dataset = TensorDataset(
torch.tensor(X_test, dtype=torch.long),
torch.tensor(y_test, dtype=torch.float)
)

train_sampler = RandomSampler(train_dataset)

test_sampler = SequentialSampler(test_dataset)

train_loader = DataLoader(
dataset = train_dataset,
sampler = train_sampler,
batch_size =batch_size
)

test_loader = DataLoader(
dataset = test_dataset,
sampler = test_sampler,
batch_size = batch_size_eval
)

return train_loader, test_loader

0 comments on commit 39ffe0c

Please sign in to comment.