Add utils function

HessTaha · May 30, 2020 · a022919 · a022919
1 parent 70916f3
commit a022919
Showing 1 changed file with 45 additions and 8 deletions.
diff --git a/src/utils.py b/src/utils.py
@@ -1,18 +1,55 @@
 import os
 import random
+from typing import List
 
 import pandas as pd
 
 ## Modelling
 from transformers import AutoTokenizer
-from torch.utils.data import (
-    TensorDataset,
-    RandomSampler,
-    SequentialSampler,
-    DataLoader
-)
 
 ## Metrics / Utils
-from sklearn.model_selection import train_test_split
-from sklearn.metrics import f1_score
+from sklearn.preprocessing import LabelEncoder
 
+def get_length(df: pd.DataFrame, col_index: int):
+    '''
+    Max sequence length for training a NN
+
+    Based on heuristic (mean and std over the length distribution of texts)
+    '''
+    df[df.columns[col_index]] = df[df.columns[col_index]].astype(str)
+
+    sequences_length = df[df.columns[col_index]].apply(lambda x: len(x.split()))
+
+    max_seq_length = int(round(sequences_length.mean() + sequences_length.std()))
+
+    return max_seq_length
+
+def get_labels(df: pd.DataFrame, labels_col):
+    '''
+    Encode labels from df
+
+    return np.array of labels
+    '''
+    LB = LabelEncoder()
+
+    LB.fit(df[labels_col])
+
+    return LB.transform(df[labels_col])
+
+def encode_texts(df: pd.DataFrame, texts_col: str, tokenizer: str = "bert-base-uncased", max_seq_length: int = 512):
+    """"
+    Encode list of texts using pretrained tokenizer from huggingface
+
+    return np.array of encoded sequence 
+    """
+    pretrained_tokenizer = AutoTokenizer.from_pretrained(tokenizer, use_fast=True)
+    print(pretrained_tokenizer)
+    texts = list(df[texts_col].astype(str))
+
+    encoded_sequence = pretrained_tokenizer.batch_encode_plus(texts, 
+                                                              add_special_tokens=True, 
+                                                              pad_to_max_length=True, 
+                                                              max_length=max_seq_length,
+                                                              return_attention_masks=False,
+                                                              return_token_type_ids=False)['input_ids']
+    return encoded_sequence