forked from JavClaude/Sentiment-Analysis
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Javclaude
committed
May 30, 2020
1 parent
70916f3
commit a022919
Showing
1 changed file
with
45 additions
and
8 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,18 +1,55 @@ | ||
import os | ||
import random | ||
from typing import List | ||
|
||
import pandas as pd | ||
|
||
## Modelling | ||
from transformers import AutoTokenizer | ||
from torch.utils.data import ( | ||
TensorDataset, | ||
RandomSampler, | ||
SequentialSampler, | ||
DataLoader | ||
) | ||
|
||
## Metrics / Utils | ||
from sklearn.model_selection import train_test_split | ||
from sklearn.metrics import f1_score | ||
from sklearn.preprocessing import LabelEncoder | ||
|
||
def get_length(df: pd.DataFrame, col_index: int): | ||
''' | ||
Max sequence length for training a NN | ||
Based on heuristic (mean and std over the length distribution of texts) | ||
''' | ||
df[df.columns[col_index]] = df[df.columns[col_index]].astype(str) | ||
|
||
sequences_length = df[df.columns[col_index]].apply(lambda x: len(x.split())) | ||
|
||
max_seq_length = int(round(sequences_length.mean() + sequences_length.std())) | ||
|
||
return max_seq_length | ||
|
||
def get_labels(df: pd.DataFrame, labels_col): | ||
''' | ||
Encode labels from df | ||
return np.array of labels | ||
''' | ||
LB = LabelEncoder() | ||
|
||
LB.fit(df[labels_col]) | ||
|
||
return LB.transform(df[labels_col]) | ||
|
||
def encode_texts(df: pd.DataFrame, texts_col: str, tokenizer: str = "bert-base-uncased", max_seq_length: int = 512): | ||
"""" | ||
Encode list of texts using pretrained tokenizer from huggingface | ||
return np.array of encoded sequence | ||
""" | ||
pretrained_tokenizer = AutoTokenizer.from_pretrained(tokenizer, use_fast=True) | ||
print(pretrained_tokenizer) | ||
texts = list(df[texts_col].astype(str)) | ||
|
||
encoded_sequence = pretrained_tokenizer.batch_encode_plus(texts, | ||
add_special_tokens=True, | ||
pad_to_max_length=True, | ||
max_length=max_seq_length, | ||
return_attention_masks=False, | ||
return_token_type_ids=False)['input_ids'] | ||
return encoded_sequence |