Skip to content

Commit

Permalink
Merge branch 'f_baseline' of https://github.com/JavClaude/Sentiment-A…
Browse files Browse the repository at this point in the history
…nalysis into f_baseline
  • Loading branch information
HessTaha committed May 30, 2020
2 parents 93dddf4 + 6a12d43 commit be2e1d3
Showing 1 changed file with 3 additions and 3 deletions.
6 changes: 3 additions & 3 deletions src/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,14 +36,14 @@ def get_labels(df: pd.DataFrame, labels_col):

return LB.transform(df[labels_col])

def encode_texts(df: pd.DataFrame, texts_col: str, tokenizer: str = "bert-base-uncased", max_seq_length: int = 512):
def encode_texts(df: pd.DataFrame, texts_col: str, tokenizer: str = "bert-base-uncased", max_seq_length: int = 512, return_vocab_size: bool = True):
""""
Encode list of texts using pretrained tokenizer from huggingface
return np.array of encoded sequence
"""
pretrained_tokenizer = AutoTokenizer.from_pretrained(tokenizer, use_fast=True)
print(pretrained_tokenizer)

texts = list(df[texts_col].astype(str))

encoded_sequence = pretrained_tokenizer.batch_encode_plus(texts,
Expand All @@ -52,4 +52,4 @@ def encode_texts(df: pd.DataFrame, texts_col: str, tokenizer: str = "bert-base-u
max_length=max_seq_length,
return_attention_masks=False,
return_token_type_ids=False)['input_ids']
return encoded_sequence
return encoded_sequence, pretrained_tokenizer.vocab_size

0 comments on commit be2e1d3

Please sign in to comment.