Skip to content

Commit

Permalink
Merge pull request #14 from boostcampaitech4lv23nlp2/feat/dashboard
Browse files Browse the repository at this point in the history
Feat/dashboard add dashboard to analyze result
  • Loading branch information
FacerAin authored Nov 18, 2022
2 parents f05c323 + 3a05d83 commit b2f8887
Show file tree
Hide file tree
Showing 7 changed files with 285 additions and 2 deletions.
23 changes: 23 additions & 0 deletions .github/workflows/pr-reminder.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
name: PRs reviews reminder

on:
schedule:
- cron: "0 1 * * *" #KST: 10:00
- cron: "0 3 * * *" #KST: 12:00
- cron: "0 4 * * *" #KST: 13:00
- cron: "0 7 * * *" #KST: 16:00
- cron: "0 9 * * *" #KST: 18:00

jobs:
pr-reviews-reminder:
runs-on: ubuntu-latest
steps:
- uses: davideviolante/[email protected]
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
with:
webhook-url: 'https://hooks.slack.com/services/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' # Required
provider: 'slack' # Required (slack or msteams)
channel: '#ecl-free-talking' # Optional, eg: #general
github-provider-map: 'wbin0718:U041WE3RDMX,FacerAin:U041WE4P8GZ,ghlrobin:U041HN2FGMR,kyc3492:U041388FBM5,jinmyeongAN:U041HR962M8' # Optional, eg: DavideViolante:UEABCDEFG,foobar:UAABCDEFG
ignore-label: '' # Optional, eg: no-reminder
7 changes: 6 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ quality: set-style-dep check-quality
style: set-style-dep set-style
setup: set-precommit set-style-dep set-test-dep set-git set-dev set-dataset set-directory
test: set-test-dep set-test
dashboard: set-dashboard


##### basic #####
Expand Down Expand Up @@ -58,4 +59,8 @@ set-directory:
mkdir -p ./src/prediction
mkdir -p ./src/logs
mkdir -p ./src/best_model
mkdir -p ./src/results
mkdir -p ./src/results


set-dashboard:
streamlit run dashboard/app.py
46 changes: 46 additions & 0 deletions dashboard/app.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
import argparse
import pickle as pickle

import pandas as pd
import streamlit as st

from .utils import get_filtered_result, test


def app(args):
"""Run streamlit app"""
test_df = pd.read_csv(args.valid_data_path)

st.set_page_config(page_icon="❄️", page_title="Into the RE", layout="wide")

st.title("Into the Re")

result_df = test(args)
filtered_df = get_filtered_result(result_df, test_df)

st.dataframe(filtered_df)
st.text(f"전체 {len(test_df)}{len(filtered_df)}개를 틀렸습니다.")
st.text("실제 정답 분포")
st.bar_chart(filtered_df["answer"].value_counts())
st.text("예측 라벨 분포")
st.bar_chart(filtered_df["pred_label"].value_counts())


parser = argparse.ArgumentParser()

parser.add_argument("--model_name", default="klue/bert-base", type=str)
parser.add_argument(
"--model_dir",
default="src/best_model",
type=str,
)
parser.add_argument(
"--valid_data_path",
default="dataset/train/dev.csv",
type=str,
)


args = parser.parse_args()

app(args)
Binary file added dashboard/dict_label_to_num.pkl
Binary file not shown.
Binary file added dashboard/dict_num_to_label.pkl
Binary file not shown.
208 changes: 208 additions & 0 deletions dashboard/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,208 @@
import json
import pickle as pickle

import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader
from tqdm import tqdm
from transformers import AutoModelForSequenceClassification, AutoTokenizer

DICT_NUM_TO_LABEL_PATH = "dashboard/dict_num_to_label.pkl"
with open(DICT_NUM_TO_LABEL_PATH, "rb") as f:
dict_num_to_label = pickle.load(f)


def inference(model, tokenized_sent, device):
"""
test dataset을 DataLoader로 만들어 준 후,
batch_size로 나눠 model이 예측 합니다.
"""
dataloader = DataLoader(tokenized_sent, batch_size=16, shuffle=False)
model.eval()
output_pred = []
output_prob = []
for i, data in enumerate(tqdm(dataloader)):
with torch.no_grad():
outputs = model(
input_ids=data["input_ids"].to(device),
attention_mask=data["attention_mask"].to(device),
token_type_ids=data["token_type_ids"].to(device),
)
logits = outputs[0]
prob = F.softmax(logits, dim=-1).detach().cpu().numpy()
logits = logits.detach().cpu().numpy()
result = np.argmax(logits, axis=-1)

output_pred.append(result)
output_prob.append(prob)

return (
np.concatenate(output_pred).tolist(),
np.concatenate(output_prob, axis=0).tolist(),
)


def tokenized_dataset(dataset, tokenizer):
"""tokenizer에 따라 sentence를 tokenizing 합니다."""
concat_entity = []
for e01, e02 in zip(dataset["subject_entity"], dataset["object_entity"]):
temp = ""
temp = e01 + "[SEP]" + e02
concat_entity.append(temp)
tokenized_sentences = tokenizer(
concat_entity,
list(dataset["sentence"]),
return_tensors="pt",
padding=True,
truncation=True,
max_length=256,
add_special_tokens=True,
)
return tokenized_sentences


def preprocessing_dataset(dataset):
"""처음 불러온 csv 파일을 원하는 형태의 DataFrame으로 변경 시켜줍니다."""
subject_entity = []
object_entity = []
for i, j in zip(dataset["subject_entity"], dataset["object_entity"]):
i = i[1:-1].split(",")[0].split(":")[1]
j = j[1:-1].split(",")[0].split(":")[1]

subject_entity.append(i)
object_entity.append(j)
out_dataset = pd.DataFrame(
{
"id": dataset["id"],
"sentence": dataset["sentence"],
"subject_entity": subject_entity,
"object_entity": object_entity,
"label": dataset["label"],
}
)
return out_dataset


def load_data(dataset_dir):
"""csv 파일을 경로에 맡게 불러 옵니다."""
pd_dataset = pd.read_csv(dataset_dir)
dataset = preprocessing_dataset(pd_dataset)

return dataset


def load_test_dataset(dataset_dir, tokenizer):
"""
test dataset을 불러온 후,
tokenizing 합니다.
"""
test_dataset = load_data(dataset_dir)
test_label = [100 for i in range(len(test_dataset))]
# tokenizing dataset
tokenized_test = tokenized_dataset(test_dataset, tokenizer)
return test_dataset["id"], tokenized_test, test_label


def num_to_label(label):
"""
숫자로 되어 있던 class를 원본 문자열 라벨로 변환 합니다.
"""
origin_label = []
for v in label:
origin_label.append(dict_num_to_label[v])

return origin_label


class RE_Dataset(torch.utils.data.Dataset):
"""Dataset 구성을 위한 class."""

def __init__(self, pair_dataset, labels):
self.pair_dataset = pair_dataset
self.labels = labels

def __getitem__(self, idx):
item = {key: val[idx].clone().detach() for key, val in self.pair_dataset.items()}
item["labels"] = torch.tensor(self.labels[idx])
return item

def __len__(self):
return len(self.labels)


def get_topn_probs(probs, n=3):
"""_summary_
Args:
probs (_type_): _description_
n (int, optional): _description_. Defaults to 3.
Returns:
_type_: _description_
"""
pairs = []
top_n_idxs = list(reversed(np.array(probs).argsort()))[:n]
for idx in top_n_idxs:
pairs.append((dict_num_to_label[idx], probs[idx]))
return pairs


def get_entity_word(row):
"""_summary_
Args:
row (_type_): _description_
Returns:
_type_: _description_
"""
row = row.replace("'", '"')
return json.loads(row)["word"]


def get_filtered_result(new_df, test_df):
"""_summary_
Args:
new_df (_type_): _description_
test_df (_type_): _description_
Returns:
_type_: _description_
"""
new_df["sentence"] = test_df["sentence"]
new_df["answer"] = test_df["label"]
new_df["subject"] = test_df["subject_entity"].apply(get_entity_word)
new_df["object"] = test_df["object_entity"].apply(get_entity_word)
new_df["probs"] = new_df["probs"].apply(get_topn_probs)
new_df = new_df.loc[new_df["pred_label"] != new_df["answer"]]
new_df = new_df[["sentence", "subject", "object", "pred_label", "answer", "probs"]]
return new_df


def test(args):
"""Perform a test using model of model_dir
Returns:
_type_: pd.DataFrame
"""
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained(args.model_name)

model = AutoModelForSequenceClassification.from_pretrained(args.model_dir)
model.to(device)

test_id, test_dataset, test_label = load_test_dataset(args.valid_data_path, tokenizer)
Re_test_dataset = RE_Dataset(test_dataset, test_label)

pred_answer, output_prob = inference(model, Re_test_dataset, device)
pred_answer = num_to_label(pred_answer)
output = pd.DataFrame(
{
"pred_label": pred_answer,
"probs": output_prob,
}
)
return output
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,5 @@ scikit-learn~=0.24.1
transformers==4.10.0
pytorch-lightning==1.7.7
pyyaml==6.0
mlflow==2.0.1
mlflow==2.0.1
streamlit==1.14.1

0 comments on commit b2f8887

Please sign in to comment.