Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feat/dashboard add dashboard to analyze result #14

Merged
merged 6 commits into from
Nov 18, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 23 additions & 0 deletions .github/workflows/pr-reminder.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
name: PRs reviews reminder

on:
schedule:
- cron: "0 1 * * *" #KST: 10:00
- cron: "0 3 * * *" #KST: 12:00
- cron: "0 4 * * *" #KST: 13:00
- cron: "0 7 * * *" #KST: 16:00
- cron: "0 9 * * *" #KST: 18:00

jobs:
pr-reviews-reminder:
runs-on: ubuntu-latest
steps:
- uses: davideviolante/[email protected]
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
with:
webhook-url: 'https://hooks.slack.com/services/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' # Required
provider: 'slack' # Required (slack or msteams)
channel: '#ecl-free-talking' # Optional, eg: #general
github-provider-map: 'wbin0718:U041WE3RDMX,FacerAin:U041WE4P8GZ,ghlrobin:U041HN2FGMR,kyc3492:U041388FBM5,jinmyeongAN:U041HR962M8' # Optional, eg: DavideViolante:UEABCDEFG,foobar:UAABCDEFG
ignore-label: '' # Optional, eg: no-reminder
7 changes: 6 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ quality: set-style-dep check-quality
style: set-style-dep set-style
setup: set-precommit set-style-dep set-test-dep set-git set-dev set-dataset set-directory
test: set-test-dep set-test
dashboard: set-dashboard


##### basic #####
Expand Down Expand Up @@ -58,4 +59,8 @@ set-directory:
mkdir -p ./src/prediction
mkdir -p ./src/logs
mkdir -p ./src/best_model
mkdir -p ./src/results
mkdir -p ./src/results


set-dashboard:
streamlit run dashboard/app.py
46 changes: 46 additions & 0 deletions dashboard/app.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
import argparse
import pickle as pickle

import pandas as pd
import streamlit as st

from .utils import get_filtered_result, test


def app(args):
"""Run streamlit app"""
test_df = pd.read_csv(args.valid_data_path)

st.set_page_config(page_icon="❄️", page_title="Into the RE", layout="wide")

st.title("Into the Re")

result_df = test(args)
filtered_df = get_filtered_result(result_df, test_df)

st.dataframe(filtered_df)
st.text(f"전체 {len(test_df)} 중 {len(filtered_df)}개를 틀렸습니다.")
st.text("실제 정답 분포")
st.bar_chart(filtered_df["answer"].value_counts())
st.text("예측 라벨 분포")
st.bar_chart(filtered_df["pred_label"].value_counts())


parser = argparse.ArgumentParser()

parser.add_argument("--model_name", default="klue/bert-base", type=str)
parser.add_argument(
"--model_dir",
default="src/best_model",
type=str,
)
parser.add_argument(
"--valid_data_path",
default="dataset/train/dev.csv",
type=str,
)


args = parser.parse_args()

app(args)
Binary file added dashboard/dict_label_to_num.pkl
Binary file not shown.
Binary file added dashboard/dict_num_to_label.pkl
Binary file not shown.
208 changes: 208 additions & 0 deletions dashboard/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,208 @@
import json
import pickle as pickle

import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader
from tqdm import tqdm
from transformers import AutoModelForSequenceClassification, AutoTokenizer

DICT_NUM_TO_LABEL_PATH = "dashboard/dict_num_to_label.pkl"
with open(DICT_NUM_TO_LABEL_PATH, "rb") as f:
dict_num_to_label = pickle.load(f)


def inference(model, tokenized_sent, device):
"""
test dataset을 DataLoader로 만들어 준 후,
batch_size로 나눠 model이 예측 합니다.
"""
dataloader = DataLoader(tokenized_sent, batch_size=16, shuffle=False)
model.eval()
output_pred = []
output_prob = []
for i, data in enumerate(tqdm(dataloader)):
with torch.no_grad():
outputs = model(
input_ids=data["input_ids"].to(device),
attention_mask=data["attention_mask"].to(device),
token_type_ids=data["token_type_ids"].to(device),
)
logits = outputs[0]
prob = F.softmax(logits, dim=-1).detach().cpu().numpy()
logits = logits.detach().cpu().numpy()
result = np.argmax(logits, axis=-1)

output_pred.append(result)
output_prob.append(prob)

return (
np.concatenate(output_pred).tolist(),
np.concatenate(output_prob, axis=0).tolist(),
)


def tokenized_dataset(dataset, tokenizer):
"""tokenizer에 따라 sentence를 tokenizing 합니다."""
concat_entity = []
for e01, e02 in zip(dataset["subject_entity"], dataset["object_entity"]):
temp = ""
temp = e01 + "[SEP]" + e02
concat_entity.append(temp)
tokenized_sentences = tokenizer(
concat_entity,
list(dataset["sentence"]),
return_tensors="pt",
padding=True,
truncation=True,
max_length=256,
add_special_tokens=True,
)
return tokenized_sentences


def preprocessing_dataset(dataset):
"""처음 불러온 csv 파일을 원하는 형태의 DataFrame으로 변경 시켜줍니다."""
subject_entity = []
object_entity = []
for i, j in zip(dataset["subject_entity"], dataset["object_entity"]):
i = i[1:-1].split(",")[0].split(":")[1]
j = j[1:-1].split(",")[0].split(":")[1]

subject_entity.append(i)
object_entity.append(j)
out_dataset = pd.DataFrame(
{
"id": dataset["id"],
"sentence": dataset["sentence"],
"subject_entity": subject_entity,
"object_entity": object_entity,
"label": dataset["label"],
}
)
return out_dataset


def load_data(dataset_dir):
"""csv 파일을 경로에 맡게 불러 옵니다."""
pd_dataset = pd.read_csv(dataset_dir)
dataset = preprocessing_dataset(pd_dataset)

return dataset


def load_test_dataset(dataset_dir, tokenizer):
"""
test dataset을 불러온 후,
tokenizing 합니다.
"""
test_dataset = load_data(dataset_dir)
test_label = [100 for i in range(len(test_dataset))]
# tokenizing dataset
tokenized_test = tokenized_dataset(test_dataset, tokenizer)
return test_dataset["id"], tokenized_test, test_label


def num_to_label(label):
"""
숫자로 되어 있던 class를 원본 문자열 라벨로 변환 합니다.
"""
origin_label = []
for v in label:
origin_label.append(dict_num_to_label[v])

return origin_label


class RE_Dataset(torch.utils.data.Dataset):
"""Dataset 구성을 위한 class."""

def __init__(self, pair_dataset, labels):
self.pair_dataset = pair_dataset
self.labels = labels

def __getitem__(self, idx):
item = {key: val[idx].clone().detach() for key, val in self.pair_dataset.items()}
item["labels"] = torch.tensor(self.labels[idx])
return item

def __len__(self):
return len(self.labels)


def get_topn_probs(probs, n=3):
"""_summary_

Args:
probs (_type_): _description_
n (int, optional): _description_. Defaults to 3.

Returns:
_type_: _description_
"""
pairs = []
top_n_idxs = list(reversed(np.array(probs).argsort()))[:n]
for idx in top_n_idxs:
pairs.append((dict_num_to_label[idx], probs[idx]))
return pairs


def get_entity_word(row):
"""_summary_

Args:
row (_type_): _description_

Returns:
_type_: _description_
"""
row = row.replace("'", '"')
return json.loads(row)["word"]


def get_filtered_result(new_df, test_df):
"""_summary_

Args:
new_df (_type_): _description_
test_df (_type_): _description_

Returns:
_type_: _description_
"""
new_df["sentence"] = test_df["sentence"]
new_df["answer"] = test_df["label"]
new_df["subject"] = test_df["subject_entity"].apply(get_entity_word)
new_df["object"] = test_df["object_entity"].apply(get_entity_word)
new_df["probs"] = new_df["probs"].apply(get_topn_probs)
new_df = new_df.loc[new_df["pred_label"] != new_df["answer"]]
new_df = new_df[["sentence", "subject", "object", "pred_label", "answer", "probs"]]
return new_df


def test(args):
"""Perform a test using model of model_dir

Returns:
_type_: pd.DataFrame
"""
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained(args.model_name)

model = AutoModelForSequenceClassification.from_pretrained(args.model_dir)
model.to(device)

test_id, test_dataset, test_label = load_test_dataset(args.valid_data_path, tokenizer)
Re_test_dataset = RE_Dataset(test_dataset, test_label)

pred_answer, output_prob = inference(model, Re_test_dataset, device)
pred_answer = num_to_label(pred_answer)
output = pd.DataFrame(
{
"pred_label": pred_answer,
"probs": output_prob,
}
)
return output
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,5 @@ scikit-learn~=0.24.1
transformers==4.10.0
pytorch-lightning==1.7.7
pyyaml==6.0
mlflow==2.0.1
mlflow==2.0.1
streamlit==1.14.1