diff --git a/.github/workflows/pr-reminder.yml b/.github/workflows/pr-reminder.yml new file mode 100644 index 0000000..f7c7724 --- /dev/null +++ b/.github/workflows/pr-reminder.yml @@ -0,0 +1,23 @@ +name: PRs reviews reminder + +on: + schedule: + - cron: "0 1 * * *" #KST: 10:00 + - cron: "0 3 * * *" #KST: 12:00 + - cron: "0 4 * * *" #KST: 13:00 + - cron: "0 7 * * *" #KST: 16:00 + - cron: "0 9 * * *" #KST: 18:00 + +jobs: + pr-reviews-reminder: + runs-on: ubuntu-latest + steps: + - uses: davideviolante/pr-reviews-reminder-action@v2.1.4 + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + with: + webhook-url: 'https://hooks.slack.com/services/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' # Required + provider: 'slack' # Required (slack or msteams) + channel: '#ecl-free-talking' # Optional, eg: #general + github-provider-map: 'wbin0718:U041WE3RDMX,FacerAin:U041WE4P8GZ,ghlrobin:U041HN2FGMR,kyc3492:U041388FBM5,jinmyeongAN:U041HR962M8' # Optional, eg: DavideViolante:UEABCDEFG,foobar:UAABCDEFG + ignore-label: '' # Optional, eg: no-reminder \ No newline at end of file diff --git a/Makefile b/Makefile index 1b4e747..db11625 100644 --- a/Makefile +++ b/Makefile @@ -3,6 +3,7 @@ quality: set-style-dep check-quality style: set-style-dep set-style setup: set-precommit set-style-dep set-test-dep set-git set-dev set-dataset set-directory test: set-test-dep set-test +dashboard: set-dashboard ##### basic ##### @@ -58,4 +59,8 @@ set-directory: mkdir -p ./src/prediction mkdir -p ./src/logs mkdir -p ./src/best_model - mkdir -p ./src/results \ No newline at end of file + mkdir -p ./src/results + + +set-dashboard: + streamlit run dashboard/app.py \ No newline at end of file diff --git a/dashboard/app.py b/dashboard/app.py new file mode 100644 index 0000000..4715c79 --- /dev/null +++ b/dashboard/app.py @@ -0,0 +1,46 @@ +import argparse +import pickle as pickle + +import pandas as pd +import streamlit as st + +from .utils import get_filtered_result, test + + +def app(args): + """Run streamlit app""" + test_df = pd.read_csv(args.valid_data_path) + + st.set_page_config(page_icon="❄️", page_title="Into the RE", layout="wide") + + st.title("Into the Re") + + result_df = test(args) + filtered_df = get_filtered_result(result_df, test_df) + + st.dataframe(filtered_df) + st.text(f"전체 {len(test_df)} 중 {len(filtered_df)}개를 틀렸습니다.") + st.text("실제 정답 분포") + st.bar_chart(filtered_df["answer"].value_counts()) + st.text("예측 라벨 분포") + st.bar_chart(filtered_df["pred_label"].value_counts()) + + +parser = argparse.ArgumentParser() + +parser.add_argument("--model_name", default="klue/bert-base", type=str) +parser.add_argument( + "--model_dir", + default="src/best_model", + type=str, +) +parser.add_argument( + "--valid_data_path", + default="dataset/train/dev.csv", + type=str, +) + + +args = parser.parse_args() + +app(args) diff --git a/dashboard/dict_label_to_num.pkl b/dashboard/dict_label_to_num.pkl new file mode 100644 index 0000000..e452189 Binary files /dev/null and b/dashboard/dict_label_to_num.pkl differ diff --git a/dashboard/dict_num_to_label.pkl b/dashboard/dict_num_to_label.pkl new file mode 100644 index 0000000..2c6d031 Binary files /dev/null and b/dashboard/dict_num_to_label.pkl differ diff --git a/dashboard/utils.py b/dashboard/utils.py new file mode 100644 index 0000000..eb50691 --- /dev/null +++ b/dashboard/utils.py @@ -0,0 +1,208 @@ +import json +import pickle as pickle + +import numpy as np +import pandas as pd +import torch +import torch.nn.functional as F +from torch.utils.data import DataLoader +from tqdm import tqdm +from transformers import AutoModelForSequenceClassification, AutoTokenizer + +DICT_NUM_TO_LABEL_PATH = "dashboard/dict_num_to_label.pkl" +with open(DICT_NUM_TO_LABEL_PATH, "rb") as f: + dict_num_to_label = pickle.load(f) + + +def inference(model, tokenized_sent, device): + """ + test dataset을 DataLoader로 만들어 준 후, + batch_size로 나눠 model이 예측 합니다. + """ + dataloader = DataLoader(tokenized_sent, batch_size=16, shuffle=False) + model.eval() + output_pred = [] + output_prob = [] + for i, data in enumerate(tqdm(dataloader)): + with torch.no_grad(): + outputs = model( + input_ids=data["input_ids"].to(device), + attention_mask=data["attention_mask"].to(device), + token_type_ids=data["token_type_ids"].to(device), + ) + logits = outputs[0] + prob = F.softmax(logits, dim=-1).detach().cpu().numpy() + logits = logits.detach().cpu().numpy() + result = np.argmax(logits, axis=-1) + + output_pred.append(result) + output_prob.append(prob) + + return ( + np.concatenate(output_pred).tolist(), + np.concatenate(output_prob, axis=0).tolist(), + ) + + +def tokenized_dataset(dataset, tokenizer): + """tokenizer에 따라 sentence를 tokenizing 합니다.""" + concat_entity = [] + for e01, e02 in zip(dataset["subject_entity"], dataset["object_entity"]): + temp = "" + temp = e01 + "[SEP]" + e02 + concat_entity.append(temp) + tokenized_sentences = tokenizer( + concat_entity, + list(dataset["sentence"]), + return_tensors="pt", + padding=True, + truncation=True, + max_length=256, + add_special_tokens=True, + ) + return tokenized_sentences + + +def preprocessing_dataset(dataset): + """처음 불러온 csv 파일을 원하는 형태의 DataFrame으로 변경 시켜줍니다.""" + subject_entity = [] + object_entity = [] + for i, j in zip(dataset["subject_entity"], dataset["object_entity"]): + i = i[1:-1].split(",")[0].split(":")[1] + j = j[1:-1].split(",")[0].split(":")[1] + + subject_entity.append(i) + object_entity.append(j) + out_dataset = pd.DataFrame( + { + "id": dataset["id"], + "sentence": dataset["sentence"], + "subject_entity": subject_entity, + "object_entity": object_entity, + "label": dataset["label"], + } + ) + return out_dataset + + +def load_data(dataset_dir): + """csv 파일을 경로에 맡게 불러 옵니다.""" + pd_dataset = pd.read_csv(dataset_dir) + dataset = preprocessing_dataset(pd_dataset) + + return dataset + + +def load_test_dataset(dataset_dir, tokenizer): + """ + test dataset을 불러온 후, + tokenizing 합니다. + """ + test_dataset = load_data(dataset_dir) + test_label = [100 for i in range(len(test_dataset))] + # tokenizing dataset + tokenized_test = tokenized_dataset(test_dataset, tokenizer) + return test_dataset["id"], tokenized_test, test_label + + +def num_to_label(label): + """ + 숫자로 되어 있던 class를 원본 문자열 라벨로 변환 합니다. + """ + origin_label = [] + for v in label: + origin_label.append(dict_num_to_label[v]) + + return origin_label + + +class RE_Dataset(torch.utils.data.Dataset): + """Dataset 구성을 위한 class.""" + + def __init__(self, pair_dataset, labels): + self.pair_dataset = pair_dataset + self.labels = labels + + def __getitem__(self, idx): + item = {key: val[idx].clone().detach() for key, val in self.pair_dataset.items()} + item["labels"] = torch.tensor(self.labels[idx]) + return item + + def __len__(self): + return len(self.labels) + + +def get_topn_probs(probs, n=3): + """_summary_ + + Args: + probs (_type_): _description_ + n (int, optional): _description_. Defaults to 3. + + Returns: + _type_: _description_ + """ + pairs = [] + top_n_idxs = list(reversed(np.array(probs).argsort()))[:n] + for idx in top_n_idxs: + pairs.append((dict_num_to_label[idx], probs[idx])) + return pairs + + +def get_entity_word(row): + """_summary_ + + Args: + row (_type_): _description_ + + Returns: + _type_: _description_ + """ + row = row.replace("'", '"') + return json.loads(row)["word"] + + +def get_filtered_result(new_df, test_df): + """_summary_ + + Args: + new_df (_type_): _description_ + test_df (_type_): _description_ + + Returns: + _type_: _description_ + """ + new_df["sentence"] = test_df["sentence"] + new_df["answer"] = test_df["label"] + new_df["subject"] = test_df["subject_entity"].apply(get_entity_word) + new_df["object"] = test_df["object_entity"].apply(get_entity_word) + new_df["probs"] = new_df["probs"].apply(get_topn_probs) + new_df = new_df.loc[new_df["pred_label"] != new_df["answer"]] + new_df = new_df[["sentence", "subject", "object", "pred_label", "answer", "probs"]] + return new_df + + +def test(args): + """Perform a test using model of model_dir + + Returns: + _type_: pd.DataFrame + """ + device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") + tokenizer = AutoTokenizer.from_pretrained(args.model_name) + + model = AutoModelForSequenceClassification.from_pretrained(args.model_dir) + model.to(device) + + test_id, test_dataset, test_label = load_test_dataset(args.valid_data_path, tokenizer) + Re_test_dataset = RE_Dataset(test_dataset, test_label) + + pred_answer, output_prob = inference(model, Re_test_dataset, device) + pred_answer = num_to_label(pred_answer) + output = pd.DataFrame( + { + "pred_label": pred_answer, + "probs": output_prob, + } + ) + return output diff --git a/requirements.txt b/requirements.txt index ad2ec85..b9aa8e3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,4 +3,5 @@ scikit-learn~=0.24.1 transformers==4.10.0 pytorch-lightning==1.7.7 pyyaml==6.0 -mlflow==2.0.1 \ No newline at end of file +mlflow==2.0.1 +streamlit==1.14.1 \ No newline at end of file