forked from boostcampaitech7/level2-nlp-datacentric-nlp-16
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathclean.py
100 lines (82 loc) · 2.43 KB
/
clean.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import argparse
import os
import pandas as pd
import transformers
from utils.clean_text import calculate_ratio, denoise_text
from utils.util import set_seed
transformers.logging.set_verbosity_error()
def main(arg):
"""
Clean dataset using Korean ratio and LM
Firstly, detect noisy texts by Korean character ratio
Nextly, denoise the text using LM with prompt engineering
Args:
SEED (int): random seed number
MODEL_ID (str): huggingface model id
KR_UB (float): Korean ratio upper bound for cleanable noisy texts group
KR_LB (float): Korean ratio lower bound for cleanable noisy texts group
"""
## parameters
SEED = arg.seed
MODEL_ID = arg.model_id
KR_UB = arg.kr_ub
KR_LB = arg.kr_lb
## random seeding
set_seed(SEED)
## data loading
BASE_DIR = os.getcwd()
DATA_DIR = os.path.join(BASE_DIR, "data")
data = pd.read_csv(os.path.join(DATA_DIR, "train.csv"))
## text ratio
res = data["text"].apply(lambda x: calculate_ratio(x))
korean_ratio = res[1]
idx_korean = (korean_ratio >= KR_LB) & (korean_ratio < KR_UB)
idx = idx_korean
cleanable_data = data[idx]
## denoise
PROMPT_DIR = os.path.join(BASE_DIR, "prompts")
with open(os.path.join(PROMPT_DIR, "prompt_llama.txt"), "r") as f:
template = f.read()
output_txts = denoise_text(
texts=cleanable_data["text"].tolist(),
model_id=MODEL_ID,
template=template,
)
data.loc[idx, "text"] = output_txts
## remove not-cleanable text
idx_korean = korean_ratio < KR_LB
idx = idx_korean
data = data[~idx]
data.to_csv(os.path.join(DATA_DIR, "train_cleaned.csv"), index=False)
if __name__ == "__main__":
args = argparse.ArgumentParser()
args.add_argument(
"-s",
"--seed",
default=456,
type=int,
help="setting random seed (default: 456)",
)
args.add_argument(
"-m",
"--model_id",
default=None,
type=str,
help="hugging face model id (default: None)",
)
args.add_argument(
"-ku",
"--kr_ub",
default=0.75,
type=float,
help="upper bound for korean ratio in text (default: 0.75)",
)
args.add_argument(
"-kl",
"--kr_lb",
default=0.5,
type=float,
help="lower bound for korean ratio in text (default: 0.5)",
)
arg = args.parse_args()
main(arg)