-
Notifications
You must be signed in to change notification settings - Fork 1
/
preprocess.py
134 lines (97 loc) · 4.32 KB
/
preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
from re import compile
from collections import OrderedDict, namedtuple
def remove_exact_keywords(line: str) -> list[str]:
find_empty_parantheses_regex = compile(r"\(\s*\)")
remove_nonprompts_regex = compile(r"[^a-zA-Z()_\-\[\]{}]*")
remove_nonweighters_regex = compile(r"[()\[\]{}]*")
remove_inside_regex = compile(r"[^()\[\]{}]*")
def get_unique_list(sequence: list[str]) -> list[str]:
seen = set()
return [
x.strip()
for x in sequence
if not (x.strip() in seen or seen.add(x.strip()))
]
# remove exact prompts
prompts = get_unique_list(line.split(","))
pure_prompts = OrderedDict() # order matters, it contains prompts' original forms
extracted_pure_prompts = set() # order isn't important, it contains prompt keywords
# remove exact keyword
for prompt in prompts:
tempPrompt = remove_nonprompts_regex.sub(
"", prompt
).lstrip() # from -> ((masterpiece:1.2)) | to -> ((masterpiece))
if len(tempPrompt) == 0:
continue
tempPrompt = remove_nonweighters_regex.sub(
"", tempPrompt
) # from -> ((masterpiece)) | to -> masterpiece
if tempPrompt in extracted_pure_prompts:
tempPrompt = remove_inside_regex.sub(
"", prompt
) # from -> ((masterpiece:1.2)) | to -> (())
if (
len(find_empty_parantheses_regex.findall(tempPrompt))
> 0 # find balanced parantheses count
):
# check balanced parantheses
inner_parant_count = tempPrompt.count("(")
outer_parant_count = tempPrompt.count(")")
"""
there is nothing to do
continue with with next prompt
"""
if inner_parant_count == outer_parant_count:
continue
lowest_count = min(inner_parant_count, outer_parant_count)
# remove balanced parantheses
# because it is going to be appended to a string
for _ in range(lowest_count):
tempPrompt = tempPrompt.replace("()", "")
pure_prompts[tempPrompt] = True
continue
extracted_pure_prompts.add(tempPrompt)
pure_prompts[prompt] = True
return pure_prompts.keys()
def fix_commas(string: str) -> str:
remove_multiwhitespaces_regex = compile(r"\s+")
remove_nonpromptcommas_regex = compile(r"(,\s){2,}")
temp_str = remove_multiwhitespaces_regex.sub(" ", string)
return remove_nonpromptcommas_regex.sub(", ", temp_str)
def fix_artifacts(string: str) -> str:
temp_string = string
ArtifactFix = namedtuple("ArtifactFix", ["regex", "new_str"])
replace_list = [
ArtifactFix(regex=compile(r"\(\s*,"), new_str="("),
ArtifactFix(regex=compile(r"\[\s*,"), new_str="["),
ArtifactFix(regex=compile(r"{\s*,"), new_str="{"),
ArtifactFix(regex=compile(r",\s*\)"), new_str=")"),
ArtifactFix(regex=compile(r",\s*\]"), new_str="]"),
ArtifactFix(regex=compile(r",\s*}"), new_str="}"),
]
# fixing the artifacts
for artifact_fix in replace_list:
temp_string = artifact_fix.regex.sub(artifact_fix.new_str, temp_string)
return temp_string
def preprocess(line: str) -> str:
remove_scalarweights_regex = compile(r",\s*:[0-9]*\.?[0-9]+")
remove_emptyprompts_regex = compile(r",\s+[()\[\]{}]+\s*,")
remove_danglingparantheses_regex = compile(r"\B\s+|\s+\B")
temp_line = line.encode("ascii", "xmlcharrefreplace").decode()
temp_line = temp_line.encode("utf-8", "xmlcharrefreplace").decode()
temp_line = temp_line.replace("\xa0", " ")
temp_line = temp_line.replace("\n", ", ")
temp_line = fix_commas(temp_line)
temp_line = remove_scalarweights_regex.sub(
"", temp_line
) # from -> , 0.6 | to -> *empty string*
temp_line = ", ".join(remove_exact_keywords(temp_line))
temp_line = fix_commas(temp_line)
temp_line = fix_artifacts(temp_line)
temp_line = remove_emptyprompts_regex.sub(
",", temp_line
) # from -> , (((, | to -> ,
temp_line = remove_danglingparantheses_regex.sub("", temp_line).replace(
",", ", "
) # from -> (( ((prompt)) | to -> ((prompt))
return temp_line