-
Notifications
You must be signed in to change notification settings - Fork 0
/
summarization_preprocessing.py
79 lines (67 loc) · 2.28 KB
/
summarization_preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
import json
def convert_examples_to_features(item):
example, example_index, tokenizer, args, stage = item
source_str = example.source
source_str = source_str.replace('</s>', '<unk>')
source_ids = tokenizer.encode(
source_str, max_length=args.max_source_length, padding='max_length', truncation=True)
assert source_ids.count(tokenizer.eos_token_id) == 1
if stage == 'test':
target_ids = []
else:
target_str = example.target
target_str = target_str.replace('</s>', '<unk>')
target_ids = tokenizer.encode(target_str, max_length=args.max_target_length, padding='max_length',
truncation=True)
assert target_ids.count(tokenizer.eos_token_id) == 1
return InputFeatures(
example_index,
source_ids,
target_ids,
url=example.url
)
class InputFeatures(object):
"""A single training/test features for a example."""
def __init__(self,
example_id,
source_ids,
target_ids,
url=None
):
self.example_id = example_id
self.source_ids = source_ids
self.target_ids = target_ids
self.url = url
class Example(object):
"""A single training/test example."""
def __init__(self,
idx,
source,
target,
):
self.idx = idx
self.source = source
self.target = target
def read_summarize_examples(filename, data_num):
"""Read examples from filename."""
examples = []
with open(filename, encoding="utf-8") as f:
for idx, line in enumerate(f):
line = line.strip()
js = json.loads(line)
if 'idx' not in js:
js['idx'] = idx
code = ' '.join(js['code_tokens']).replace('\n', ' ')
code = ' '.join(code.strip().split())
nl = ' '.join(js['docstring_tokens']).replace('\n', '')
nl = ' '.join(nl.strip().split())
examples.append(
Example(
idx=idx,
source=code,
target=nl,
)
)
if idx + 1 == data_num:
break
return examples