-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathpreprocessing.py
73 lines (62 loc) · 2 KB
/
preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import zipfile
import os
import csv
import json
# unzip
zip_file_path = '/content/LLaVA/open.zip'
extracted_folder = '/content/dacon-multimodal-vqa'
def extract_zip(zip_file, extract_to):
with zipfile.ZipFile(zip_file, 'r') as zip_ref:
zip_ref.extractall(extract_to)
try:
extract_zip(zip_file_path, extracted_folder)
print(f"압축 파일을 성공적으로 해제하였습니다. 경로: {extracted_folder}")
except Exception as e:
print(f"압축 파일 해제 중 오류가 발생하였습니다: {e}")
# ----------------------------------------------------------------
# make 'output.json'
with open('/content/dacon-multimodal-vqa/train.csv', 'r') as f:
reader = csv.reader(f)
next(reader)
data = list(reader)
json_data = []
for row in data:
id, image_id, question, answer = row
json_data.append({
"id": id,
"image": "/content/dacon-multimodal-vqa/image/train/" + image_id + ".jpg",
"conversations": [
{
"from": "human",
"value": "<image>\n" + question
},
{
"from": "gpt",
"value": answer
}
]
})
with open('output.json', 'w') as f:
json.dump(json_data, f, indent=4)
# ----------------------------------------------------------------
# make 'test.json'
with open('/content/dacon-multimodal-vqa/test.csv', 'r') as f:
reader = csv.reader(f)
next(reader)
data = list(reader)
json_data = []
for row in data:
id, image_id, question = row
json_data.append({
"id": id,
"image": "/content/dacon-multimodal-vqa/image/test/" + image_id + ".jpg",
"text": question
})
# jsonl file path
jsonl_output_file = "/content/dacon-multimodal-vqa/test.jsonl"
# JSON to JSONL
with open(jsonl_output_file, "w") as file:
for obj in json_data:
# write file (JSON +(\n)).
json.dump(obj, file)
file.write("\n")