-
Notifications
You must be signed in to change notification settings - Fork 4
/
update_db.py
147 lines (116 loc) · 3.8 KB
/
update_db.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
from tortoise import Tortoise, run_async
from config import SQL_CONN_URL
from models import *
import numpy as np
import json
import gc
# JSON -> SQL CONVERTER SCRIPT -----
# You need a fresh `open.json` file named `original.json` stored in the jobs folder. (don't delete the existing open.json file)
# You need your SQL database set up, and configured in config.py
def _calculate_shard_number(job):
count = (np.int64(job["end_id"]) / 1000000) * 2
if job["shard"] == 0:
count -= 1
return int(count)
async def init():
# 0. Connect to DB
print("Connecting to DB using url from config.py...")
await Tortoise.init(
db_url=SQL_CONN_URL,
modules={'models': ['models']}
)
await Tortoise.generate_schemas()
# 1. Jobs
print("Processing jobs... (this may take a while)")
with open("jobs/shard_info.json", "r") as f:
directory = json.load(f)["directory"]
with open("jobs/original.json", "r") as f:
db = json.load(f)
with open("jobs/open.json", "r") as f:
opened = [_calculate_shard_number(i) for i in json.load(f)]
with open("jobs/closed.json", "r") as f:
closed = [int(i) for i in json.load(f)]
with open("jobs/open_gpu.json", "r") as f:
gpu_data = json.load(f)
jobs = []
for i in opened:
data = db[i-1]
job = Job(
number=i,
url=directory + data["url"],
start_id=data["start_id"],
end_id=data["end_id"],
shard_of_chunk=data["shard"],
gpu=False,
gpu_url=None,
pending=False,
closed=False,
completor=None,
cpu_completor=None
)
jobs.append(job)
for i in closed:
data = db[i-1]
job = Job(
number=i,
url=directory + data["url"],
start_id=data["start_id"],
end_id=data["end_id"],
shard_of_chunk=data["shard"],
gpu=False,
gpu_url=None,
pending=False,
closed=True,
completor="N/A",
cpu_completor=None
)
jobs.append(job)
for data in gpu_data:
number = int(data[0])
job = Job(
number=number,
url=directory + db[number-1]["url"],
start_id=data[1]["start_id"],
end_id=data[1]["end_id"],
shard_of_chunk=data[1]["shard"],
gpu=True,
gpu_url=data[1]["url"],
pending=False,
closed=False,
completor=None,
cpu_completor="N/A"
)
jobs.append(job)
# Dedupe
seen = set()
new_jobs = []
for job in jobs:
if job.number not in seen:
new_jobs.append(job)
seen.add(job.number)
jobs = new_jobs
jobs = sorted(jobs, key=lambda x: x.number) # Sort
print("Bulk creating jobs in database... (this may take a while)")
await Job.bulk_create(jobs)
del db, opened, closed, gpu_data, jobs
gc.collect()
# 2. Leaderboard
print("Processing leaderboard...")
with open("jobs/leaderboard.json", "r") as f:
lb = json.load(f)
leaderboard = []
for user in lb:
userboard = Leaderboard(
nickname=user,
jobs_completed=lb[user][0],
pairs_scraped=lb[user][1]
)
leaderboard.append(userboard)
print("Bulk creating leaderboard in database... (this may take a while)")
await Leaderboard.bulk_create(leaderboard)
del lb, leaderboard
gc.collect()
# We don't need to do Client as they are volatile
# We don't need to do CPU_Leaderboard as it did not exist before v3.0.0
print("Done.")
run_async(init())