This repository has been archived by the owner on Oct 29, 2020. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathworker.py
339 lines (284 loc) · 20.9 KB
/
worker.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
# Let's remind people who still have this running to shut it down
from os.path import isfile
from json import loads
from os import environ
import requests
from sys import exit
if "TRACKER_USERNAME" in environ.keys():
TRACKER_USERNAME = environ["TRACKER_USERNAME"]
elif isfile("config.json"):
try:
TRACKER_USERNAME = loads(open("config.json").read())["TRACKER_USERNAME"]
except:
TRACKER_USERNAME = "Unnamed"
else:
TRACKER_USERNAME = "Unnamed"
print("=============================")
print("This project is now complete, and we are working on sorting and finalizing the data. Thank you to everyone who contributed!")
print("=============================")
print()
print("Just a heads up, we will send your TRACKER_USERNAME to the script admins just so we can remind you to shut down your worker if you've forgotten.")
requests.post("https://discord.com/api/webhooks/771212810877141032/dj9WCWZ2oE5t_vzdyc_OEdTaGbAP92bJFe8CEfYXlRXKJfPewOHWYAgBrLwx596k0CJC", json={"content": str(TRACKER_USERNAME)+" just tried to start a worker."})
exit(0)
from threading import Thread
import requests
from time import sleep
from os import mkdir, rmdir, listdir, system, environ
from os.path import isdir, isfile, getsize
from json import loads
from youtube_channel import process_channel
import signal
import tracker
from youtube_dl import YoutubeDL
from shutil import rmtree, which
from queue import Queue
from gc import collect
from discovery import getmetadata
from export import subprrun
#useful Queue example: https://stackoverflow.com/a/54658363
jobs = Queue()
try:
mkdir("out")
except:
pass
try:
mkdir("directory")
except:
pass
HEROKU = False
if isfile("../Procfile"):
HEROKU = True
langs = ['ab', 'aa', 'af', 'sq', 'ase', 'am', 'ar', 'arc', 'hy', 'as', 'ay', 'az', 'bn', 'ba', 'eu', 'be', 'bh', 'bi', 'bs', 'br',
'bg', 'yue', 'yue-HK', 'ca', 'chr', 'zh-CN', 'zh-HK', 'zh-Hans', 'zh-SG', 'zh-TW', 'zh-Hant', 'cho', 'co', 'hr', 'cs', 'da', 'nl',
'nl-BE', 'nl-NL', 'dz', 'en', 'en-CA', 'en-IN', 'en-IE', 'en-GB', 'en-US', 'eo', 'et', 'fo', 'fj', 'fil', 'fi', 'fr', 'fr-BE',
'fr-CA', 'fr-FR', 'fr-CH', 'ff', 'gl', 'ka', 'de', 'de-AT', 'de-DE', 'de-CH', 'el', 'kl', 'gn', 'gu', 'ht', 'hak', 'hak-TW', 'ha',
'iw', 'hi', 'hi-Latn', 'ho', 'hu', 'is', 'ig', 'id', 'ia', 'ie', 'iu', 'ik', 'ga', 'it', 'ja', 'jv', 'kn', 'ks', 'kk', 'km', 'rw',
'tlh', 'ko', 'ku', 'ky', 'lo', 'la', 'lv', 'ln', 'lt', 'lb', 'mk', 'mg', 'ms', 'ml', 'mt', 'mni', 'mi', 'mr', 'mas', 'nan',
'nan-TW', 'lus', 'mo', 'mn', 'my', 'na', 'nv', 'ne', 'no', 'oc', 'or', 'om', 'ps', 'fa', 'fa-AF', 'fa-IR', 'pl', 'pt', 'pt-BR',
'pt-PT', 'pa', 'qu', 'ro', 'rm', 'rn', 'ru', 'ru-Latn', 'sm', 'sg', 'sa', 'sc', 'gd', 'sr', 'sr-Cyrl', 'sr-Latn', 'sh', 'sdp', 'sn',
'scn', 'sd', 'si', 'sk', 'sl', 'so', 'st', 'es', 'es-419', 'es-MX', 'es-ES', 'es-US', 'su', 'sw', 'ss', 'sv', 'tl', 'tg', 'ta',
'tt', 'te', 'th', 'bo', 'ti', 'tpi', 'to', 'ts', 'tn', 'tr', 'tk', 'tw', 'uk', 'ur', 'uz', 'vi', 'vo', 'vor', 'cy', 'fy', 'wo',
'xh', 'yi', 'yo', 'zu']
assert which("zip") and which("rsync") and which("curl"), "Please ensure the zip, rsync, and curl commands are installed on your system."
#HSID, SSID, SID cookies required
if "HSID" in environ.keys() and "SSID" in environ.keys() and "SID" in environ.keys():
cookies = {"HSID": environ["HSID"], "SSID": environ["SSID"], "SID": environ["SID"]}
elif isfile("config.json"):
cookies = loads(open("config.json").read())
else:
print("HSID, SSID, and SID cookies from youtube.com are required. Specify in config.json or as environment variables.")
assert False
if not (cookies["HSID"] and cookies["SSID"] and cookies["SID"]):
print("HSID, SSID, and SID cookies from youtube.com are required. Specify in config.json or as environment variables.")
assert False
mysession = requests.session()
mysession.headers.update({"cookie": "HSID="+cookies["HSID"]+"; SSID="+cookies["SSID"]+"; SID="+cookies["SID"], "Accept-Language": "en-US",})
validationtest = mysession.get("https://www.youtube.com/timedtext_editor?action_mde_edit_form=1&v=1iNTtHUwvq4&lang=en&bl=vmp&ui=hd&ref=player&tab=captions&o=U")
assert not "accounts.google.com" in validationtest.url, "Please ensure you have correctly specified account cookies."
assert """<button class="yt-uix-button yt-uix-button-size-default yt-uix-button-default yt-uix-button-has-icon" type="button" onclick=";return false;" id="yt-picker-language-button" data-button-action="yt.www.picker.load" data-button-menu-id="arrow-display" data-picker-key="language" data-picker-position="footer" data-button-toggle="true"><span class="yt-uix-button-icon-wrapper"><span class="yt-uix-button-icon yt-uix-button-icon-footer-language yt-sprite"></span></span><span class="yt-uix-button-content"> <span class="yt-picker-button-label">
Language:
</span>
English
</span><span class="yt-uix-button-arrow yt-sprite"></span></button>""" in validationtest.text, "Please make sure your YouTube and Google account language is set to English (United States)"
del validationtest
open("cookies.txt", "w").write("""# HTTP Cookie File
.youtube.com TRUE / FALSE 1663793455 SID [SID]
.youtube.com TRUE / FALSE 1663793455 HSID [HSID]
.youtube.com TRUE / TRUE 1663793455 SSID [SSID]""".replace("[SID]", cookies["SID"]).replace("[HSID]", cookies["HSID"]).replace("[SSID]", cookies["SSID"]))
del cookies
validationtimes = 0
#Graceful Shutdown
class GracefulKiller:
kill_now = False
def __init__(self):
signal.signal(signal.SIGINT, self.exit_gracefully)
signal.signal(signal.SIGTERM, self.exit_gracefully)
def exit_gracefully(self, signum, frame):
print("Graceful exit process initiated, no longer accepting new tasks but finishing existing ones...")
self.kill_now = True
gkiller = GracefulKiller()
#microtasks
def threadrunner():
global validationtimes
jobs = Queue()
ydl = YoutubeDL({"extract_flat": "in_playlist", "simulate": True, "skip_download": True, "quiet": True, "cookiefile": "cookies.txt", "source_address": "0.0.0.0", "call_home": False})
while True:
if not jobs.empty():
task, vid, args = jobs.get()
if task == "submitdiscovery":
tracker.add_item_to_tracker(args, vid)
elif task == "discovery":
while True:
try:
info = getmetadata(mysession, str(vid).strip())
break
except BaseException as e:
print(e)
print("Error in retrieving information, waiting 30 seconds and trying again")
sleep(30)
if info[0]: # ccenabled
if not isdir("out/"+str(vid).strip()):
mkdir("out/"+str(vid).strip())
if info[0]:
for langcode in langs:
jobs.put(("subtitles", vid, langcode))
for langcode in langs:
jobs.put(("subtitles-forceedit-metadata", vid, langcode))
for langcode in langs:
jobs.put(("subtitles-forceedit-captions", vid, langcode))
jobs.put(("complete", None, "video:"+vid))
for videodisc in info[1]:
jobs.put(("submitdiscovery", videodisc, tracker.ItemType.Video))
for channeldisc in info[2]:
jobs.put(("submitdiscovery", channeldisc, tracker.ItemType.Channel))
for mixdisc in info[3]:
jobs.put(("submitdiscovery", mixdisc, tracker.ItemType.MixPlaylist))
for playldisc in info[4]:
jobs.put(("submitdiscovery", playldisc, tracker.ItemType.Playlist))
elif task == "subtitles":
subprrun(mysession, args, vid, "default", needforcemetadata, needforcecaptions)
elif task == "subtitles-forceedit-captions":
subprrun(mysession, args, vid, "forceedit-captions", needforcemetadata, needforcecaptions)
elif task == "subtitles-forceedit-metadata":
subprrun(mysession, args, vid, "forceedit-metadata", needforcemetadata, needforcecaptions)
elif task == "channel":
try:
y = ydl.extract_info("https://www.youtube.com/channel/"+desit.split(":", 1)[1], download=False)
for itemyv in y["entries"]:
jobs.put(("submitdiscovery", itemyv["id"], tracker.ItemType.Video))
#channel created playlists
y = process_channel(desit.split(":", 1)[1])
for itemyv in y["playlists"]:
jobs.put(("submitdiscovery", itemyv, tracker.ItemType.Playlist))
for itemyv in y["channels"]:
jobs.put(("submitdiscovery", itemyv, tracker.ItemType.Channel))
jobs.put(("complete", None, "channel:"+args))
except:
print("YouTube-DL error, ignoring but not marking as complete...", "https://www.youtube.com/channel/"+desit.split(":", 1)[1])
elif task == "playlist":
try:
y = ydl.extract_info("https://www.youtube.com/playlist?list="+desit.split(":", 1)[1], download=False)
#TODO: extract owner channel in other projects
#TODO: handle channels in other projects, not needed here because we will get it from the video
for itemyvp in y["entries"]:
jobs.put(("submitdiscovery", itemyvp["id"], tracker.ItemType.Video))
jobs.put(("complete", None, "playlist:"+args))
except:
print("YouTube-DL error, ignoring but not marking as complete...", "https://www.youtube.com/playlist?list="+desit.split(":", 1)[1])
elif task == "mixplaylist":
try:
wptext = mysession.get("https://www.youtube.com/watch?v=jNQXAC9IVRw&list="+desit.split(":", 1)[1]).text
#channel handling not needed here because we will get it from the video
for line in wptext.splitlines():
if line.strip().startswith('window["ytInitialData"] = '):
initdata = loads(line.split('window["ytInitialData"] = ', 1)[1].strip()[:-1])
for itemyvp in initdata["contents"]["twoColumnWatchNextResults"]["playlist"]["playlist"]["contents"]:
jobs.put(("submitdiscovery", itemyvp["playlistPanelVideoRenderer"]["videoId"], tracker.ItemType.Video))
jobs.put(("complete", None, "mixplaylist:"+args))
except:
print("Mix Playlist error, ignoring but not marking as complete...", "https://www.youtube.com/watch?v=jNQXAC9IVRw&list="+desit.split(":", 1)[1])
elif task == "complete":
size = 0
if ":" in args:
if args.split(":", 1)[0] == "video":
#check if dir is empty, make zip if needed
if isdir("out/"+args.split(":", 1)[1]):
if not listdir("out/"+args.split(":", 1)[1]):
rmdir("out/"+args.split(":", 1)[1])
else:
#zip it up
if not isdir("directory/"+args.split(":", 1)[1]):
mkdir("directory/"+args.split(":", 1)[1])
while not isfile("directory/"+args.split(":", 1)[1]+"/"+args.split(":", 1)[1]+".zip"):
print("Attempting to zip item...")
system("zip -9 -r -j directory/"+args.split(":", 1)[1]+"/"+args.split(":", 1)[1]+".zip out/"+args.split(":", 1)[1])
#get a target
targetloc = None
while not targetloc:
targetloc = tracker.request_upload_target()
if targetloc:
break
else:
print("Waiting 5 minutes...")
sleep(300)
while True:
if targetloc.startswith("rsync"):
exitinfo = system("rsync -rltv --timeout=300 --contimeout=300 --progress --bwlimit 0 --recursive --partial --partial-dir .rsync-tmp --min-size 1 --no-compress --compress-level 0 directory/"+args.split(":", 1)[1]+"/ "+targetloc)
elif targetloc.startswith("http"):
exitinfo = system("curl -F "+args.split(":", 1)[1]+".zip=@directory/"+args.split(":", 1)[1]+"/"+args.split(":", 1)[1]+".zip "+targetloc)
if exitinfo == 0: # note that on Unix this isn't necessarily the exit code but it's still 0 upon successful exit
break
else:
print("Error in sending data to target, waiting 30 seconds and trying again.")
sleep(30)
size = getsize("directory/"+args.split(":", 1)[1]+"/"+args.split(":", 1)[1]+".zip")
#cleanup
try:
del langcnt[args.split(":", 1)[1]]
rmtree("directory/"+args.split(":", 1)[1]+"/")
rmdir("directory/"+args.split(":", 1)[1]+"/")
rmtree("out/"+args.split(":", 1)[1]+"/")
rmdir("out/"+args.split(":", 1)[1]+"/")
except:
pass
tracker.mark_item_as_done(args, size)
jobs.task_done()
else:
if not gkiller.kill_now:
# get a new task from tracker
collect() #cleanup
desit = tracker.request_item_from_tracker()
print("New task:", desit)
if desit:
if desit.split(":", 1)[0] == "video":
needforcemetadata = {'ab': None, 'aa': None, 'af': None, 'sq': None, 'ase': None, 'am': None, 'ar': None, 'arc': None, 'hy': None, 'as': None, 'ay': None, 'az': None, 'bn': None, 'ba': None, 'eu': None, 'be': None, 'bh': None, 'bi': None, 'bs': None, 'br': None,
'bg': None, 'yue': None, 'yue-HK': None, 'ca': None, 'chr': None, 'zh-CN': None, 'zh-HK': None, 'zh-Hans': None, 'zh-SG': None, 'zh-TW': None, 'zh-Hant': None, 'cho': None, 'co': None, 'hr': None, 'cs': None, 'da': None, 'nl': None,
'nl-BE': None, 'nl-NL': None, 'dz': None, 'en': None, 'en-CA': None, 'en-IN': None, 'en-IE': None, 'en-GB': None, 'en-US': None, 'eo': None, 'et': None, 'fo': None, 'fj': None, 'fil': None, 'fi': None, 'fr': None, 'fr-BE': None,
'fr-CA': None, 'fr-FR': None, 'fr-CH': None, 'ff': None, 'gl': None, 'ka': None, 'de': None, 'de-AT': None, 'de-DE': None, 'de-CH': None, 'el': None, 'kl': None, 'gn': None, 'gu': None, 'ht': None, 'hak': None, 'hak-TW': None, 'ha': None,
'iw': None, 'hi': None, 'hi-Latn': None, 'ho': None, 'hu': None, 'is': None, 'ig': None, 'id': None, 'ia': None, 'ie': None, 'iu': None, 'ik': None, 'ga': None, 'it': None, 'ja': None, 'jv': None, 'kn': None, 'ks': None, 'kk': None, 'km': None, 'rw': None,
'tlh': None, 'ko': None, 'ku': None, 'ky': None, 'lo': None, 'la': None, 'lv': None, 'ln': None, 'lt': None, 'lb': None, 'mk': None, 'mg': None, 'ms': None, 'ml': None, 'mt': None, 'mni': None, 'mi': None, 'mr': None, 'mas': None, 'nan': None,
'nan-TW': None, 'lus': None, 'mo': None, 'mn': None, 'my': None, 'na': None, 'nv': None, 'ne': None, 'no': None, 'oc': None, 'or': None, 'om': None, 'ps': None, 'fa': None, 'fa-AF': None, 'fa-IR': None, 'pl': None, 'pt': None, 'pt-BR': None,
'pt-PT': None, 'pa': None, 'qu': None, 'ro': None, 'rm': None, 'rn': None, 'ru': None, 'ru-Latn': None, 'sm': None, 'sg': None, 'sa': None, 'sc': None, 'gd': None, 'sr': None, 'sr-Cyrl': None, 'sr-Latn': None, 'sh': None, 'sdp': None, 'sn': None,
'scn': None, 'sd': None, 'si': None, 'sk': None, 'sl': None, 'so': None, 'st': None, 'es': None, 'es-419': None, 'es-MX': None, 'es-ES': None, 'es-US': None, 'su': None, 'sw': None, 'ss': None, 'sv': None, 'tl': None, 'tg': None, 'ta': None,
'tt': None, 'te': None, 'th': None, 'bo': None, 'ti': None, 'tpi': None, 'to': None, 'ts': None, 'tn': None, 'tr': None, 'tk': None, 'tw': None, 'uk': None, 'ur': None, 'uz': None, 'vi': None, 'vo': None, 'vor': None, 'cy': None, 'fy': None, 'wo': None,
'xh': None, 'yi': None, 'yo': None, 'zu': None}
needforcecaptions = {'ab': None, 'aa': None, 'af': None, 'sq': None, 'ase': None, 'am': None, 'ar': None, 'arc': None, 'hy': None, 'as': None, 'ay': None, 'az': None, 'bn': None, 'ba': None, 'eu': None, 'be': None, 'bh': None, 'bi': None, 'bs': None, 'br': None,
'bg': None, 'yue': None, 'yue-HK': None, 'ca': None, 'chr': None, 'zh-CN': None, 'zh-HK': None, 'zh-Hans': None, 'zh-SG': None, 'zh-TW': None, 'zh-Hant': None, 'cho': None, 'co': None, 'hr': None, 'cs': None, 'da': None, 'nl': None,
'nl-BE': None, 'nl-NL': None, 'dz': None, 'en': None, 'en-CA': None, 'en-IN': None, 'en-IE': None, 'en-GB': None, 'en-US': None, 'eo': None, 'et': None, 'fo': None, 'fj': None, 'fil': None, 'fi': None, 'fr': None, 'fr-BE': None,
'fr-CA': None, 'fr-FR': None, 'fr-CH': None, 'ff': None, 'gl': None, 'ka': None, 'de': None, 'de-AT': None, 'de-DE': None, 'de-CH': None, 'el': None, 'kl': None, 'gn': None, 'gu': None, 'ht': None, 'hak': None, 'hak-TW': None, 'ha': None,
'iw': None, 'hi': None, 'hi-Latn': None, 'ho': None, 'hu': None, 'is': None, 'ig': None, 'id': None, 'ia': None, 'ie': None, 'iu': None, 'ik': None, 'ga': None, 'it': None, 'ja': None, 'jv': None, 'kn': None, 'ks': None, 'kk': None, 'km': None, 'rw': None,
'tlh': None, 'ko': None, 'ku': None, 'ky': None, 'lo': None, 'la': None, 'lv': None, 'ln': None, 'lt': None, 'lb': None, 'mk': None, 'mg': None, 'ms': None, 'ml': None, 'mt': None, 'mni': None, 'mi': None, 'mr': None, 'mas': None, 'nan': None,
'nan-TW': None, 'lus': None, 'mo': None, 'mn': None, 'my': None, 'na': None, 'nv': None, 'ne': None, 'no': None, 'oc': None, 'or': None, 'om': None, 'ps': None, 'fa': None, 'fa-AF': None, 'fa-IR': None, 'pl': None, 'pt': None, 'pt-BR': None,
'pt-PT': None, 'pa': None, 'qu': None, 'ro': None, 'rm': None, 'rn': None, 'ru': None, 'ru-Latn': None, 'sm': None, 'sg': None, 'sa': None, 'sc': None, 'gd': None, 'sr': None, 'sr-Cyrl': None, 'sr-Latn': None, 'sh': None, 'sdp': None, 'sn': None,
'scn': None, 'sd': None, 'si': None, 'sk': None, 'sl': None, 'so': None, 'st': None, 'es': None, 'es-419': None, 'es-MX': None, 'es-ES': None, 'es-US': None, 'su': None, 'sw': None, 'ss': None, 'sv': None, 'tl': None, 'tg': None, 'ta': None,
'tt': None, 'te': None, 'th': None, 'bo': None, 'ti': None, 'tpi': None, 'to': None, 'ts': None, 'tn': None, 'tr': None, 'tk': None, 'tw': None, 'uk': None, 'ur': None, 'uz': None, 'vi': None, 'vo': None, 'vor': None, 'cy': None, 'fy': None, 'wo': None,
'xh': None, 'yi': None, 'yo': None, 'zu': None}
jobs.put(("discovery", desit.split(":", 1)[1], None))
elif desit.split(":", 1)[0] == "channel":
jobs.put(("channel", None, desit.split(":", 1)[1]))
elif desit.split(":", 1)[0] == "playlist":
jobs.put(("playlist", None, desit.split(":", 1)[1]))
elif desit.split(":", 1)[0] == "mixplaylist":
jobs.put(("mixplaylist", None, desit.split(":", 1)[1]))
else:
print("Ignoring item for now", desit)
else:
print("Ignoring item for now", desit)
else:
break
threads = []
THREADCNT = 50
if HEROKU:
THREADCNT = 20
#now create the rest of the threads
for i in range(THREADCNT):
runthread = Thread(target=threadrunner)
runthread.start()
threads.append(runthread)
del runthread
#https://stackoverflow.com/a/11968881
for x in threads:
x.join()
threads.remove(x)
del x
print("Exiting...")