-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmetal_crawler.py
1156 lines (951 loc) · 49 KB
/
metal_crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
"""Defines classes and functions to crawl M-A for band and artist pages."""
import os
import json
import threading
import time
import queue
import logging
import re
from datetime import date, datetime
from pathlib import Path
from dataclasses import dataclass, field
from dataclasses_serialization.json import JSONSerializer
from typing import List, Dict
import certifi
import urllib3
from urllib3.exceptions import ReadTimeoutError, MaxRetryError
import progressbar
from settings import CRAWLER_THREAD_COUNT
from bs4 import BeautifulSoup, NavigableString, Tag
from country_helper import COUNTRY_NAMES, split_locations
from genre import split_genres
from global_helpers import get_dict_key
from graph.choices import *
__author__ = 'Martin Woelke'
__license__ = 'Licensed under the Non-Profit Open Software License version 3.0'
__copyright__ = 'Copyright 2019-2023, Martin Woelke'
em_link_main = 'https://www.metal-archives.com/'
em_link_label = em_link_main + 'labels/'
bands = 'bands/'
ajaxLinks = queue.Queue()
entity_paths = {'bands': 'databases/visited_bands.txt', 'members': 'databases/visited_members.txt'}
lineup_mapping = {"Current lineup": "Current", "Last known lineup": "Last known", "Past members": "past"}
STATUS_ERROR = 'unrecoverable'
STATUS_SKIPPED = 'skipped'
STATUS_ADDED = 'added'
STATUS_INITIAL = 'initial'
stop_crawl_user_input = ""
@dataclass
class DbEntity:
emid: int = -1
link: str = 'not set'
name: str = 'not set'
type: str = 'not set'
@dataclass
class Label(DbEntity):
def __init__(self):
self.type = 'label'
@dataclass
class Band(DbEntity):
lineup: Dict = field(default_factory=dict)
genres: List[str] = field(default_factory=list)
theme: List[str] = field(default_factory=list)
active: List[str] = field(default_factory=list)
releases: Dict = field(default_factory=dict)
label: Label = Label()
visited: str = 'not set'
country: str = 'not set'
locations: str = 'not set'
status: str = 'not set'
formed: str = 'not set'
def __init__(self):
self.genres = []
self.theme = []
self.active = []
self.lineup = {}
self.releases = {}
self.type = 'band'
@dataclass
class Artist(DbEntity):
age: int = -1
gender: str = 'U'
origin: str = 'ZZ'
visited: str = 'not set'
instruments: List[str] = field(default_factory=list)
def __init__(self):
self.type = 'artist'
self.instruments = []
@dataclass
class Release(DbEntity):
release_type: str = 'not set'
release_date: str = 'not set'
rating: int = -1
review_count: int = 0
def __init__(self):
self.type = 'release'
class VisitBandThread(threading.Thread):
def __init__(self, thread_id, band_links, lock, db_handle, band_errors, visited_entities, progress_bar,
visited_bands, is_detailed=False, is_single_mode=True):
"""Constructs a worker object which is used to get prepared data from a band page.
The only remarkable thing is switching the ``chardet.charsetprober`` logger to INFO.
:param thread_id: An integer number
:param band_links: A queue with short addresses of bands which are consumed one at a time by the workers.
:param lock: Secures concurrent access to ``database`` which is used by all other workers.
:param db_handle: The database handle is used to add all entities directly into the database with the strategy
defined on the outside.
:param band_errors: A shared dictionary with band links as keys and the number of unsuccessful crawl attempts as
the value.
:param visited_entities: A dictionary with keys like 'bands' or 'artists' to quickly check if crawling is
needed. The value is the date the entry was written into the database. The dictionary must be filled on the
outside or everything will be crawled and applied to the database.
:param progress_bar: The progress bar is initialized on the outside with the size of the band_links as the
maximum value.
:param visited_bands: A list shared among the threads so that the progress bar is updated easily.
:param is_detailed: A parameter that is not used and might be useful someday.
:param is_single_mode: Indicates if a single band and its immediate connections is crawled.
"""
super(VisitBandThread, self).__init__()
self.threadID = thread_id
self.name = "BandVisitor_" + thread_id
self.bandLinks = band_links
self.logger = logging.getLogger('chardet.charsetprober')
self.logger.setLevel(logging.INFO)
self.logger = logging.getLogger('Crawler')
self.qsize = band_links.qsize()
self.logger.debug(f"Initializing {self.name}.")
self.logger.debug(f" Init with {self.qsize} bands.")
self.lock = lock
self.db_handle = db_handle
self.visited_entities = visited_entities
self.visited_bands = visited_bands
self.today = date.today()
self.is_detailed = is_detailed
self.is_single_mode = is_single_mode
self.band_errors = band_errors
self.retries_max = 3
self.progress_bar = progress_bar
global stop_crawl_user_input
def update_bar(self, band_link):
self.visited_bands.append(band_link)
self.progress_bar.update(len(self.visited_bands))
def run(self):
"""Runs crawling as long as band links are retrieved from the links queue.
:return: -1 as soon as the queue runs out of links.
"""
self.logger.debug("Running " + self.name)
while stop_crawl_user_input != "Q":
try:
link_band_temp = self.bandLinks.get_nowait()
except queue.Empty:
return -1
# TODO: Implement revisiting mechanism based on date.
# No need to visit if the band is already in the database.
if link_band_temp in self.visited_entities['bands']:
self.logger.debug(f" Skipping {link_band_temp}.")
self.update_bar(link_band_temp)
self.band_errors[STATUS_SKIPPED][link_band_temp] = ""
continue
try:
crawl_result = self.crawl_band(link_band_temp)
except Exception as e:
self.logger.exception('Something bad happened while crawling.', e)
crawl_result = None
# Error case: putting the link back into circulation.
if crawl_result is None:
if link_band_temp not in self.band_errors[STATUS_ERROR].keys():
self.band_errors[STATUS_ERROR][link_band_temp] = 1
else:
self.band_errors[STATUS_ERROR][link_band_temp] += 1
if self.band_errors[STATUS_ERROR][link_band_temp] < self.retries_max:
self.bandLinks.put(link_band_temp)
else:
self.logger.error(f'Too many retries for {link_band_temp}.')
self.update_bar(link_band_temp)
continue
else:
self.visited_entities['bands'][link_band_temp] = ''
self.lock.acquire()
try:
apply_to_db(crawl_result, self.db_handle, self.is_detailed)
self.band_errors[STATUS_ADDED][link_band_temp] = ''
except Exception as e:
self.logger.exception('Writing artists failed! This is bad. Expect loss of data for the above band.', e)
self.band_errors[STATUS_ERROR][link_band_temp] = ''
finally:
self.lock.release()
self.update_bar(link_band_temp)
# Saving the data to disk will later enable us to limit getting live data if it is not needed.
actual_band_path = f"databases/{crawl_result.country}"
os.makedirs(actual_band_path, exist_ok=True)
# We take the band link because it always uses escaped sequences. This way we have the highest
# compatibility for writing files in underlying filesystems. The slash must be replaced of course.
db_path = Path(f"{actual_band_path}/{crawl_result.link.replace('/', '_')}.json")
actual_band_file = open(db_path, "w", encoding="utf-8")
# TODO: Add try block for the dump. It crashed once because it found a Tag object.
band_data = JSONSerializer.serialize(crawl_result)
band_data_text = json.dumps(band_data)
actual_band_file.write(band_data_text)
actual_band_file.close()
def crawl_band(self, band_short_link):
"""This is where the magic happens: A short band link is expanded, visited and parsed for data.
It still may throw an exception that must be caught and dealt with. Best by putting the link back
into circulation.
:param band_short_link: Short form of the band link (e.g. Darkthrone/146).
:return:
A Band instance with band, artist and label data of the visited band or
`None` in an error case.
"""
if len(band_short_link) == 0:
return None
# TODO: Change your environment or this doesn't work!
# The % escaped glyphs in links only work if the client.py in http
# of Python 3.6 is changed in putrequest() right before self._output()
# is called. The line looks like this:
# url = rfc3986.uri_reference(url).unsplit()
# Needs to import rfc3986.
link_band = em_link_main + bands + band_short_link
logger = logging.getLogger('Crawler')
logger.info(f'>>> Crawling [{band_short_link}]')
band_soup = cook_soup(link_band)
if band_soup is None:
return None
logger.debug(" Start scraping from actual band.")
# Finds band name; needs to extract the ID later.
s = band_soup.find_all(attrs={"class": "band_name"})
if len(s) == 0:
logger.fatal(f" Did not find the attribute band_name for {band_short_link}.")
logger.debug(" Band page source for reference:")
logger.debug(band_soup.text)
return None
# All data of a band is collected here. Band members are referenced and collected in their own collection.
band_data_ref = Band()
band_data_ref.name = str(s[0].next_element.text)
band_data_ref.emid = band_short_link[band_short_link.rfind('/') + 1:]
band_data_ref.link = band_short_link
band_data_ref.visited = str(self.today)
s = band_soup.find_all(attrs={"class": "float_left"})
# Take the last two letters of the link.
band_data_ref.country = s[1].contents[3].contents[0].attrs["href"][-2:]
band_data_ref.location = split_locations(s[1].contents[7].text)
band_data_ref.status = get_dict_key(BAND_STATUS, s[1].contents[11].text)
band_data_ref.formed = s[1].contents[15].text
s = band_soup.find_all(attrs={"class": "clear"})
# Get years into a list. Earlier incarnations of a band are ignored.
years_raw = s[3].contents[3].text.lstrip().rstrip()
years_raw = years_raw.replace('\t', '')
years_raw = years_raw.replace('\n', '')
year_tokens = years_raw.split(',')
for year_token in year_tokens:
# First one filters the earlier incarnation. The second one is for a bit more obscure use case. In that an
# earlier incarnation exists but has a comma in the name (leading for the latter portion to be split).
if '(as' not in year_token and ')' not in year_token:
band_data_ref.active.append(year_token.lstrip())
s = band_soup.find_all(attrs={"class": "float_right"})
band_data_ref.genres = split_genres(s[3].contents[3].contents[0])
band_data_ref.theme = s[3].contents[7].contents[0].split(', ')
label_node = s[3].contents[11].contents[0]
# The label from band page is only the active one. All others will only be available through the individual
# releases. TODO: Visit all releases and get more detailed info.
if type(label_node) is NavigableString:
label_name = str(s[3].contents[11].contents[0])
if label_name == "Unsigned/independent":
label_id = -1
else:
label_id = label_name
label_link = ""
else:
if len(label_node.contents) > 0:
label_name = label_node.contents[0]
label_link = label_node.attrs["href"][len(em_link_label):]
label_id = label_link[label_link.find('/') + 1:]
else:
logger.error("Label node appears to empty; dumping parent parent node.")
logger.error(str(s[3]))
raise ValueError('Label node must not be empty.')
band_data_ref.label.emid = label_id
band_data_ref.label.name = label_name
band_data_ref.label.link = label_link
logger.debug(" Scraping artists from actual band.")
artists_and_bands = band_soup.find_all(attrs={"class": "ui-tabs-panel-content"})
artists_and_band_element = artists_and_bands[0]
actual_category = artists_and_band_element.contents[1].contents
# This check sets a flag if a band e.g. only has a "last known" lineup. In that case it is not "diverse".
lineup_finder = band_soup.find_all(attrs={"href": "#band_tab_members_all"})
is_lineup_diverse = True
if len(lineup_finder) == 0:
is_lineup_diverse = False
# Needs to be outside because it won't be set in every iteration of the loop.
header_category = ''
# The contents of actual_category starts with a LF (`Navigable String`) and has a LF at every even position.
# So we start at index 1 with actual payload and only take data from uneven indexes.
# Data at even indexes are from one of three categories:
# * lineupHeaders: A header category like "Current" or "Past".
# * lineupRow: An artist including instruments and time spans.
# * lineupBandsRow: Other bands an artist played in. We do not need to parse this as we connect each band
# member with the actual band.
for i in range(1, len(actual_category), 2):
actual_row = actual_category[i]
last_found_header = actual_row.attrs["class"][0]
# Normal case.
if last_found_header == "lineupHeaders":
header_category = actual_row.contents[1].contents[0].rstrip().lstrip().replace('\t', '')
# While crawling Tarot/4339 I found an unusual double space between Past and (Live). Let's remove it.
header_category = header_category.replace(' ', ' ')
logger.debug(f" Found header: {header_category}")
# Special case where a band only has one line-up.
elif last_found_header == "lineupRow":
# If a band has only one lineup (current, last-known or past) the usual headers will be missing on the
# page. For active bands with changing lineup we get 'Current'. For a band with no lineup changes it
# will be empty.
if not is_lineup_diverse:
test_header2 = str(band_soup.find_all(attrs={"href": "#band_tab_members_current"})[0].contents[0])
header_category = lineup_mapping[test_header2]
logger.debug(f" Did not find a header. Digging deeper: {header_category}")
elif last_found_header == "lineupBandsRow":
pass
if header_category not in band_data_ref.lineup.keys() and header_category != '':
# Add an empty lineup list for the found header_category if it was not in before. `header_category` will
# always have a valid value.
band_data_ref.lineup[header_category] = []
elif header_category == '':
# For the unlikely case that the header category is not found.
raise ValueError(f'The header category was empty while crawling {band_data_ref.name}.')
# Five elements for artists.
if len(actual_row) == 5:
temp_artist_soup_link = actual_row.contents[1].contents[1].attrs["href"]
# The leading part ist not needed and stripped (https://www.metal-archives.com/artists/).
# It's always 39 letters long.
temp_artist_link = actual_row.contents[1].contents[1].attrs["href"][39:]
temp_artist_id = temp_artist_link[temp_artist_link.find('/') + 1:]
temp_artist_pseudonym = str(actual_row.contents[1].contents[1].contents[0])
logger.debug(f" Recording artist data for {temp_artist_pseudonym}.")
# Don't visit known band members.
if temp_artist_link in self.visited_entities['artists']:
logger.debug(f" Skipping band member {temp_artist_link}.")
artist_soup = None
artist_exists = True
else:
artist_soup = cook_soup(temp_artist_soup_link)
artist_exists = False
name = ""
gender = "U"
age = -1
origin = 'ZZ'
if artist_soup is not None and artist_soup != 0:
member_info = None
# The artist soup is not always reliably cooked. I'll leave the debug print logs in for now.
try:
member_info = artist_soup.find('div', attrs={'id': 'member_info'})
except Exception as e:
print('e >>>>>>>>>>>>>>>>>>>>')
print(f'artist_soup: {artist_soup}')
print()
if member_info is None:
print('n >>>>>>>>>>>>>>>>>>>>')
print(f'artist_soup: {artist_soup}')
logger.error(f'Crawling artist failed (member info is None): {temp_artist_soup_link}')
print('<<<<<<<<<<<<<<<<<<<<')
continue
name = str(member_info.contents[7].contents[3].contents[0]).lstrip().rstrip()
gender = str(member_info.contents[9].contents[7].contents[0])
if gender in GENDER.values():
gender = get_dict_key(GENDER, gender)
else:
logger.error(f'Encountered unrecognized gender: {gender}')
gender = "U"
temp_age = str(member_info.contents[7].contents[7].contents[0]).lstrip().rstrip()
# Age strings contain either an N/A or are YY (born ...).
if 'N/A' not in temp_age:
age = temp_age[:temp_age.find(" ")]
if 'N/A' not in member_info.contents[9].contents[3].text:
origin = member_info.contents[9].contents[3].contents[1].attrs['href'][-2:]
else:
logger.error(f'Crawling artist failed: {temp_artist_soup_link}')
# Error case: artist_soup is invalid and the artist does not exist.
if not artist_exists:
return None
# If the band member does not have a name in the database we simply use the pseudonym. This
# unfortunately overwrites the name with whatever pseudonym we found last.
if 'N/A' in name:
name = temp_artist_pseudonym
artist = Artist()
band_data_ref.lineup[header_category].append(artist)
artist.emid = temp_artist_id
artist.link = temp_artist_link
artist.name = name
artist.gender = gender
artist.age = age
artist.origin = origin
artist.pseudonym = temp_artist_pseudonym
artist.instruments = cut_instruments_alt(actual_row.contents[3].contents[0])
artist.visited = str(self.today)
number_added_bands = 0
# Happens only for the first band if -s was used as the command line switch.
if self.is_single_mode:
number_added_bands = self.add_connected_bands_to_queue(band_soup)
# Crawl discography.
link_disco = f"https://www.metal-archives.com/band/discography/id/{band_data_ref.emid}/tab/all"
disco_soup = cook_soup(link_disco)
if disco_soup is None:
logger.error(f" Unable to get the discography for {band_short_link}.")
# We have to throw everything away and start anew.
return None
table = disco_soup.find('table', attrs={'class': 'display discog'})
table_body = table.find('tbody')
rows = table_body.find_all('tr')
for row in rows:
cells = row.findAll("td")
# Guard clause for the unlikely case if a band has no releases.
if len(cells) == 1:
logger.debug(f" No releases found for {band_data_ref.name}.")
continue
# TODO: Visit release page to get details like the actual release date instead of only the year.
album_id = cells[0].contents[0].attrs['href']
# We don't need the fixed part of the link (https://www.metal-archives.com/albums/).
album_link = album_id[38:]
# Get the ID just to be sure.
album_id = album_id[album_id.rfind('/') + 1:]
release = Release()
band_data_ref.releases[album_id] = release
release.emid = album_id
release.name = cells[0].text
release.release_type = get_dict_key(RELEASE_TYPES, cells[1].text)
release.release_date = cells[2].text
release.link = album_link
album_rating_raw = cells[3].text.rstrip().strip()
parenthesis_open = album_rating_raw.find('(')
# Instantiate with data for invalid review and and ratings.
album_rating = -1
review_count = 0
if parenthesis_open != -1:
split_rating = album_rating_raw.split('(')
# Get the average rating and review count from a string looking like this: '8 (64%)'
if len(split_rating) == 2:
review_count = int(split_rating[0].rstrip())
album_rating = int(split_rating[1][:-2])
release.rating = album_rating
release.review_count = review_count
logger.debug(f'<<< Crawling [{band_short_link}]')
return band_data_ref
def add_connected_bands_to_queue(self, band_soup):
"""Extracts all band links from the given band soup and adds them to the queue, resets the single mode flag and
updates the progressbar to the new band amount.
:param band_soup: The band soup of the band that's crawled right now. A band soup is cooked with
`cook_soup(link_band)` (which expects the _full_ address of a band page).
:return The number of connected bands.
"""
band_rows = band_soup.find_all('tr', attrs={'class': 'lineupBandsRow'})
linked_bands = []
for band_row in band_rows:
actual_bands = band_row.contents[1].contents
for i in range(1, len(actual_bands), 2):
band_link = actual_bands[i].attrs['href'][37:]
if band_link not in linked_bands:
linked_bands.append(band_link)
self.bandLinks.put(band_link)
# Most of the time the crawler log is off, so this goes to the screen and the log file.
if len(linked_bands) == 0:
log_message = f'The chosen band does not have any outward connections.'
else:
log_message = f'Added {len(linked_bands)} connected bands to the crawl.'
self.band_errors[STATUS_INITIAL] = len(linked_bands) + 1
# The logger named Crawler normally is not used for console output because it interferes with the progress bar.
# That's why we use a different logger to notify the user about connected bands instead of the object's logger.
temp_logger = logging.getLogger('Connector')
temp_logger.info(log_message)
# Switch off the single mode after the first call. At least for now. Maybe we'll do two levels (or more) later.
self.is_single_mode = False
# The additional band is the actual one because it is not in the queue right now.
self.progress_bar.max_value = self.bandLinks.qsize() + 1
return len(linked_bands)
def make_band_list(country_links):
logger = logging.getLogger('Crawler')
logger.debug('Started Band List Visitor')
skipped_links = 0
band_links = []
# Used as a very crude way to see if duplicate data is sent by MA.
json_strings = []
while country_links.qsize() != 0:
link_country_temp = country_links.get_nowait()
logger.debug(f' Working on: {link_country_temp}')
http = urllib3.PoolManager(cert_reqs='CERT_REQUIRED', ca_certs=certifi.where())
country_json = http.request('GET', link_country_temp)
json_data_string = country_json.data.decode('utf-8')
if json_data_string not in json_strings or json_data_string != '0':
json_strings.append(json_data_string)
else:
logger.error(f' Invalid data for [{link_country_temp}]. Putting it back in circulation...')
country_links.put(link_country_temp)
# The data string might contain an incomplete data definition which prevents conversion to the dict below.
json_data_string = json_data_string.replace('"sEcho": ,', '')
json_data = None
try:
json_data = json.loads(json_data_string)
except Exception as e:
logger.exception(f' JSON error for [{link_country_temp}]. Putting it back in circulation...', e)
if json_data is None or json_data == 0:
country_links.put(link_country_temp)
logger.debug(f' json_data was {json_data}.')
continue
for band in json_data["aaData"]:
# We do not need the leading "'<a href=\'https://www.metal-archives.com/bands/".
partial_link = band[0][46:band[0].rfind("'>")]
if partial_link not in band_links:
band_links.append(partial_link)
else:
logger.error(f' Found a duplicate band link in MA [{partial_link}].')
skipped_links += 1
# Not waiting is much faster but generates more errors.
time.sleep(1.0)
return band_links
def make_time_spans(raw_spans):
"""Helper function to convert time span tuples to a list of data objects.
"""
time_spans = []
for time_span_tuple in raw_spans:
if len(time_span_tuple) != 0 and time_span_tuple[0] != '?':
d0 = date(time_span_tuple[0], 1, 1)
else:
continue
t1 = time_span_tuple[1]
if t1 == 'present':
d1 = date.today()
elif t1 == '?':
continue
else:
d1 = date(time_span_tuple[1], 12, 31)
time_spans.append(d0)
time_spans.append(d1)
return time_spans
def make_active_list(raw_activity):
"""Converts a list of string dates to date objects.
Will only convert pairs of string dates. Any 'N/A' or '?' in the strings will be ignored.
:param raw_activity: A list of ISO dates as strings. A date pair must be delimited by a dash.
:return: A list of converted date objects. Since M-A only provides the year, we set the start always as the first
day of the year and the end as the last day of the year.
"""
active_list = []
for time_slot in raw_activity:
if '?' in time_slot or 'N/A' in time_slot:
continue
temp_slots = time_slot.split('-')
try:
time_slot_1 = date(int(temp_slots[0]), 1, 1)
except ValueError:
continue
if len(temp_slots) == 1:
time_slot_2 = time_slot_1
else:
if 'present' in temp_slots[1] or '?' in temp_slots[1]:
temp_year = date.today().year
else:
try:
temp_year = int(temp_slots[1])
except ValueError:
continue
time_slot_2 = date(temp_year, 12, 31)
active_list.append(time_slot_1)
active_list.append(time_slot_2)
return active_list
def apply_to_db(band: Band, db_handle, is_detailed):
logger = logging.getLogger('Crawler')
logger.debug("Apply to DB...")
# Serialize a Band object and massage it so that the DB model understands it.
temp_band_dict = JSONSerializer.serialize(band)
# TODO: Fond out if these are necessary.
del temp_band_dict['lineup']
del temp_band_dict['releases']
# DB expects date objects instead of strings.
temp_band_dict['active'] = make_active_list(band.active)
temp_band_dict['visited'] = datetime.strptime(band.visited, "%Y-%m-%d").date()
if band.formed != 'N/A':
temp_band_dict['formed'] = date(int(band.formed), 1, 1)
else:
temp_band_dict['formed'] = None
logger.debug(f' Writing data for band {band.link}.')
db_handle.add_band(temp_band_dict)
for emid, release in band.releases.items():
# We need to copy the dict first because we need to make a date object for the release date.
release_copy = JSONSerializer.serialize(release)
# This is not the accurate date, only the year.
date_sanitized = release_copy['release_date']
# If the date is unknown, it sometimes comes as the string "0000". In this case we use 1900 as a default .
if date_sanitized == "0000":
date_sanitized = 1900
date_sanitized = int(date_sanitized)
date_sanitized = date(date_sanitized, 1, 1)
release_copy['release_date'] = date_sanitized
logger.debug(f' Writing data for release {release_copy["name"]}.')
db_handle.add_release(release_copy)
db_handle.band_recorded_release(band.emid, emid)
for status, members in band.lineup.items():
for member in members:
temp_member_dict = JSONSerializer.serialize(member)
temp_member_dict['visited'] = datetime.strptime(member.visited, "%Y-%m-%d").date()
logger.debug(f" Writing data for artist {temp_member_dict['link']}.")
try:
db_handle.add_member(temp_member_dict)
except Exception as e:
logger.exception(f'Adding the band member was unsuccessful: {member.link}', e)
for instrument in member.instruments:
try:
db_handle.member_played_in_band(
member.emid,
band.emid,
instrument[0],
member.pseudonym,
make_time_spans(instrument[1]),
get_dict_key(MEMBER_STATUS, status)
)
except Exception as e:
logger.exception("Making member connection failed.", e, exc_info=True)
logger.error(member)
logger.error(band.emid)
logger.error(instrument)
logger.error(member.pseudonym)
logger.error(get_dict_key(MEMBER_STATUS, status))
# Add labels if mode is detailed.
if is_detailed:
pass
def cook_soup(link, retry_count=5):
"""Wraps getting a web page for further parsing.
Retries several times to get the page if the request yields in a *Forbidden*.
:param link: URL to get the web page from.
:param retry_count: Set to any number greater than 0 (will be set internally to 1 if smaller than 1).
:return: Either a BeautifulSoup object of the requested page or `None` if the request failed.
"""
logger = logging.getLogger('Crawler')
# Set to 1 if value is invalid.
if retry_count < 1:
retry_count = 1
logger.debug(f"Cooking soup for {link}")
while retry_count > 0:
# Initialize the pool manager with certificates. There will be nasty warnings for every call if you don't.
http = urllib3.PoolManager(cert_reqs='CERT_REQUIRED', ca_certs=certifi.where())
web_page = None
retry_count -= 1
try:
web_page = http.request('GET', link, timeout=10.0)
except ReadTimeoutError as e:
logger.exception(e, exc_info=True)
except MaxRetryError as e:
logger.exception(e, exc_info=True)
if web_page is None:
logger.error("Received no data.")
retry_count = 0
elif len(web_page.data) != 0:
web_page_string = web_page.data.decode("utf-8")
if "Forbidden.\n" == web_page_string:
time.sleep(.5)
logger.debug(f" Trying again... ({retry_count} to go)")
elif "Error 404 -" in web_page_string:
retry_count = 0
logger.error(f'404 for link: {link}')
elif "not found" in web_page_string:
logger.info(f'Not found: {link}')
# Ignore use case for now but log the page string.
retry_count = -1
logger.debug(web_page_string)
elif 'may refer to' in web_page_string:
retry_count = 0
logger.error(f'May refer: {link}')
else:
# Breaks out of the loop.
retry_count = -1
else:
logger.error(f'Data Length: {str(len(web_page.data))}\n{web_page.data}')
logger.debug(f" Trying again... ({retry_count} to go)")
# Error case: No web page data after n retries.
if retry_count == 0:
return None
soup = BeautifulSoup(web_page.data.decode('utf-8', 'ignore'), "html.parser")
# Extra safeguard for extremely rare cases.
if soup.text == '':
logger.error(f'Soup text is {soup.text}. Data Length: {str(len(web_page.data))}\n{web_page.data}')
soup = None
return soup
def cut_instruments_alt(instrument_string):
instruments = []
instrument_string = instrument_string.rstrip().lstrip().replace('\t', '').replace(' ', '')
# First split along the '),'.
temp_instruments = instrument_string.split('),')
# Put the closing parenthesis back into every element but the last one. It's needed to preserve parts like
# "(earlier)".
for index in range(0, len(temp_instruments) - 1):
temp_instruments[index] += ')'
for temp_instrument in temp_instruments:
temp_instrument = temp_instrument.lstrip()
# Test if there are any numerals in instrument_string.
if not bool(re.search(r'\d', temp_instrument)):
instruments.append([temp_instrument, []])
# We have at least one year.
else:
split_more = temp_instrument.split('(')
back_together = split_more[0]
# The nastiest thing is users forgetting to use commas as delimiters. If `split_more` contains e.g. three
# elements, we might have such a case.
# There's currently no handling for this use case and easier to use the "report error" function on M-A.
ready_spans = []
for inner in range(1, len(split_more)):
if bool(re.search(r'\d', split_more[inner])):
# First split by commas.
time_spans = split_more[inner].split(',')
# Nasty special case for just one band (The_Flesh_Trading_Company/3540340934).
if 'Live' in time_spans or 'live' in time_spans:
continue
# Then we have one of four types of strings. (1) two years separated by a '-' but the hyphen must be
# in the middle (if it is not we have e.g. a 10-string bass: ARGH!) , (2) a single
# year, (3) a year followed by a '-' and 'present' or (4) at least one '?'. (5) The nastiest special
# case so far: inside the parenthesis is a string we cannot interpret (e.g. 'on EP 1').
for time_span in time_spans:
time_span = time_span.lstrip().rstrip()
# Safeguard against sloppy instruments where the time span starts with a comma.
if time_span == '':
continue
# There still is a trailing ')' in the end.
if time_span[len(time_span) - 1] == ')':
time_span = time_span[:-1]
# (2)
if len(time_span) == 4:
years = [int(time_span), int(time_span)]
# (1)
elif len(time_span) == 9 and time_span[0] != '?' and time_span[4] == '-':
years = [int(time_span[0:4]), int(time_span[5:])]
# (4) Nasty special case.
elif '?' in time_span:
# '?-?' after removing a trailing ')'.
if time_span[0] == '?' and time_span[-1:] == '?':
years = ['?', '?']
elif time_span[0] == '?':
if re.search('[Pp]resent', time_span):
years = ['?', 'present']
else:
years = ['?', int(time_span[2:])]
elif time_span[-1:] == '?':
years = [int(time_span[0:4]), '?']
else:
years = []
# (5)
elif not time_span.isdigit() and not re.search('[Pp]resent', time_span):
continue
# (3)
else:
years = [int(time_span[0:4]), 'present']
ready_spans.append(years)
# Strings in brackets, part of the instrument we're looking for.
else:
back_together += '(' + split_more[inner]
instruments.append([back_together.rstrip(), ready_spans])
return instruments
def cut_instruments(instrument_string):
collection = []
# First split along the '),'.
instrument_string = instrument_string.rstrip().lstrip().replace('\t', '').replace(' ', '')
temp_instruments = instrument_string.split('),')
# Put the closing parenthesis back into every element but the last one. It's needed to preserve parts like
# "(earlier)".
for index in range(0, len(temp_instruments) - 1):
temp_instruments[index] += ')'
for temp_instrument in temp_instruments:
temp_instrument = temp_instrument.lstrip()
# Test if there are any numerals in instrument_string.
if not bool(re.search(r'\d', temp_instrument)):
collection.append((temp_instrument, []))
# We have at least one year.
else:
split_more = temp_instrument.split('(')
back_together = split_more[0]
ready_spans = []
for inner in range(1, len(split_more)):
if bool(re.search(r'\d', split_more[inner])):
# First split by commas.
time_spans = split_more[inner].split(',')
# Then we have one of four types of strings. (1) two years separated by a '-' but the hyphen must be
# in the middle (if it is not we have e.g. a 10-string bass: ARGH!) , (2) a single
# year, (3) a year followed by a '-' and 'present' or (4) at least one '?'. (5) The nastiest special
# case so far: inside the parenthesis is a string we cannot interpret (e.g. 'on EP 1').
for time_span in time_spans:
time_span = time_span.lstrip().rstrip()
# Safeguard against sloppy instruments where the time span starts with a comma.
if time_span == '':
continue
# There still is a trailing ')' in the end.
if time_span[len(time_span) - 1] == ')':
time_span = time_span[:-1]
# (2)
if len(time_span) == 4:
years = (int(time_span), int(time_span))
# (1)
elif len(time_span) == 9 and time_span[0] != '?' and time_span[4] == '-':
years = (int(time_span[0:4]), int(time_span[5:]))
# (4) Nasty special case.
elif '?' in time_span:
# '?-?' after removing a trailing ')'.
if time_span[0] == '?' and time_span[-1:] == '?':
years = ('?', '?')
elif time_span[0] == '?':
if re.search('[Pp]resent', time_span):
years = ('?', 'present')
else:
years = ('?', int(time_span[2:]))
elif time_span[-1:] == '?':
years = (int(time_span[0:4]), '?')
else:
years = ()
# (5)
elif not time_span.isdigit() and not re.search('[Pp]resent', time_span):
continue
# (3)
else:
years = (int(time_span[0:4]), 'present')
ready_spans.append(years)
# Strings in brackets, part of the instrument we're looking for.
else:
back_together += '(' + split_more[inner]
collection.append((back_together.rstrip(), ready_spans))
return collection
def crawl_country(country_short):
"""Crawls the given country page for band links and returns the list of short band links.
Depending on the total amount of bands in the given country, the pages will be fetched through
MA's AJAX API in packages of up til 500 bands.
:param country_short: A country's ISO code to parse band links from.
:return An unsorted list of short band links.
"""
logger = logging.getLogger('Crawler')
logger.debug(f">>> Crawling Country: {COUNTRY_NAMES[country_short]}")
link_country = "https://www.metal-archives.com/browse/ajax-country/c/" + country_short
http = urllib3.PoolManager(cert_reqs='CERT_REQUIRED', ca_certs=certifi.where())
while True:
country_json = http.request('GET', link_country)
json_data_string = country_json.data.decode("utf-8")
if "Forbidden." in json_data_string or json_data_string == "0":
logger.debug(" trying again...")
time.sleep(.5)
else:
break
json_data_string = json_data_string.replace("\"sEcho\": ,", '')
json_data = json.loads(json_data_string)
# The total amount of entries for this country is the only data we need for now.
amount_entries = json_data["iTotalRecords"]
logger.debug(f' Country has [{amount_entries}] entries.')
# Limit imposed by MA.
display_constant = 500
link_suffix = "/json/1?sEcho=1&iDisplayStart="
band_links = None
alt_link = "https://www.metal-archives.com/browse/ajax-country/c/{0}/json/1?sEcho={1:.0f}&iDisplayStart={2}"
# I reworked this section multiple times but kept running into trouble infrequently. Running twice (and slowly)