-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathscrape_lyrics.py
210 lines (200 loc) · 9.67 KB
/
scrape_lyrics.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
# get lyrics from http://www.songlyrics.com/
# eventually, this will only be used to train the dataset on a server. lyrics will not be stored anywhere.
import requests
from bs4 import BeautifulSoup
import string
from collections import defaultdict
import MySQLdb
import re
# global constants
MIN_SONGS = 500 # minimum number of songs for artist to be considered
MIN_LYRICS_CHARCOUNT = 200 # minimum character count for a song's lyrics to be considered
# Now connect to the database - store credentials in ~/.my.cnf
print 'will establish connection to db...',
db = MySQLdb.connect(host="localhost", db="cs221_nlp", read_default_file='~/.my.cnf')
db_cursor = db.cursor()
print 'done!'
# scrape the whole website slowly
# this will increase the efficiency and speed of artists scraped, since mysql can take HUGE transactions per second
genres = defaultdict(int)
def extract_artist(alph, artist_page = 0, genres_considered=['Rock', 'Pop', 'Hip Hop/Rap', 'R&B;', 'Electronic', 'Country', 'Jazz', 'Blues', 'Christian', 'Folk']):
"""
@param alph: single char, should be one of string.lowercase
@param genres: the list of genres to be considered for scraping (default all 10)
@param artist_page: the artist pagination to start from (default 0)
Scrapes songlyrics.com for artist names starting with @alph (should be single letter of english alphabet)
"""
assert (alph in string.lowercase), "Only english alphabets allowed as arguments for extract_artist"
artist_count = 0
songs_total_alph = 0
artist_links = [alph]
# start by expliring each alphabetical URL
uri_alph = 'http://www.songlyrics.com/'+alph
response = requests.get(uri_alph)
html_alph = response.text
soup_html_alph = BeautifulSoup(html_alph, "html.parser")
alph_pagination = soup_html_alph.findAll("li", { "class" : "li_pagination" })
# Now we can parse this.. need to go from zz[1] to zz[n-2]
soup = BeautifulSoup(str(alph_pagination[1]), 'html.parser')
# now we make a list of all pages for artists starting with 'alph' - then we will iterate through it
for p in alph_pagination[0:-1]:
current_link = str(p.find('a'))
curr = BeautifulSoup(current_link, 'html.parser')
link = curr.find('a')
if not link == None:
# print link['href']
artist_links.append(link['href'])
artist_count += len(artist_links)
# Now will go to each of these pages in artist links and retrieve songs
for i, al in enumerate(artist_links[artist_page:]):
song_count = 0
uri = 'http://www.songlyrics.com/'+al
response = requests.get(uri)
x = response.text
y = BeautifulSoup(x, "html.parser")
lyrix = y.findAll("td", { "class" : "td-item" })
print 'exploring', al
# and now we go down a further level to get the actual songs of each artist
artist_pages = []
z = y.findAll("table", { "class" : "tracklist" }) # the table of entries of artists
soup = BeautifulSoup(str(z[0]), 'html.parser')
for at in soup.findAll('tr'): # loop through all the anchor tags in that table
# print 'checking page of artist', at.text,
current_artist_url = str(at.a['href']) # start exploring this artist
songs_count_text = at.find("td", {"class": "td-item"}).text
num_songs_for_artist = int(songs_count_text.split()[0])
artist_name = at.a.text
artist_name = artist_name.encode('utf-8')
if num_songs_for_artist <= MIN_SONGS:
print 'skipping', artist_name, 'due to very few songs'
else:
print 'exploring', artist_name
artist_pages.append(current_artist_url)
# Now we go to this artists page
try:
# protecting against too many redirects
uri = current_artist_url
response = requests.get(uri) # goto the artists page
except requests.exceptions.TooManyRedirects:
continue # skip this one
artist_page_html = response.text
artist_page_html_soup = BeautifulSoup(artist_page_html, "html.parser")
# on artist page, get Genre
artist_title = artist_page_html_soup.findAll("div", { "class" : "pagetitle" }) # get the title div which has the genres
artist_songs = artist_page_html_soup.findAll("table", { "class" : "tracklist" }) # get all songs
songs_seen = 1
# check if this div is not empty
if len(artist_title) != 0:
'''if this is the case, we will be skipping this set of lyrics since they are not tagged'''
artist_title_soup = BeautifulSoup(str(artist_title[0]), 'html.parser') # soup banaao
genre = artist_title_soup.a.text # <- this is the required genre append to hash table
genre = genre.encode('utf-8')
# Check if this genre is in the list, otherwise skip
if not genre in genres_considered:
print 'not considering genre', genre
continue
genres[genre] += 1 # add to the new genre list
song_count += 1
# print current_artist_url, genre
# Now finally, get the lyrics and put them into the DB
song_soup = soup = BeautifulSoup(str(artist_songs), 'html.parser')
for song in song_soup.findAll('a'):
if songs_seen <= num_songs_for_artist:
# deep-dive into each link one by one and retrieve the lyrics
song_url = song['href']
song_url = song_url.encode('utf-8')
song_name = song.text
song_name = song_name.encode('utf-8')
song_request = requests.get(song_url)
song_lyrics = BeautifulSoup(song_request.text, "html.parser")
if song_lyrics.img == None:
continue
img = song_lyrics.img.extract()
lyrics_div = song_lyrics.find("p", { "id" : "songLyricsDiv" })
if not lyrics_div == None:
lyric = lyrics_div.get_text()
lyric = lyric.encode('utf-8')
# now we edit out crap from the lyrics
lyric = re.sub(r"[^\s\w_]+", '', lyric.lower().replace('\n', ' '))
if (len(lyric) >= MIN_LYRICS_CHARCOUNT) and (lyric[:25] != 'We do not have the lyrics'):
'''INSERTING THE LYRICS INTO THE DB'''
db_cursor.execute("""insert into song (lyrics, genre, url, artist_name, song_name) values (%s, %s, %s, %s, %s)""", (lyric, genre, song_url, artist_name, song_name))
db.commit()
songs_seen += 1
print 'page', i, 'of', alph, 'songs:', song_count
print dict(genres)
songs_total_alph += song_count
print 'RESULTS FOR CASE', alph
print alph, ':', len(artist_links), ', num_songs:',songs_total_alph
print 'total no of artists:', artist_count
# print no of songs for each genre
print 'total songs per genre:'
for g in genres:
print g, ':', genres[g]
def grab_lyrics_artist(artist_link, num_songs_for_artist, artist_name, genre):
print 'Processing music for', artist_name
song_count = 0
try:
# protecting against too many redirects
uri = artist_link
response = requests.get(uri) # goto the artists page
except requests.exceptions.TooManyRedirects:
print "too many redirects. quitting."
return
artist_page_html = response.text
artist_page_html_soup = BeautifulSoup(artist_page_html, "html.parser")
# on artist page, get Genre
artist_title = artist_page_html_soup.findAll("div", { "class" : "pagetitle" }) # get the title div which has the genres
artist_songs = artist_page_html_soup.findAll("table", { "class" : "tracklist" }) # get all songs
songs_seen = 1
# check if this div is not empty
if len(artist_title) != 0:
'''if this is the case, we will be skipping this set of lyrics since they are not tagged'''
artist_title_soup = BeautifulSoup(str(artist_title[0]), 'html.parser') # soup banaao
# genre = artist_title_soup.a.text # <- this is the required genre append to hash table
# genre = genre.encode('utf-8')
# Check if this genre is in the list, otherwise skip
song_count += 1
# print current_artist_url, genre
# Now finally, get the lyrics and put them into the DB
song_soup = soup = BeautifulSoup(str(artist_songs), 'html.parser')
for song in song_soup.findAll('a'):
if songs_seen <= num_songs_for_artist:
# deep-dive into each link one by one and retrieve the lyrics
song_url = song['href']
song_url = song_url.encode('utf-8')
song_name = song.text
song_name = song_name.encode('utf-8')
song_request = requests.get(song_url)
song_lyrics = BeautifulSoup(song_request.text, "html.parser")
if song_lyrics.img != None:
img = song_lyrics.img.extract()
lyrics_div = song_lyrics.find("p", { "id" : "songLyricsDiv" })
if not lyrics_div == None:
lyric = lyrics_div.get_text()
lyric = lyric.encode('utf-8')
# now we edit out crap from the lyrics
lyric = re.sub(r"[^\s\w_]+", '', lyric.lower())
# print "lyric = ", lyric
if (len(lyric) >= MIN_LYRICS_CHARCOUNT) and (lyric[:25] != 'We do not have the lyrics'):
# print 'song inserted!'
'''INSERTING THE LYRICS INTO THE DB'''
db_cursor.execute("""insert into song (lyrics, genre, url, artist_name, song_name) values (%s, %s, %s, %s, %s)""", (lyric, genre, song_url, artist_name, song_name))
db.commit()
songs_seen += 1
else:
print "skipped one"
print "DONE!"
def grab_lyrics(artist_list, genre):
for artist in artist_list:
artist_link = 'http://www.songlyrics.com/' + '-'.join(artist.split()) + '-lyrics/'
grab_lyrics_artist(artist_link, 220, artist, genre)
if __name__ == "__main__":
# alphabets_list = string.lowercase
# alphabets_list = ['t']
# for alph in alphabets_list:
# TODO: spawn a new thread for each call to extract_artist
# extract_artist(alph,artist_page=2, genres_considered = ['Rock', 'Pop', 'Hip Hop/Rap', 'R&B;', 'Country', 'Jazz', 'Blues', 'Christian'])
artist_list = ['dolly parton', 'garth brooks', 'johnny cash', 'willie nelson', 'merle haggard', 'shania twain', 'kenny rogers', 'conway twitty', 'carrie underwood', 'faith hill']
# grab_lyrics_artist('http://www.songlyrics.com/michael-jackson-lyrics/', 200, 'Michael Jackson', 'Pop')
grab_lyrics(artist_list, 'Country')