This repository has been archived by the owner on Nov 13, 2019. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
119 lines (106 loc) · 3.7 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
#!/usr/lib/python2.7
# -*- coding: utf-8 -*-
from bs4 import BeautifulSoup
from urlparse import urlparse
import requests, csv, lxml, string, sys
# UTF-8 for all
reload(sys)
sys.setdefaultencoding('utf8')
# Helpfull color
class bcolors:
HEADER = '\033[95m'
OKBLUE = '\033[94m'
OKGREEN = '\033[92m'
WARNING = '\033[93m'
FAIL = '\033[91m'
ENDC = '\033[0m'
BOLD = '\033[1m'
UNDERLINE = '\033[4m'
## Downloader
def download(url):
print(bcolors.WARNING + bcolors.BOLD + 'GO download on : ' + bcolors.FAIL + url + bcolors.ENDC)
response = requests.get(url)
if response.status_code == 200:
return response.text
else:
sys.exit(bcolors.FAIL + bcolors.BOLD + 'nothing here')
## First Crawl
def extract_url(html):
# print(bcolors.WARNING + bcolors.BOLD + 'GO extraction data from : '+ bcolors.FAIL + artist_name + bcolors.ENDC)
soup = BeautifulSoup(html, 'lxml')
next_link = soup.find('a', {'class' : 'next_page'})
listLink = soup.find('section', {'class':'all_songs'})
link = listLink.find_all('a', {'class' : 'song_name'})
all_link = []
for x in link:
title_Song = x.get('title')
url_Song = x.get('href')
listeA = [title_Song, url_Song]
all_link.append(listeA)
if (next_link is None):
print 'ONLY ONE PAGE'
for x in link:
title_Song = x.get('title')
url_Song = x.get('href')
listeA = [title_Song, url_Song]
all_link.append(listeA)
else:
print 'MULTIPLE PAGES'
next_link_URL = host + next_link.get('href')
while (next_link is not None):
try:
next_to_list = download(next_link_URL)
soup = BeautifulSoup(next_to_list, 'lxml')
next_link = soup.find('a', {'class' : 'next_page'}).get('href')
linkList = soup.find_all('a', {'class' : 'song_name'})
next_link_URL = host + next_link
for x in linkList:
title_Song = x.get('title')
url_Song = x.get('href')
listeA = [title_Song, url_Song]
all_link.append(listeA)
except AttributeError:
print('Fin de boucle !')
break
return all_link
## Second Crawl
def extract_lyrics(html):
soup = BeautifulSoup(html, 'lxml')
divLyrics = soup.find('div', {'class':'lyrics'})
divLyrics = divLyrics.find_all('p')
return divLyrics
# Extract CSV
def write_csv(filename, data):
with open('corpus/' + filename + '.csv', 'a') as csvfile:
dataWriter = csv.writer(csvfile, delimiter=';')
dataWriter.writerows(all_songs)
return
if len(sys.argv) > 1:
artist_name = str(sys.argv[1])
else:
sys.exit(bcolors.FAIL + bcolors.BOLD + 'Please pass one argument as an artist_name seen on RapGenius')
## Used on debug
# artist_name = 'East'
# artist_name = urlparse(ArtistURL).path.split('/artists/',1)[-1]
host = 'http://genius.com'
ArtistURL = host + '/artists/' + artist_name
get = download(ArtistURL)
songListURL = extract_url(get)
all_songs = []
if (songListURL is None):
sys.exit(bcolors.FAIL + bcolors.BOLD + 'nothing here')
else:
for x in songListURL:
title_Song = x[0]
url_Song = x[1]
print url_Song
getTracks = download(url_Song)
lyricsData = extract_lyrics(getTracks)
for y in lyricsData:
text = y.getText().encode('utf8')
listeB = [title_Song,url_Song,artist_name,text]
all_songs.append(listeB)
print 'nombre pistes : ' + len(all_songs)
for song in all_songs:
print song[0]
write_csv(artist_name, song)