-
Notifications
You must be signed in to change notification settings - Fork 1
/
generate_ground_truth.py
executable file
·89 lines (72 loc) · 2.89 KB
/
generate_ground_truth.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
"""
This file uses fma_small directory and tracks.csv to generate a single csv file containing:
id_track, genre
It is used as ground truth in the learning process.
"""
import csv
import glob
def getTopgenre(id_subgenre):
reader = csv.reader(open('genres.csv'), delimiter=",")
reader2 = csv.reader(open('genres.csv'), delimiter=",")
for line in reader:
if line[0] == id_subgenre:
for l in reader2:
if l[0] == line[-1]:
return l[-2]
def extract_trackid_topgenres():
reader = csv.reader(open('tracks.csv', encoding="utf8"), delimiter=",")
# 13 genre top
i = 0
for line in reader:
i += 1
if i == 3:
break
file = open('training_set_medium.csv', 'w')
dict = {'Hip-Hop': 0, 'Pop': 0, 'Rock': 0, 'Experimental': 0, 'Folk': 0, 'Jazz': 0, 'Electronic': 0, 'Spoken': 0,
'International': 0, 'Soul-RnB': 0, 'Blues': 0, 'Country': 0, 'Classical': 0, 'Old-Time_Historic': 0,
'Instrumental': 0, 'Easy Listening': 0}
# writer.writerow(['track_id','genre'])
for line in reader:
genre = line[-13]
track_id = line[0]
if (genre == ''):
# Retrieve its top genre
sub = line[-12]
sub = sub[1:-1]
sub = sub.split(',')
genre = getTopgenre(sub[0])
if genre is not None and genre != '':
if (genre == 'Old-Time / Historic'):
genre = 'Old-Time_Historic'
file.write(track_id + ',' + genre + '\n')
dict[genre] = dict[genre] + 1
for k in dict.keys():
print(k, end='')
print(' ', end='')
print(dict[k])
print(dict)
file.close()
"""Generate the effective training_set since not all metadata tracks are in the
small version of fma dataset"""
def generate_true_dataset():
reader = csv.reader(open('training_set_medium.csv'), delimiter=",")
file = open('dataset_medium.csv', 'w')
for line in reader:
id = line[0].zfill(6) # Pad with zeros since mp3 files are padded with zeros -_-"
# Check if a file named id.mp3 is present in one of all sub-folders
for f in glob.glob('fma_medium/*/' + id + '.mp3', recursive=True):
# I'm inside the loop so there's a file
file.write(id + ',' + line[1] + '\n')
file.close()
def count_genres(file):
reader = csv.reader(open(file), delimiter=",")
dict = {'Hip-Hop': 0, 'Pop': 0, 'Rock': 0, 'Experimental': 0, 'Folk': 0, 'Jazz': 0, 'Electronic': 0, 'Spoken': 0,
'International': 0, 'Soul-RnB': 0, 'Blues': 0, 'Country': 0, 'Classical': 0, 'Old-Time_Historic': 0,
'Instrumental': 0, 'Easy Listening': 0}
for line in reader:
dict[line[1]] = dict[line[1]] + 1
for k in dict.keys():
print(k + ' ' + str(dict[k]))
#count_genres('dataset_medium.csv')
extract_trackid_topgenres()
generate_true_dataset()