-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmetadata.py
284 lines (244 loc) · 9.22 KB
/
metadata.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
from __future__ import annotations
import os
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
from pytube import YouTube, extract
from shazamio import Shazam
import pandas as pd
from fastapi import HTTPException
from pathlib import Path
# YouTube API key must be set as environmental variable (both in local and in docker cases)
API_KEY = os.environ.get("API_KEY")
if API_KEY is None:
raise Exception("Error: API_KEY for youtube api must be set as an environmental variable")
"""
NAME: shazam_recognize
DESC: finds the song from the shazam api
PRMS: -
RTRN: -
"""
async def shazam_recognize(video: Video, audio_file: str):
try:
shazam = Shazam()
out = await shazam.recognize_song(audio_file)
except Exception as e:
raise HTTPException(status_code = 200, detail = str(e))
matches = out['matches']
if len(matches) == 0:
raise HTTPException(status_code = 200, detail = "Audio not recognized in shazam")
# get track data
video.audio_title = out['track']['title']
video.audio_subtitle = out['track']['subtitle']
"""
NAME: Database
DESC: Class which handles the storage and the interaction with the csv file, it reads
data from csv and updates csv files in case of new insert or deletion
"""
class Database():
videos: None
def __init__(self, csv_filename: str) -> None:
# read already downloaded videos from csv file
self.csv_filename = csv_filename
# create the YouTube application using the API key
yt_api = build("youtube", "v3", developerKey = API_KEY)
# set it as a class variable
self.yt_api = yt_api
# first we need to check if file exists, if not we can create it
if not os.path.exists(csv_filename):
try:
csv_file = Path(csv_filename)
csv_file.touch(exist_ok = True)
self.videos = pd.DataFrame()
except:
raise Exception("Error: cannot create csv file")
elif os.stat(csv_filename).st_size > 0:
self.read_from_csv(csv_filename)
else:
self.videos = pd.DataFrame(columns = ['id', 'url', 'title', 'viewCount', 'publishedAt', 'audio_title', 'audio_subtitle'])
"""
NAME: insert
DESC: Get a Video class instance and save the data in local storage and in csv
PRMS: video: Video
RTRN: str
"""
def insert(self, video: Video) -> str:
self.videos = self.videos._append(video.get_metadata(), ignore_index = True)
# after insert we have to update csv file
self.save_in_csv(self.csv_filename)
return {"success": True, "id": video.id}
"""
NAME: get_video
DESC: Get Video class instance based on the given url
PRMS: video_url: str
RTRN: Video
"""
def get_video(self, video_url: str) -> Video:
# check if video url already exists
filter_result = self.videos[self.videos['url'] == video_url]
if filter_result.empty:
video = Video(video_url, self.yt_api)
return video
else:
raise HTTPException(status_code = 404, detail = "Video already exists")
"""
NAME: read_from_csv
DESC: Read already saved videos in the csv and load them into the memory using a pandas dataftame
PRMS: filename
RTRN: -
"""
def read_from_csv(self, filename: str) -> None:
# read file from the csv, using the pandas library
try:
self.videos = pd.read_csv(filename)
except IOError as error:
raise Exception(str(error))
"""
NAME: save_in_csv
DESC: Save records in the csv file
PRMS: -
RTRN: None
"""
def save_in_csv(self, filename: str) -> None:
# if there are any videos in the dataframe, we can write it in the csv
if self.videos is not None:
try:
self.videos.to_csv(filename, index=False)
except IOError as error:
raise Exception(str(error))
"""
NAME: drop
DESC: frop record from local db / dataframe
PRMS: video_id
RTRN: string for the api resonse
"""
def drop(self, video_id: str) -> str:
videos = self.videos
filter_result = videos[videos['id'] == video_id]
try:
if not filter_result.empty:
# drop record from the dataframe
videos.drop(videos[videos['id'] == video_id].index, inplace = True)
# update csv file after the deletion
self.save_in_csv(self.csv_filename)
return {"success": True, "msg": "Video dropped"}
else:
raise HTTPException(status_code = 404, detail = f"Error: Video with id {video_id} not found")
except:
raise HTTPException(status_code = 404, detail = f"Error: Video with id {video_id} not found")
"""
NAME: Video
DESC: Class to download, hold and handle data of an individual video from youtube
"""
class Video():
"""
NAME: __init__
DESC: Class constructor method, Video is also a structured as a binary tree (maybe not needed in case of database), so that search is faster, also there will be no duplicates
PRMS: api, video_url
RTRN: None
"""
def __init__(self, video_url: str, yt_api) -> None:
self._retrieve_data(video_url, yt_api)
"""
NAME: retrieve_data:
DESC: Gets the video url as an input and fills the class variables with the data retrieved from the api
ARGS: api, video_url
RETN: None
"""
def _retrieve_data(self, video_url, yt_api):
# extract video id from the url
video_id = extract.video_id(video_url)
# use yt api to get video statistics and snippet
try:
response = yt_api.videos().list(
part = 'statistics,snippet',
id = video_id
).execute()
except HttpError as e:
if e.error_details[0]['reason'] == "rateLimitExceeded":
raise HTTPException(status_code = 404, detail = "Too many requests. Rate limit exceeded")
elif e.error_details[0]['reason'] == "quotaExceeded":
raise HTTPException(status_code = 404, detail = "Quota have been exceeded")
elif e.error_details[0]['reason'] == "forbidden":
raise HTTPException(status_code = 404, detail = "Access is forbidden. Please check your google developers page or your API KEY")
else:
raise HTTPException(status_code = 404, detail = "Error: could not retrieve data from YouTube Data API")
# if there are not items found, then the video url was not found
if len(response['items']) == 0:
raise HTTPException(status_code = 404, detail = "Error: Could not retrieve data from YouTube Data API for this url")
# get response data
info = response['items'][0]
statistics = info['statistics']
snippet = info['snippet']
# fill class variable with the data retrieved
self.id = video_id
self.url = video_url
self.title = snippet['title']
self.viewCount = statistics['viewCount']
self.publishedAt = snippet['publishedAt']
# download video and audio
self.download_video()
self.download_audio()
"""
NAME: get_metadata:
DESC: Return video info in a dictionary
ARGS: -
RETN: dict
"""
def get_metadata(self) -> dict:
return {
"id": self.id,
"url": self.url,
"title": self.title,
"viewCount": self.viewCount,
"publishedAt": self.publishedAt,
"audio_title": self.audio_title,
"audio_subtitle": self.audio_subtitle
}
"""
NAME: download_video
DESC: Download the video using pytube library
ARGS: -
RETN: -
"""
def download_video(self) -> None:
try:
# download the video url
yt = YouTube(self.url)
# download video
yt.streams\
.filter(progressive = True, file_extension = 'mp4')\
.order_by('resolution').desc()\
.first()\
.download(output_path = "streams")
except:
# print("Error: cannot download video")
raise HTTPException(status_code = 404, detail = "Error: Cannot download video")
"""
NAME: download_audio
DESC: Download the audio using pytube library
ARGS: -
RETN: -
"""
def download_audio(self) -> None:
try:
# download the video url
yt = YouTube(self.url)
# download audio
audio_file = yt.streams\
.filter(only_audio = True, file_extension = 'webm')\
.order_by('abr').desc()\
.first()\
.download(output_path = "streams")
except:
# print("Error: cannot download audio")
raise HTTPException(status_code = 404, detail = "Error: Cannot download audio")
self.audio_file = audio_file
"""
NAME: recognize
DESC: Finds the song by the audio using the shazam api
PRMS: -
RTRN: -
"""
async def recognize(self, audio_file) -> None:
# recognize song
await shazam_recognize(self, audio_file)