-
Notifications
You must be signed in to change notification settings - Fork 2
/
Youtube_scrapper.py
230 lines (161 loc) · 6.62 KB
/
Youtube_scrapper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
"""
Python script for scrapping YouTube video data given url of the videos.
Input: Query and number of videos for which you want to scrap data.
Output: csv file containing url, Timestamp, Title, Views, upload_date, Likes, Dislikes and Comments.
"""
#!pip install selenium #To install selenium remove #
#importing necessary libraries
import time
import pandas as pd
from query import *
from datetime import datetime
from selenium import webdriver
drivepath = "C:\chromedriver\chromedriver.exe" #path of chromedriver
#setting chrome options for using chrome without opening chrome
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument('incognito')
driver = webdriver.Chrome(drivepath,options=chrome_options)
driver.get('https://www.youtube.com/results?search_query='+query)
print('You quered for:',driver.title)
#loop for extracting first n links for the query appeared on YouTube search
while True:
driver.execute_script("window.scrollTo(0, window.scrollY + 5000);")
tn = driver.find_elements_by_xpath('//*[@id="thumbnail"]')
links = []
for i in tn:
links.append(i.get_attribute('href'))
len(links)
if len(links)> no_of_videos + 10: #i have set the limit 10 more than req. for safety
break
Links = links[1:no_of_videos+1]
print('Total scrapped links are:',len(links))
#function to return video data except comments (comments extraction takes time so make seperate function)
def Scrap(url):
""" Just pass the url of the youtube video you want to scrap data for except comments.
input: url of the video
return: Dictionary containing url,
timestamp at which it is extracted,
title of the video,
views on the video,
upload date,
likes and dislikes.
"""
dct = {}
dct['url'] = url
Timestamp = datetime.now().strftime("%d/%m/%Y %H:%M:%S") #Time at which url is extracted
dct['Timestamp'] = Timestamp
driver.get(url)
time.sleep(2)
try:
title = driver.find_element_by_xpath('//*[@id="container"]/h1/yt-formatted-string')
Title = title.text
dct['Title'] = Title
except:
dct['Title'] = ''
try:
views = driver.find_element_by_xpath('//*[@id="count"]/yt-view-count-renderer')
Views = views.text
dct['Views'] = Views
except:
dct['Views'] = ''
try:
date = driver.find_element_by_xpath('//*[@id="date"]/yt-formatted-string')
upload_date = date.text
dct['upload_date'] = upload_date
except:
dct['upload_date'] = ''
try:
likes = driver.find_element_by_xpath('//*[@id="top-level-buttons"]/ytd-toggle-button-renderer[1]/a')
Likes = likes.text
dct['Likes'] = Likes
except:
dct['Likes'] = ''
try:
dislikes = driver.find_element_by_xpath('//*[@id="top-level-buttons"]/ytd-toggle-button-renderer[2]/a')
Dislikes = dislikes.text
dct['Dislikes'] = Dislikes
except:
dct['Dislikes'] = ''
return dct
#print(Scrap('https://youtu.be/XhZ1w3saRiI')) #for testing purpose remove # to see output
#function to return video data except comments (comments extraction takes time so make seperate function)
def comments(url):
""" Just pass the url of the youtube video you want to scrap comments for.
input: url of the video.
return: Dictionary containing url and comments on the video seperated by ' ||
'"""
dct = {}
driver.get(url)
time.sleep(2) #wait 2 sec for the html to load
dct['url'] = url
driver.execute_script("window.scrollTo(0, window.scrollY + 50000);")
comments = driver.find_element_by_xpath('//*[@id="comments"]')
driver.execute_script("arguments[0].scrollIntoView();", comments)
last_height = driver.execute_script("return document.documentElement.scrollHeight")
#loop to load all comments on the webpage
while True:
driver.execute_script("window.scrollTo(0, document.documentElement.scrollHeight);")
time.sleep(2) #wait 2 sec for the html to load
new_height = driver.execute_script("return document.documentElement.scrollHeight") # Calculate new scroll height and compare with last scroll height.
if new_height == last_height:
break
last_height = new_height
try:
username = driver.find_elements_by_xpath('//*[@id="author-text"]')
comment = driver.find_elements_by_xpath('//*[@id="content-text"]')
except:
username = []
comment = []
Comments = []
for user, comment in zip(username, comment):
Comments.append("{} - {}".format(user.text,comment.text))
All_comments = ' || '.join(Comments) #converting list of comments to a single string seperated by ' || '
dct['Comments'] = All_comments
return dct
#print(comments('https://www.youtube.com/watch?v=ytCWVUmb0d0')) #for testing purpose remove # to see output
#loop for scraping data except comments for 501 links
print('start scraping data loop...., will run 501 times.')
data = []
iter = 1
for i in Links[:501]: #scraping 501 videos
try:
dt = Scrap(i)
display(dt)
data.append(dt)
print('completed - ',iter)
iter = iter + 1
except:
print('error at- ',iter)
iter = iter + 1
pass
data_without_comments = pd.DataFrame(data) #converting to dataframe
print('Dataframe containing data')
display(data_without_comments)
#loop for scraping comments for 501 links
print('start scraping comments loop...., will run 501 times.')
com = []
iter = 1
for i in Links[:501]: #scraping 501 videos
try:
dt = comments(i)
com.append(dt)
print(dt)
print('comments completed - ',iter)
iter = iter +1
except:
print('comments error - ',iter)
iter = iter + 1
pass
data_comments = pd.DataFrame(com) #converting to dataframe
print('Dataframe containing comments')
display(data_comments)
#merge both the dataframes on common urls
df = data_without_comments.merge(data_comments, on = 'url', how = 'outer')
print('Final data')
display(df)
#making csv file of the whole data
df.to_csv('YouTube_data.csv', index= False)
""" Thanks have a great life ahead."""