-
Notifications
You must be signed in to change notification settings - Fork 1
/
scrapper.py
77 lines (67 loc) · 2.44 KB
/
scrapper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
import requests
import pandas as pd
from bs4 import BeautifulSoup
import numpy as np
def get_all_titles(soup):
result_topics=[]
all_topics=soup.find_all('h3',{"class":"lister-item-header"})
#print(all_topics)
for topic in all_topics:
topic = str(topic.find('a'))
topic = topic.replace("<","=")
topic = topic.replace(">","=")
topic = topic.split('=')
topic = topic[int(len(topic)/2)]
result_topics.append(topic)
return result_topics
def get_all_genres(soup):
result_genres=[]
all_genres=soup.find_all("p",{"class":'text-muted'})
for genre in all_genres:
genre=str(genre.find_all("span",{"class":"genre"}))
if genre == []:
pass
else:
genre=genre.replace("<","=")
genre=genre.replace(">","=")
genre=genre.split('=')
genre=genre[int((len(genre)/2))]
result_genres.append(genre)
return result_genres
def post_process(genres):
post_process_genres=[]
for i in genres:
i=i.replace("\n","")
i=i.replace(" ","")
post_process_genres.append(i)
return post_process_genres
def check_repeated_comma(x):
list_x=x.split(',')
if len(list_x)==3:
return x
else:
return np.NaN
def data_set(url):
data_set = pd.DataFrame(columns=["Movie", "Primary Genre", "Secondary Genre", "Tertiary Genre"])
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
title = get_all_titles(soup)
genres = get_all_genres(soup)
genres = post_process(genres)
data_set["Movie"]=pd.Series(title)
data_set["Primary Genre"]=pd.Series(genres)
data_set["Primary Genre"]=data_set["Primary Genre"].apply(check_repeated_comma)
data_set["Secondary Genre"]=data_set["Secondary Genre"].fillna('To be filled')
data_set["Tertiary Genre"]=data_set["Tertiary Genre"].fillna('To be filled')
data_set = data_set.loc[data_set['Primary Genre']!=np.NaN]
data_set = data_set.dropna(how = "any")
data_set[['Primary Genre','Secondary Genre','Tertiary Genre']] = data_set["Primary Genre"].str.split(',',expand = True)
data_set.to_csv("Dataset.csv",mode='a',header=False,index=False)
print(data_set)
import os
os.system('cls')
print("IMDB Scrapper")
number_of_pages = int(input("Number of pages to scrap: "))
for i in range(number_of_pages):
url=input("Enter the url: ")
data_set(url)