1
1
import math
2
2
import re
3
+ import datefinder
3
4
import pandas as pd
4
5
from bs4 import BeautifulSoup
5
6
from p_tqdm import p_map
@@ -34,9 +35,11 @@ def extractPage(url: str) -> str:
34
35
pageNotLoaded = False
35
36
reviewers = []
36
37
ratings = []
38
+ ratingsDate = []
37
39
reviewDescriptions = []
38
40
reviewTitles = []
39
41
reviewrsSpan = productPage .findAll ("span" , {"class" : "a-profile-name" })
42
+ reviewDate = productPage .findAll ("span" , {"class" : "review-date" })
40
43
ratingsSpan = productPage .findAll ("i" , {"class" : "review-rating" })
41
44
reviewTitlesSpan = productPage .findAll ("a" , {"class" : "review-title-content" })
42
45
reviewDescriptionSpan = productPage .findAll (
@@ -48,6 +51,8 @@ def extractPage(url: str) -> str:
48
51
for i in range (2 , len (reviewrsSpan )):
49
52
reviewers .append (reviewrsSpan [i ].get_text ())
50
53
ratings .append (int (ratingsSpan [i ].get_text ()[0 ]))
54
+ matches = datefinder .find_dates (reviewDate [i ].get_text ())
55
+ ratingsDate .append (list (matches )[0 ].strftime ("%m/%d/%Y" ))
51
56
52
57
for i in range (0 , len (reviewTitlesSpan )):
53
58
reviewTitles .append (reviewTitlesSpan [i ].get_text ())
@@ -63,6 +68,7 @@ def extractPage(url: str) -> str:
63
68
"ratings" : ratings ,
64
69
"reviewTitles" : reviewTitles ,
65
70
"reviewDescriptions" : reviewDescriptions ,
71
+ "date" : ratingsDate ,
66
72
}
67
73
68
74
@@ -107,6 +113,7 @@ def scrape_reviews(url):
107
113
productReviewsData ["Rating" ] = res ["ratings" ]
108
114
productReviewsData ["Title" ] = res ["reviewTitles" ]
109
115
productReviewsData ["Description" ] = res ["reviewDescriptions" ]
116
+ productReviewsData ["Date" ] = res ["date" ]
110
117
# productReviewsData["link"] = url
111
118
# productReviewsData["Product Title"] = pageTitle
112
119
0 commit comments