Merge pull request #7 from officialpm/v0.1.8

🎉 Added Date
officialpm · Jun 10, 2021 · 9c23a38 · 9c23a38
2 parents ded193e + 7c64cfd
commit 9c23a38
Show file tree

Hide file tree

Showing 4 changed files with 16 additions and 8 deletions.
diff --git a/README.md b/README.md
@@ -27,7 +27,7 @@
 from scrape_amazon import get_reviews
 
 reviews = get_reviews('in','B078BNQ318') #returns dataframe
-#Reviewer, Rating, Title, Description
+#Reviewer, Rating, Title, Description, Date
 ```
 
 ### CLI
@@ -52,11 +52,11 @@ output_path  output_path for saving (B078BNQ318.csv)
 ## Output
 
 ```shell
-        Reviewer       Rating       Title       Description
-0     Parth Maniar       4      Great but ...     I change ... 
-1     Manpreet Singh     3      Delivers  ...     Great ph ... 
-2     Aparna Uniyal      1      Battery/H ...     I have   ... 
-3     Rahul              5      Great but ...     On the f ... 
+        Reviewer       Rating       Title       Description          Date
+0     Parth Maniar       4      Great but ...     I change ...     '05/24/2021, 00:00:00'
+1     Manpreet Singh     3      Delivers  ...     Great ph ...     '05/24/2021, 00:00:00' 
+2     Aparna Uniyal      1      Battery/H ...     I have   ...     '05/24/2021, 00:00:00' 
+3     Rahul              5      Great but ...     On the f ...     '05/24/2021, 00:00:00' 
 ```
 ## Want to contribute?
 To get more information on contributing, go to the 

diff --git a/scrape_amazon/util/scrape.py b/scrape_amazon/util/scrape.py
@@ -1,5 +1,6 @@
 import math
 import re
+import datefinder
 import pandas as pd
 from bs4 import BeautifulSoup
 from p_tqdm import p_map
@@ -34,9 +35,11 @@ def extractPage(url: str) -> str:
             pageNotLoaded = False
     reviewers = []
     ratings = []
+    ratingsDate = []
     reviewDescriptions = []
     reviewTitles = []
     reviewrsSpan = productPage.findAll("span", {"class": "a-profile-name"})
+    reviewDate = productPage.findAll("span", {"class": "review-date"})
     ratingsSpan = productPage.findAll("i", {"class": "review-rating"})
     reviewTitlesSpan = productPage.findAll("a", {"class": "review-title-content"})
     reviewDescriptionSpan = productPage.findAll(
@@ -48,6 +51,8 @@ def extractPage(url: str) -> str:
     for i in range(2, len(reviewrsSpan)):
         reviewers.append(reviewrsSpan[i].get_text())
         ratings.append(int(ratingsSpan[i].get_text()[0]))
+        matches = datefinder.find_dates(reviewDate[i].get_text())
+        ratingsDate.append(list(matches)[0].strftime("%m/%d/%Y"))
 
     for i in range(0, len(reviewTitlesSpan)):
         reviewTitles.append(reviewTitlesSpan[i].get_text())
@@ -63,6 +68,7 @@ def extractPage(url: str) -> str:
         "ratings": ratings,
         "reviewTitles": reviewTitles,
         "reviewDescriptions": reviewDescriptions,
+        "date": ratingsDate,
     }
 
 
@@ -107,6 +113,7 @@ def scrape_reviews(url):
     productReviewsData["Rating"] = res["ratings"]
     productReviewsData["Title"] = res["reviewTitles"]
     productReviewsData["Description"] = res["reviewDescriptions"]
+    productReviewsData["Date"] = res["date"]
     # productReviewsData["link"] = url
     # productReviewsData["Product Title"] = pageTitle
 

diff --git a/setup.cfg b/setup.cfg
@@ -1,6 +1,6 @@
 [metadata]
 name = scrape_amazon
-version = 0.1.7
+version = 0.1.8
 description = Scrape Amazon Reviews smoothly.
 license = MIT
 author = Parth Maniar

diff --git a/setup.py b/setup.py
@@ -5,7 +5,7 @@
 
 setuptools.setup(
     name="scrape_amazon",
-    version="0.1.7",
+    version="0.1.8",
     description="Scrape Amazon Reviews",
     url="http://github.com/officialpm/scrape-amazon",
     author="Parth Maniar",
@@ -34,6 +34,7 @@
         "p_tqdm",
         "my_fake_useragent",
         "requests",
+        "datefinder"
     ],
     entry_points={
         "console_scripts": ["scrape-amazon=scrape_amazon.cli:get_reviews_cli"],