Skip to content

Commit

Permalink
Merge pull request #7 from officialpm/v0.1.8
Browse files Browse the repository at this point in the history
🎉 Added Date
  • Loading branch information
officialpm authored Jun 10, 2021
2 parents ded193e + 7c64cfd commit 9c23a38
Show file tree
Hide file tree
Showing 4 changed files with 16 additions and 8 deletions.
12 changes: 6 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
from scrape_amazon import get_reviews

reviews = get_reviews('in','B078BNQ318') #returns dataframe
#Reviewer, Rating, Title, Description
#Reviewer, Rating, Title, Description, Date
```

### CLI
Expand All @@ -52,11 +52,11 @@ output_path output_path for saving (B078BNQ318.csv)
## Output

```shell
Reviewer Rating Title Description
0 Parth Maniar 4 Great but ... I change ...
1 Manpreet Singh 3 Delivers ... Great ph ...
2 Aparna Uniyal 1 Battery/H ... I have ...
3 Rahul 5 Great but ... On the f ...
Reviewer Rating Title Description Date
0 Parth Maniar 4 Great but ... I change ... '05/24/2021, 00:00:00'
1 Manpreet Singh 3 Delivers ... Great ph ... '05/24/2021, 00:00:00'
2 Aparna Uniyal 1 Battery/H ... I have ... '05/24/2021, 00:00:00'
3 Rahul 5 Great but ... On the f ... '05/24/2021, 00:00:00'
```
## Want to contribute?
To get more information on contributing, go to the
Expand Down
7 changes: 7 additions & 0 deletions scrape_amazon/util/scrape.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import math
import re
import datefinder
import pandas as pd
from bs4 import BeautifulSoup
from p_tqdm import p_map
Expand Down Expand Up @@ -34,9 +35,11 @@ def extractPage(url: str) -> str:
pageNotLoaded = False
reviewers = []
ratings = []
ratingsDate = []
reviewDescriptions = []
reviewTitles = []
reviewrsSpan = productPage.findAll("span", {"class": "a-profile-name"})
reviewDate = productPage.findAll("span", {"class": "review-date"})
ratingsSpan = productPage.findAll("i", {"class": "review-rating"})
reviewTitlesSpan = productPage.findAll("a", {"class": "review-title-content"})
reviewDescriptionSpan = productPage.findAll(
Expand All @@ -48,6 +51,8 @@ def extractPage(url: str) -> str:
for i in range(2, len(reviewrsSpan)):
reviewers.append(reviewrsSpan[i].get_text())
ratings.append(int(ratingsSpan[i].get_text()[0]))
matches = datefinder.find_dates(reviewDate[i].get_text())
ratingsDate.append(list(matches)[0].strftime("%m/%d/%Y"))

for i in range(0, len(reviewTitlesSpan)):
reviewTitles.append(reviewTitlesSpan[i].get_text())
Expand All @@ -63,6 +68,7 @@ def extractPage(url: str) -> str:
"ratings": ratings,
"reviewTitles": reviewTitles,
"reviewDescriptions": reviewDescriptions,
"date": ratingsDate,
}


Expand Down Expand Up @@ -107,6 +113,7 @@ def scrape_reviews(url):
productReviewsData["Rating"] = res["ratings"]
productReviewsData["Title"] = res["reviewTitles"]
productReviewsData["Description"] = res["reviewDescriptions"]
productReviewsData["Date"] = res["date"]
# productReviewsData["link"] = url
# productReviewsData["Product Title"] = pageTitle

Expand Down
2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[metadata]
name = scrape_amazon
version = 0.1.7
version = 0.1.8
description = Scrape Amazon Reviews smoothly.
license = MIT
author = Parth Maniar
Expand Down
3 changes: 2 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

setuptools.setup(
name="scrape_amazon",
version="0.1.7",
version="0.1.8",
description="Scrape Amazon Reviews",
url="http://github.com/officialpm/scrape-amazon",
author="Parth Maniar",
Expand Down Expand Up @@ -34,6 +34,7 @@
"p_tqdm",
"my_fake_useragent",
"requests",
"datefinder"
],
entry_points={
"console_scripts": ["scrape-amazon=scrape_amazon.cli:get_reviews_cli"],
Expand Down

0 comments on commit 9c23a38

Please sign in to comment.