From ba943eb32a215020118bd383df52434103b13c68 Mon Sep 17 00:00:00 2001 From: Eric Date: Sun, 14 Jun 2020 20:06:02 +1200 Subject: [PATCH] Let there be light --- .github/workflows/ci.yml | 40 ++++++++++ .github/workflows/publish.yml | 26 +++++++ .gitignore | 4 + LICENCE | 21 +++++ README.md | 100 ++++++++++++++++++++++++ app_store_scraper/__init__.py | 11 +++ app_store_scraper/__version__.py | 7 ++ app_store_scraper/app_store.py | 117 ++++++++++++++++++++++++++++ app_store_scraper/base.py | 64 +++++++++++++++ app_store_scraper/tests/__init__.py | 0 app_store_scraper/tests/test_all.py | 43 ++++++++++ requirements.txt | 1 + setup.py | 38 +++++++++ 13 files changed, 472 insertions(+) create mode 100644 .github/workflows/ci.yml create mode 100644 .github/workflows/publish.yml create mode 100644 .gitignore create mode 100644 LICENCE create mode 100644 README.md create mode 100644 app_store_scraper/__init__.py create mode 100644 app_store_scraper/__version__.py create mode 100644 app_store_scraper/app_store.py create mode 100644 app_store_scraper/base.py create mode 100644 app_store_scraper/tests/__init__.py create mode 100644 app_store_scraper/tests/test_all.py create mode 100644 requirements.txt create mode 100644 setup.py diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..6db4b47 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,40 @@ +name: Build + +on: + push: + branches: [ master ] + paths-ignore: + - '**.md' + pull_request: + branches: [ master ] + paths-ignore: + - '**.md' + +jobs: + build: + runs-on: ubuntu-latest + strategy: + matrix: + os: [ubuntu-latest, macos-latest, windows-latest] + python-version: [3.6, 3.7, 3.8] + steps: + - name: Checkout + uses: actions/checkout@v2 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install black flake8 pytest + pip install -r requirements.txt + - name: Format with black + run: | + black . --check + - name: Lint with flake8 + run: | + flake8 . --ignore=E203 --count --show-source --statistics --max-line-length=90 + - name: Test with pytest + run: | + pytest diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml new file mode 100644 index 0000000..fa96f2d --- /dev/null +++ b/.github/workflows/publish.yml @@ -0,0 +1,26 @@ +name: Publish + +on: + release: + types: [created] + +jobs: + deploy: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - name: Set up Python + uses: actions/setup-python@v2 + with: + python-version: '3.x' + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install setuptools wheel twine + - name: Build and publish ๐Ÿ ๐Ÿ“ฆ to PyPi + env: + TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} + TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} + run: | + python setup.py sdist bdist_wheel + twine upload dist/* diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..7c59dc0 --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +venv +__pycache__ +.pytest_cache +*.egg-info diff --git a/LICENCE b/LICENCE new file mode 100644 index 0000000..c2f064d --- /dev/null +++ b/LICENCE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2020 Eric Lim + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md new file mode 100644 index 0000000..bf0afe9 --- /dev/null +++ b/README.md @@ -0,0 +1,100 @@ +![Build](https://github.com/cowboy-bebug/app-store-scraper/workflows/Build/badge.svg) +[![PRs Welcome](https://img.shields.io/badge/PRs-welcome-brightgreen.svg?style=flat-square)](https://github.com/cowboy-bebug/app-store-scraper/pulls) +Code style: black + +``` + ___ _____ _ _____ + / _ \ / ___| | / ___| + / /_\ \_ __ _ __ \ `--.| |_ ___ _ __ ___ \ `--. ___ _ __ __ _ _ __ ___ _ __ + | _ | '_ \| '_ \ `--. \ __/ _ \| '__/ _ \ `--. \/ __| '__/ _` | '_ \ / _ \ '__| + | | | | |_) | |_) | /\__/ / || (_) | | | __/ /\__/ / (__| | | (_| | |_) | __/ | + \_| |_/ .__/| .__/ \____/ \__\___/|_| \___| \____/ \___|_| \__,_| .__/ \___|_| + | | | | | | + |_| |_| |_| +``` + +# Quickstart + +```console +pip3 install app-store-scraper +``` + +```python +from app_store_scraper import AppStore +from pprint import pprint + +fortnite = AppStore(country="nz", app_name="fortnite", app_id=1261357853) +fortnite.review(how_many=20) + +pprint(fortnite.reviews) +pprint(fortnite.reviews_count) +``` + +# Extra Details + +Let's continue from the code example used in [Quickstart](#quickstart). + + +## Instantiation + +There are three required arguments, `country, app_name, app_id`. + +```pycon +>>> fortnite +AppStore(country=nz, app_name=fortnite, app_id=1261357853) +``` + +These are required to create a URL for the App Store landing page, which can be displayed by the private field, `landing_url` like below: + +```pycon +>>> fortnite.landing_url +'https://apps.apple.com/nz/app/fortnite/id1261357853' +``` + +There are optional arguments used to override log settings: + +- `log_format` + - passed directly to `logging.basicConfig(format=log_format)` + - default is `"%(asctime)s [%(levelname)s] %(name)s - %(message)s"` +- `log_level` + - passed directly to `logging.basicConfig(level=log_level)` + - default is `"INFO"` +- `log_interval` + - log is produced every 10 seconds (by default) as a "heartbeat" (useful for a long scraping session) + - default is `10` + + +## Fetching Review + +The maximum number of reviews fetched per request is 20. To minimise the number of calls, the limit of 20 is hardcoded. This means the `review()` method will always grab more than the `how_many` argument supplied with an increment of 20. + +```pycon +>>> fortnite.review(how_many=33) +>>> fortnite.reviews_count +40 +``` + +If `how_many` is not provided, `review()` will terminate after *all* reviews are fetched. + +**NOTE** the review count seen on the landing page differs from the actual number of reviews fetched. This is simply because only *some* users who rated the app also leave reviews. + + +## Review Data + +The fetched review data are loaded in memory and live inside `reviews` attribute as a list of dict. +```pycon +>>> fortnite.reviews +[{'userName': 'someone', 'rating': 5, 'date': datetime.datetime(... +``` + +Each review dictionary has the following schema: +```python +{ + "date": datetime.datetime, + "isEdited": bool, + "rating": int, + "review": str, + "title": str, + "userName": str + } +``` diff --git a/app_store_scraper/__init__.py b/app_store_scraper/__init__.py new file mode 100644 index 0000000..8ad7ff5 --- /dev/null +++ b/app_store_scraper/__init__.py @@ -0,0 +1,11 @@ +from .app_store import AppStore +from .__version__ import ( # noqa: F401 + __title__, + __version__, + __description__, + __author__, + __url__, + __license__, +) + +__all__ = ["AppStore"] diff --git a/app_store_scraper/__version__.py b/app_store_scraper/__version__.py new file mode 100644 index 0000000..22e6d51 --- /dev/null +++ b/app_store_scraper/__version__.py @@ -0,0 +1,7 @@ +__title__ = "app-store-scraper" +__version__ = "0.1.1" +__description__ = "Single API โ˜ App Store Review Scraper ๐Ÿงน" +__author__ = "Eric Lim" +__url__ = "https://github.com/cowboy-bebug/app-store-scraper" +__license__ = "MIT" +__keywords__ = ["app store", "ios", "review", "scraping", "scraper"] diff --git a/app_store_scraper/app_store.py b/app_store_scraper/app_store.py new file mode 100644 index 0000000..041fd85 --- /dev/null +++ b/app_store_scraper/app_store.py @@ -0,0 +1,117 @@ +import logging +import re +import requests +import sys +import time +from datetime import datetime +from requests.adapters import HTTPAdapter +from requests.packages.urllib3.util.retry import Retry +from .base import Base + +logger = logging.getLogger("AppStore") + + +class AppStore(Base): + def __init__( + self, + country, + app_name, + app_id, + log_format="%(asctime)s [%(levelname)s] %(name)s - %(message)s", + log_level="INFO", + log_interval=10, + ): + super().__init__(country, app_name, app_id) + self.request_headers.update({"Authorization": self.__token()}) + + logging.basicConfig(format=log_format, level=log_level.upper()) + self.log_interval = log_interval + + def __repr__(self): + return "{object}(country={country}, app_name={app_name}, app_id={app_id})".format( + object=self.__class__.__name__, + country=self.country, + app_name=self.app_name, + app_id=self.app_id, + ) + + def __str__(self): + width = 12 + return ( + f"{'Country'.rjust(width, ' ')} | {self.country}\n" + f"{'Name'.rjust(width, ' ')} | {self.app_name}\n" + f"{'ID'.rjust(width, ' ')} | {self.app_id}\n" + f"{'URL'.rjust(width, ' ')} | {self.landing_url}\n" + f"{'Review count'.rjust(width, ' ')} | {self.reviews_count}" + ) + + def __get( + self, + url, + headers=None, + params=None, + total=3, + backoff_factor=3, + status_forcelist=[404], + ) -> requests.Response: + retries = Retry( + total=total, + backoff_factor=backoff_factor, + status_forcelist=status_forcelist, + ) + with requests.Session() as s: + s.mount(self.base_request_url, HTTPAdapter(max_retries=retries)) + logger.debug(f"Making a GET request: {url}") + self.response = s.get(url, headers=headers, params=params) + + def __token(self): + self.__get(self.landing_url) + tags = self.response.text.splitlines() + for tag in tags: + if re.match(r" interval: + logger.info(f"[{interval}s HEARTBEAT] Fetched {self.reviews_count} reviews") + self.log_timer = 0 + + def review(self, how_many=sys.maxsize): + logger.info(f"Fetching reviews for {self.landing_url}") + while True: + self.__heartbeat() + self.__get( + self.request_url, + headers=self.request_headers, + params=self.request_params, + ) + self.__parse_data() + self.__parse_next() + if self.request_offset is None or self.fetched_count >= how_many: + logger.info(f"Fetched {self.fetched_count} reviews") + self.fetched_count = 0 + break diff --git a/app_store_scraper/base.py b/app_store_scraper/base.py new file mode 100644 index 0000000..4fc9d30 --- /dev/null +++ b/app_store_scraper/base.py @@ -0,0 +1,64 @@ +import random +import re + + +class Base: + __scheme = "https" + + __landing_host = "apps.apple.com" + __request_host = "amp-api.apps.apple.com" + + __landing_path = "{country}/app/{app_name}/id{app_id}" + __request_path = "v1/catalog/{country}/apps/{app_id}/reviews" + + def __init__(self, country, app_name, app_id): + self.country = str(country).lower() + self.app_name = re.sub(r"[\W_]+", "-", str(app_name).lower()) + self.app_id = str(app_id) + + self.base_landing_url = f"{self.__scheme}://{self.__landing_host}" + self.base_request_url = f"{self.__scheme}://{self.__request_host}" + + self.landing_url = self.__landing_url() + self.request_url = self.__request_url() + + self.user_agents = [ + # NOTE: grab from https://bit.ly/2zu0cmU + "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322)", + "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko", + "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)", + ] + + self.request_offset = 0 + self.request_headers = { + "Accept": "application/json", + "Connection": "keep-alive", + "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8", + "Origin": self.base_landing_url, + "Referer": self.landing_url, + "User-Agent": random.choice(self.user_agents), + } + self.request_params = { + "l": "en-GB", + "offset": self.request_offset, + "limit": 20, + "platform": "web", + "additionalPlatforms": "appletv,ipad,iphone,mac", + } + + self.reviews = list() + self.reviews_count = int() + + self.fetched_count = int() + + self.log_timer = float() + + def __landing_url(self): + landing_url = f"{self.__scheme}://{self.__landing_host}/{self.__landing_path}" + return landing_url.format( + country=self.country, app_name=self.app_name, app_id=self.app_id + ) + + def __request_url(self): + request_url = f"{self.__scheme}://{self.__request_host}/{self.__request_path}" + return request_url.format(country=self.country, app_id=self.app_id) diff --git a/app_store_scraper/tests/__init__.py b/app_store_scraper/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/app_store_scraper/tests/test_all.py b/app_store_scraper/tests/test_all.py new file mode 100644 index 0000000..cc2c3d7 --- /dev/null +++ b/app_store_scraper/tests/test_all.py @@ -0,0 +1,43 @@ +from app_store_scraper import AppStore + +test_country = "Nz" +test_app_name = "Cool App" +test_app_id = 7357 + +app = AppStore(country=test_country, app_name=test_app_name, app_id=test_app_id) + + +def test_app_init_fields(): + assert app.country == test_country.lower() + assert app.app_name == test_app_name.lower().replace(" ", "-") + assert app.app_id == str(test_app_id).lower() + + +def test_app_urls(): + test_base_landing_url = "https://apps.apple.com" + test_base_request_url = "https://amp-api.apps.apple.com" + test_landing_path = f"{app.country}/app/{app.app_name}/id{app.app_id}" + test_request_path = f"v1/catalog/{app.country}/apps/{app.app_id}/reviews" + test_landing_url = f"{test_base_landing_url}/{test_landing_path}" + test_request_url = f"{test_base_request_url}/{test_request_path}" + assert app.base_landing_url == test_base_landing_url + assert app.base_request_url == test_base_request_url + assert app.landing_url == test_landing_url + assert app.request_url == test_request_url + + +def test_app_defaults(): + assert app.log_interval == 10 + + +def test_app_token(): + assert app.request_headers["Authorization"] is None + + +def test_app(): + fortnite = AppStore(country="nz", app_name="fortnite", app_id=1261357853) + fortnite.review(how_many=3) + + assert len(fortnite.reviews) == 20 + assert len(fortnite.reviews) == fortnite.reviews_count + assert fortnite.request_offset == 20 diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..b450057 --- /dev/null +++ b/requirements.txt @@ -0,0 +1 @@ +requests==2.23.0 diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..6bfac1d --- /dev/null +++ b/setup.py @@ -0,0 +1,38 @@ +import setuptools + +about = dict() + +with open("app_store_scraper/__version__.py", "r") as f: + exec(f.read(), about) + +with open("README.md", "r") as f: + long_description = f.read() + +with open("requirements.txt", "r") as f: + install_requires = f.readlines() + +setuptools.setup( + name=about["__title__"], + version=about["__version__"], + description=about["__description__"], + long_description=long_description, + long_description_content_type="text/markdown", + author=about["__author__"], + url=about["__url__"], + license=about["__license__"], + keywords=about["__keywords__"], + packages=setuptools.find_packages(".", exclude=["*.tests"]), + install_requires=install_requires, + classifiers=[ + "License :: OSI Approved :: MIT License", + "Intended Audience :: Developers", + "Natural Language :: English", + "Operating System :: OS Independent", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.6", + "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: 3.8", + ], + python_requires=">=3.6", + project_urls={"Source": about["__url__"]}, +)