From 9ad19fb985b881accd8b56bf85a9bc2d3bc01462 Mon Sep 17 00:00:00 2001 From: Eric Date: Mon, 14 Sep 2020 10:41:05 +1200 Subject: [PATCH] Chore/tidy up (#15) * Update PR template * Make a base class * Bump 0.3.1 -> 0.3.2 --- .github/PULL_REQUEST_TEMPLATE.md | 4 +- app_store_scraper/__version__.py | 2 +- app_store_scraper/app_store.py | 183 ++--------------------------- app_store_scraper/base.py | 193 +++++++++++++++++++++++++++++++ app_store_scraper/podcast.py | 5 +- 5 files changed, 209 insertions(+), 178 deletions(-) create mode 100644 app_store_scraper/base.py diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index 7981038..1a7cfc1 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -1,9 +1,7 @@ -This PR is about ... / Fixes #[issue number]. +This PR is about ... Changes: - Added ... - Removed ... - Changed ... - Fixed ... - -@cowboy-bebug diff --git a/app_store_scraper/__version__.py b/app_store_scraper/__version__.py index 0fb168c..716d068 100644 --- a/app_store_scraper/__version__.py +++ b/app_store_scraper/__version__.py @@ -1,5 +1,5 @@ __title__ = "app-store-scraper" -__version__ = "0.3.1" +__version__ = "0.3.2" __description__ = "Single API ☝ App Store Review Scraper 🧹" __author__ = "Eric Lim" __url__ = "https://github.com/cowboy-bebug/app-store-scraper" diff --git a/app_store_scraper/app_store.py b/app_store_scraper/app_store.py index 67926b1..4c4b092 100644 --- a/app_store_scraper/app_store.py +++ b/app_store_scraper/app_store.py @@ -1,32 +1,16 @@ import logging -import random -import re -import requests -import sys -import time -from datetime import datetime -from requests.adapters import HTTPAdapter -from requests.packages.urllib3.util.retry import Retry +from .base import Base logger = logging.getLogger("AppStore") -class AppStore: - _scheme = "https" - +class AppStore(Base): _landing_host = "apps.apple.com" _request_host = "amp-api.apps.apple.com" _landing_path = "{country}/app/{app_name}/id{app_id}" _request_path = "v1/catalog/{country}/apps/{app_id}/reviews" - _user_agents = [ - # NOTE: grab from https://bit.ly/2zu0cmU - "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322)", - "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko", - "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)", - ] - def __init__( self, country, @@ -36,38 +20,16 @@ def __init__( log_level="INFO", log_interval=5, ): - logging.basicConfig(format=log_format, level=log_level.upper()) - self._base_landing_url = f"{self._scheme}://{self._landing_host}" - self._base_request_url = f"{self._scheme}://{self._request_host}" - - self.country = str(country).lower() - self.app_name = re.sub(r"[\W_]+", "-", str(app_name).lower()) - if app_id is None: - logger.info("Searching for app id") - app_id = self.search_id() - self.app_id = int(app_id) - - self.url = self._landing_url() - - self.reviews = list() - self.reviews_count = int() - - self._log_interval = float(log_interval) - self._log_timer = float() - - self._fetched_count = int() + super().__init__( + country=country, + app_name=app_name, + app_id=app_id, + log_format=log_format, + log_level=log_level, + log_interval=log_interval, + ) - self._request_url = self._request_url() - self._request_offset = 0 - self._request_headers = { - "Accept": "application/json", - "Authorization": self._token(), - "Connection": "keep-alive", - "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8", - "Origin": self._base_landing_url, - "Referer": self.url, - "User-Agent": random.choice(self._user_agents), - } + # override self._request_params = { "l": "en-GB", "offset": self._request_offset, @@ -75,126 +37,3 @@ def __init__( "platform": "web", "additionalPlatforms": "appletv,ipad,iphone,mac", } - - self._response = requests.Response() - - logger.info( - f"Initialised: {self.__class__.__name__}" - f"('{self.country}', '{self.app_name}', {self.app_id})" - ) - logger.info(f"Ready to fetch reviews from: {self.url}") - - def __repr__(self): - return "{}(country='{}', app_name='{}', app_id={})".format( - self.__class__.__name__, self.country, self.app_name, self.app_id, - ) - - def __str__(self): - width = 12 - return ( - f"{'Country'.rjust(width, ' ')} | {self.country}\n" - f"{'Name'.rjust(width, ' ')} | {self.app_name}\n" - f"{'ID'.rjust(width, ' ')} | {self.app_id}\n" - f"{'URL'.rjust(width, ' ')} | {self.url}\n" - f"{'Review count'.rjust(width, ' ')} | {self.reviews_count}" - ) - - def _landing_url(self): - landing_url = f"{self._base_landing_url}/{self._landing_path}" - return landing_url.format( - country=self.country, app_name=self.app_name, app_id=self.app_id - ) - - def _request_url(self): - request_url = f"{self._base_request_url}/{self._request_path}" - return request_url.format(country=self.country, app_id=self.app_id) - - def _get( - self, - url, - headers=None, - params=None, - total=3, - backoff_factor=3, - status_forcelist=[404], - ) -> requests.Response: - retries = Retry( - total=total, - backoff_factor=backoff_factor, - status_forcelist=status_forcelist, - ) - with requests.Session() as s: - s.mount(self._base_request_url, HTTPAdapter(max_retries=retries)) - logger.debug(f"Making a GET request: {url}") - self._response = s.get(url, headers=headers, params=params) - - def _token(self): - self._get(self.url) - tags = self._response.text.splitlines() - for tag in tags: - if re.match(r" interval: - self._log_status() - self._log_timer = 0 - - def search_id(self): - search_url = "https://www.google.com/search" - self._get(search_url, params={"q": f"app store {self.app_name}"}) - pattern = fr"{self._base_landing_url}/[a-z]{{2}}/.+?/id([0-9]+)" - app_id = re.search(pattern, self._response.text).group(1) - return app_id - - def review(self, how_many=sys.maxsize): - self._log_timer = 0 - try: - while True: - self._heartbeat() - self._get( - self._request_url, - headers=self._request_headers, - params=self._request_params, - ) - self._parse_data() - self._parse_next() - if self._request_offset is None or self._fetched_count >= how_many: - break - except KeyboardInterrupt: - logger.error("Keyboard interrupted") - except Exception as e: - logger.error(f"Something went wrong: {e}") - finally: - self._log_status() - self._fetched_count = 0 diff --git a/app_store_scraper/base.py b/app_store_scraper/base.py new file mode 100644 index 0000000..ece2ee7 --- /dev/null +++ b/app_store_scraper/base.py @@ -0,0 +1,193 @@ +import logging +import random +import re +import requests +import sys +import time +from datetime import datetime +from requests.adapters import HTTPAdapter +from requests.packages.urllib3.util.retry import Retry + +logger = logging.getLogger("Base") + + +class Base: + _scheme = "https" + + _landing_host = "" + _request_host = "" + + _landing_path = "" + _request_path = "" + + _user_agents = [ + # NOTE: grab from https://bit.ly/2zu0cmU + "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322)", + "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko", + "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)", + ] + + def __init__( + self, + country, + app_name, + app_id=None, + log_format="%(asctime)s [%(levelname)s] %(name)s - %(message)s", + log_level="INFO", + log_interval=5, + ): + logging.basicConfig(format=log_format, level=log_level.upper()) + self._base_landing_url = f"{self._scheme}://{self._landing_host}" + self._base_request_url = f"{self._scheme}://{self._request_host}" + + self.country = str(country).lower() + self.app_name = re.sub(r"[\W_]+", "-", str(app_name).lower()) + if app_id is None: + logger.info("Searching for app id") + app_id = self.search_id() + self.app_id = int(app_id) + + self.url = self._landing_url() + + self.reviews = list() + self.reviews_count = int() + + self._log_interval = float(log_interval) + self._log_timer = float() + + self._fetched_count = int() + + self._request_url = self._request_url() + self._request_offset = 0 + self._request_headers = { + "Accept": "application/json", + "Authorization": self._token(), + "Connection": "keep-alive", + "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8", + "Origin": self._base_landing_url, + "Referer": self.url, + "User-Agent": random.choice(self._user_agents), + } + self._request_params = {} + self._response = requests.Response() + + logger.info( + f"Initialised: {self.__class__.__name__}" + f"('{self.country}', '{self.app_name}', {self.app_id})" + ) + logger.info(f"Ready to fetch reviews from: {self.url}") + + def __repr__(self): + return "{}(country='{}', app_name='{}', app_id={})".format( + self.__class__.__name__, self.country, self.app_name, self.app_id, + ) + + def __str__(self): + width = 12 + return ( + f"{'Country'.rjust(width, ' ')} | {self.country}\n" + f"{'Name'.rjust(width, ' ')} | {self.app_name}\n" + f"{'ID'.rjust(width, ' ')} | {self.app_id}\n" + f"{'URL'.rjust(width, ' ')} | {self.url}\n" + f"{'Review count'.rjust(width, ' ')} | {self.reviews_count}" + ) + + def _landing_url(self): + landing_url = f"{self._base_landing_url}/{self._landing_path}" + return landing_url.format( + country=self.country, app_name=self.app_name, app_id=self.app_id + ) + + def _request_url(self): + request_url = f"{self._base_request_url}/{self._request_path}" + return request_url.format(country=self.country, app_id=self.app_id) + + def _get( + self, + url, + headers=None, + params=None, + total=3, + backoff_factor=3, + status_forcelist=[404], + ) -> requests.Response: + retries = Retry( + total=total, + backoff_factor=backoff_factor, + status_forcelist=status_forcelist, + ) + with requests.Session() as s: + s.mount(self._base_request_url, HTTPAdapter(max_retries=retries)) + logger.debug(f"Making a GET request: {url}") + self._response = s.get(url, headers=headers, params=params) + + def _token(self): + self._get(self.url) + tags = self._response.text.splitlines() + for tag in tags: + if re.match(r" interval: + self._log_status() + self._log_timer = 0 + + def search_id(self): + search_url = "https://www.google.com/search" + self._get(search_url, params={"q": f"app store {self.app_name}"}) + pattern = fr"{self._base_landing_url}/[a-z]{{2}}/.+?/id([0-9]+)" + app_id = re.search(pattern, self._response.text).group(1) + return app_id + + def review(self, how_many=sys.maxsize): + self._log_timer = 0 + try: + while True: + self._heartbeat() + self._get( + self._request_url, + headers=self._request_headers, + params=self._request_params, + ) + self._parse_data() + self._parse_next() + if self._request_offset is None or self._fetched_count >= how_many: + break + except KeyboardInterrupt: + logger.error("Keyboard interrupted") + except Exception as e: + logger.error(f"Something went wrong: {e}") + finally: + self._log_status() + self._fetched_count = 0 diff --git a/app_store_scraper/podcast.py b/app_store_scraper/podcast.py index 8e0bc53..635beb6 100644 --- a/app_store_scraper/podcast.py +++ b/app_store_scraper/podcast.py @@ -1,10 +1,10 @@ import logging -from .app_store import AppStore +from .base import Base logger = logging.getLogger("Podcast") -class Podcast(AppStore): +class Podcast(Base): _landing_host = "podcasts.apple.com" _request_host = "amp-api.podcasts.apple.com" @@ -29,6 +29,7 @@ def __init__( log_interval=log_interval, ) + # override self._request_params = { "l": "en-GB", "offset": self._request_offset,