From bcb8560d90997ebe36828e7f707df3476e3f7708 Mon Sep 17 00:00:00 2001 From: lukadd Date: Sun, 7 Mar 2021 18:27:21 -0500 Subject: [PATCH 1/3] feat: Updated RegEx to support Saved User Lists --- pypartpicker/regex.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pypartpicker/regex.py b/pypartpicker/regex.py index 2c04664..89da0b6 100644 --- a/pypartpicker/regex.py +++ b/pypartpicker/regex.py @@ -2,10 +2,10 @@ def get_list_links(string): - list_regex = re.compile("((?:http|https)://(?:[a-z]{2}.pcpartpicker|pcpartpicker).com/list/(?:[a-zA-Z0-9]{6}))") + list_regex = re.compile("((?:http|https)://(?:[a-z]{2}.)?pcpartpicker.com/(?:(?:list/(?:[a-zA-Z0-9]{6}))|(?:user/(?:[\\w]+)/saved/(?:[a-zA-Z0-9]{6}))))") return re.findall(list_regex, string) def get_product_links(string): - product_regex = re.compile("((?:http|https)://(?:[a-z]{2}.pcpartpicker|pcpartpicker).com/product/(?:[a-zA-Z0-9]{6}))") + product_regex = re.compile("((?:http|https)://(?:[a-z]{2}.)?pcpartpicker.com/product/(?:[a-zA-Z0-9]{6}))") return re.findall(product_regex, string) From af8b12455c88fd57c3c102eacb585cf24bd06ac3 Mon Sep 17 00:00:00 2001 From: lukadd Date: Sun, 7 Mar 2021 18:29:03 -0500 Subject: [PATCH 2/3] fix: Add missing self parameter to async methods --- pypartpicker/scraper.py | 41 +++++++++++++++++------------------------ 1 file changed, 17 insertions(+), 24 deletions(-) diff --git a/pypartpicker/scraper.py b/pypartpicker/scraper.py index 7df6f14..8f8d591 100644 --- a/pypartpicker/scraper.py +++ b/pypartpicker/scraper.py @@ -26,6 +26,7 @@ def __init__(self, **kwargs): self.url = kwargs.get("url") self.compatibility = kwargs.get("compatibility") + class Product(Part): def __init__(self, **kwargs): @@ -36,6 +37,7 @@ def __init__(self, **kwargs): self.reviews = kwargs.get("reviews") self.compatible_parts = kwargs.get("compatible_parts") + class Price: def __init__(self, **kwargs): @@ -46,6 +48,7 @@ def __init__(self, **kwargs): self.base_value = kwargs.get("base_value") self.in_stock = kwargs.get("in_stock") + class Review: def __init__(self, **kwargs): @@ -57,9 +60,11 @@ def __init__(self, **kwargs): self.rating = kwargs.get("rating") self.content = kwargs.get("content") + class Verification(Exception): pass + class Scraper: def __init__(self, **kwargs): @@ -68,7 +73,6 @@ def __init__(self, **kwargs): raise ValueError("Headers kwarg has to be a dict!") self.headers = headers_dict - def make_soup(self, url) -> BeautifulSoup: # sends a request to the URL page = requests.get(url, headers=self.headers) @@ -79,9 +83,7 @@ def make_soup(self, url) -> BeautifulSoup: # returns the HTML return soup - def fetch_list(self, list_url) -> PCPPList: - # checks if its a pcpartpicker list and raises an exception if its not or if the list is empty if not "pcpartpicker.com/list/" in list_url or list_url.endswith("/list/"): raise Exception(f"'{list_url}' is an invalid PCPartPicker list!") @@ -139,9 +141,7 @@ def fetch_list(self, list_url) -> PCPPList: # returns a PCPPList object containing all the information return PCPPList(parts=parts, wattage=wattage, total=total_cost, url=list_url, compatibility=compatibilitynotes) - def part_search(self, search_term, **kwargs) -> Part: - search_term = search_term.replace(' ', '+') limit = kwargs.get("limit", 20) @@ -179,9 +179,9 @@ def part_search(self, search_term, **kwargs) -> Part: # creates a part object with the information from the product page part_object = Part( - name = soup.find(class_="pageTitle").get_text(), - url = search_link, - price = None + name=soup.find(class_="pageTitle").get_text(), + url=search_link, + price=None ) # searches for the pricing table @@ -214,9 +214,9 @@ def part_search(self, search_term, **kwargs) -> Part: for product in section.find_all("ul", class_="list-unstyled"): # extracts the product data from the HTML code and creates a part object with that information part_object = Part( - name = product.find("p", class_="search_results--link").get_text().strip(), - url = "https://" + urlparse(search_link).netloc + product.find("p", class_="search_results--link").find("a", href=True)["href"], - image = ("https://" + product.find("img")["src"].strip('/')).replace("https://https://", "https://") + name=product.find("p", class_="search_results--link").get_text().strip(), + url="https://" + urlparse(search_link).netloc + product.find("p", class_="search_results--link").find("a", href=True)["href"], + image=("https://" + product.find("img")["src"].strip('/')).replace("https://https://", "https://") ) try: part_object.price = product.find(class_="product__link product__link--price").get_text() @@ -229,9 +229,7 @@ def part_search(self, search_term, **kwargs) -> Part: # returns the part objects return parts[:kwargs.get("limit", 20)] - def fetch_product(self, part_url) -> Product: - # checks if the URL is invalid if not "pcpartpicker.com" in part_url and "/product/" in part_url: raise ValueError("Invalid product URL!") @@ -282,7 +280,7 @@ def fetch_product(self, part_url) -> Product: review_box = soup.find(class_="block partReviews") # skips over this process if the review box does not exist - if review_box != None: + if review_box is not None: reviews = [] @@ -322,7 +320,7 @@ def fetch_product(self, part_url) -> Product: compatible_parts = None # fetches section with compatible parts hyperlinks compatible_parts_list = soup.find(class_="compatibleParts__list list-unstyled") - if compatible_parts_list != None: + if compatible_parts_list is not None: compatible_parts = [] # finds every list item in the section for item in compatible_parts_list.find_all("li"): @@ -347,28 +345,23 @@ def fetch_product(self, part_url) -> Product: image_box = soup.find(class_="single_image_gallery_box") - if image_box != None: + if image_box is not None: # adds image to object if it finds one product_object.image = image_box.find("img")["src"].replace("https://https://", "https://") return product_object - - async def aio_part_search(search_term, **kwargs): + async def aio_part_search(self, search_term, **kwargs): with concurrent.futures.ThreadPoolExecutor() as pool: result = await asyncio.get_event_loop().run_in_executor(pool, partial(self.part_search, search_term, **kwargs)) return result - - async def aio_fetch_list(list_url): + async def aio_fetch_list(self, list_url): with concurrent.futures.ThreadPoolExecutor() as pool: result = await asyncio.get_event_loop().run_in_executor(pool, self.fetch_list, list_url) return result - - async def aio_fetch_product(part_url): + async def aio_fetch_product(self, part_url): with concurrent.futures.ThreadPoolExecutor() as pool: result = await asyncio.get_event_loop().run_in_executor(pool, self.fetch_product, part_url) return result - - From dbd9711426ed920f5c8bff781456002451e7d231 Mon Sep 17 00:00:00 2001 From: lukadd Date: Sun, 7 Mar 2021 19:15:40 -0500 Subject: [PATCH 3/3] feat: Added support for Saved User Lists in fetch_list() function; Added new private methods --- pypartpicker/scraper.py | 39 ++++++++++++++++++++++++++------------- 1 file changed, 26 insertions(+), 13 deletions(-) diff --git a/pypartpicker/scraper.py b/pypartpicker/scraper.py index 8f8d591..66e2966 100644 --- a/pypartpicker/scraper.py +++ b/pypartpicker/scraper.py @@ -1,10 +1,12 @@ +import asyncio +import concurrent.futures +import math +import re import requests + from bs4 import BeautifulSoup -from urllib.parse import urlparse -import concurrent.futures -import asyncio from functools import partial -import math +from urllib.parse import urlparse class Part: @@ -73,7 +75,8 @@ def __init__(self, **kwargs): raise ValueError("Headers kwarg has to be a dict!") self.headers = headers_dict - def make_soup(self, url) -> BeautifulSoup: + # Private Helper Function + def __make_soup(self, url) -> BeautifulSoup: # sends a request to the URL page = requests.get(url, headers=self.headers) # gets the HTML code for the website and parses it using Python's built in HTML parser @@ -83,14 +86,24 @@ def make_soup(self, url) -> BeautifulSoup: # returns the HTML return soup + # Private Helper Function + # Uses a RegEx to check if the specified string matches the URL format of a valid PCPP parts list + def __check_list_url(self, url_str): + return re.search(r"((?:http|https)://(?:[a-z]{2}.)?pcpartpicker.com/(?:(?:list/(?:[a-zA-Z0-9]{6}))|(?:user/(?:[\\w]+)/saved/(?:[a-zA-Z0-9]{6}))))", url_str) + + # Private Helper Function + # Uses a RegEx to check if the specified string matches the URL format of a valid product on PCPP + def __check_product_url(self, url_str): + return re.search(r"((?:http|https)://(?:[a-z]{2}.)?pcpartpicker.com/product/(?:[a-zA-Z0-9]{6}))", url_str) + def fetch_list(self, list_url) -> PCPPList: - # checks if its a pcpartpicker list and raises an exception if its not or if the list is empty - if not "pcpartpicker.com/list/" in list_url or list_url.endswith("/list/"): - raise Exception(f"'{list_url}' is an invalid PCPartPicker list!") + # Ensure a valid pcpartpicker parts list was passed to the function + if self.__check_list_url(list_url) is None: + raise ValueError(f"'{list_url}' is an invalid PCPartPicker list!") # fetches the HTML code for the website try: - soup = self.make_soup(list_url) + soup = self.__make_soup(list_url) except requests.exceptions.ConnectionError: raise ValueError("Invalid list URL! Max retries exceeded with URL.") @@ -170,7 +183,7 @@ def part_search(self, search_term, **kwargs) -> Part: for i in range(iterations): try: - soup = self.make_soup(f"{search_link}&page={i + 1}") + soup = self.__make_soup(f"{search_link}&page={i + 1}") except requests.exceptions.ConnectionError: raise ValueError("Invalid region! Max retries exceeded with URL.") @@ -230,12 +243,12 @@ def part_search(self, search_term, **kwargs) -> Part: return parts[:kwargs.get("limit", 20)] def fetch_product(self, part_url) -> Product: - # checks if the URL is invalid - if not "pcpartpicker.com" in part_url and "/product/" in part_url: + # Ensure a valid product page was passed to the function + if self.__check_product_url(part_url) is None: raise ValueError("Invalid product URL!") try: - soup = self.make_soup(part_url) + soup = self.__make_soup(part_url) except requests.exceptions.ConnectionError: raise ValueError("Invalid product URL! Max retries exceeded with URL.")