Skip to content

Commit

Permalink
Merge pull request #8 from lukadd16/feat/userlists
Browse files Browse the repository at this point in the history
feat: Support for User Parts Lists
  • Loading branch information
thefakequake committed Mar 8, 2021
2 parents 2fadced + dbd9711 commit cbf1cf0
Show file tree
Hide file tree
Showing 2 changed files with 44 additions and 38 deletions.
4 changes: 2 additions & 2 deletions pypartpicker/regex.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,10 @@


def get_list_links(string):
list_regex = re.compile("((?:http|https)://(?:[a-z]{2}.pcpartpicker|pcpartpicker).com/list/(?:[a-zA-Z0-9]{6}))")
list_regex = re.compile("((?:http|https)://(?:[a-z]{2}.)?pcpartpicker.com/(?:(?:list/(?:[a-zA-Z0-9]{6}))|(?:user/(?:[\\w]+)/saved/(?:[a-zA-Z0-9]{6}))))")
return re.findall(list_regex, string)


def get_product_links(string):
product_regex = re.compile("((?:http|https)://(?:[a-z]{2}.pcpartpicker|pcpartpicker).com/product/(?:[a-zA-Z0-9]{6}))")
product_regex = re.compile("((?:http|https)://(?:[a-z]{2}.)?pcpartpicker.com/product/(?:[a-zA-Z0-9]{6}))")
return re.findall(product_regex, string)
78 changes: 42 additions & 36 deletions pypartpicker/scraper.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
import asyncio
import concurrent.futures
import math
import re
import requests

from bs4 import BeautifulSoup
from urllib.parse import urlparse
import concurrent.futures
import asyncio
from functools import partial
import math
from urllib.parse import urlparse


class Part:
Expand All @@ -26,6 +28,7 @@ def __init__(self, **kwargs):
self.url = kwargs.get("url")
self.compatibility = kwargs.get("compatibility")


class Product(Part):

def __init__(self, **kwargs):
Expand All @@ -36,6 +39,7 @@ def __init__(self, **kwargs):
self.reviews = kwargs.get("reviews")
self.compatible_parts = kwargs.get("compatible_parts")


class Price:

def __init__(self, **kwargs):
Expand All @@ -46,6 +50,7 @@ def __init__(self, **kwargs):
self.base_value = kwargs.get("base_value")
self.in_stock = kwargs.get("in_stock")


class Review:

def __init__(self, **kwargs):
Expand All @@ -57,9 +62,11 @@ def __init__(self, **kwargs):
self.rating = kwargs.get("rating")
self.content = kwargs.get("content")


class Verification(Exception):
pass


class Scraper:

def __init__(self, **kwargs):
Expand All @@ -68,8 +75,8 @@ def __init__(self, **kwargs):
raise ValueError("Headers kwarg has to be a dict!")
self.headers = headers_dict


def make_soup(self, url) -> BeautifulSoup:
# Private Helper Function
def __make_soup(self, url) -> BeautifulSoup:
# sends a request to the URL
page = requests.get(url, headers=self.headers)
# gets the HTML code for the website and parses it using Python's built in HTML parser
Expand All @@ -79,16 +86,24 @@ def make_soup(self, url) -> BeautifulSoup:
# returns the HTML
return soup

# Private Helper Function
# Uses a RegEx to check if the specified string matches the URL format of a valid PCPP parts list
def __check_list_url(self, url_str):
return re.search(r"((?:http|https)://(?:[a-z]{2}.)?pcpartpicker.com/(?:(?:list/(?:[a-zA-Z0-9]{6}))|(?:user/(?:[\\w]+)/saved/(?:[a-zA-Z0-9]{6}))))", url_str)

def fetch_list(self, list_url) -> PCPPList:
# Private Helper Function
# Uses a RegEx to check if the specified string matches the URL format of a valid product on PCPP
def __check_product_url(self, url_str):
return re.search(r"((?:http|https)://(?:[a-z]{2}.)?pcpartpicker.com/product/(?:[a-zA-Z0-9]{6}))", url_str)

# checks if its a pcpartpicker list and raises an exception if its not or if the list is empty
if not "pcpartpicker.com/list/" in list_url or list_url.endswith("/list/"):
raise Exception(f"'{list_url}' is an invalid PCPartPicker list!")
def fetch_list(self, list_url) -> PCPPList:
# Ensure a valid pcpartpicker parts list was passed to the function
if self.__check_list_url(list_url) is None:
raise ValueError(f"'{list_url}' is an invalid PCPartPicker list!")

# fetches the HTML code for the website
try:
soup = self.make_soup(list_url)
soup = self.__make_soup(list_url)
except requests.exceptions.ConnectionError:
raise ValueError("Invalid list URL! Max retries exceeded with URL.")

Expand Down Expand Up @@ -139,9 +154,7 @@ def fetch_list(self, list_url) -> PCPPList:
# returns a PCPPList object containing all the information
return PCPPList(parts=parts, wattage=wattage, total=total_cost, url=list_url, compatibility=compatibilitynotes)


def part_search(self, search_term, **kwargs) -> Part:

search_term = search_term.replace(' ', '+')
limit = kwargs.get("limit", 20)

Expand Down Expand Up @@ -170,7 +183,7 @@ def part_search(self, search_term, **kwargs) -> Part:
for i in range(iterations):

try:
soup = self.make_soup(f"{search_link}&page={i + 1}")
soup = self.__make_soup(f"{search_link}&page={i + 1}")
except requests.exceptions.ConnectionError:
raise ValueError("Invalid region! Max retries exceeded with URL.")

Expand All @@ -179,9 +192,9 @@ def part_search(self, search_term, **kwargs) -> Part:

# creates a part object with the information from the product page
part_object = Part(
name = soup.find(class_="pageTitle").get_text(),
url = search_link,
price = None
name=soup.find(class_="pageTitle").get_text(),
url=search_link,
price=None
)

# searches for the pricing table
Expand Down Expand Up @@ -214,9 +227,9 @@ def part_search(self, search_term, **kwargs) -> Part:
for product in section.find_all("ul", class_="list-unstyled"):
# extracts the product data from the HTML code and creates a part object with that information
part_object = Part(
name = product.find("p", class_="search_results--link").get_text().strip(),
url = "https://" + urlparse(search_link).netloc + product.find("p", class_="search_results--link").find("a", href=True)["href"],
image = ("https://" + product.find("img")["src"].strip('/')).replace("https://https://", "https://")
name=product.find("p", class_="search_results--link").get_text().strip(),
url="https://" + urlparse(search_link).netloc + product.find("p", class_="search_results--link").find("a", href=True)["href"],
image=("https://" + product.find("img")["src"].strip('/')).replace("https://https://", "https://")
)
try:
part_object.price = product.find(class_="product__link product__link--price").get_text()
Expand All @@ -229,15 +242,13 @@ def part_search(self, search_term, **kwargs) -> Part:
# returns the part objects
return parts[:kwargs.get("limit", 20)]


def fetch_product(self, part_url) -> Product:

# checks if the URL is invalid
if not "pcpartpicker.com" in part_url and "/product/" in part_url:
# Ensure a valid product page was passed to the function
if self.__check_product_url(part_url) is None:
raise ValueError("Invalid product URL!")

try:
soup = self.make_soup(part_url)
soup = self.__make_soup(part_url)
except requests.exceptions.ConnectionError:
raise ValueError("Invalid product URL! Max retries exceeded with URL.")

Expand Down Expand Up @@ -282,7 +293,7 @@ def fetch_product(self, part_url) -> Product:
review_box = soup.find(class_="block partReviews")

# skips over this process if the review box does not exist
if review_box != None:
if review_box is not None:

reviews = []

Expand Down Expand Up @@ -322,7 +333,7 @@ def fetch_product(self, part_url) -> Product:
compatible_parts = None
# fetches section with compatible parts hyperlinks
compatible_parts_list = soup.find(class_="compatibleParts__list list-unstyled")
if compatible_parts_list != None:
if compatible_parts_list is not None:
compatible_parts = []
# finds every list item in the section
for item in compatible_parts_list.find_all("li"):
Expand All @@ -347,28 +358,23 @@ def fetch_product(self, part_url) -> Product:

image_box = soup.find(class_="single_image_gallery_box")

if image_box != None:
if image_box is not None:
# adds image to object if it finds one
product_object.image = image_box.find("img")["src"].replace("https://https://", "https://")

return product_object


async def aio_part_search(search_term, **kwargs):
async def aio_part_search(self, search_term, **kwargs):
with concurrent.futures.ThreadPoolExecutor() as pool:
result = await asyncio.get_event_loop().run_in_executor(pool, partial(self.part_search, search_term, **kwargs))
return result


async def aio_fetch_list(list_url):
async def aio_fetch_list(self, list_url):
with concurrent.futures.ThreadPoolExecutor() as pool:
result = await asyncio.get_event_loop().run_in_executor(pool, self.fetch_list, list_url)
return result


async def aio_fetch_product(part_url):
async def aio_fetch_product(self, part_url):
with concurrent.futures.ThreadPoolExecutor() as pool:
result = await asyncio.get_event_loop().run_in_executor(pool, self.fetch_product, part_url)
return result


0 comments on commit cbf1cf0

Please sign in to comment.