Skip to content

Commit

Permalink
demozoo scraper: add original link as website, add release date
Browse files Browse the repository at this point in the history
  • Loading branch information
avivace committed Dec 18, 2024
1 parent 1a48b52 commit 1e417f8
Show file tree
Hide file tree
Showing 2 changed files with 32 additions and 3 deletions.
32 changes: 29 additions & 3 deletions scrapers/py_importers/demozoo.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,11 @@

import requests
from bs4 import BeautifulSoup

import re
from py_common.Logger import Logger
from py_common.Production import Production
import py_common.utils as utils
from datetime import datetime

########################
### GLOBAL VARIABLES ###
Expand Down Expand Up @@ -86,7 +87,7 @@ def scrape(platform):
# get rows; for each rows, get the name of the prod and the internal link
for link in links:
demozoo_internal_link = baseurl + "/" + link.get("href")

print(demozoo_internal_link)
# building slug: all lowercase, each word separated by hyphen, no special character
slug = utils.build_slug(link.text)

Expand Down Expand Up @@ -115,7 +116,21 @@ def scrape(platform):
elif slug in globalgameslist:
logger.write("[WARN]", " " + slug + " already in entries folder!")

def parse_date(date_string):
date_part = re.search(r"(\d{1,2} [A-Za-z]+ \d{4})|([A-Za-z]+ \d{4})|(\d{4})", date_string)

if not date_part:
raise ValueError(f"No recognizable date found in: {date_string}")

date_part = date_part.group(0) # Extract the matched part

parsed_date = datetime.strptime(date_part, "%d %B %Y")

# Convert to desired format
return parsed_date.strftime("%Y-%m-%d")

def scrape_page(slug, url, platform):
demozoo_url = url
'''
given a slug and demozoo production url, it returns an object containing everything useful
to build a file hierarchy
Expand All @@ -131,6 +146,17 @@ def scrape_page(slug, url, platform):
# getting title
title = str.strip(soup.find('div', {"class": "production_title focus_title"}).findChildren("h2")[0].text)

date_string = str.strip(soup.find('ul', {"class": "attributes"}).findChildren("li")[0].text)

release_date = None

try:
release_date = parse_date(date_string)
print(date_string, "->", parse_date(date_string))
except:
print("nodate")


logger.write("[INFO]", " Adding: " + title + " ...")

# getting developer
Expand Down Expand Up @@ -198,7 +224,7 @@ def scrape_page(slug, url, platform):

files = [f"{slug}.{platform.lower()}"]

return Production(title, slug, developer, platform, typetag, screenshots, files, video, repository=source, url=url)
return Production(title, slug, developer, platform, typetag, screenshots, files, video, date=release_date, repository=source, url=demozoo_url)

def main():
for platform in PLATFORMS.keys():
Expand Down
3 changes: 3 additions & 0 deletions scrapers/py_importers/py_common/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,6 +198,7 @@ def build(prod, entrypath: str, desired_extensions: list):

# Handle screenshots
if prod.screenshots and prod.screenshots[0] != "None":
print(prod.screenshots)
try:
r = requests.get(prod.screenshots[0], allow_redirects=True, timeout=None)
screen_ext = prod.screenshots[0].split(".")[-1].lower()
Expand Down Expand Up @@ -270,6 +271,8 @@ def makeJSON(prod, entrypath):
"screenshots": [screen for screen in prod.screenshots] if len(prod.screenshots) != 0 else [],
"slug": prod.slug,
"title": prod.title,
"website": [ prod.url ],
"date": prod.date
}

# adding optional fields
Expand Down

0 comments on commit 1e417f8

Please sign in to comment.