First draft of AyloAPI

Maista6969 · Jan 15, 2024 · f9c6574 · f9c6574
commit f9c6574
Show file tree

Hide file tree

Showing 67 changed files with 7,356 additions and 0 deletions.
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -0,0 +1,40 @@
+name: Deploy index to Github Pages
+
+on:
+  push:
+    branches: [ master ]
+
+  # Allows you to run this workflow manually from the Actions tab
+  workflow_dispatch:
+
+# Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages
+permissions:
+  contents: read
+  pages: write
+  id-token: write
+
+jobs:
+  build:
+    runs-on: ubuntu-22.04
+    steps:
+    - name: Checkout master
+      uses: actions/checkout@v2
+      with:
+        path: master
+        ref: master
+        fetch-depth: '0'
+    - run: |
+        cd master
+        ./build_site.sh ../_site/
+    - uses: actions/upload-pages-artifact@v2
+
+  deploy:
+    environment:
+      name: github-pages
+      url: ${{ steps.deployment.outputs.page_url }}
+    runs-on: ubuntu-22.04
+    needs: build
+    steps:
+      - name: Deploy to GitHub Pages
+        id: deployment
+        uses: actions/deploy-pages@v2
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,5 @@
+# Scraper generated files
+*.json
+
+# Index build artifact
+/_site
diff --git a/build_site.sh b/build_site.sh
@@ -0,0 +1,89 @@
+#!/bin/bash
+
+# builds a repository of scrapers
+# outputs to _site with the following structure:
+# index.yml
+# <scraper_id>.zip
+# Each zip file contains the scraper.yml file and any other files in the same directory
+
+outdir="$1"
+if [ -z "$outdir" ]; then
+    outdir="_site"
+fi
+
+rm -rf "$outdir"
+mkdir -p "$outdir"
+
+buildScraper() 
+{
+    f=$1
+    dir=$(dirname "$f")
+
+    # get the scraper id from the filename
+    scraper_id=$(basename "$f" .yml)
+    versionFile=$f
+    if [ "$scraper_id" == "package" ]; then
+        scraper_id=$(basename "$dir")
+    fi
+
+    if [ "$dir" != "./scrapers" ]; then
+        versionFile="$dir"
+    fi
+
+    echo "Processing $scraper_id"
+
+    # create a directory for the version
+    version=$(git log -n 1 --pretty=format:%h -- "$versionFile")
+    updated=$(TZ=UTC0 git log -n 1 --date="format-local:%F %T" --pretty=format:%ad -- "$versionFile")
+
+    # create the zip file
+    # copy other files
+    zipfile=$(realpath "$outdir/$scraper_id.zip")
+
+    name=$(grep "^name:" "$f" | cut -d' ' -f2- | sed -e 's/\r//' -e 's/^"\(.*\)"$/\1/')
+    ignore=$(grep "^# ignore:" "$f" | cut -c 10- | sed -e 's/\r//')
+    dep=$(grep "^# requires:" "$f" | cut -c 12- | sed -e 's/\r//')
+
+    # always ignore package file
+    ignore="-x $ignore package"
+
+    pushd "$dir" > /dev/null
+    if [ "$dir" != "./scrapers" ]; then
+        zip -r "$zipfile" . ${ignore} > /dev/null
+    else
+        zip "$zipfile" "$scraper_id.yml" > /dev/null
+    fi
+    popd > /dev/null
+
+    # write to spec index
+    echo "- id: $scraper_id
+  name: $name
+  version: $version
+  date: $updated
+  path: $scraper_id.zip
+  sha256: $(sha256sum "$zipfile" | cut -d' ' -f1)" >> "$outdir"/index.yml
+
+    # handle dependencies
+    if [ ! -z "$dep" ]; then
+        echo "  requires:" >> "$outdir"/index.yml
+        for d in ${dep//,/ }; do
+            echo "    - $d" >> "$outdir"/index.yml
+        done
+    fi
+
+    echo "" >> "$outdir"/index.yml
+}
+
+# find all yml files in ./scrapers - these are packages individually
+for f in ./scrapers/*.yml; do 
+    buildScraper "$f"
+done
+
+find ./scrapers/ -mindepth 2 -name *.yml -print0 | while read -d $'\0' f; do
+    buildScraper "$f"
+done
+
+# handle dependency packages
+find ./scrapers/ -mindepth 2 -name package -print0 | while read -d $'\0' f; do
+    buildScraper "$f"
+done
diff --git a/scrapers/AyloAPI/README.md b/scrapers/AyloAPI/README.md
@@ -0,0 +1,34 @@
+# The Aylo API scraper
+
+This is arguably the biggest scraper in the repo and covers a _lot_ of networks and studios. It is
+composed of one main file that contains the functions necessary to scrape scenes, movies, galleries
+and performers from the Aylo API along with a few supporting files with functions that handle things
+like constructing URL slugs and caching instance tokens.
+
+Design goals:
+
+- Split scrapers that can handle the individual complexities of subnetworks without overcomplicating the main scraper
+- Easy to modify and understand: documentation, examples
+
+These functions are designed to be open for extension, but closed to modification: but what does this mean?
+The networks and studios in the Aylo API differ in how they construct their URLs and even how
+their parent/child studio relationships are expressed these functions could easily take on a lot of
+complexity if they were to handle every special case. Instead these scraping functions return their
+results in a standard format that works for most studios while also optionally taking a postprocessing
+function that callers can supply to handle their special requirements.
+
+The standard URL formats that can vary:
+scenes: `https://www.<brand-domain>.com/scene/<scene-id>/<scene-title-slug>`
+movies: `https://www.<brand-domain>.com/movie/<movie-id>/<movie-title-slug>`
+performers: `https://www.<brand-domain>.com/model/<performer-id>/<performer-name-slug>`
+
+`brand-domain` is based on the parent studio: `bangbros` for Bang Bros, `gaywire` for Gay Wire,
+`bigstr` for BigStr (which has since consolidated under the Czech Hunter name, so those URLs are wrong!)
+
+Uses the `parse_args` helper from [py_common](../py_common/util.py)
+Developed to be ergonomic for testing and integrating into other Python scripts
+
+The simplest case is exemplified by the Babes network: they use the standard URL formats and their
+parent studio domain `www.babes.com` is correct for all substudios. Their scraper does not need
+to make any changes to the results returned by the API, so their scraper is fully defined in [Babes.yml](../Babes.yml).
+The only thing it needs to do is specify which domains it should search which can be done inline.