Skip to content

Commit

Permalink
Merge pull request #11 from miztch/refactor/stepfunctions
Browse files Browse the repository at this point in the history
Use StepFunctions to scrape multiple pages
  • Loading branch information
miztch authored Feb 1, 2024
2 parents 04c682a + dce2379 commit b521bfb
Show file tree
Hide file tree
Showing 6 changed files with 199 additions and 166 deletions.
31 changes: 31 additions & 0 deletions .devcontainer/devcontainer.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
{
"name": "Python 3",
"image": "mcr.microsoft.com/devcontainers/python:1-3.12-bullseye",
"features": {
"ghcr.io/devcontainers/features/aws-cli:1": {},
"ghcr.io/customink/codespaces-features/sam-cli:1": {}
},
"customizations": {
"vscode": {
"extensions": [
"amazonwebservices.aws-toolkit-vscode",
"ms-python.black-formatter",
"ms-python.isort"
],
"settings": {
"terminal.integrated.shell.linux": "/usr/bin/zsh",
"[python]": {
"editor.formatOnSave": true,
"editor.defaultFormatter": "ms-python.black-formatter"
},
"isort.args": [
"--profile",
"black"
]
}
}
},
"mounts": [
"source=${env:HOME}${env:USERPROFILE}/.aws,target=/home/vscode/.aws,type=bind,consistency=cached"
]
}
39 changes: 0 additions & 39 deletions functions/fanout/index.py

This file was deleted.

Empty file removed functions/fanout/requirements.txt
Empty file.
143 changes: 66 additions & 77 deletions functions/sasha/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,35 +16,35 @@
logger = logging.getLogger()
logger.setLevel(logging.INFO)

dynamodb = boto3.resource('dynamodb')
table = dynamodb.Table(os.environ['VLR_MATCHES_TABLE'])
dynamodb = boto3.resource("dynamodb")
table = dynamodb.Table(os.environ["VLR_MATCHES_TABLE"])

# vlr match events cache
vlr_events_cache = {}


def insert(table, matches):
'''
"""
put items into specified DynamoDB table.
'''
"""
with table.batch_writer() as batch:
for match in matches:
logger.info('put match info into the table: {}'.format(match))
logger.info("put match info into the table: {}".format(match))
batch.put_item({k: v for k, v in match.items()})


def sleep():
'''
"""
sleep for 1~10 secs (randomly)
'''
"""
sec = random.randint(1, 10)
time.sleep(sec)


def get_event_from_cache(event_url_path):
global vlr_events_cache

result = ''
result = ""

if event_url_path in vlr_events_cache:
result = vlr_events_cache[event_url_path]
Expand All @@ -53,30 +53,29 @@ def get_event_from_cache(event_url_path):


def scrape_event(event_url_path):
'''
"""
scrape event page of url_path
'''
"""
global vlr_events_cache

url = 'https://www.vlr.gg{}'.format(event_url_path)
logger.info('get event info: {}'.format(url))
url = "https://www.vlr.gg{}".format(event_url_path)
logger.info("get event info: {}".format(url))

resp = requests.get(url, headers=headers)
html = HTMLParser(resp.text)

event_id = int(event_url_path.split('/')[2])
event_id = int(event_url_path.split("/")[2])

event_name = html.css_first('.wf-title').text().strip()
event_name = event_name.replace('\t', '').replace('\n', '')
event_name = html.css_first(".wf-title").text().strip()
event_name = event_name.replace("\t", "").replace("\n", "")

country_flag = html.css_first(
'.event-desc-item-value .flag').attributes['class']
country_flag = country_flag.replace(' mod-', '_').replace('flag_', '')
country_flag = html.css_first(".event-desc-item-value .flag").attributes["class"]
country_flag = country_flag.replace(" mod-", "_").replace("flag_", "")

data = {
'event_id': event_id,
'event_name': event_name,
'country_flag': country_flag
"event_id": event_id,
"event_name": event_name,
"country_flag": country_flag,
}

# caching
Expand All @@ -86,107 +85,97 @@ def scrape_event(event_url_path):


def scrape_match(match_url_path):
'''
"""
scrape match page of url_path
'''
"""
global vlr_events_cache

url = 'https://www.vlr.gg{}'.format(match_url_path)
logger.info('get match info: {}'.format(url))
url = "https://www.vlr.gg{}".format(match_url_path)
logger.info("get match info: {}".format(url))

resp = requests.get(url, headers=headers)
html = HTMLParser(resp.text)

match_id = int(match_url_path.split('/')[1])
match_id = int(match_url_path.split("/")[1])

match_name = html.css_first('.match-header-event-series').text()
match_name = match_name.replace('\t', '').replace('\n', '')
match_name = html.css_first(".match-header-event-series").text()
match_name = match_name.replace("\t", "").replace("\n", "")

start_time = html.css_first('.moment-tz-convert').attributes['data-utc-ts']
with_timezone = ' '.join([start_time, 'EST'])
start_time = html.css_first(".moment-tz-convert").attributes["data-utc-ts"]
with_timezone = " ".join([start_time, "EST"])

tzinfo = {'EST': tz.gettz('America/New_York'),
'CST': tz.gettz('America/Chicago')}
tzinfo = {"EST": tz.gettz("America/New_York"), "CST": tz.gettz("America/Chicago")}
start_time_est = parse(with_timezone, tzinfos=tzinfo)
start_time_utc = start_time_est.astimezone(tz.gettz('Etc/GMT'))
start_time_utc = datetime.strftime(start_time_utc, '%Y-%m-%dT%H:%M:%S%z')
start_time_utc = start_time_est.astimezone(tz.gettz("Etc/GMT"))
start_time_utc = datetime.strftime(start_time_utc, "%Y-%m-%dT%H:%M:%S%z")

teams = html.css('.wf-title-med')
teams = [t.text().replace('\t', '').replace('\n', '') for t in teams]
teams = html.css(".wf-title-med")
teams = [t.text().replace("\t", "").replace("\n", "") for t in teams]

best_of = html.css('.match-header-vs-note')[-1].text()
best_of = best_of.replace('Bo', '').replace(' Maps', '')
best_of = best_of.replace('\t', '').replace('\n', '')
best_of = html.css(".match-header-vs-note")[-1].text()
best_of = best_of.replace("Bo", "").replace(" Maps", "")
best_of = best_of.replace("\t", "").replace("\n", "")
best_of = int(best_of)

event_url_path = html.css_first('a.match-header-event').attributes['href']
event_url_path = html.css_first("a.match-header-event").attributes["href"]

if event_url_path in vlr_events_cache:
logger.info('get event info from cache: {}'.format(event_url_path))
logger.info("get event info from cache: {}".format(event_url_path))
event_info = vlr_events_cache[event_url_path]
else:
logger.info('get event info from website: {}'.format(event_url_path))
logger.info("get event info from website: {}".format(event_url_path))
event_info = scrape_event(event_url_path)

data = {
'match_id': match_id,
'event_name': event_info['event_name'],
'event_country_flag': event_info['country_flag'],
'start_time': start_time_utc,
'best_of': best_of,
'match_name': match_name,
'teams': teams
"match_id": match_id,
"event_name": event_info["event_name"],
"event_country_flag": event_info["country_flag"],
"start_time": start_time_utc,
"best_of": best_of,
"match_name": match_name,
"teams": teams,
}
return data


def scrape_matches(page: str = 1):
'''
"""
scrape /matches page
'''
url = 'https://www.vlr.gg/matches?page={}'.format(page)
logger.info('fetch matches list from: {}'.format(url))
"""
url = "https://www.vlr.gg/matches?page={}".format(page)
logger.info("fetch matches list from: {}".format(url))

resp = requests.get(url, headers=headers)
html = HTMLParser(resp.text)

matches = []

for item in html.css('a.wf-module-item'):
match_url_path = item.attributes['href']
for item in html.css("a.wf-module-item"):
match_url_path = item.attributes["href"]

sleep()
match_detail = scrape_match(match_url_path)

item = {
'id': match_detail['match_id'],
'eventName': match_detail['event_name'],
'eventCountryFlag': match_detail['event_country_flag'],
'startTime': match_detail['start_time'],
'bestOf': match_detail['best_of'],
'matchName': match_detail['match_name'],
'teams': [{'title': team} for team in match_detail['teams']],
'pagePath': match_url_path
"id": match_detail["match_id"],
"eventName": match_detail["event_name"],
"eventCountryFlag": match_detail["event_country_flag"],
"startTime": match_detail["start_time"],
"bestOf": match_detail["best_of"],
"matchName": match_detail["match_name"],
"teams": [{"title": team} for team in match_detail["teams"]],
"pagePath": match_url_path,
}
logger.info('add match to the list: {}'.format(item))
logger.info("add match to the list: {}".format(item))
matches.append(item)

return matches


def lambda_handler(event, context):
records = event['Records']
match_list = []
page = str(event["page"])

for record in records:
body = json.loads(record['body'])
page = str(body['page'])
matches = scrape_matches(page)
insert(table, matches)

matches = scrape_matches(page)
match_list.extend(matches)

insert(table, match_list)

return {
'matches_count': len(match_list)
}
return {"matches_count": len(matches)}
58 changes: 58 additions & 0 deletions statemachine/scraping.asl.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
{
"Comment": "Scrape defined vlr.gg match pages",
"StartAt": "Count pages to scrape",
"States": {
"Count pages to scrape": {
"Type": "Pass",
"Parameters": {
"pages.$": "States.ArrayRange(1,${PagesToScrape},1)"
},
"Next": "Queue Lambda Functions"
},
"Queue Lambda Functions": {
"Type": "Map",
"MaxConcurrency": 1,
"InputPath": "$.pages",
"ItemSelector": {
"page.$": "$$.Map.Item.Value"
},
"ItemProcessor": {
"ProcessorConfig": {
"Mode": "INLINE"
},
"StartAt": "Wait",
"States": {
"Wait": {
"Type": "Wait",
"Seconds": 30,
"Next": "Scrape Matches"
},
"Scrape Matches": {
"Type": "Task",
"Resource": "${SashaFunctionArn}:$LATEST",
"OutputPath": "$.Payload",
"Parameters": {
"page.$": "$.page"
},
"ResultPath": "$.Payload.Result",
"Retry": [
{
"ErrorEquals": [
"Lambda.ServiceException",
"Lambda.AWSLambdaException",
"Lambda.SdkClientException",
"Lambda.TooManyRequestsException"
],
"IntervalSeconds": 1,
"MaxAttempts": 3,
"BackoffRate": 2
}
],
"End": true
}
}
},
"End": true
}
}
}
Loading

0 comments on commit b521bfb

Please sign in to comment.