Skip to content

Commit

Permalink
refactor: class and professor spiders refactored to new sigarra sched…
Browse files Browse the repository at this point in the history
…ule format
  • Loading branch information
tomaspalma committed Jul 7, 2024
1 parent 43f3d3c commit 4103cb6
Show file tree
Hide file tree
Showing 6 changed files with 42 additions and 210 deletions.
5 changes: 3 additions & 2 deletions src/Makefile
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
.PHONY: all clean
FILES = $(shell ls)

all: info faculties courses course_units course_metadata classes slots slot_professor professors
# all: info faculties courses course_units course_metadata classes slots slot_professor professors
all: info faculties courses course_units course_metadata slots


faculties:
Expand Down Expand Up @@ -48,4 +49,4 @@ convert_mysql:

clean:
@echo "Removing database.db"
@rm ./scrapper/database/dbs/database.db
@rm ./scrapper/database/dbs/database.db
4 changes: 3 additions & 1 deletion src/scrapper/items.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,11 +44,13 @@ class CourseMetadata(scrapy.Item):
ects = scrapy.Field()

class Class(scrapy.Item):
id = scrapy.Field()
course_unit_id = scrapy.Field()
name = scrapy.Field() # 1MIEIC01
last_updated = scrapy.Field()

class Slot(scrapy.Item):
id = scrapy.Field()
lesson_type = scrapy.Field() # T, TP, PL, etc.
day = scrapy.Field() # 0 = monday, 1 = tuesday, .., 5 = saturday (no sunday)
start_time = scrapy.Field() # At what time the lesson starts
Expand All @@ -68,4 +70,4 @@ class Professor(scrapy.Item):
id = scrapy.Field()
professor_acronym = scrapy.Field()
professor_name = scrapy.Field()


8 changes: 5 additions & 3 deletions src/scrapper/spiders/class_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,8 +76,11 @@ def courseUnitRequests(self):
self.course_units = db.cursor.fetchall()
db.connection.close()

print("Course units from database: ", self.course_units);

self.log("Crawling {} course units to fetch classes".format(len(self.course_units)))
for course_unit in self.course_units:
# print("https://sigarra.up.pt/{}/pt/{}".format(course_unit[2], course_unit[1]))
yield scrapy.http.Request(
url="https://sigarra.up.pt/{}/pt/{}".format(course_unit[2], course_unit[1]),
meta={'id': course_unit[0], 'faculty': course_unit[2]},
Expand All @@ -95,14 +98,13 @@ def getClassesUrl(self, response):
response.xpath('//td[@headers="t6"]/a/@href').getall()
))



for url in classesUrl:
if "turmas_view" in url:
className = (
response.xpath('//span[@class="textopequenoc"]/a[@href="'+url+'"]/text()').extract_first()
or response.xpath('//td[@headers="t6"]/a[@href="'+url+'"]/text()').extract_first()
)

yield Class(
name=className.strip(),
course_unit_id=response.meta['id'],
Expand All @@ -128,6 +130,6 @@ def extractCompositeClass(self, response):
)

def scrapyError(self, error):
# print(error)
print("Current error: ", error)
# O Scrapper não tem erros
return
98 changes: 0 additions & 98 deletions src/scrapper/spiders/professor_spider.py

This file was deleted.

102 changes: 0 additions & 102 deletions src/scrapper/spiders/slot_professor_spider.py

This file was deleted.

35 changes: 31 additions & 4 deletions src/scrapper/spiders/slot_spider.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import getpass
import re
import scrapy
from datetime import datetime
from scrapy.http import Request, FormRequest
Expand All @@ -10,7 +11,7 @@
from scrapper.settings import CONFIG, PASSWORD, USERNAME

from ..database.Database import Database
from ..items import Slot
from ..items import Slot, Class, SlotProfessor, Professor

def get_class_id(course_unit_id, class_name):
db = Database()
Expand Down Expand Up @@ -123,26 +124,52 @@ def func(self, error):
return

def makeRequestToSigarraScheduleAPI(self, response):
apiUrl = response.xpath('//div[@id="cal-shadow-container"]/@data-evt-source-url').extract_first()
self.api_url = response.xpath('//div[@id="cal-shadow-container"]/@data-evt-source-url').extract_first()

yield Request(url=apiUrl, callback=self.extractSchedule)
yield Request(url=self.api_url, callback=self.extractSchedule)

def extractSchedule(self, response):
schedule_data = response.json()["data"]
slot_ids = set()

for schedule in schedule_data:
date_format = "%Y-%m-%dT%H:%M:%S"
start_time = datetime.strptime(schedule["start"], date_format)
end_time = datetime.strptime(schedule["end"], date_format)

if(int(schedule["id"]) in slot_ids):
continue

slot_ids.add(int(schedule["id"]))

yield Class(
id=schedule["id"],
name=schedule["name"],
course_unit_id=re.search(r'uc/(\d+)/', self.api_url).group(1),
last_updated=datetime.now()
)

yield Slot(
id=schedule["id"],
lesson_type=schedule["typology"]["acronym"],
day=self.days[schedule["week_days"][0]],
start_time=start_time.hour + (start_time.minute / 60),
duration=(end_time - start_time).total_seconds() / 3600,
location=schedule["rooms"][0]["name"],
is_composed=len(schedule["persons"]) > 0,
professor_id=schedule["persons"][0]["id"],
professor_id=schedule["persons"][0]["sigarra_id"],
class_id=schedule["id"],
last_updated=datetime.now(),
)

for teacher in schedule["persons"]:
yield Professor(
id = teacher["sigarra_id"],
professor_acronym = teacher["acronym"],
professor_name = teacher["name"].split("-")[1].strip()
)

yield SlotProfessor(
slot_id=schedule["id"],
professor_id=teacher["sigarra_id"]
)

0 comments on commit 4103cb6

Please sign in to comment.