Skip to content

Commit

Permalink
fix: is_composed logic corected and course unit id race condition
Browse files Browse the repository at this point in the history
  • Loading branch information
tomaspalma committed Jul 15, 2024
1 parent b343501 commit bde541a
Show file tree
Hide file tree
Showing 2 changed files with 41 additions and 31 deletions.
11 changes: 1 addition & 10 deletions src/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -17,18 +17,9 @@ course_units:
course_metadata:
scrapy crawl course_metadata

classes:
scrapy crawl classes

slots:
scrapy crawl slots

slot_professor:
scrapy crawl slot_professor

professors:
scrapy crawl professors

info:
python ./scrapper/info.py

Expand All @@ -44,7 +35,7 @@ upload:

convert_mysql:
@echo "Converting dump to mysql..."
@bash ./scripts/sqlite3-to-mysql.sh ./scripts/dump/data/dump_sqlite3.sql > ./scripts/dump/data/dump_mysql.sql
@bash ./scripts/sqlite3-to-mysql.sh ./scripts/dump/data/dump_sqlite3.sql > ./scripts/dump/data/01_data.sql
@echo "Convertion completed!"

clean:
Expand Down
61 changes: 40 additions & 21 deletions src/scrapper/spiders/slot_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,13 +107,21 @@ def classUnitRequests(self):
"""
db.cursor.execute(sql)
self.course_units = db.cursor.fetchall()

print("current course units: ", self.course_units)

db.connection.close()

self.log("Crawling {} class units".format(len(self.course_units)))

for course_unit in self.course_units:
course_unit_id = course_unit[0]
link_id_fragment = course_unit[1] # e.g. hor_geral.ucurr_view?pv_ocorrencia_id=514985
faculty = course_unit[2]

yield Request(
url="https://sigarra.up.pt/{}/pt/{}".format(course_unit[2], course_unit[1]),
meta={'course_unit_id': course_unit[0]},
url="https://sigarra.up.pt/{}/pt/{}".format(faculty, link_id_fragment),
meta={'course_unit_id': course_unit_id},
callback=self.makeRequestToSigarraScheduleAPI,
errback=self.func
)
Expand All @@ -126,47 +134,58 @@ def func(self, error):
def makeRequestToSigarraScheduleAPI(self, response):
self.api_url = response.xpath('//div[@id="cal-shadow-container"]/@data-evt-source-url').extract_first()

yield Request(url=self.api_url, callback=self.extractSchedule)
yield Request(url=self.api_url, callback=self.extractSchedule, meta={'course_unit_id': re.search(r'uc/(\d+)/', self.api_url).group(1)})

def extractSchedule(self, response):
schedule_data = response.json()["data"]
slot_ids = set()

course_unit_id = response.meta.get('course_unit_id')

if len(schedule_data) < 1:
return

# There is only one class per slot, so we only need to create a class once
current_class_id = schedule_data[0]["id"]
current_class_name = schedule_data[0]["name"].split("_")[2]
yield Class(
id=current_class_id,
name=current_class_name,
course_unit_id=course_unit_id,
last_updated=datetime.now()
)


print("current api url: ", self.api_url)
print("SUPPOSED COURSE UNIT ID: ", course_unit_id)
print("Full schedule data: ", schedule_data[0])
print("CURRENT CLASS NAME FULL: ", schedule_data[0]["name"].split("_")[2])
print("CURRENT_CLASS_NAME: ", current_class_name)
print("CURRENT COURSE UNIT ID: ", re.search(r'uc/(\d+)/', self.api_url).group(1))
# if current_class_name == 'CPD':
# print("WHAT THE HELL COURSE UNIT ID IS: ", re.search(r'uc/(\d+)/', self.api_url).group(1))

date_format = "%Y-%m-%dT%H:%M:%S"
for schedule in schedule_data:
date_format = "%Y-%m-%dT%H:%M:%S"
start_time = datetime.strptime(schedule["start"], date_format)
end_time = datetime.strptime(schedule["end"], date_format)

if(int(schedule["id"]) in slot_ids):
continue

slot_ids.add(int(schedule["id"]))

yield Class(
id=schedule["id"],
name=schedule["name"],
course_unit_id=re.search(r'uc/(\d+)/', self.api_url).group(1),
last_updated=datetime.now()
)

yield Slot(
id=schedule["id"],
lesson_type=schedule["typology"]["acronym"],
day=self.days[schedule["week_days"][0]],
start_time=start_time.hour + (start_time.minute / 60),
duration=(end_time - start_time).total_seconds() / 3600,
location=schedule["rooms"][0]["name"],
is_composed=len(schedule["persons"]) > 0,
is_composed=len(schedule["classes"]) > 0,
professor_id=schedule["persons"][0]["sigarra_id"],
class_id=schedule["id"],
class_id=current_class_id,
last_updated=datetime.now(),
)

for teacher in schedule["persons"]:
yield Professor(
id = teacher["sigarra_id"],
professor_acronym = teacher["acronym"],
professor_name = teacher["name"].split("-")[1].strip()
professor_name = teacher["name"] #.split("-")[1].strip()
)

yield SlotProfessor(
Expand Down

0 comments on commit bde541a

Please sign in to comment.