diff --git a/src/Makefile b/src/Makefile index 55e4b25..4f141d0 100644 --- a/src/Makefile +++ b/src/Makefile @@ -1,7 +1,8 @@ .PHONY: all clean FILES = $(shell ls) -all: info faculties courses course_units course_metadata classes slots slot_professor professors +# all: info faculties courses course_units course_metadata classes slots slot_professor professors +all: info faculties courses course_units course_metadata slots faculties: @@ -16,18 +17,9 @@ course_units: course_metadata: scrapy crawl course_metadata -classes: - scrapy crawl classes - slots: scrapy crawl slots -slot_professor: - scrapy crawl slot_professor - -professors: - scrapy crawl professors - info: python ./scrapper/info.py @@ -43,9 +35,9 @@ upload: convert_mysql: @echo "Converting dump to mysql..." - @bash ./scripts/sqlite3-to-mysql.sh ./scripts/dump/data/dump_sqlite3.sql > ./scripts/dump/data/dump_mysql.sql + @bash ./scripts/sqlite3-to-mysql.sh ./scripts/dump/data/dump_sqlite3.sql > ./scripts/dump/data/01_data.sql @echo "Convertion completed!" clean: @echo "Removing database.db" - @rm ./scrapper/database/dbs/database.db \ No newline at end of file + @rm ./scrapper/database/dbs/database.db diff --git a/src/scrapper/items.py b/src/scrapper/items.py index 7f4526a..1aa6f13 100644 --- a/src/scrapper/items.py +++ b/src/scrapper/items.py @@ -7,6 +7,7 @@ import scrapy + class Faculty(scrapy.Item): acronym = scrapy.Field() name = scrapy.Field() @@ -37,18 +38,22 @@ class CourseUnit(scrapy.Item): classes_url = scrapy.Field() last_updated = scrapy.Field() + class CourseMetadata(scrapy.Item): course_id = scrapy.Field() course_unit_id = scrapy.Field() course_unit_year = scrapy.Field() ects = scrapy.Field() + class Class(scrapy.Item): course_unit_id = scrapy.Field() name = scrapy.Field() # 1MIEIC01 last_updated = scrapy.Field() - + + class Slot(scrapy.Item): + id = scrapy.Field() lesson_type = scrapy.Field() # T, TP, PL, etc. day = scrapy.Field() # 0 = monday, 1 = tuesday, .., 5 = saturday (no sunday) start_time = scrapy.Field() # At what time the lesson starts @@ -64,8 +69,8 @@ class SlotProfessor(scrapy.Item): slot_id = scrapy.Field() professor_id = scrapy.Field() + class Professor(scrapy.Item): id = scrapy.Field() professor_acronym = scrapy.Field() professor_name = scrapy.Field() - \ No newline at end of file diff --git a/src/scrapper/spiders/class_spider.py b/src/scrapper/spiders/class_spider.py deleted file mode 100644 index bd19d7f..0000000 --- a/src/scrapper/spiders/class_spider.py +++ /dev/null @@ -1,133 +0,0 @@ -import getpass -import scrapy -from datetime import datetime -import urllib.parse -from configparser import ConfigParser, ExtendedInterpolation -import json - -from scrapper.settings import CONFIG, PASSWORD, USERNAME - -from ..database.Database import Database -from ..items import Class - -class ClassSpider(scrapy.Spider): - name = "classes" - allowed_domains = ['sigarra.up.pt'] - login_page_base = 'https://sigarra.up.pt/feup/pt/mob_val_geral.autentica' - - def __init__(self, *args, **kwargs): - super(ClassSpider, self).__init__(*args, **kwargs) - self.open_config() - self.user = CONFIG[USERNAME] - self.password = CONFIG[PASSWORD] - - def open_config(self): - """ - Reads and saves the configuration file. - """ - config_file = "./config.ini" - self.config = ConfigParser(interpolation=ExtendedInterpolation()) - self.config.read(config_file) - - - def format_login_url(self): - return '{}?{}'.format(self.login_page_base, urllib.parse.urlencode({ - 'pv_login': self.user, - 'pv_password': self.password - })) - - def start_requests(self): - """This function is called before crawling starts.""" - - if self.password is None: - self.password = getpass.getpass(prompt='Password: ', stream=None) - - yield scrapy.http.Request(url=self.format_login_url(), callback=self.check_login_response) - - def check_login_response(self, response): - """Check the response returned by a login request to see if we are - successfully logged in. Since we used the mobile login API endpoint, - we can just check the status code. - """ - - if response.status == 200: - response_body = json.loads(response.body) - if response_body['authenticated']: - self.log("Successfully logged in. Let's start crawling!") - return self.courseUnitRequests() - else: - message = 'Login failed. SIGARRA\'s response: error type "{}";\nerror message "{}"'.format( - response_body.erro, response_body.erro_msg) - print(message, flush=True) - self.log(message) - else: - print('Login Failed. HTTP Error {}'.format(response.status), flush=True) - self.log('Login Failed. HTTP Error {}'.format(response.status)) - - def courseUnitRequests(self): - db = Database() - sql = """ - SELECT course_unit.id, course_unit.schedule_url, course.faculty_id - FROM course_unit JOIN course - ON course_unit.course_id = course.id - WHERE schedule_url IS NOT NULL - """ - db.cursor.execute(sql) - self.course_units = db.cursor.fetchall() - db.connection.close() - - self.log("Crawling {} course units to fetch classes".format(len(self.course_units))) - for course_unit in self.course_units: - yield scrapy.http.Request( - url="https://sigarra.up.pt/{}/pt/{}".format(course_unit[2], course_unit[1]), - meta={'id': course_unit[0], 'faculty': course_unit[2]}, - callback=self.getClassesUrl, - errback=self.scrapyError - ) - - def getClassesUrl(self, response): - if response.xpath('//div[@id="erro"]/h2/text()').extract_first() == "Sem Resultados": - yield None - - classesUrl = list(set( - response.xpath('//span[@class="textopequenoc"]/a/@href').getall() - + - response.xpath('//td[@headers="t6"]/a/@href').getall() - )) - - - - for url in classesUrl: - if "turmas_view" in url: - className = ( - response.xpath('//span[@class="textopequenoc"]/a[@href="'+url+'"]/text()').extract_first() - or response.xpath('//td[@headers="t6"]/a[@href="'+url+'"]/text()').extract_first() - ) - yield Class( - name=className.strip(), - course_unit_id=response.meta['id'], - last_updated=datetime.now() - ) - elif "composto_desc" in url: - yield scrapy.http.Request( - url="https://sigarra.up.pt/{}/pt/{}".format(response.meta['faculty'], url), - meta={'id': response.meta['id']}, - callback=self.extractCompositeClass, - errback=self.scrapyError - ) - else: - yield None - - def extractCompositeClass(self, response): - classesNames = response.xpath('//div[@id="conteudoinner"]/li/a/text()').getall() - for className in classesNames: - yield Class( - name=className.strip(), - course_unit_id=response.meta['id'], - last_updated=datetime.now() - ) - - def scrapyError(self, error): - # print(error) - # O Scrapper não tem erros - return diff --git a/src/scrapper/spiders/professor_spider.py b/src/scrapper/spiders/professor_spider.py deleted file mode 100644 index 33dec40..0000000 --- a/src/scrapper/spiders/professor_spider.py +++ /dev/null @@ -1,98 +0,0 @@ -import getpass -import scrapy -from scrapy.http import Request, FormRequest -from urllib.parse import urlencode -from configparser import ConfigParser, ExtendedInterpolation -import json - -from scrapper.settings import CONFIG, PASSWORD, USERNAME -from ..database.Database import Database -from dotenv import dotenv_values -from ..items import Professor -import pandas as pd - - -class ProfessorSpider(scrapy.Spider): - name = "professors" - allowed_domains = ['sigarra.up.pt'] - login_page_base = 'https://sigarra.up.pt/feup/pt/mob_val_geral.autentica' - password = None - - - def open_config(self): - """ - Reads and saves the configuration file. - """ - config_file = "./config.ini" - self.config = ConfigParser(interpolation=ExtendedInterpolation()) - self.config.read(config_file) - - def __init__(self, password=None, category=None, *args, **kwargs): - super(ProfessorSpider, self).__init__(*args, **kwargs) - self.open_config() - self.user = CONFIG[USERNAME] - self.password = CONFIG[PASSWORD] - - def format_login_url(self): - return '{}?{}'.format(self.login_page_base, urlencode({ - 'pv_login': self.user, - 'pv_password': self.password - })) - - def start_requests(self): - "This function is called before crawling starts." - if self.password is None: - self.password = getpass.getpass(prompt='Password: ', stream=None) - - yield Request(url=self.format_login_url(), callback=self.check_login_response, errback=self.login_response_err) - - def login_response_err(self, failure): - print('Login failed. SIGARRA\'s response: error type 404;\nerror message "{}"'.format(failure)) - print("Check your password") - - def check_login_response(self, response): - """Check the response returned by a login request to see if we are - successfully logged in. Since we used the mobile login API endpoint, - we can just check the status code. - """ - - if response.status == 200: - response_body = json.loads(response.body) - if response_body['authenticated']: - self.log("Successfully logged in. Let's start crawling!") - return self.scheduleRequests() - - - def scheduleRequests(self): - print("Gathering professors") - db = Database() - - sql = """ - SELECT slot_professor.professor_id, url - FROM course_unit JOIN class JOIN slot JOIN slot_professor - ON course_unit.id = class.course_unit_id AND class.id = slot.class_id AND slot.id = slot_professor.slot_id - GROUP BY slot_professor.professor_id - """ - db.cursor.execute(sql) - self.prof_info = db.cursor.fetchall() - db.connection.close() - - self.log("Crawling {} schedules".format(len(self.prof_info))) - - - for (id, url) in self.prof_info: - faculty = url.split('/')[3] - yield scrapy.http.Request( - url="https://sigarra.up.pt/{}/pt/func_geral.FormView?p_codigo={}".format(faculty, id), - meta={'professor_id': id}, - callback=self.extractProfessors) - - def extractProfessors(self, response): - name = response.xpath('//table[@class="tabelasz"]/tr[1]/td[2]/b/text()').extract_first() - acronym = response.xpath('//table[@class="tabelasz"]/tr[2]/td[2]/b/text()').extract_first() - return Professor( - id = response.meta['professor_id'], - professor_acronym = acronym, - professor_name = name - ) - diff --git a/src/scrapper/spiders/slot_professor_spider.py b/src/scrapper/spiders/slot_professor_spider.py deleted file mode 100644 index 8a5c38f..0000000 --- a/src/scrapper/spiders/slot_professor_spider.py +++ /dev/null @@ -1,102 +0,0 @@ -import getpass -import scrapy -from scrapy.http import Request, FormRequest -from urllib.parse import urlencode -from configparser import ConfigParser, ExtendedInterpolation -import json - -from scrapper.settings import CONFIG, PASSWORD, USERNAME -from ..database.Database import Database -from ..items import SlotProfessor - - -class SlotProfessorSpider(scrapy.Spider): - name = 'slot_professor' - allowed_domains = ['sigarra.up.pt'] - login_page_base = 'https://sigarra.up.pt/feup/pt/mob_val_geral.autentica' - password = None - - def open_config(self): - """ - Reads and saves the configuration file. - """ - config_file = "./config.ini" - self.config = ConfigParser(interpolation=ExtendedInterpolation()) - self.config.read(config_file) - - def __init__(self, password=None, category=None, *args, **kwargs): - super(SlotProfessorSpider, self).__init__(*args, **kwargs) - self.open_config() - self.user = CONFIG[USERNAME] - self.password = CONFIG[PASSWORD] - - def format_login_url(self): - return '{}?{}'.format(self.login_page_base, urlencode({ - 'pv_login': self.user, - 'pv_password': self.password - })) - - def start_requests(self): - "This function is called before crawling starts." - if self.password is None: - self.password = getpass.getpass(prompt='Password: ', stream=None) - - yield Request(url=self.format_login_url(), callback=self.check_login_response, errback=self.login_response_err) - - def login_response_err(self, failure): - print('Login failed. SIGARRA\'s response: error type 404;\nerror message "{}"'.format(failure)) - print("Check your password") - - def check_login_response(self, response): - """Check the response returned by a login request to see if we are - successfully logged in. Since we used the mobile login API endpoint, - we can just check the status code. - """ - - if response.status == 200: - response_body = json.loads(response.body) - if response_body['authenticated']: - self.log("Successfully logged in. Let's start crawling!") - return self.slotRequests() - - - def slotRequests(self): - print("Gathering professors' metadata") - db = Database() - - sql = """ - SELECT url, is_composed, slot.professor_id, slot.id - FROM course_unit JOIN class JOIN slot - ON course_unit.id = class.course_unit_id AND class.id = slot.class_id - """ - db.cursor.execute(sql) - self.prof_info = db.cursor.fetchall() - db.connection.close() - - self.log("Crawling {} slots".format(len(self.prof_info))) - - for (url, is_composed, professor_id, slot_id) in self.prof_info: - faculty = url.split('/')[3] - - # It is not the sigarra's professor id, but the link to the list of professors. - if is_composed: - yield scrapy.http.Request( - url="https://sigarra.up.pt/{}/pt/hor_geral.composto_doc?p_c_doc={}".format(faculty, professor_id), - meta={'slot_id': slot_id}, - dont_filter=True, - callback=self.extractCompoundProfessors) - else: - # It is the sigarra's professor id. - yield SlotProfessor( - slot_id=slot_id, - professor_id=professor_id, - ) - - def extractCompoundProfessors(self, response): - professors = response.xpath('//*[@id="conteudoinner"]/li/a/@href').extract() - - for professor_link in professors: - yield SlotProfessor( - slot_id=response.meta['slot_id'], - professor_id=professor_link.split('=')[1], - ) diff --git a/src/scrapper/spiders/slot_spider.py b/src/scrapper/spiders/slot_spider.py index 23a5cc1..a345f7a 100644 --- a/src/scrapper/spiders/slot_spider.py +++ b/src/scrapper/spiders/slot_spider.py @@ -1,15 +1,18 @@ import getpass +import re import scrapy from datetime import datetime from scrapy.http import Request, FormRequest import urllib.parse from configparser import ConfigParser, ExtendedInterpolation import json +from datetime import time from scrapper.settings import CONFIG, PASSWORD, USERNAME -from ..database.Database import Database -from ..items import Slot +from ..database.Database import Database +from ..items import Slot, Class, SlotProfessor, Professor + def get_class_id(course_unit_id, class_name): db = Database() @@ -19,16 +22,16 @@ def get_class_id(course_unit_id, class_name): ON course_unit.id = class.course_unit_id WHERE course_unit.id = {} AND class.name = '{}' """.format(course_unit_id, class_name) - + db.cursor.execute(sql) class_id = db.cursor.fetchone() db.connection.close() - if (class_id == None): # TODO: verificar casos em que a aula já esta na db mas for some reason não foi encontrada + if (class_id == None): # TODO: verificar casos em que a aula já esta na db mas for some reason não foi encontrada # db2 = Database() # sql = """ # SELECT course_unit.url - # FROM course_unit + # FROM course_unit # WHERE course_unit.id = {} # """.format(course_unit_id) @@ -36,22 +39,24 @@ def get_class_id(course_unit_id, class_name): # class_url = db2.cursor.fetchone() # db2.connection.close() # print("Class not found: ", class_url[0]) - return None + return None return class_id[0] + class SlotSpider(scrapy.Spider): name = "slots" allowed_domains = ['sigarra.up.pt'] login_page_base = 'https://sigarra.up.pt/feup/pt/mob_val_geral.autentica' - days = {'Segunda': 0, 'Terça': 1, 'Quarta': 2, - 'Quinta': 3, 'Sexta': 4, 'Sábado': 5} - # password = None + days = {'Segunda-feira': 0, 'Terça-feira': 1, 'Quarta-feira': 2, + 'Quinta-feira': 3, 'Sexta-feira': 4, 'Sábado': 5} def __init__(self, password=None, category=None, *args, **kwargs): super(SlotSpider, self).__init__(*args, **kwargs) self.open_config() self.user = CONFIG[USERNAME] self.password = CONFIG[PASSWORD] + self.professor_name_pattern = "\d+\s-\s[A-zÀ-ú](\s[A-zÀ-ú])*" + self.inserted_teacher_ids = set() def open_config(self): """ @@ -59,8 +64,7 @@ def open_config(self): """ config_file = "./config.ini" self.config = ConfigParser(interpolation=ExtendedInterpolation()) - self.config.read(config_file) - + self.config.read(config_file) def format_login_url(self): return '{}?{}'.format(self.login_page_base, urllib.parse.urlencode({ @@ -93,11 +97,12 @@ def check_login_response(self, response): print(message, flush=True) self.log(message) else: - print('Login Failed. HTTP Error {}'.format(response.status), flush=True) + print('Login Failed. HTTP Error {}'.format( + response.status), flush=True) self.log('Login Failed. HTTP Error {}'.format(response.status)) def classUnitRequests(self): - db = Database() + db = Database() sql = """ SELECT course_unit.id, course_unit.schedule_url, course.faculty_id FROM course JOIN course_metadata JOIN course_unit @@ -106,268 +111,119 @@ def classUnitRequests(self): """ db.cursor.execute(sql) self.course_units = db.cursor.fetchall() + db.connection.close() self.log("Crawling {} class units".format(len(self.course_units))) + for course_unit in self.course_units: - yield Request( - url="https://sigarra.up.pt/{}/pt/{}".format(course_unit[2], course_unit[1]), - meta={'course_unit_id': course_unit[0]}, - callback=self.extractSchedule, + course_unit_id = course_unit[0] + # e.g. hor_geral.ucurr_view?pv_ocorrencia_id=514985 + link_id_fragment = course_unit[1] + faculty = course_unit[2] + + yield Request( + url="https://sigarra.up.pt/{}/pt/{}".format( + faculty, link_id_fragment), + meta={'course_unit_id': course_unit_id}, + callback=self.makeRequestToSigarraScheduleAPI, errback=self.func ) - + def func(self, error): # # O scrapper não tem erros - # print(error) + print("An error has occured: ", error) return + def makeRequestToSigarraScheduleAPI(self, response): + self.api_url = response.xpath( + '//div[@id="cal-shadow-container"]/@data-evt-source-url').extract_first() + + yield Request(url=self.api_url, callback=self.extractSchedule, meta={'course_unit_id': re.search(r'uc/(\d+)/', self.api_url).group(1)}) + def extractSchedule(self, response): - # Check if there is no schedule available - if response.xpath('//div[@id="erro"]/h2/text()').extract_first() == "Sem Resultados": - yield None - - # Classes in timetable - for schedule in response.xpath('//table[@class="horario"]'): - # This array represents the rowspans left in the current row - # It is used because when a row has rowspan > 1, the table - # will seem to be missing a column and can cause out of sync errors - # between the HTML table and its memory representation - rowspans = [0, 0, 0, 0, 0, 0] - hour = 8 - for row in schedule.xpath('./tr[not(th)]'): - cols = row.xpath('./td[not(contains(@class, "k"))]') - cols_iter = iter(cols) - - # 0 -> Monday, 1 -> Tuesday, ..., 5 -> Saturday (No sunday) - for cur_day in range(0, 6): - if rowspans[cur_day] > 0: - rowspans[cur_day] -= 1 - - # If there is a class in the current column, then just - # skip it - if rowspans[cur_day] > 0: - continue - - cur_col = next(cols_iter) - class_duration = cur_col.xpath('@rowspan').extract_first() - if class_duration is not None: - rowspans[cur_day] = int(class_duration) - yield self.extractClassSchedule( - response, - cur_col, - cur_day, - hour, - int(class_duration) / 2, - response.meta['course_unit_id'] - ) - - hour += 0.5 - - # Overlapping classes - for row in response.xpath('//table[@class="dados"]/tr[not(th)]'): - yield self.extractOverlappingClassSchedule(response, row, response.meta['course_unit_id']) - - def extractClassSchedule(self, response, cell, day, start_time, duration, course_unit_id): - lesson_type = cell.xpath( - 'b/text()').extract_first().strip().replace('(', '', 1).replace(')', '', 1) - table = cell.xpath('table/tr') - location = table.xpath('td/a/text()').extract_first() - professor_link = table.xpath('td[@class="textod"]//a/@href').extract_first() - is_composed = 'composto_doc' in professor_link - professor_id = professor_link.split('=')[1] - - clazz = cell.xpath('span/a') - class_name = clazz.xpath('text()').extract_first() - class_url = clazz.xpath('@href').extract_first() - - # If true, this means the class is composed of more than one class - # And an additional request must be made to obtain all classes - if "hor_geral.composto_desc" in class_url: - return response.follow( - class_url, - dont_filter=True, - meta={ - 'course_unit_id': course_unit_id, - 'lesson_type': lesson_type, - 'start_time': start_time, - 'is_composed': is_composed, - 'professor_id': professor_id, - 'location': location, - 'day': day, - 'duration': duration - }, - callback=self.extractComposedClasses - ) + schedule_data = response.json()["data"] + course_unit_id = response.meta.get('course_unit_id') - class_id = get_class_id(course_unit_id, class_name) - if (class_id != None): - return Slot( - lesson_type=lesson_type, - day=day, - start_time=start_time, - duration=duration, - location=location, - is_composed=is_composed, - professor_id=professor_id, - class_id=class_id, - last_updated=datetime.now(), - ) - else: - return None + if len(schedule_data) < 1: + return - def extractComposedClasses(self, response): - class_names = response.xpath( - '//div[@id="conteudoinner"]/li/a/text()').extract() + date_format = "%Y-%m-%dT%H:%M:%S" - for class_name in class_names: - class_id = get_class_id(response.meta['course_unit_id'], class_name) - if (class_id != None): - yield Slot( - lesson_type=response.meta['lesson_type'], - day=response.meta['day'], - start_time=response.meta['start_time'], - duration=response.meta['duration'], - location=response.meta['location'], - is_composed=response.meta['is_composed'], - professor_id=response.meta['professor_id'], - class_id=class_id, + inserted_slots_ids = [] + for schedule in schedule_data: + if(schedule['id'] in inserted_slots_ids): continue + inserted_slots_ids.append(schedule['id']) + + start_time = datetime.strptime(schedule["start"], date_format) + end_time = datetime.strptime(schedule["end"], date_format) + + for teacher in schedule["persons"]: + (sigarra_id, name) = self.get_professor_info(teacher) + + if sigarra_id in self.inserted_teacher_ids: + continue + + self.inserted_teacher_ids.add(sigarra_id) + + yield Professor( + id=sigarra_id, + professor_acronym=teacher["acronym"], + professor_name=name + ) + + for current_class in schedule["classes"]: + yield Class( + name=current_class["name"], + course_unit_id=course_unit_id, last_updated=datetime.now() ) - else: - yield None - - def extractOverlappingClassSchedule(self, response, row, course_unit_id): - day_str = row.xpath('td[2]/text()').extract_first() - time_str = row.xpath('td[3]/text()').extract_first() - - day = self.days[day_str] - hours, minutes = time_str.split(':') - start_time = int(hours) - - if int(minutes) > 0: - start_time += int(minutes) / 60 - - lesson_type = row.xpath( - 'td[1]/text()').extract_first().strip().replace('(', '', 1).replace(')', '', 1) - location = row.xpath('td[4]/a/text()').extract_first() - professor_link = row.xpath('td[@headers="t5"]/a/@href').extract_first() - is_composed = 'composto_doc' in professor_link - professor_id = professor_link.split('=')[1] - - clazz = row.xpath('td[6]/a') - class_name = clazz.xpath('text()').extract_first() - class_url = clazz.xpath('@href').extract_first() - - # If true, this means the class is composed of more than one class - # And an additional request must be made to obtain all classes - if "hor_geral.composto_desc" in class_url: - return response.follow( - class_url, - dont_filter=True, - meta={ - 'course_unit_id': course_unit_id, - 'lesson_type': lesson_type, - 'start_time': start_time, - 'is_composed': is_composed, - 'professor_id': professor_id, - 'location': location, - 'day': day - }, - callback=self.extractDurationFromComposedOverlappingClasses - ) - return response.follow( - class_url, - dont_filter=True, - meta={ - 'course_unit_id': course_unit_id, - 'lesson_type': lesson_type, - 'start_time': start_time, - 'is_composed': is_composed, - 'professor_id': professor_id, - 'location': location, - 'day': day, - 'class_name': class_name - }, - callback=self.extractDurationFromOverlappingClass - ) - - def extractDurationFromComposedOverlappingClasses(self, response): - classes = response.xpath('//div[@id="conteudoinner"]/li/a') - - for clazz in classes: - class_name = clazz.xpath('./text()').extract_first() - class_url = clazz.xpath('@href').extract_first() - - yield response.follow( - class_url, - dont_filter=True, - meta={ - 'course_unit_id': response.meta['course_unit_id'], - 'lesson_type': response.meta['lesson_type'], - 'start_time': response.meta['start_time'], - 'is_composed': response.meta['is_composed'], - 'professor_id': response.meta['professor_id'], - 'location': response.meta['location'], - 'day': response.meta['day'], - 'class_name': class_name - }, - callback=self.extractDurationFromOverlappingClass - ) + # INFO Since we need to know at runtime the id of the slot so that we can then create SlotProfessor + # instances, we are going to be using the same id as the class in order to minimize database lookups + # during the runtime of the scrapper + current_class_id = get_class_id( + course_unit_id, current_class["name"]) - def extractDurationFromOverlappingClass(self, response): - day = response.meta['day'] - start_time = response.meta['start_time'] - duration = None - - # Classes in timetable - for schedule in response.xpath('//table[@class="horario"]'): - # This array represents the rowspans left in the current row - # It is used because when a row has rowspan > 1, the table - # will seem to be missing a column and can cause out of sync errors - # between the HTML table and its memory representation - rowspans = [0, 0, 0, 0, 0, 0] - hour = 8 - for row in schedule.xpath('./tr[not(th)]'): - cols = row.xpath('./td[not(contains(@class, "k"))]') - cols_iter = iter(cols) - - # 0 -> Monday, 1 -> Tuesday, ..., 5 -> Saturday (No sunday) - for cur_day in range(0, 6): - if rowspans[cur_day] > 0: - rowspans[cur_day] -= 1 - - # If there is a class in the current column, then just - # skip it - if rowspans[cur_day] > 0: - continue - - cur_col = next(cols_iter) - class_duration = cur_col.xpath('@rowspan').extract_first() - if class_duration is not None: - rowspans[cur_day] = int(class_duration) - if cur_day == day and start_time == hour: - duration = int(class_duration) / 2 - - hour += 0.5 - - if duration is None: - return None - - class_id = get_class_id(response.meta['course_unit_id'], response.meta['class_name']) - if (class_id != None): - yield Slot( - lesson_type=response.meta['lesson_type'], - day=day, - start_time=start_time, - duration=duration, - location=response.meta['location'], - is_composed=response.meta['is_composed'], - professor_id=response.meta['professor_id'], - class_id=get_class_id(response.meta['course_unit_id'], response.meta['class_name']), - last_updated=datetime.now(), - ) - else: - yield None - \ No newline at end of file + print(f"(id: {current_class_id}, name: {current_class['name']})") + + yield Slot( + id=schedule["id"], + lesson_type=schedule["typology"]["acronym"], + day=self.days[schedule["week_days"][0]], + start_time=start_time.hour + (start_time.minute / 60), + duration=(end_time - start_time).total_seconds() / 3600, + location=schedule["rooms"][0]["name"], + is_composed=len(schedule["classes"]) > 0, + professor_id=schedule["persons"][0]["sigarra_id"], + class_id=current_class_id, + last_updated=datetime.now(), + ) + + for teacher in schedule["persons"]: + (sigarra_id, name) = self.get_professor_info( + teacher) + + yield SlotProfessor( + slot_id=schedule["id"], + professor_id=sigarra_id + ) + + def get_professor_info(self, teacher): + """ + The sigarra API that are using gives the name of the professors in two ways: + 1. - + 2. + + The option 2 generally occurs when there is not any teacher assigned. So, in order to retrive the + in the cases that we have a '-' in the middle, we have to check which one of option 1 + or option 2 the api returned for that specific class. + """ + + if re.search(self.professor_name_pattern, teacher["name"]): + [professor_sigarra_id, professor_name, + *_] = teacher["name"].split("-", 1) + + return (professor_sigarra_id.strip(), professor_name.strip()) + + return (teacher["sigarra_id"], teacher["name"])