Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor slot and professors spiders in light of sigarra schedule html page changes #112

Merged
merged 6 commits into from
Jul 24, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 6 additions & 3 deletions src/scrapper/items.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

import scrapy


class Faculty(scrapy.Item):
acronym = scrapy.Field()
name = scrapy.Field()
Expand Down Expand Up @@ -37,18 +38,20 @@ class CourseUnit(scrapy.Item):
classes_url = scrapy.Field()
last_updated = scrapy.Field()


class CourseMetadata(scrapy.Item):
course_id = scrapy.Field()
course_unit_id = scrapy.Field()
course_unit_year = scrapy.Field()
ects = scrapy.Field()


class Class(scrapy.Item):
id = scrapy.Field()
course_unit_id = scrapy.Field()
name = scrapy.Field() # 1MIEIC01
last_updated = scrapy.Field()



class Slot(scrapy.Item):
id = scrapy.Field()
lesson_type = scrapy.Field() # T, TP, PL, etc.
Expand All @@ -66,8 +69,8 @@ class SlotProfessor(scrapy.Item):
slot_id = scrapy.Field()
professor_id = scrapy.Field()


class Professor(scrapy.Item):
id = scrapy.Field()
professor_acronym = scrapy.Field()
professor_name = scrapy.Field()

91 changes: 65 additions & 26 deletions src/scrapper/spiders/slot_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,10 @@

from scrapper.settings import CONFIG, PASSWORD, USERNAME

from ..database.Database import Database
from ..database.Database import Database
from ..items import Slot, Class, SlotProfessor, Professor


def get_class_id(course_unit_id, class_name):
db = Database()
sql = """
Expand All @@ -21,26 +22,27 @@ def get_class_id(course_unit_id, class_name):
ON course_unit.id = class.course_unit_id
WHERE course_unit.id = {} AND class.name = '{}'
""".format(course_unit_id, class_name)

db.cursor.execute(sql)
class_id = db.cursor.fetchone()
db.connection.close()

if (class_id == None): # TODO: verificar casos em que a aula já esta na db mas for some reason não foi encontrada
if (class_id == None): # TODO: verificar casos em que a aula já esta na db mas for some reason não foi encontrada
# db2 = Database()
# sql = """
# SELECT course_unit.url
# FROM course_unit
# FROM course_unit
# WHERE course_unit.id = {}
# """.format(course_unit_id)

# db2.cursor.execute(sql)
# class_url = db2.cursor.fetchone()
# db2.connection.close()
# print("Class not found: ", class_url[0])
return None
return None
return class_id[0]


class SlotSpider(scrapy.Spider):
name = "slots"
allowed_domains = ['sigarra.up.pt']
Expand All @@ -53,15 +55,16 @@ def __init__(self, password=None, category=None, *args, **kwargs):
self.open_config()
self.user = CONFIG[USERNAME]
self.password = CONFIG[PASSWORD]
self.professor_name_pattern = "\d+\s-\s[A-zÀ-ú](\s[A-zÀ-ú])*"
self.inserted_teacher_ids = set()

def open_config(self):
"""
Reads and saves the configuration file.
"""
config_file = "./config.ini"
self.config = ConfigParser(interpolation=ExtendedInterpolation())
self.config.read(config_file)

self.config.read(config_file)

def format_login_url(self):
return '{}?{}'.format(self.login_page_base, urllib.parse.urlencode({
Expand Down Expand Up @@ -94,11 +97,12 @@ def check_login_response(self, response):
print(message, flush=True)
self.log(message)
else:
print('Login Failed. HTTP Error {}'.format(response.status), flush=True)
print('Login Failed. HTTP Error {}'.format(
response.status), flush=True)
self.log('Login Failed. HTTP Error {}'.format(response.status))

def classUnitRequests(self):
db = Database()
db = Database()
sql = """
SELECT course_unit.id, course_unit.schedule_url, course.faculty_id
FROM course JOIN course_metadata JOIN course_unit
Expand All @@ -108,31 +112,32 @@ def classUnitRequests(self):
db.cursor.execute(sql)
self.course_units = db.cursor.fetchall()

print("current course units: ", self.course_units)

db.connection.close()

self.log("Crawling {} class units".format(len(self.course_units)))

for course_unit in self.course_units:
course_unit_id = course_unit[0]
link_id_fragment = course_unit[1] # e.g. hor_geral.ucurr_view?pv_ocorrencia_id=514985
# e.g. hor_geral.ucurr_view?pv_ocorrencia_id=514985
link_id_fragment = course_unit[1]
faculty = course_unit[2]

yield Request(
url="https://sigarra.up.pt/{}/pt/{}".format(faculty, link_id_fragment),
yield Request(
url="https://sigarra.up.pt/{}/pt/{}".format(
faculty, link_id_fragment),
meta={'course_unit_id': course_unit_id},
callback=self.makeRequestToSigarraScheduleAPI,
errback=self.func
)

def func(self, error):
# # O scrapper não tem erros
# print(error)
print("An error has occured: ", error)
return

def makeRequestToSigarraScheduleAPI(self, response):
self.api_url = response.xpath('//div[@id="cal-shadow-container"]/@data-evt-source-url').extract_first()
self.api_url = response.xpath(
'//div[@id="cal-shadow-container"]/@data-evt-source-url').extract_first()

yield Request(url=self.api_url, callback=self.extractSchedule, meta={'course_unit_id': re.search(r'uc/(\d+)/', self.api_url).group(1)})

Expand All @@ -149,35 +154,69 @@ def extractSchedule(self, response):
end_time = datetime.strptime(schedule["end"], date_format)

for teacher in schedule["persons"]:
(sigarra_id, name) = self.get_professor_info(teacher)

if sigarra_id in self.inserted_teacher_ids:
continue

self.inserted_teacher_ids.add(sigarra_id)

yield Professor(
id = teacher["sigarra_id"],
professor_acronym = teacher["acronym"],
professor_name = teacher["name"] #.split("-")[1].strip()
id=sigarra_id,
professor_acronym=teacher["acronym"],
professor_name=name
)

for current_class in schedule["classes"]:
yield Class(
id=current_class["sigarra_id"],
name=current_class["name"],
course_unit_id=course_unit_id,
last_updated=datetime.now()
)

# INFO Since we need to know at runtime the id of the slot so that we can then create SlotProfessor
# instances, we are going to be using the same id as the class in order to minimize database lookups
# during the runtime of the scrapper
current_class_id = get_class_id(
course_unit_id, current_class["name"])

yield Slot(
id=current_class["sigarra_id"],
id=current_class_id,
lesson_type=schedule["typology"]["acronym"],
day=self.days[schedule["week_days"][0]],
start_time=start_time.hour + (start_time.minute / 60),
duration=(end_time - start_time).total_seconds() / 3600,
location=schedule["rooms"][0]["name"],
is_composed=len(schedule["classes"]) > 0,
professor_id=schedule["persons"][0]["sigarra_id"],
class_id=current_class["sigarra_id"],
class_id=current_class_id,
last_updated=datetime.now(),
thePeras marked this conversation as resolved.
Show resolved Hide resolved
)

for teacher in schedule["persons"]:
(sigarra_id, name) = self.get_professor_info(
teacher) # (sigarra_id | None, teacher_name)

yield SlotProfessor(
slot_id=current_class["sigarra_id"]
professor_id=teacher["sigarra_id"]
slot_id=current_class_id,
professor_id=sigarra_id
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is None a possible value? What happens in these cases?

Copy link
Member Author

@tomaspalma tomaspalma Jul 22, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

None was a possible value in the first version of the get_professor_info method at the time that comment was written. Now that you pointed it out, I removed that outdated comment (# (sigarra_id | None, teacher_name))

In the current version, None is not a possible value. When the teacher["name"] does not respect the format <professor_id> - <professor_name>, instead of retrieving the professor_id on the left side of the -, it justs uses the teacher["sigarra_id"] number from the values returned from sigarra api.

To clarify, the teacher["sigarra_id"] field is not really the id that appears on the link of the page of the teacher, which is extremely weird but not up to us since we didn't developed that API.

Copy link
Member

@thePeras thePeras Jul 22, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But the professor_id is used in the link, right?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok, I just see that there is also a sigarra_url, which redirect to teacher profile. Can we save that? I think it could be usefull

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, the professor_id is used in the link.

We can save that, good point!

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe the URL is safest

Copy link
Member Author

@tomaspalma tomaspalma Jul 22, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not entirely sure I understood what you suggested

Do you mean retrieving the professor_id from the sigarra_url?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, save the sigarra_url in the database as, for example, teacher_link

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think that maybe it is probably best to do it in a separate issue from this PR, as it is a lower priority issue and this PR should be merged as soon as possible.

)

def get_professor_info(self, teacher):
"""
The sigarra API that are using gives the name of the professors in two ways:
1. <sigarra_code> - <professor_name>
2. <name>

The option 2 generally occurs when there is not any teacher assigned. So, in order to retrive the
<professor_name> in the cases that we have a '-' in the middle, we have to check which one of option 1
or option 2 the api returned for that specific class.
"""

if re.search(self.professor_name_pattern, teacher["name"]):
[professor_sigarra_id, professor_name,
*_] = teacher["name"].split("-", 1)

return (professor_sigarra_id.strip(), professor_name.strip())

return (teacher["sigarra_id"], teacher["name"])