Skip to content

Commit b0021dc

Browse files
committedApr 10, 2023
fix: don't get multiple tables
1 parent 3defdd9 commit b0021dc

File tree

1 file changed

+9
-4
lines changed

1 file changed

+9
-4
lines changed
 

‎rufsc/webscraping.py

+9-4
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import datetime as dt
2+
import urllib.parse
23
from typing import Optional
34

45
import pandas as pd
@@ -49,7 +50,9 @@ def get_menu() -> Optional[dict]:
4950
.replace("\r", " ")
5051
.capitalize()
5152
)
52-
menu[get_menu_header(i)].append(food)
53+
menu_list = menu.get(get_menu_header(i))
54+
if menu_list is not None:
55+
menu_list.append(food)
5356
return menu
5457
else:
5558
# No menu for today. Maybe they didn't add it yet.
@@ -66,9 +69,9 @@ def get_pdf_link(url: str, today: dt.date) -> Optional[str]:
6669
pdfs = [
6770
link.get("href")
6871
for link in p
69-
if _number2month[today.month] in link.get("href").lower()
72+
if _number2month[today.month] in urllib.parse.unquote(link.get("href")).lower()
7073
]
71-
pdf_link = pdfs[0] if pdfs else None
74+
pdf_link = pdfs[-1] if pdfs else None
7275

7376
return pdf_link
7477

@@ -79,7 +82,9 @@ def get_pdf_table(pdf_link: str) -> pd.DataFrame:
7982
"names": list(range(1, 8)), # ISO weekday numbers (1=Monday, 2=Tuesday, etc.)
8083
}
8184
pdf_table = (
82-
tb.read_pdf(pdf_link, pandas_options=pandas_options, lattice=False)[0]
85+
tb.read_pdf(
86+
pdf_link, pandas_options=pandas_options, lattice=True, multiple_tables=False
87+
)[0]
8388
.drop([0, 1])[dt.date.today().isoweekday()] # Filter by the day of the week
8489
.fillna("Não foi possível obter informações")
8590
.reset_index(drop=True)

0 commit comments

Comments
 (0)