Skip to content

Commit

Permalink
master: Code cleanup
Browse files Browse the repository at this point in the history
  • Loading branch information
raymelon committed Feb 19, 2023
1 parent aff0194 commit e2fb73e
Show file tree
Hide file tree
Showing 2 changed files with 100 additions and 113 deletions.
203 changes: 95 additions & 108 deletions collect_tagalog.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,131 +63,118 @@
# page fetching
while True:

try:

# once the letter_index exceeds 25 (Ñ and NG excluded), the alphabet traversal is done
if letter_index > 25:
raise Exception('All valid URLs traversed.')

url = 'http://tagalog.pinoydictionary.com/list/' + letters[letter_index] + '/' + str(page_index) + '/'

print('Fetching from', url) # prints url for output display

# tries opening the page
req = session.get(url)

if page_index == 1:
res = req.result()

if res.status_code == 200:
print('Success. Response is', res.status_code)
html = res.content

# find the last page for the current letter
raw = BeautifulSoup(html, 'html.parser')

# expected raw.find()['href'] return: http://tagalog.pinoydictionary.com/list/a/88/
# expected last_page_number: 88
last_page_element = raw.find('a', title='Last Page')

if last_page_element:
last_page_number = int(list(filter(str.strip, last_page_element['href'].split('/')))[-1])
else:
last_page_number = 1

else:
raise MoverException('Failed. Response is', res.status_code)

elif page_index < 1:
raise MoverException('Failed. Invalid page_index value:', page_index)

else:
if page_index > last_page_number:
raise MoverException('Last index reached for letter_index:', letter_index)


# check response later
worker_pool.append((url, req))

except MoverException as me:

print(me)

# the trick here is when the page doesn't exist,
# we need to go to the next letter page already
letter_index += 1

# new letter means new page index
page_index = 1
continue

except Exception as e:

print(e)
break

# parses the page opened
# raw = BeautifulSoup(html, 'html.parser')

# # each word is enclosed in <h2> tags,
# # therefore it is the only tag we need
# words = raw.findAll('h2', class_='word-entry')

# for word in words:
# # only gets words up to 8 characters length and...
# if len(word.next.next) < 9:
# # ...containing alphabet characters only
# print(re.compile('[^a-zA-Z]').sub('', word.next.next), file=f)

# go to next page index
page_index += 1
try:

# once the letter_index exceeds 25 (Ñ and NG excluded), the alphabet traversal is done
if letter_index > 25:
raise Exception('All valid URLs traversed.')

url = 'http://tagalog.pinoydictionary.com/list/' + letters[letter_index] + '/' + str(page_index) + '/'

print('Fetching from', url) # prints url for output display

# tries opening the page
req = session.get(url)

if page_index == 1:
res = req.result()

if res.status_code == 200:
print('Success. Response is', res.status_code)
html = res.content

# find the last page for the current letter
raw = BeautifulSoup(html, 'html.parser')

# examples:
# expected raw.find()['href'] return: http://tagalog.pinoydictionary.com/list/a/88/
# expected last_page_number: 88
last_page_element = raw.find('a', title='Last Page')

if last_page_element:
last_page_number = int(list(filter(str.strip, last_page_element['href'].split('/')))[-1])
else:
last_page_number = 1

else:
raise MoverException('Failed. Response is', res.status_code)

elif page_index < 1:
raise MoverException('Failed. Invalid page_index value:', page_index)

else:
if page_index > last_page_number:
raise MoverException('Last index reached for letter_index:', letter_index)


# check response later
worker_pool.append((url, req))

except MoverException as me:

print(me)

# the trick here is when the page doesn't exist,
# we need to go to the next letter page already
letter_index += 1

# new letter means new page index
page_index = 1
continue

except Exception as e:

print(e)
break
# go to next page index
page_index += 1

print('Checking', len(worker_pool), 'page workers...')

for worker in worker_pool:

try:
try:

(url, req) = worker
(url, req) = worker

res = req.result()
res = req.result()

if res.status_code == 200:
print('Check completion of page worker for request at', url, '- Success. Response is', res.status_code, '- Extracting words...')
else:
raise MoverException('Check completion of page worker for request at', url, ': Failed. Response is', res.status_code)
if res.status_code == 200:
print('Check completion of page worker for request at', url, '- Success. Response is', res.status_code, '- Extracting words...')
else:
raise MoverException('Check completion of page worker for request at', url, ': Failed. Response is', res.status_code)

html = res.content
html = res.content

# parses the page opened
raw = BeautifulSoup(html, 'html.parser')
# each word is enclosed in <h2> tags,
# therefore it is the only tag we need
words = raw.findAll('h2', class_='word-entry')
for word in words:
# only gets words up to 15 characters length and...
if len(word.next.next) < 15:
# ...containing alphabet characters only
# print(re.compile('[^a-zA-Z]').sub('', word.next.next), file=f)
all_words.append(re.compile('[^a-zA-Z]').sub('', word.next.next))
# parses the page opened
raw = BeautifulSoup(html, 'html.parser')
# each word is enclosed in <h2> tags,
# therefore it is the only tag we need
words = raw.findAll('h2', class_='word-entry')
for word in words:
# only gets words up to 15 characters length and...
if len(word.next.next) < 15:
# ...containing alphabet characters only
# print(re.compile('[^a-zA-Z]').sub('', word.next.next), file=f)
all_words.append(re.compile('[^a-zA-Z]').sub('', word.next.next))

except MoverException:
except MoverException:

print(me)
continue
print(me)
continue

except Exception:
except Exception:

print(e)
break
print(e)
break


with open('tagalog_dict.txt', 'w') as f:
# writing to file
for word in all_words:
print(word, file=f)
# writing to file
for word in all_words:
print(word, file=f)


print('Writing finished. See the extracted words at "tagalog_dict.txt"')
Expand Down
10 changes: 5 additions & 5 deletions filter_dict.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
with open('tagalog_dict.txt', 'r') as dic:
words = dic.read().split('\n')
words = dic.read().split('\n')

nodup = set()
nodup_add = nodup.add
nodup = [w.lower() for w in words if not (w.lower() in nodup or nodup_add(w.lower()))]

with open('tagalog_dict.txt', 'w') as dic:
for n in nodup:
print(n, file=dic)
for n in nodup:
print(n, file=dic)

0 comments on commit e2fb73e

Please sign in to comment.