diff --git a/readme_modifications.md b/readme_modifications.md new file mode 100644 index 0000000..e4b79f3 --- /dev/null +++ b/readme_modifications.md @@ -0,0 +1,108 @@ +### Some notes on the modifications made by bjkeefe to `core.py` + +#### Motivation + +I had been happily using WiktionaryParser for several months. One day, I was developing +an application where I wanted to be able to distinguish between two cases: (1) where +Wiktionary does not have any English definitions for a given word, and (2) where +Wiktionary does not have any entry at all. + +I made a few modifications to `core.py` to support this desire. The returned value +remains a `list`, containg a `dict`, in all cases. If the `word` and `language` passed to +`.fetch()` yield a Wiktionary entry, the results will be the same as before. + +#### So, what's new? + +If there is no entry for a given `word` and `language`, the returned value is now no +longer an empty `list`, but a `list`, containing a `dict`, whose only key is +`"additional_info"`, whose value is a `str`: `"no entry for ".` + +If there is no entry at all, same as above, except the value in the `dict` becomes +`"Wiktionary does not yet have an entry for "`. + + +#### Source code differences + +Here is the diff output (ignoring whitespace) between the new version and the original: + +``` +$ diff -w core.py core.py.abo +119c119 +< return [{"additional_info": f"no {language} entry for {self.current_word}"}] +--- +> return [] +126c126 +< return [{"additional_info": f"no {language} entry for {self.current_word}"}] +--- +> return [] +285,288d284 +< search_string = "Wiktionary does not yet have an entry for " + word +< result = self.soup.find_all(string=re.compile(search_string)) +< if result: +< return [{"additional_info": search_string}] +``` + +#### Testing + +The new version of `core.py` passes all tests in `tests/test_core.py.` + +Because I didn't have time to modify the existing tests, I wrote some quick tests that +explicitly test the modifications I made. These are in `tests/test_core_new.py`. This +file expects to be run with `pytest`, because I am less familiar with `unittest`. All of +the tests pass when run against the new version of `core.py`. + +NB: the new tests will NOT all pass if run against the old version of `core.py.` + +Also, I wrote a little script called `driver.py`. This is intended for interactive testing. + +``` +$ py driver.py -h +usage: driver.py [-h] [-m] word + +Check against Wiktionary using WiktionaryParser + +positional arguments: + word the word to look up + +options: + -h, --help show this help message and exit + -m, --multiple-languages + if present, look up for several languages; otherwise, just English +``` + +#### Organization + +All of the above -- the modifications and new files -- are in a new `git` branch named +`additional_info`. + +#### Minor problem with backwards compatibility + +If someone has written some code that checks the result returned by `.fetch()` like this ... + +``` +result = parser.fetch(word) +if not result: # --or-- if len(result) == 0: + do_something() +``` + +... this will no longer work. This could be changed to, for example: + +``` +result = parser.fetch(word) +if not "definitions" in result[0]: + do_something() +``` + +[added 2023-06-22 08:53] It occurs to me that there might be a way around this problem: +change the call signature to `.fetch()`, by adding the keyword arg `allow_messages=False`. +Calls to `.fetch()` in existing code would, of course, not have this arg, and since the +default would be not to allow the return of `"messages"`, a not-found condition would +return an empty list, as before. However, if the call, in new code, were `.fetch("word", +allow_messages=True)`, then a not-found condition would result in what I was after: +additional info about the not-found result. + +Let me know if you want me to implement that. + +#### Questions, comments, criticisms + +Please feel free to email me: bjkeefe@gmail.com. Thanks for reading! diff --git a/tests/test_core_new.py b/tests/test_core_new.py new file mode 100644 index 0000000..045a4ae --- /dev/null +++ b/tests/test_core_new.py @@ -0,0 +1,103 @@ +"""A few quick tests of the modifications made by bjkeefe to core.py. +These tests will NOT all succeed if run against the master branch of WiktionaryParser, +at least as of 2023-06-17. +""" +try: + import pytest +except ModuleNotFoundError: + print("test_core_new.py: these tests require pytest to be importable, so this won't work:") + print(" $ py test_core_new.py") + print() + print("However, pytest usually comes along for the ride when installing Python from") + print("python.org, so this should work:") + print(" $ pytest test_core_new.py") + raise SystemExit() + +from wiktionaryparser import WiktionaryParser + + +def test_core_new_default_language(): + parser = WiktionaryParser() + + # A word that has several English definitions + result = parser.fetch("receive") + assert type(result) == list + assert len(result) == 1 + assert type(result[0]) == dict + assert "etymology" in result[0] + assert "pronunciations" in result[0] + assert "definitions" in result[0] + assert len(result[0]["definitions"]) > 0 + assert "additional_info" not in result[0] + + # A word that has a Wiktionary entry, because it is a common misspelling + result = parser.fetch("recieve") + assert type(result) == list + assert len(result) == 1 + assert type(result[0]) == dict + assert "etymology" in result[0] + assert "pronunciations" in result[0] + assert "definitions" in result[0] + assert len(result[0]["definitions"]) > 0 + assert "additional_info" not in result[0] + + # Two words that have a Wiktionary entry, but no English definitions + for word in ["abilitanti", "aimai"]: + result = parser.fetch(word) + assert type(result) == list + assert len(result) == 1 + assert type(result[0]) == dict + assert "etymology" not in result[0] + assert "pronunciations" not in result[0] + assert "definitions" not in result[0] + assert "additional_info" in result[0] + assert result[0]["additional_info"] == f"no english entry for {word}" + + # A "word" that has no Wiktionary entry + result = parser.fetch("aimiable") + assert type(result) == list + assert len(result) == 1 + assert type(result[0]) == dict + assert "etymology" not in result[0] + assert "pronunciations" not in result[0] + assert "definitions" not in result[0] + assert "additional_info" in result[0] + assert result[0]["additional_info"] == f"Wiktionary does not yet have an entry for aimiable" + + +def test_core_new_non_english_languages(): + words = ["receive", "recieve", "abilitanti", "aimai", "aimiable"] + languages = ["italian", "french", "japanese"] + + parser = WiktionaryParser() + for word in words: + for language in languages: + parser.set_default_language(language) + result = parser.fetch(word) + if language == "italian": + if word == "abilitanti": + assert "definitions" in result[0] + assert "additional_info" not in result[0] + else: + assert "definitions" not in result[0] + assert "additional_info" in result[0] + if word != "aimiable": + assert result[0]["additional_info"] == f"no {language} entry for {word}" + else: + expected = f"Wiktionary does not yet have an entry for {word}" + assert result[0]["additional_info"] == expected + + elif language == "french" or language == "japanese": + if word == "aimai": + assert "definitions" in result[0] + assert "additional_info" not in result[0] + else: + assert "definitions" not in result[0] + assert "additional_info" in result[0] + if word != "aimiable": + assert result[0]["additional_info"] == f"no {language} entry for {word}" + else: + expected = f"Wiktionary does not yet have an entry for {word}" + assert result[0]["additional_info"] == expected + + diff --git a/wiktionaryparser/core.py b/wiktionaryparser/core.py index 49f6617..65c3f48 100644 --- a/wiktionaryparser/core.py +++ b/wiktionaryparser/core.py @@ -116,14 +116,14 @@ def get_word_data(self, language): start_index = content.find_previous().text + '.' if not start_index: if contents: - return [] + return [{"additional_info": f"no {language} entry for {self.current_word}"}] language_heading = self.soup.find_all( "span", {"class": "mw-headline"}, string=lambda s: s.lower() == language ) if not language_heading: - return [] + return [{"additional_info": f"no {language} entry for {self.current_word}"}] for content in contents: index = content.find_previous().text content_text = self.remove_digits(content.text.lower()) @@ -282,4 +282,8 @@ def fetch(self, word, language=None, old_id=None): self.soup = BeautifulSoup(response.text.replace('>\n<', '><'), 'html.parser') self.current_word = word self.clean_html() + search_string = "Wiktionary does not yet have an entry for " + word + result = self.soup.find_all(string=re.compile(search_string)) + if result: + return [{"additional_info": search_string}] return self.get_word_data(language.lower()) diff --git a/wiktionaryparser/core.py.abo b/wiktionaryparser/core.py.abo new file mode 100644 index 0000000..49f6617 --- /dev/null +++ b/wiktionaryparser/core.py.abo @@ -0,0 +1,285 @@ +import re, requests +from wiktionaryparser.utils import WordData, Definition, RelatedWord +from bs4 import BeautifulSoup +from itertools import zip_longest +from copy import copy +from string import digits + +PARTS_OF_SPEECH = [ + "noun", "verb", "adjective", "adverb", "determiner", + "article", "preposition", "conjunction", "proper noun", + "letter", "character", "phrase", "proverb", "idiom", + "symbol", "syllable", "numeral", "initialism", "interjection", + "definitions", "pronoun", "particle", "predicative", "participle", + "suffix", +] + +RELATIONS = [ + "synonyms", "antonyms", "hypernyms", "hyponyms", + "meronyms", "holonyms", "troponyms", "related terms", + "coordinate terms", +] + +def is_subheading(child, parent): + child_headings = child.split(".") + parent_headings = parent.split(".") + if len(child_headings) <= len(parent_headings): + return False + for child_heading, parent_heading in zip(child_headings, parent_headings): + if child_heading != parent_heading: + return False + return True + +class WiktionaryParser(object): + def __init__(self): + self.url = "https://en.wiktionary.org/wiki/{}?printable=yes" + self.soup = None + self.session = requests.Session() + self.session.mount("http://", requests.adapters.HTTPAdapter(max_retries = 2)) + self.session.mount("https://", requests.adapters.HTTPAdapter(max_retries = 2)) + self.language = 'english' + self.current_word = None + self.PARTS_OF_SPEECH = copy(PARTS_OF_SPEECH) + self.RELATIONS = copy(RELATIONS) + self.INCLUDED_ITEMS = self.RELATIONS + self.PARTS_OF_SPEECH + ['etymology', 'pronunciation'] + + def include_part_of_speech(self, part_of_speech): + part_of_speech = part_of_speech.lower() + if part_of_speech not in self.PARTS_OF_SPEECH: + self.PARTS_OF_SPEECH.append(part_of_speech) + self.INCLUDED_ITEMS.append(part_of_speech) + + def exclude_part_of_speech(self, part_of_speech): + part_of_speech = part_of_speech.lower() + self.PARTS_OF_SPEECH.remove(part_of_speech) + self.INCLUDED_ITEMS.remove(part_of_speech) + + def include_relation(self, relation): + relation = relation.lower() + if relation not in self.RELATIONS: + self.RELATIONS.append(relation) + self.INCLUDED_ITEMS.append(relation) + + def exclude_relation(self, relation): + relation = relation.lower() + self.RELATIONS.remove(relation) + self.INCLUDED_ITEMS.remove(relation) + + def set_default_language(self, language=None): + if language is not None: + self.language = language.lower() + + def get_default_language(self): + return self.language + + def clean_html(self): + unwanted_classes = ['sister-wikipedia', 'thumb', 'reference', 'cited-source'] + for tag in self.soup.find_all(True, {'class': unwanted_classes}): + tag.extract() + + def remove_digits(self, string): + return string.translate(str.maketrans('', '', digits)).strip() + + def count_digits(self, string): + return len(list(filter(str.isdigit, string))) + + def get_id_list(self, contents, content_type): + if content_type == 'etymologies': + checklist = ['etymology'] + elif content_type == 'pronunciation': + checklist = ['pronunciation'] + elif content_type == 'definitions': + checklist = self.PARTS_OF_SPEECH + if self.language == 'chinese': + checklist += self.current_word + elif content_type == 'related': + checklist = self.RELATIONS + else: + return None + id_list = [] + if len(contents) == 0: + return [('1', x.title(), x) for x in checklist if self.soup.find('span', {'id': x.title()})] + for content_tag in contents: + content_index = content_tag.find_previous().text + text_to_check = self.remove_digits(content_tag.text).strip().lower() + if text_to_check in checklist: + content_id = content_tag.parent['href'].replace('#', '') + id_list.append((content_index, content_id, text_to_check)) + return id_list + + def get_word_data(self, language): + contents = self.soup.find_all('span', {'class': 'toctext'}) + word_contents = [] + start_index = None + for content in contents: + if content.text.lower() == language: + start_index = content.find_previous().text + '.' + if not start_index: + if contents: + return [] + language_heading = self.soup.find_all( + "span", + {"class": "mw-headline"}, + string=lambda s: s.lower() == language + ) + if not language_heading: + return [] + for content in contents: + index = content.find_previous().text + content_text = self.remove_digits(content.text.lower()) + if index.startswith(start_index) and content_text in self.INCLUDED_ITEMS: + word_contents.append(content) + word_data = { + 'examples': self.parse_examples(word_contents), + 'definitions': self.parse_definitions(word_contents), + 'etymologies': self.parse_etymologies(word_contents), + 'related': self.parse_related_words(word_contents), + 'pronunciations': self.parse_pronunciations(word_contents), + } + json_obj_list = self.map_to_object(word_data) + return json_obj_list + + def parse_pronunciations(self, word_contents): + pronunciation_id_list = self.get_id_list(word_contents, 'pronunciation') + pronunciation_list = [] + audio_links = [] + pronunciation_div_classes = ['mw-collapsible', 'vsSwitcher'] + for pronunciation_index, pronunciation_id, _ in pronunciation_id_list: + pronunciation_text = [] + span_tag = self.soup.find_all('span', {'id': pronunciation_id})[0] + list_tag = span_tag.parent + while list_tag.name != 'ul': + list_tag = list_tag.find_next_sibling() + if list_tag.name == 'p': + pronunciation_text.append(list_tag.text) + break + if list_tag.name == 'div' and any(_ in pronunciation_div_classes for _ in list_tag['class']): + break + for super_tag in list_tag.find_all('sup'): + super_tag.clear() + for list_element in list_tag.find_all('li'): + for audio_tag in list_element.find_all('div', {'class': 'mediaContainer'}): + audio_links.append(audio_tag.find('source')['src']) + audio_tag.extract() + for nested_list_element in list_element.find_all('ul'): + nested_list_element.extract() + if list_element.text and not list_element.find('table', {'class': 'audiotable'}): + pronunciation_text.append(list_element.text.strip()) + pronunciation_list.append((pronunciation_index, pronunciation_text, audio_links)) + return pronunciation_list + + def parse_definitions(self, word_contents): + definition_id_list = self.get_id_list(word_contents, 'definitions') + definition_list = [] + definition_tag = None + for def_index, def_id, def_type in definition_id_list: + definition_text = [] + span_tag = self.soup.find_all('span', {'id': def_id})[0] + table = span_tag.parent.find_next_sibling() + while table and table.name not in ['h3', 'h4', 'h5']: + definition_tag = table + table = table.find_next_sibling() + if definition_tag.name == 'p': + if definition_tag.text.strip(): + definition_text.append(definition_tag.text.strip()) + if definition_tag.name in ['ol', 'ul']: + for element in definition_tag.find_all('li', recursive=False): + if element.text: + definition_text.append(element.text.strip()) + if def_type == 'definitions': + def_type = '' + definition_list.append((def_index, definition_text, def_type)) + return definition_list + + def parse_examples(self, word_contents): + definition_id_list = self.get_id_list(word_contents, 'definitions') + example_list = [] + for def_index, def_id, def_type in definition_id_list: + span_tag = self.soup.find_all('span', {'id': def_id})[0] + table = span_tag.parent + while table.name != 'ol': + table = table.find_next_sibling() + examples = [] + while table and table.name == 'ol': + for element in table.find_all('dd'): + example_text = re.sub(r'\([^)]*\)', '', element.text.strip()) + if example_text: + examples.append(example_text) + element.clear() + example_list.append((def_index, examples, def_type)) + for quot_list in table.find_all(['ul', 'ol']): + quot_list.clear() + table = table.find_next_sibling() + return example_list + + def parse_etymologies(self, word_contents): + etymology_id_list = self.get_id_list(word_contents, 'etymologies') + etymology_list = [] + etymology_tag = None + for etymology_index, etymology_id, _ in etymology_id_list: + etymology_text = '' + span_tag = self.soup.find_all('span', {'id': etymology_id})[0] + next_tag = span_tag.parent.find_next_sibling() + while next_tag and next_tag.name not in ['h3', 'h4', 'div', 'h5']: + etymology_tag = next_tag + next_tag = next_tag.find_next_sibling() + if etymology_tag.name == 'p': + etymology_text += etymology_tag.text + else: + for list_tag in etymology_tag.find_all('li'): + etymology_text += list_tag.text + '\n' + etymology_list.append((etymology_index, etymology_text)) + return etymology_list + + def parse_related_words(self, word_contents): + relation_id_list = self.get_id_list(word_contents, 'related') + related_words_list = [] + for related_index, related_id, relation_type in relation_id_list: + words = [] + span_tag = self.soup.find_all('span', {'id': related_id})[0] + parent_tag = span_tag.parent + while parent_tag and not parent_tag.find_all('li'): + parent_tag = parent_tag.find_next_sibling() + if parent_tag: + for list_tag in parent_tag.find_all('li'): + words.append(list_tag.text) + related_words_list.append((related_index, words, relation_type)) + return related_words_list + + def map_to_object(self, word_data): + json_obj_list = [] + if not word_data['etymologies']: + word_data['etymologies'] = [('', '')] + for (current_etymology, next_etymology) in zip_longest(word_data['etymologies'], word_data['etymologies'][1:], fillvalue=('999', '')): + data_obj = WordData() + data_obj.etymology = current_etymology[1] + for pronunciation_index, text, audio_links in word_data['pronunciations']: + if (self.count_digits(current_etymology[0]) == self.count_digits(pronunciation_index)) or (current_etymology[0] <= pronunciation_index < next_etymology[0]): + data_obj.pronunciations = text + data_obj.audio_links = audio_links + for definition_index, definition_text, definition_type in word_data['definitions']: + current_etymology_str = ".".join(f"{int(num):02d}" for num in current_etymology[0].split(".") if num) + definition_index_str = ".".join(f"{int(num):02d}" for num in definition_index.split(".") if num) + next_etymology_str = ".".join(f"{int(num):02d}" for num in next_etymology[0].split(".") if num) + if current_etymology_str <= definition_index_str < next_etymology_str \ + or is_subheading(current_etymology[0], definition_index): + def_obj = Definition() + def_obj.text = definition_text + def_obj.part_of_speech = definition_type + for example_index, examples, _ in word_data['examples']: + if example_index.startswith(definition_index): + def_obj.example_uses = examples + for related_word_index, related_words, relation_type in word_data['related']: + if related_word_index.startswith(definition_index): + def_obj.related_words.append(RelatedWord(relation_type, related_words)) + data_obj.definition_list.append(def_obj) + json_obj_list.append(data_obj.to_json()) + return json_obj_list + + def fetch(self, word, language=None, old_id=None): + language = self.language if not language else language + response = self.session.get(self.url.format(word), params={'oldid': old_id}) + self.soup = BeautifulSoup(response.text.replace('>\n<', '><'), 'html.parser') + self.current_word = word + self.clean_html() + return self.get_word_data(language.lower()) diff --git a/wiktionaryparser/driver.py b/wiktionaryparser/driver.py new file mode 100644 index 0000000..a3adc8f --- /dev/null +++ b/wiktionaryparser/driver.py @@ -0,0 +1,47 @@ +import argparse + +from wiktionaryparser import WiktionaryParser + + +def wiktionary_lookup(word,language="english"): + parser = WiktionaryParser() + parser.set_default_language(language) + result = parser.fetch(word) + if len(result) == 0: + return ["*** WiktionaryParser didn't find anything"] + else: + if "additional_info" in result[0]: + return [result[0]["additional_info"]] + else: + if "definitions" in result[0] and len(result[0]["definitions"]) > 0: + return [definition for definition in result[0]["definitions"][0]["text"]] + else: + return ["** WiktionaryParser didn't find any definitions"] + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Check against Wiktionary using WiktionaryParser") + parser.add_argument("-m", "--multiple-languages", action="store_true", + help="if present, look up for several languages; otherwise, just English") + + parser.add_argument("word", help="the word to look up") + args = parser.parse_args() + + if args.multiple_languages: + languages = ["english", "french", "italian", "japanese"] + else: + languages = ["english"] + + for language in languages: + if args.multiple_languages: + print("\n----------------------------------------") + print(f"Trying {args.word} for {language = }") + definitions = wiktionary_lookup(args.word, language) + + if len(definitions) > 0: + for elem in definitions: + print("-- ", elem[:80]) + if args.multiple_languages: + print("----------------------------------------") +