From 457f08f49ac59dd69c7552369cbed451865a2c1d Mon Sep 17 00:00:00 2001 From: Mohammad Nejati Date: Sat, 29 Jun 2024 13:49:18 +0000 Subject: [PATCH] Extract search records for learn sections --- .github/workflows/index_on_algolia.yml | 27 ++++-- algolia_records/{ => learn}/.gitignore | 0 algolia_records/libraries/.gitignore | 1 + config/config.yaml | 14 ++- gecko/config.py | 11 ++- gecko/extract_learn_records.py | 96 +++++++++++++++++++ ...ecords.py => extract_libraries_records.py} | 2 +- gecko/index_on_algolia.py | 24 +++-- 8 files changed, 157 insertions(+), 18 deletions(-) rename algolia_records/{ => learn}/.gitignore (100%) create mode 100644 algolia_records/libraries/.gitignore create mode 100644 gecko/extract_learn_records.py rename gecko/{extract_records.py => extract_libraries_records.py} (96%) diff --git a/.github/workflows/index_on_algolia.yml b/.github/workflows/index_on_algolia.yml index bfa6688..e048b31 100644 --- a/.github/workflows/index_on_algolia.yml +++ b/.github/workflows/index_on_algolia.yml @@ -1,6 +1,7 @@ name: Index on Algolia on: + pull_request: push: branches: [develop, ci-*] paths: @@ -26,18 +27,28 @@ jobs: - name: Install dependencies run: pip install -r requirements.txt + - name: Clone and build website-v2-docs + run: | + git clone --depth=1 --branch=master https://github.com/boostorg/website-v2-docs.git ../website-v2-docs + cd ../website-v2-docs + ./build.sh + - name: Download and extract boost release archive run: | - BOOST_VERSION=$(sed -n 's/.*version: "\(.*\)"/\1/p' config/config.yaml) - BOOST_VERSION_MAIN=$(echo $BOOST_VERSION | sed -E 's/([0-9]+)_([0-9]+)_([0-9]+)(.*)/\1.\2.\3/g') - wget --no-verbose https://boostorg.jfrog.io/artifactory/main/release/$BOOST_VERSION_MAIN/source/boost_$BOOST_VERSION.tar.gz - tar -xzf boost_$BOOST_VERSION.tar.gz -C ../ + mkdir ../boost_1_85_0 + # BOOST_VERSION=$(sed -n 's/.*version: "\(.*\)"/\1/p' config/config.yaml) + # BOOST_VERSION_MAIN=$(echo $BOOST_VERSION | sed -E 's/([0-9]+)_([0-9]+)_([0-9]+)(.*)/\1.\2.\3/g') + # wget --no-verbose https://boostorg.jfrog.io/artifactory/main/release/$BOOST_VERSION_MAIN/source/boost_$BOOST_VERSION.tar.gz + # tar -xzf boost_$BOOST_VERSION.tar.gz -C ../ + + - name: Extract learn records + run: python -m gecko.extract_learn_records - - name: Extract records - run: python -m gecko.extract_records + # - name: Extract libraries records + # run: python -m gecko.extract_libraries_records - - name: Check validity of records - run: python -m gecko.sanitizer check + # - name: Check validity of records + # run: python -m gecko.sanitizer check - name: Index on Algolia env: diff --git a/algolia_records/.gitignore b/algolia_records/learn/.gitignore similarity index 100% rename from algolia_records/.gitignore rename to algolia_records/learn/.gitignore diff --git a/algolia_records/libraries/.gitignore b/algolia_records/libraries/.gitignore new file mode 100644 index 0000000..a6c57f5 --- /dev/null +++ b/algolia_records/libraries/.gitignore @@ -0,0 +1 @@ +*.json diff --git a/config/config.yaml b/config/config.yaml index a1c74d0..766e625 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -1,6 +1,16 @@ boost: - version: "1_85_0" - root: "../boost_1_85_0" + version: '1_85_0' + root: '../boost_1_85_0' + +website-v2-docs: + root: '../website-v2-docs/build' + sections: + - key: 'contributor-guide' + name: 'Contributor Guide' + - key: 'formal-reviews' + name: 'Formal Reviews' + - key: 'user-guide' + name: 'User Guide' algolia: app-id: D7O1MLLTAF diff --git a/gecko/config.py b/gecko/config.py index 7f45dc5..d2b55f6 100644 --- a/gecko/config.py +++ b/gecko/config.py @@ -7,10 +7,19 @@ 'version': str, 'root': os.path.exists }, + 'website-v2-docs': { + 'root': os.path.exists, + 'sections': [ + { + 'key': str, + 'name': str + } + ] + }, 'algolia': { 'app-id': str, 'api-key': str, - 'settings':dict + 'settings': dict }, 'crawlers': [ { diff --git a/gecko/extract_learn_records.py b/gecko/extract_learn_records.py new file mode 100644 index 0000000..1d1affd --- /dev/null +++ b/gecko/extract_learn_records.py @@ -0,0 +1,96 @@ +import re +import json +from pathlib import Path +from urllib.parse import urljoin +from bs4 import BeautifulSoup, Tag + +from .crawlers.helpers import has_class +from .config import config + + +class AntoraDoc(): + def crawl(self, doc_root: Path) -> dict: + sections = {} + doc_root = doc_root.resolve() + + for file_path in doc_root.rglob('*.html'): + with open(file_path, 'r', encoding='utf-8', errors='ignore') as file: + soup = BeautifulSoup(file.read(), 'html.parser') + + lvls = [] + for link in soup.select('body nav.breadcrumbs ul li a'): + lvls = lvls + [{'title': link.text, 'path': urljoin(str(file_path), link.get('href'))}] + + sect1 = soup.select_one('body article.doc') + if sect1: + self._extract_section_n(str(file_path), sections, sect1, lvls) + + return sections + + def _extract_section_n(self, file_path: str, sections: dict, sect: Tag, lvls: list = []): + header = sect.select_one('h1, h2, h3, h4, h5, h6') + + if header.name == 'h1': + path = file_path + else: + title = header.text + path = file_path + '#' + header.get('id') + lvls = lvls + [{'title': title, 'path': path}] + + if header.find_next_sibling() and has_class(header.find_next_sibling(), 'sectionbody'): + siblings = header.find_next_sibling().find_all(recursive=False) + else: + siblings = header.next_siblings + + content = '' + for sibling in siblings: + if isinstance(sibling, Tag) and sibling.has_attr('class') and len([i for i in sibling.get('class') if i.startswith('sect')]) > 0: + self._extract_section_n(file_path, sections, sibling, lvls) + continue + content += sibling.get_text() + ' ' + + sections[path] = {'content': content, 'lvls': lvls} + + +def create_algolia_records(section_key: str, section_name: str, doc_root: Path, sections: dict): + doc_root = doc_root.resolve() + records = [] + + for _, section in sections.items(): + for lvl in section['lvls']: + lvl['path'] = lvl['path'].replace(str(doc_root) + '/', '') + + records.append({ + 'type': 'content', + 'section_key': section_key, + 'section_name': section_name, + 'content': re.sub(r'\s+', ' ', section['content']).strip(), + 'weight': { + 'pageRank': 0, + 'level': 100 - len(section['lvls']) * 10, + 'position': 0 + }, + 'path': section['lvls'][-1]['path'] if len(section['lvls']) > 0 else None, + 'hierarchy': { + 'lvl0': section['lvls'][0] if len(section['lvls']) > 0 else None, + 'lvl1': section['lvls'][1] if len(section['lvls']) > 1 else None, + 'lvl2': section['lvls'][2] if len(section['lvls']) > 2 else None, + 'lvl3': section['lvls'][3] if len(section['lvls']) > 3 else None, + 'lvl4': section['lvls'][4] if len(section['lvls']) > 4 else None, + 'lvl5': section['lvls'][5] if len(section['lvls']) > 5 else None, + 'lvl6': section['lvls'][6] if len(section['lvls']) > 6 else None + }}) + + with open('./algolia_records/learn/' + section_key + '.json', 'w', encoding='utf-8') as outfile: + json.dump(records, outfile, indent=4) + + +if __name__ == "__main__": + crawler = AntoraDoc() + + for section in config['website-v2-docs']['sections']: + sections = crawler.crawl(Path(config['website-v2-docs']['root']) / section['key']) + create_algolia_records(section['key'], + section['name'], + Path(config['website-v2-docs']['root']), + sections) diff --git a/gecko/extract_records.py b/gecko/extract_libraries_records.py similarity index 96% rename from gecko/extract_records.py rename to gecko/extract_libraries_records.py index 6aafb20..51a5a57 100644 --- a/gecko/extract_records.py +++ b/gecko/extract_libraries_records.py @@ -75,7 +75,7 @@ def create_algolia_records(library_key: str, sections: dict, boost_root: str): 'lvl6': section['lvls'][6] if len(section['lvls']) > 6 else None }}) - with open('./algolia_records/' + library_key.replace('/', '_') + '.json', 'w', encoding='utf-8') as outfile: + with open('./algolia_records/libraries/' + library_key.replace('/', '_') + '.json', 'w', encoding='utf-8') as outfile: json.dump(records, outfile, indent=4) diff --git a/gecko/index_on_algolia.py b/gecko/index_on_algolia.py index 6696bb6..7438ebe 100644 --- a/gecko/index_on_algolia.py +++ b/gecko/index_on_algolia.py @@ -9,19 +9,19 @@ client = SearchClient.create(config['algolia']['app-id'], config['algolia']['api-key']) print('Initializing {} index ...'.format(config['boost']['version'])) - index = client.init_index(config['boost']['version']) + libraries_index = client.init_index(config['boost']['version']) print('Setting settings for {} index ...'.format(config['boost']['version'])) - index.set_settings(config['algolia']['settings']) + libraries_index.set_settings(config['algolia']['settings']) - for path in Path('./algolia_records').glob('*.json'): + for path in Path('./algolia_records/libraries').glob('*.json'): print('uploading records for {}...'.format(path.stem)) with open(path, 'r', encoding='utf-8') as f: records = json.load(f) # Delete the existing records for this library. - index.delete_by({'filters': 'library_key:{}'.format(records[0]['library_key'])}) + libraries_index.delete_by({'filters': 'library_key:{}'.format(records[0]['library_key'])}) # Split long documents into smaller parts. for record in records: @@ -34,5 +34,17 @@ records = [record for record in records if not ( record['content'] == '' and not record['hierarchy']['lvl0'])] - # TODO instead of using autoGenerateObjectIDIfNotExist we might create a hash out of hierarchy items - index.save_objects(records, {'autoGenerateObjectIDIfNotExist': True}) + libraries_index.save_objects(records, {'autoGenerateObjectIDIfNotExist': True}) + + learn_index = client.init_index('learn') + + for path in Path('./algolia_records/learn').glob('*.json'): + print('uploading records for {}...'.format(path.stem)) + + with open(path, 'r', encoding='utf-8') as f: + records = json.load(f) + + # Delete the existing records for this library. + learn_index.delete_by({'filters': 'section_key:{}'.format(records[0]['section_key'])}) + + learn_index.save_objects(records, {'autoGenerateObjectIDIfNotExist': True})