Extract search records for learn sections

cppalliance · Jun 29, 2024 · 457f08f · 457f08f
1 parent 3c1ee1a
commit 457f08f
Show file tree

Hide file tree

Showing 8 changed files with 157 additions and 18 deletions.
diff --git a/.github/workflows/index_on_algolia.yml b/.github/workflows/index_on_algolia.yml
@@ -1,6 +1,7 @@
 name: Index on Algolia
 
 on:
+  pull_request:
   push:
     branches: [develop, ci-*]
     paths:
@@ -26,18 +27,28 @@ jobs:
       - name: Install dependencies
         run: pip install -r requirements.txt
 
+      - name: Clone and build website-v2-docs
+        run: |
+          git clone --depth=1 --branch=master https://github.com/boostorg/website-v2-docs.git ../website-v2-docs
+          cd ../website-v2-docs
+          ./build.sh
+
       - name: Download and extract boost release archive
         run: |
-          BOOST_VERSION=$(sed -n 's/.*version: "\(.*\)"/\1/p' config/config.yaml)
-          BOOST_VERSION_MAIN=$(echo $BOOST_VERSION | sed -E 's/([0-9]+)_([0-9]+)_([0-9]+)(.*)/\1.\2.\3/g')
-          wget --no-verbose https://boostorg.jfrog.io/artifactory/main/release/$BOOST_VERSION_MAIN/source/boost_$BOOST_VERSION.tar.gz
-          tar -xzf boost_$BOOST_VERSION.tar.gz -C ../
+          mkdir ../boost_1_85_0
+          # BOOST_VERSION=$(sed -n 's/.*version: "\(.*\)"/\1/p' config/config.yaml)
+          # BOOST_VERSION_MAIN=$(echo $BOOST_VERSION | sed -E 's/([0-9]+)_([0-9]+)_([0-9]+)(.*)/\1.\2.\3/g')
+          # wget --no-verbose https://boostorg.jfrog.io/artifactory/main/release/$BOOST_VERSION_MAIN/source/boost_$BOOST_VERSION.tar.gz
+          # tar -xzf boost_$BOOST_VERSION.tar.gz -C ../
+
+      - name: Extract learn records
+        run: python -m gecko.extract_learn_records
 
-      - name: Extract records
-        run: python -m gecko.extract_records
+      # - name: Extract libraries records
+      #   run: python -m gecko.extract_libraries_records
 
-      - name: Check validity of records
-        run: python -m gecko.sanitizer check
+      # - name: Check validity of records
+      #   run: python -m gecko.sanitizer check
 
       - name: Index on Algolia
         env:

diff --git a/algolia_records/.gitignore → algolia_records/learn/.gitignore b/algolia_records/.gitignore → algolia_records/learn/.gitignore
diff --git a/algolia_records/libraries/.gitignore b/algolia_records/libraries/.gitignore
@@ -0,0 +1 @@
+*.json
diff --git a/config/config.yaml b/config/config.yaml
@@ -1,6 +1,16 @@
 boost:
-  version: "1_85_0"
-  root: "../boost_1_85_0"
+  version: '1_85_0'
+  root: '../boost_1_85_0'
+
+website-v2-docs:
+  root: '../website-v2-docs/build'
+  sections:
+    - key: 'contributor-guide'
+      name: 'Contributor Guide'
+    - key: 'formal-reviews'
+      name: 'Formal Reviews'
+    - key: 'user-guide'
+      name: 'User Guide'
 
 algolia:
   app-id: D7O1MLLTAF

diff --git a/gecko/config.py b/gecko/config.py
@@ -7,10 +7,19 @@
         'version': str,
         'root': os.path.exists
     },
+    'website-v2-docs': {
+        'root': os.path.exists,
+        'sections': [
+            {
+                'key': str,
+                'name': str
+            }
+        ]
+    },
     'algolia': {
         'app-id': str,
         'api-key': str,
-        'settings':dict
+        'settings': dict
     },
     'crawlers': [
         {

diff --git a/gecko/extract_learn_records.py b/gecko/extract_learn_records.py
@@ -0,0 +1,96 @@
+import re
+import json
+from pathlib import Path
+from urllib.parse import urljoin
+from bs4 import BeautifulSoup, Tag
+
+from .crawlers.helpers import has_class
+from .config import config
+
+
+class AntoraDoc():
+    def crawl(self, doc_root: Path) -> dict:
+        sections = {}
+        doc_root = doc_root.resolve()
+
+        for file_path in doc_root.rglob('*.html'):
+            with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
+                soup = BeautifulSoup(file.read(), 'html.parser')
+
+                lvls = []
+                for link in soup.select('body nav.breadcrumbs ul li a'):
+                    lvls = lvls + [{'title': link.text, 'path': urljoin(str(file_path), link.get('href'))}]
+
+                sect1 = soup.select_one('body article.doc')
+                if sect1:
+                    self._extract_section_n(str(file_path), sections, sect1, lvls)
+
+        return sections
+
+    def _extract_section_n(self, file_path: str, sections: dict, sect: Tag, lvls: list = []):
+        header = sect.select_one('h1, h2, h3, h4, h5, h6')
+
+        if header.name == 'h1':
+            path = file_path
+        else:
+            title = header.text
+            path = file_path + '#' + header.get('id')
+            lvls = lvls + [{'title': title, 'path': path}]
+
+        if header.find_next_sibling() and has_class(header.find_next_sibling(), 'sectionbody'):
+            siblings = header.find_next_sibling().find_all(recursive=False)
+        else:
+            siblings = header.next_siblings
+
+        content = ''
+        for sibling in siblings:
+            if isinstance(sibling, Tag) and sibling.has_attr('class') and len([i for i in sibling.get('class') if i.startswith('sect')]) > 0:
+                self._extract_section_n(file_path, sections, sibling, lvls)
+                continue
+            content += sibling.get_text() + ' '
+
+        sections[path] = {'content': content, 'lvls': lvls}
+
+
+def create_algolia_records(section_key: str, section_name: str, doc_root: Path, sections: dict):
+    doc_root = doc_root.resolve()
+    records = []
+
+    for _, section in sections.items():
+        for lvl in section['lvls']:
+            lvl['path'] = lvl['path'].replace(str(doc_root) + '/', '')
+
+        records.append({
+            'type': 'content',
+            'section_key': section_key,
+            'section_name': section_name,
+            'content': re.sub(r'\s+', ' ', section['content']).strip(),
+            'weight': {
+                'pageRank': 0,
+                'level': 100 - len(section['lvls']) * 10,
+                'position': 0
+            },
+            'path': section['lvls'][-1]['path'] if len(section['lvls']) > 0 else None,
+            'hierarchy': {
+                'lvl0': section['lvls'][0] if len(section['lvls']) > 0 else None,
+                'lvl1': section['lvls'][1] if len(section['lvls']) > 1 else None,
+                'lvl2': section['lvls'][2] if len(section['lvls']) > 2 else None,
+                'lvl3': section['lvls'][3] if len(section['lvls']) > 3 else None,
+                'lvl4': section['lvls'][4] if len(section['lvls']) > 4 else None,
+                'lvl5': section['lvls'][5] if len(section['lvls']) > 5 else None,
+                'lvl6': section['lvls'][6] if len(section['lvls']) > 6 else None
+            }})
+
+    with open('./algolia_records/learn/' + section_key + '.json', 'w', encoding='utf-8') as outfile:
+        json.dump(records, outfile, indent=4)
+
+
+if __name__ == "__main__":
+    crawler = AntoraDoc()
+
+    for section in config['website-v2-docs']['sections']:
+        sections = crawler.crawl(Path(config['website-v2-docs']['root']) / section['key'])
+        create_algolia_records(section['key'],
+                               section['name'],
+                               Path(config['website-v2-docs']['root']),
+                               sections)
diff --git a/gecko/extract_records.py → gecko/extract_libraries_records.py b/gecko/extract_records.py → gecko/extract_libraries_records.py
@@ -75,7 +75,7 @@ def create_algolia_records(library_key: str, sections: dict, boost_root: str):
                 'lvl6': section['lvls'][6] if len(section['lvls']) > 6 else None
             }})
 
-    with open('./algolia_records/' + library_key.replace('/', '_') + '.json', 'w', encoding='utf-8') as outfile:
+    with open('./algolia_records/libraries/' + library_key.replace('/', '_') + '.json', 'w', encoding='utf-8') as outfile:
         json.dump(records, outfile, indent=4)
 
 

diff --git a/gecko/index_on_algolia.py b/gecko/index_on_algolia.py
@@ -9,19 +9,19 @@
     client = SearchClient.create(config['algolia']['app-id'], config['algolia']['api-key'])
 
     print('Initializing {} index ...'.format(config['boost']['version']))
-    index = client.init_index(config['boost']['version'])
+    libraries_index = client.init_index(config['boost']['version'])
 
     print('Setting settings for {} index ...'.format(config['boost']['version']))
-    index.set_settings(config['algolia']['settings'])
+    libraries_index.set_settings(config['algolia']['settings'])
 
-    for path in Path('./algolia_records').glob('*.json'):
+    for path in Path('./algolia_records/libraries').glob('*.json'):
         print('uploading records for {}...'.format(path.stem))
 
         with open(path, 'r', encoding='utf-8') as f:
             records = json.load(f)
 
             # Delete the existing records for this library.
-            index.delete_by({'filters': 'library_key:{}'.format(records[0]['library_key'])})
+            libraries_index.delete_by({'filters': 'library_key:{}'.format(records[0]['library_key'])})
 
             # Split long documents into smaller parts.
             for record in records:
@@ -34,5 +34,17 @@
             records = [record for record in records if not (
                 record['content'] == '' and not record['hierarchy']['lvl0'])]
 
-            # TODO instead of using autoGenerateObjectIDIfNotExist we might create a hash out of hierarchy items
-            index.save_objects(records, {'autoGenerateObjectIDIfNotExist': True})
+            libraries_index.save_objects(records, {'autoGenerateObjectIDIfNotExist': True})
+
+    learn_index = client.init_index('learn')
+
+    for path in Path('./algolia_records/learn').glob('*.json'):
+        print('uploading records for {}...'.format(path.stem))
+
+        with open(path, 'r', encoding='utf-8') as f:
+            records = json.load(f)
+
+            # Delete the existing records for this library.
+            learn_index.delete_by({'filters': 'section_key:{}'.format(records[0]['section_key'])})
+
+            learn_index.save_objects(records, {'autoGenerateObjectIDIfNotExist': True})