cppalliance · ashtum · Jun 29, 2024 · Jun 29, 2024 · Jun 29, 2024 · Jun 29, 2024
diff --git a/.github/workflows/index_on_algolia.yml b/.github/workflows/index_on_algolia.yml
@@ -26,15 +26,25 @@ jobs:
       - name: Install dependencies
         run: pip install -r requirements.txt
 
+      - name: Clone and build website-v2-docs
+        run: |
+          git clone --depth=1 --branch=master https://github.com/boostorg/website-v2-docs.git ../website-v2-docs
+          cd ../website-v2-docs
+          ./build.sh
+
       - name: Download and extract boost release archive
         run: |
+          mkdir ../boost_1_85_0
           BOOST_VERSION=$(sed -n 's/.*version: "\(.*\)"/\1/p' config/config.yaml)
           BOOST_VERSION_MAIN=$(echo $BOOST_VERSION | sed -E 's/([0-9]+)_([0-9]+)_([0-9]+)(.*)/\1.\2.\3/g')
-          wget --no-verbose https://boostorg.jfrog.io/artifactory/main/release/$BOOST_VERSION_MAIN/source/boost_$BOOST_VERSION.tar.gz
+          wget --no-verbose https://archives.boost.io/release/$BOOST_VERSION_MAIN/source/boost_$BOOST_VERSION.tar.gz
           tar -xzf boost_$BOOST_VERSION.tar.gz -C ../
 
-      - name: Extract records
-        run: python -m gecko.extract_records
+      - name: Extract learn records
+        run: python -m gecko.extract_learn_records
+
+      - name: Extract libraries records
+        run: python -m gecko.extract_libraries_records
 
       - name: Check validity of records
         run: python -m gecko.sanitizer check

diff --git a/algolia_records/.gitignore → algolia_records/learn/.gitignore b/algolia_records/.gitignore → algolia_records/learn/.gitignore
diff --git a/algolia_records/libraries/.gitignore b/algolia_records/libraries/.gitignore
@@ -0,0 +1 @@
+*.json
diff --git a/config/config.yaml b/config/config.yaml
@@ -1,6 +1,27 @@
 boost:
-  version: "1_85_0"
-  root: "../boost_1_85_0"
+  version: '1_85_0'
+  root: '../boost_1_85_0'
+
+website-v2-docs:
+  root: '../website-v2-docs/build'
+  sections:
+    - key: 'contributor-guide'
+      name: 'Contributor Guide'
+      last-records: 388
+      last-words: 40989
+      last-lvls: 1325
+
+    - key: 'formal-reviews'
+      name: 'Formal Reviews'
+      last-records: 45
+      last-words: 7433
+      last-lvls: 154
+
+    - key: 'user-guide'
+      name: 'User Guide'
+      last-records: 189
+      last-words: 36070
+      last-lvls: 572
 
 algolia:
   app-id: D7O1MLLTAF
@@ -20,7 +41,7 @@ algolia:
       - unordered(hierarchy.lvl5)
       - unordered(hierarchy.lvl6)
       - content
-    numericAttributesToIndex: null
+    numericAttributesToIndex:
     attributesToRetrieve:
       - hierarchy.lvl0.path
       - hierarchy.lvl1.path
@@ -39,8 +60,8 @@ algolia:
     advancedSyntax: true
     attributeCriteriaComputedByMinProximity: true
     distinct: true
-    unretrievableAttributes: null
-    optionalWords: null
+    unretrievableAttributes:
+    optionalWords:
     attributesForFaceting:
       - library_key
     attributesToSnippet:
@@ -307,6 +328,11 @@ crawlers:
         last-words: 4059
         last-lvls: 204
 
+      - key: cobalt
+        last-records: 131
+        last-words: 14546
+        last-lvls: 319
+
       - key: compat
         last-records: 46
         last-words: 1079
@@ -382,11 +408,6 @@ crawlers:
         last-words: 6161
         last-lvls: 127
 
-      - key: cobalt
-        last-records: 131
-        last-words: 14546
-        last-lvls: 319
-
   - name: QuickBook
     libraries:
       - key: accumulators

diff --git a/gecko/config.py b/gecko/config.py
@@ -7,10 +7,22 @@
         'version': str,
         'root': os.path.exists
     },
+    'website-v2-docs': {
+        'root': os.path.exists,
+        'sections': [
+            {
+                'key': str,
+                'name': str,
+                'last-records': int,
+                'last-words': int,
+                'last-lvls': int
+            }
+        ]
+    },
     'algolia': {
         'app-id': str,
         'api-key': str,
-        'settings':dict
+        'settings': dict
     },
     'crawlers': [
         {

diff --git a/gecko/extract_learn_records.py b/gecko/extract_learn_records.py
@@ -0,0 +1,96 @@
+import re
+import json
+from pathlib import Path
+from urllib.parse import urljoin
+from bs4 import BeautifulSoup, Tag
+
+from .crawlers.helpers import has_class
+from .config import config
+
+
+class AntoraDoc():
+    def crawl(self, doc_root: Path) -> dict:
+        sections = {}
+        doc_root = doc_root.resolve()
+
+        for file_path in doc_root.rglob('*.html'):
+            with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
+                soup = BeautifulSoup(file.read(), 'html.parser')
+
+                lvls = []
+                for link in soup.select('body nav.breadcrumbs ul li a'):
+                    lvls = lvls + [{'title': link.text, 'path': urljoin(str(file_path), link.get('href'))}]
+
+                sect1 = soup.select_one('body article.doc')
+                if sect1:
+                    self._extract_section_n(str(file_path), sections, sect1, lvls)
+
+        return sections
+
+    def _extract_section_n(self, file_path: str, sections: dict, sect: Tag, lvls: list = []):
+        header = sect.select_one('h1, h2, h3, h4, h5, h6')
+
+        if header.name == 'h1':
+            path = file_path
+        else:
+            title = header.text
+            path = file_path + '#' + header.get('id')
+            lvls = lvls + [{'title': title, 'path': path}]
+
+        if header.find_next_sibling() and has_class(header.find_next_sibling(), 'sectionbody'):
+            siblings = header.find_next_sibling().find_all(recursive=False)
+        else:
+            siblings = header.next_siblings
+
+        content = ''
+        for sibling in siblings:
+            if isinstance(sibling, Tag) and sibling.has_attr('class') and len([i for i in sibling.get('class') if i.startswith('sect')]) > 0:
+                self._extract_section_n(file_path, sections, sibling, lvls)
+                continue
+            content += sibling.get_text() + ' '
+
+        sections[path] = {'content': content, 'lvls': lvls}
+
+
+def create_algolia_records(section_key: str, section_name: str, doc_root: Path, sections: dict):
+    doc_root = doc_root.resolve()
+    records = []
+
+    for _, section in sections.items():
+        for lvl in section['lvls']:
+            lvl['path'] = lvl['path'].replace(str(doc_root) + '/', '')
+
+        records.append({
+            'type': 'content',
+            'section_key': section_key,
+            'section_name': section_name,
+            'content': re.sub(r'\s+', ' ', section['content']).strip(),
+            'weight': {
+                'pageRank': 0,
+                'level': 100 - len(section['lvls']) * 10,
+                'position': 0
+            },
+            'path': section['lvls'][-1]['path'] if len(section['lvls']) > 0 else None,
+            'hierarchy': {
+                'lvl0': section['lvls'][0] if len(section['lvls']) > 0 else None,
+                'lvl1': section['lvls'][1] if len(section['lvls']) > 1 else None,
+                'lvl2': section['lvls'][2] if len(section['lvls']) > 2 else None,
+                'lvl3': section['lvls'][3] if len(section['lvls']) > 3 else None,
+                'lvl4': section['lvls'][4] if len(section['lvls']) > 4 else None,
+                'lvl5': section['lvls'][5] if len(section['lvls']) > 5 else None,
+                'lvl6': section['lvls'][6] if len(section['lvls']) > 6 else None
+            }})
+
+    with open('./algolia_records/learn/' + section_key + '.json', 'w', encoding='utf-8') as outfile:
+        json.dump(records, outfile, indent=4)
+
+
+if __name__ == "__main__":
+    crawler = AntoraDoc()
+
+    for section in config['website-v2-docs']['sections']:
+        sections = crawler.crawl(Path(config['website-v2-docs']['root']) / section['key'])
+        create_algolia_records(section['key'],
+                               section['name'],
+                               Path(config['website-v2-docs']['root']),
+                               sections)
diff --git a/gecko/extract_records.py → gecko/extract_libraries_records.py b/gecko/extract_records.py → gecko/extract_libraries_records.py
@@ -75,7 +75,7 @@ def create_algolia_records(library_key: str, sections: dict, boost_root: str):
                 'lvl6': section['lvls'][6] if len(section['lvls']) > 6 else None
             }})
 
-    with open('./algolia_records/' + library_key.replace('/', '_') + '.json', 'w', encoding='utf-8') as outfile:
+    with open('./algolia_records/libraries/' + library_key.replace('/', '_') + '.json', 'w', encoding='utf-8') as outfile:
         json.dump(records, outfile, indent=4)
 
 

diff --git a/gecko/index_on_algolia.py b/gecko/index_on_algolia.py
@@ -9,19 +9,19 @@
     client = SearchClient.create(config['algolia']['app-id'], config['algolia']['api-key'])
 
     print('Initializing {} index ...'.format(config['boost']['version']))
-    index = client.init_index(config['boost']['version'])
+    libraries_index = client.init_index(config['boost']['version'])
 
-    print('Setting settings for {} index ...'.format(config['boost']['version']))
-    index.set_settings(config['algolia']['settings'])
+    # print('Setting settings for {} index ...'.format(config['boost']['version']))
+    # libraries_index.set_settings(config['algolia']['settings'])
 
-    for path in Path('./algolia_records').glob('*.json'):
+    for path in Path('./algolia_records/libraries').glob('*.json'):
         print('uploading records for {}...'.format(path.stem))
 
         with open(path, 'r', encoding='utf-8') as f:
             records = json.load(f)
 
             # Delete the existing records for this library.
-            index.delete_by({'filters': 'library_key:{}'.format(records[0]['library_key'])})
+            libraries_index.delete_by({'filters': 'library_key:{}'.format(records[0]['library_key'])})
 
             # Split long documents into smaller parts.
             for record in records:
@@ -34,5 +34,17 @@
             records = [record for record in records if not (
                 record['content'] == '' and not record['hierarchy']['lvl0'])]
 
-            # TODO instead of using autoGenerateObjectIDIfNotExist we might create a hash out of hierarchy items
-            index.save_objects(records, {'autoGenerateObjectIDIfNotExist': True})
+            libraries_index.save_objects(records, {'autoGenerateObjectIDIfNotExist': True})
+
+    learn_index = client.init_index('learn')
+
+    for path in Path('./algolia_records/learn').glob('*.json'):
+        print('uploading records for {}...'.format(path.stem))
+
+        with open(path, 'r', encoding='utf-8') as f:
+            records = json.load(f)
+
+            # Delete the existing records for this library.
+            libraries_index.delete_by({'filters': 'section_key:{}'.format(records[0]['section_key'])})
+
+            #learn_index.save_objects(records, {'autoGenerateObjectIDIfNotExist': True})
diff --git a/gecko/sanitizer.py b/gecko/sanitizer.py
@@ -13,62 +13,81 @@
 from .config import config, update_config_file
 
 
-def check_for_abnormality(nbof: str, library_key: str, prev: int, curr: int):
+def check_for_abnormality(nbof: str, name: str, prev: int, curr: int):
     if (abs(curr - prev) / prev) > 0.2:
-        print('Error: Abnormal change in number of {} in {} from:{} to:{}'.format(nbof, library_key, prev,  curr))
+        print('Error: Abnormal change in number of {} in {} from:{} to:{}'.format(nbof, name, prev,  curr))
         return True
     return False
 
 
+def check(cfg:dict, json_file_path: Path):
+    failed = False
+    with open(json_file_path, 'r', encoding='utf-8') as f:
+        records = json.load(f)
+        words = 0
+        lvls = 0
+        for record in records:
+            words += len(re.findall(r'\w+', record['content']))
+            lvls += len([l for l in record['hierarchy'].values() if l is not None])
+
+        failed |= check_for_abnormality(
+            nbof='records',
+            name=cfg['key'],
+            prev=cfg['last-records'],
+            curr=len(records))
+
+        failed |= check_for_abnormality(
+            nbof='words',
+            name=cfg['key'],
+            prev=cfg['last-words'],
+            curr=words)
+
+        failed |= check_for_abnormality(
+            nbof='lvls',
+            name=cfg['key'],
+            prev=cfg['last-lvls'],
+            curr=lvls)
+
+    return failed
+
+
+def update_config(cfg:dict, json_file_path: Path):
+    with open(json_file_path, 'r', encoding='utf-8') as f:
+        records = json.load(f)
+        cfg['last-records'] = len(records)
+        cfg['last-words'] = 0
+        cfg['last-lvls'] = 0
+        for record in records:
+            cfg['last-words'] += len(re.findall(r'\w+', record['content']))
+            cfg['last-lvls'] += len([l for l in record['hierarchy'].values() if l is not None])
+
+
 if __name__ == "__main__":
     args = docopt(__doc__)
 
     if args['check']:
         failed = False
         for crawler_cfg in config['crawlers']:
             for library_cfg in crawler_cfg['libraries']:
-                json_file_path = Path('./algolia_records') / f"{library_cfg['key'].replace('/','_')}.json"
-                with open(json_file_path, 'r', encoding='utf-8') as f:
-                    records = json.load(f)
-                    words = 0
-                    lvls = 0
-                    for record in records:
-                        words += len(re.findall(r'\w+', record['content']))
-                        lvls += len([l for l in record['hierarchy'].values() if l is not None])
-
-                    failed |= check_for_abnormality(
-                        nbof='records',
-                        library_key=library_cfg['key'],
-                        prev=library_cfg['last-records'],
-                        curr=len(records))
-
-                    failed |= check_for_abnormality(
-                        nbof='words',
-                        library_key=library_cfg['key'],
-                        prev=library_cfg['last-words'],
-                        curr=words)
-
-                    failed |= check_for_abnormality(
-                        nbof='lvls',
-                        library_key=library_cfg['key'],
-                        prev=library_cfg['last-lvls'],
-                        curr=lvls)
+                json_file_path = Path('./algolia_records/libraries') / f"{library_cfg['key'].replace('/','_')}.json"
+                failed |= check(library_cfg, json_file_path)
+
+        for section_cfg in config['website-v2-docs']['sections']:
+            json_file_path = Path('./algolia_records/learn') / f"{section_cfg['key']}.json"
+            failed |= check(section_cfg, json_file_path)
 
         if failed:
             sys.exit(1)
 
     if args['update-config']:
         for crawler_cfg in config['crawlers']:
             for library_cfg in crawler_cfg['libraries']:
-                json_file_path = Path('./algolia_records') / f"{library_cfg['key'].replace('/','_')}.json"
-                with open(json_file_path, 'r', encoding='utf-8') as f:
-                    records = json.load(f)
-                    library_cfg['last-records'] = len(records)
-                    library_cfg['last-words'] = 0
-                    library_cfg['last-lvls'] = 0
-                    for record in records:
-                        library_cfg['last-words'] += len(re.findall(r'\w+', record['content']))
-                        library_cfg['last-lvls'] += len([l for l in record['hierarchy'].values() if l is not None])
+                json_file_path = Path('./algolia_records/libraries') / f"{library_cfg['key'].replace('/','_')}.json"
+                update_config(library_cfg, json_file_path)
+
+        for section_cfg in config['website-v2-docs']['sections']:
+            json_file_path = Path('./algolia_records/learn') / f"{section_cfg['key']}.json"
+            update_config(section_cfg, json_file_path)
 
         update_config_file()
         print('Config has been updated.')