Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Extract search records for learn sections #148

Merged
merged 5 commits into from
Jun 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 13 additions & 3 deletions .github/workflows/index_on_algolia.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,15 +26,25 @@ jobs:
- name: Install dependencies
run: pip install -r requirements.txt

- name: Clone and build website-v2-docs
run: |
git clone --depth=1 --branch=master https://github.com/boostorg/website-v2-docs.git ../website-v2-docs
cd ../website-v2-docs
./build.sh

- name: Download and extract boost release archive
run: |
mkdir ../boost_1_85_0
BOOST_VERSION=$(sed -n 's/.*version: "\(.*\)"/\1/p' config/config.yaml)
BOOST_VERSION_MAIN=$(echo $BOOST_VERSION | sed -E 's/([0-9]+)_([0-9]+)_([0-9]+)(.*)/\1.\2.\3/g')
wget --no-verbose https://boostorg.jfrog.io/artifactory/main/release/$BOOST_VERSION_MAIN/source/boost_$BOOST_VERSION.tar.gz
wget --no-verbose https://archives.boost.io/release/$BOOST_VERSION_MAIN/source/boost_$BOOST_VERSION.tar.gz
tar -xzf boost_$BOOST_VERSION.tar.gz -C ../

- name: Extract records
run: python -m gecko.extract_records
- name: Extract learn records
run: python -m gecko.extract_learn_records

- name: Extract libraries records
run: python -m gecko.extract_libraries_records

- name: Check validity of records
run: python -m gecko.sanitizer check
Expand Down
File renamed without changes.
1 change: 1 addition & 0 deletions algolia_records/libraries/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
*.json
41 changes: 31 additions & 10 deletions config/config.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,27 @@
boost:
version: "1_85_0"
root: "../boost_1_85_0"
version: '1_85_0'
root: '../boost_1_85_0'

website-v2-docs:
root: '../website-v2-docs/build'
sections:
- key: 'contributor-guide'
name: 'Contributor Guide'
last-records: 388
last-words: 40989
last-lvls: 1325

- key: 'formal-reviews'
name: 'Formal Reviews'
last-records: 45
last-words: 7433
last-lvls: 154

- key: 'user-guide'
name: 'User Guide'
last-records: 189
last-words: 36070
last-lvls: 572

algolia:
app-id: D7O1MLLTAF
Expand All @@ -20,7 +41,7 @@ algolia:
- unordered(hierarchy.lvl5)
- unordered(hierarchy.lvl6)
- content
numericAttributesToIndex: null
numericAttributesToIndex:
attributesToRetrieve:
- hierarchy.lvl0.path
- hierarchy.lvl1.path
Expand All @@ -39,8 +60,8 @@ algolia:
advancedSyntax: true
attributeCriteriaComputedByMinProximity: true
distinct: true
unretrievableAttributes: null
optionalWords: null
unretrievableAttributes:
optionalWords:
attributesForFaceting:
- library_key
attributesToSnippet:
Expand Down Expand Up @@ -307,6 +328,11 @@ crawlers:
last-words: 4059
last-lvls: 204

- key: cobalt
last-records: 131
last-words: 14546
last-lvls: 319

- key: compat
last-records: 46
last-words: 1079
Expand Down Expand Up @@ -382,11 +408,6 @@ crawlers:
last-words: 6161
last-lvls: 127

- key: cobalt
last-records: 131
last-words: 14546
last-lvls: 319

- name: QuickBook
libraries:
- key: accumulators
Expand Down
14 changes: 13 additions & 1 deletion gecko/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,22 @@
'version': str,
'root': os.path.exists
},
'website-v2-docs': {
'root': os.path.exists,
'sections': [
{
'key': str,
'name': str,
'last-records': int,
'last-words': int,
'last-lvls': int
}
]
},
'algolia': {
'app-id': str,
'api-key': str,
'settings':dict
'settings': dict
},
'crawlers': [
{
Expand Down
96 changes: 96 additions & 0 deletions gecko/extract_learn_records.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
import re
import json
from pathlib import Path
from urllib.parse import urljoin
from bs4 import BeautifulSoup, Tag

from .crawlers.helpers import has_class
from .config import config


class AntoraDoc():
def crawl(self, doc_root: Path) -> dict:
sections = {}
doc_root = doc_root.resolve()

for file_path in doc_root.rglob('*.html'):
with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
soup = BeautifulSoup(file.read(), 'html.parser')

lvls = []
for link in soup.select('body nav.breadcrumbs ul li a'):
lvls = lvls + [{'title': link.text, 'path': urljoin(str(file_path), link.get('href'))}]

sect1 = soup.select_one('body article.doc')
if sect1:
self._extract_section_n(str(file_path), sections, sect1, lvls)

return sections

def _extract_section_n(self, file_path: str, sections: dict, sect: Tag, lvls: list = []):
header = sect.select_one('h1, h2, h3, h4, h5, h6')

if header.name == 'h1':
path = file_path
else:
title = header.text
path = file_path + '#' + header.get('id')
lvls = lvls + [{'title': title, 'path': path}]

if header.find_next_sibling() and has_class(header.find_next_sibling(), 'sectionbody'):
siblings = header.find_next_sibling().find_all(recursive=False)
else:
siblings = header.next_siblings

content = ''
for sibling in siblings:
if isinstance(sibling, Tag) and sibling.has_attr('class') and len([i for i in sibling.get('class') if i.startswith('sect')]) > 0:
self._extract_section_n(file_path, sections, sibling, lvls)
continue
content += sibling.get_text() + ' '

sections[path] = {'content': content, 'lvls': lvls}


def create_algolia_records(section_key: str, section_name: str, doc_root: Path, sections: dict):
doc_root = doc_root.resolve()
records = []

for _, section in sections.items():
for lvl in section['lvls']:
lvl['path'] = lvl['path'].replace(str(doc_root) + '/', '')

records.append({
'type': 'content',
'section_key': section_key,
'section_name': section_name,
'content': re.sub(r'\s+', ' ', section['content']).strip(),
'weight': {
'pageRank': 0,
'level': 100 - len(section['lvls']) * 10,
'position': 0
},
'path': section['lvls'][-1]['path'] if len(section['lvls']) > 0 else None,
'hierarchy': {
'lvl0': section['lvls'][0] if len(section['lvls']) > 0 else None,
'lvl1': section['lvls'][1] if len(section['lvls']) > 1 else None,
'lvl2': section['lvls'][2] if len(section['lvls']) > 2 else None,
'lvl3': section['lvls'][3] if len(section['lvls']) > 3 else None,
'lvl4': section['lvls'][4] if len(section['lvls']) > 4 else None,
'lvl5': section['lvls'][5] if len(section['lvls']) > 5 else None,
'lvl6': section['lvls'][6] if len(section['lvls']) > 6 else None
}})

with open('./algolia_records/learn/' + section_key + '.json', 'w', encoding='utf-8') as outfile:
json.dump(records, outfile, indent=4)


if __name__ == "__main__":
crawler = AntoraDoc()

for section in config['website-v2-docs']['sections']:
sections = crawler.crawl(Path(config['website-v2-docs']['root']) / section['key'])
create_algolia_records(section['key'],
section['name'],
Path(config['website-v2-docs']['root']),
sections)
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ def create_algolia_records(library_key: str, sections: dict, boost_root: str):
'lvl6': section['lvls'][6] if len(section['lvls']) > 6 else None
}})

with open('./algolia_records/' + library_key.replace('/', '_') + '.json', 'w', encoding='utf-8') as outfile:
with open('./algolia_records/libraries/' + library_key.replace('/', '_') + '.json', 'w', encoding='utf-8') as outfile:
json.dump(records, outfile, indent=4)


Expand Down
26 changes: 19 additions & 7 deletions gecko/index_on_algolia.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,19 +9,19 @@
client = SearchClient.create(config['algolia']['app-id'], config['algolia']['api-key'])

print('Initializing {} index ...'.format(config['boost']['version']))
index = client.init_index(config['boost']['version'])
libraries_index = client.init_index(config['boost']['version'])

print('Setting settings for {} index ...'.format(config['boost']['version']))
index.set_settings(config['algolia']['settings'])
# print('Setting settings for {} index ...'.format(config['boost']['version']))
# libraries_index.set_settings(config['algolia']['settings'])

for path in Path('./algolia_records').glob('*.json'):
for path in Path('./algolia_records/libraries').glob('*.json'):
print('uploading records for {}...'.format(path.stem))

with open(path, 'r', encoding='utf-8') as f:
records = json.load(f)

# Delete the existing records for this library.
index.delete_by({'filters': 'library_key:{}'.format(records[0]['library_key'])})
libraries_index.delete_by({'filters': 'library_key:{}'.format(records[0]['library_key'])})

# Split long documents into smaller parts.
for record in records:
Expand All @@ -34,5 +34,17 @@
records = [record for record in records if not (
record['content'] == '' and not record['hierarchy']['lvl0'])]

# TODO instead of using autoGenerateObjectIDIfNotExist we might create a hash out of hierarchy items
index.save_objects(records, {'autoGenerateObjectIDIfNotExist': True})
libraries_index.save_objects(records, {'autoGenerateObjectIDIfNotExist': True})

learn_index = client.init_index('learn')

for path in Path('./algolia_records/learn').glob('*.json'):
print('uploading records for {}...'.format(path.stem))

with open(path, 'r', encoding='utf-8') as f:
records = json.load(f)

# Delete the existing records for this library.
libraries_index.delete_by({'filters': 'section_key:{}'.format(records[0]['section_key'])})

#learn_index.save_objects(records, {'autoGenerateObjectIDIfNotExist': True})
93 changes: 56 additions & 37 deletions gecko/sanitizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,62 +13,81 @@
from .config import config, update_config_file


def check_for_abnormality(nbof: str, library_key: str, prev: int, curr: int):
def check_for_abnormality(nbof: str, name: str, prev: int, curr: int):
if (abs(curr - prev) / prev) > 0.2:
print('Error: Abnormal change in number of {} in {} from:{} to:{}'.format(nbof, library_key, prev, curr))
print('Error: Abnormal change in number of {} in {} from:{} to:{}'.format(nbof, name, prev, curr))
return True
return False


def check(cfg:dict, json_file_path: Path):
failed = False
with open(json_file_path, 'r', encoding='utf-8') as f:
records = json.load(f)
words = 0
lvls = 0
for record in records:
words += len(re.findall(r'\w+', record['content']))
lvls += len([l for l in record['hierarchy'].values() if l is not None])

failed |= check_for_abnormality(
nbof='records',
name=cfg['key'],
prev=cfg['last-records'],
curr=len(records))

failed |= check_for_abnormality(
nbof='words',
name=cfg['key'],
prev=cfg['last-words'],
curr=words)

failed |= check_for_abnormality(
nbof='lvls',
name=cfg['key'],
prev=cfg['last-lvls'],
curr=lvls)

return failed


def update_config(cfg:dict, json_file_path: Path):
with open(json_file_path, 'r', encoding='utf-8') as f:
records = json.load(f)
cfg['last-records'] = len(records)
cfg['last-words'] = 0
cfg['last-lvls'] = 0
for record in records:
cfg['last-words'] += len(re.findall(r'\w+', record['content']))
cfg['last-lvls'] += len([l for l in record['hierarchy'].values() if l is not None])


if __name__ == "__main__":
args = docopt(__doc__)

if args['check']:
failed = False
for crawler_cfg in config['crawlers']:
for library_cfg in crawler_cfg['libraries']:
json_file_path = Path('./algolia_records') / f"{library_cfg['key'].replace('/','_')}.json"
with open(json_file_path, 'r', encoding='utf-8') as f:
records = json.load(f)
words = 0
lvls = 0
for record in records:
words += len(re.findall(r'\w+', record['content']))
lvls += len([l for l in record['hierarchy'].values() if l is not None])

failed |= check_for_abnormality(
nbof='records',
library_key=library_cfg['key'],
prev=library_cfg['last-records'],
curr=len(records))

failed |= check_for_abnormality(
nbof='words',
library_key=library_cfg['key'],
prev=library_cfg['last-words'],
curr=words)

failed |= check_for_abnormality(
nbof='lvls',
library_key=library_cfg['key'],
prev=library_cfg['last-lvls'],
curr=lvls)
json_file_path = Path('./algolia_records/libraries') / f"{library_cfg['key'].replace('/','_')}.json"
failed |= check(library_cfg, json_file_path)

for section_cfg in config['website-v2-docs']['sections']:
json_file_path = Path('./algolia_records/learn') / f"{section_cfg['key']}.json"
failed |= check(section_cfg, json_file_path)

if failed:
sys.exit(1)

if args['update-config']:
for crawler_cfg in config['crawlers']:
for library_cfg in crawler_cfg['libraries']:
json_file_path = Path('./algolia_records') / f"{library_cfg['key'].replace('/','_')}.json"
with open(json_file_path, 'r', encoding='utf-8') as f:
records = json.load(f)
library_cfg['last-records'] = len(records)
library_cfg['last-words'] = 0
library_cfg['last-lvls'] = 0
for record in records:
library_cfg['last-words'] += len(re.findall(r'\w+', record['content']))
library_cfg['last-lvls'] += len([l for l in record['hierarchy'].values() if l is not None])
json_file_path = Path('./algolia_records/libraries') / f"{library_cfg['key'].replace('/','_')}.json"
update_config(library_cfg, json_file_path)

for section_cfg in config['website-v2-docs']['sections']:
json_file_path = Path('./algolia_records/learn') / f"{section_cfg['key']}.json"
update_config(section_cfg, json_file_path)

update_config_file()
print('Config has been updated.')
Loading