diff --git a/changelog.md b/changelog.md index ad16bd1..d12aaf0 100644 --- a/changelog.md +++ b/changelog.md @@ -1,3 +1,7 @@ +# 1.1.21 + +- Fix: the creation of an includes map when using the `only_partial` argument. + # 1.1.20 - Fix: bug where the `_adjust_links` duplicated the anchor. diff --git a/foliant/preprocessors/includes.py b/foliant/preprocessors/includes.py index f536e9e..bb2620e 100644 --- a/foliant/preprocessors/includes.py +++ b/foliant/preprocessors/includes.py @@ -800,6 +800,130 @@ def _get_included_file_path( return included_file_path + def _read_source_file_content( + self, + file_path: Path + ) -> str: + '''Read content from source file, handling both temporary and source directory paths. + + :param file_path: Path to the file to read + + :returns: File content as string + ''' + + self.logger.debug(f'Reading source file: {file_path}') + + # If the file is located in a temporary directory, let's try to find the corresponding source file + if self.working_dir.resolve() in file_path.parents: + # This is a file in a temporary directory + try: + # Get the path to the source file + src_file_path = self._get_src_file_path(file_path) + self.logger.debug(f'Mapping temporary file to source file: {src_file_path}') + + if src_file_path.exists(): + with open(src_file_path, encoding='utf8') as src_file: + return src_file.read() + else: + # If the source file is not found, we read from the temporary file + self.logger.debug('Source file not found, reading from temporary file') + if file_path.exists(): + with open(file_path, encoding='utf8') as temp_file: + return temp_file.read() + else: + self.logger.warning(f'File not found: {file_path}') + return '' + except Exception as e: + self.logger.debug(f'Error mapping to source file: {e}, reading from temporary file') + if file_path.exists(): + with open(file_path, encoding='utf8') as temp_file: + return temp_file.read() + else: + self.logger.warning(f'File not found: {file_path}') + return '' + else: + # The file is not in the temporary directory, we read it directly + if file_path.exists(): + with open(file_path, encoding='utf8') as src_file: + return src_file.read() + else: + self.logger.warning(f'File not found: {file_path}') + return '' + + def _has_not_build_meta(self, content: str) -> bool: + '''Check if content has not_build: true in front matter. + + :param content: File content + + :returns: True if file has not_build: true in metadata + ''' + # Simple check for front matter with not_build: true + front_matter_pattern = re.compile(r'^---\s*\n(.*?)\n---\s*\n', re.DOTALL | re.MULTILINE) + match = front_matter_pattern.match(content) + + if match: + front_matter = match.group(1) + # Check for not_build: true + not_build_pattern = re.compile(r'not_build\s*:\s*true', re.IGNORECASE) + return bool(not_build_pattern.search(front_matter)) + + return False + + def _process_include_for_includes_map( + self, + included_file_path: Path, + from_heading: str or None = None, + to_heading: str or None = None, + from_id: str or None = None, + to_id: str or None = None, + to_end: bool = False + ) -> (str, list): + '''Process include statement specifically for includes_map generation. + Reads content from source files directly, not from temporary directory. + + :param included_file_path: Path to the included file + :param from_heading: Include starting from this heading + :param to_heading: Include up to this heading + :param from_id: Include starting from the heading or the anchor that has this ID + :param to_id: Include up to the heading or the anchor that has this ID + :param to_end: Flag that tells to cut to the end of document + + :returns: Tuple of (included file content, list of anchors) + ''' + + self.logger.debug(f'Processing include for includes_map: {included_file_path}') + + anchors = [] + + # Reading the contents of the file from the source directory + content = self._read_source_file_content(included_file_path) + + if not content: + return '', anchors + + # Check if the file has not_build: true + if self._has_not_build_meta(content): + self.logger.debug(f'File {included_file_path} has not_build: true, using original content') + + # Removing metadata from content + content = remove_meta(content) + + # Cut content based on parameters + content = self._cut_from_position_to_position( + content, + from_heading, + to_heading, + from_id, + to_id, + to_end + ) + + # Find anchors + if self.includes_map_anchors: + anchors = self._add_anchors(anchors, content) + + return content, anchors + def _process_include( self, included_file_path: Path, @@ -829,8 +953,10 @@ def _process_include( :param to_end: Flag that tells to cut to the end of document :param sethead: Level of the topmost heading in the included content :param nohead: Flag that tells to strip the starting heading from the included content + :param include_link: Link to the included file for URL includes + :param origin_file_path: Path to the original file where include tag is located - :returns: Included file content + :returns: Tuple of (included file content, list of anchors) ''' self.logger.debug( @@ -875,7 +1001,7 @@ def _process_include( relative_path = regexp_find_path.findall(line) for ex_line in relative_path: - exceptions_characters = re.findall(r'https?://[^\s]+|@|:|\.png|\.jpeg|.svg', ex_line) + exceptions_characters = re.findall(r'https?://[^\s]+|@|:|\.png|\.jpeg|\.svg', ex_line) if exceptions_characters: continue else: @@ -924,7 +1050,8 @@ def _process_include( ).escape(included_content) included_content = self._adjust_image_paths(included_content, included_file_path) - included_content = self._adjust_links(included_content, included_file_path, origin_file_path) + if origin_file_path: + included_content = self._adjust_links(included_content, included_file_path, origin_file_path) if project_root_path: included_content = self._adjust_paths_in_tags_attributes( @@ -957,30 +1084,31 @@ def _find_anchors(self, content: str) -> list: anchors_list = [] anchors = re.findall(r'\([\-\_A-Za-z0-9]+)\<\/anchor\>', content) - for anchor in anchors: - anchors_list.append(anchor) + anchors_list.extend(anchors) + custom_ids = re.findall(r'\{\#([\-\_A-Za-z0-9]+)\}', content) - for anchor in custom_ids: - anchors_list.append(anchor) + anchors_list.extend(custom_ids) + elements_with_ids = re.findall(r'id\=[\"\']([\-\_A-Za-z0-9]+)[\"\']', content) - for anchor in elements_with_ids: - anchors_list.append(anchor) + anchors_list.extend(elements_with_ids) + return anchors_list - def _add_anchors(self, l: list, content: str) -> list: + def _add_anchors(self, anchor_list: list, content: str) -> list: """Add an anchor link to the list of anchor links - :param l: The original list + :param anchor_list: The original list :param content: Markdown content :returns: A list with added anchors """ anchors = self._find_anchors(content) if anchors: - l.extend(anchors) - return l + anchor_list.extend(anchors) + return anchor_list def clean_tokens(self, url: str) -> str: + """Remove tokens from URLs.""" token_pattern = r"(https*://)(.*)@(.*)" s = url if self.enable_clean_tokens: @@ -989,6 +1117,7 @@ def clean_tokens(self, url: str) -> str: return s def _prepare_path_for_includes_map(self, path: Path) -> str: + """Prepare path for includes map.""" donor_path = None if path.as_posix().startswith(self.working_dir.as_posix()): _path = path.relative_to(self.working_dir) @@ -1006,11 +1135,162 @@ def _prepare_path_for_includes_map(self, path: Path) -> str: return donor_path def _exist_in_includes_map(self, includes_map: list, path: str) -> bool: + """Check if path exists in includes map.""" for obj in includes_map: if obj["file"] == path: return True return False + def process_includes_for_map( + self, + markdown_file_path: Path, + content: str, + recipient_md_path: str + ) -> None: + '''Process includes specifically for includes_map generation. + This method only collects includes information without modifying content. + + :param markdown_file_path: Path to currently processed Markdown file + :param content: Markdown content + :param recipient_md_path: Path to the file in source directory + ''' + + self.logger.debug(f'Processing includes for map: {markdown_file_path}') + + include_statement_pattern = re.compile( + rf'((?]*)?\>.*?\<\/(?:{"|".join(self.tags)})\>)', + flags=re.DOTALL + ) + + content_parts = include_statement_pattern.split(content) + + for content_part in content_parts: + include_statement = self.pattern.fullmatch(content_part) + + if include_statement: + donor_md_path = None + donor_anchors = [] + + body = self._tag_body_pattern.match(include_statement.group('body').strip()) + options = self.get_options(include_statement.group('options')) + + if body and body.group('path'): + if body.group('repo'): + # File in Git repository + repo_from_alias = self.options['aliases'].get(body.group('repo')) + + revision = None + + if repo_from_alias: + if '#' in repo_from_alias: + repo_url, revision = repo_from_alias.split('#', maxsplit=1) + else: + repo_url = repo_from_alias + else: + repo_url = body.group('repo') + + if body.group('revision'): + revision = body.group('revision') + + # Create link to repository file + include_link = self.create_full_link(repo_url, revision, body.group('path')) + donor_md_path = include_link + body.group('path') + donor_md_path = self.clean_tokens(donor_md_path) + + # Process include for anchors + _, anchors = self._process_include_for_includes_map( + included_file_path=Path('/dummy/path'), # dummy path for repo files + from_heading=body.group('from_heading'), + to_heading=body.group('to_heading') + ) + + if self.includes_map_anchors: + donor_anchors = donor_anchors + anchors + + else: + # Local file + included_file_path = self._get_included_file_path(body.group('path'), markdown_file_path) + donor_md_path = self._prepare_path_for_includes_map(included_file_path) + donor_md_path = self.clean_tokens(donor_md_path) + + # Process include for anchors (reading from source file) + _, anchors = self._process_include_for_includes_map( + included_file_path=included_file_path, + from_heading=body.group('from_heading'), + to_heading=body.group('to_heading') + ) + + if self.includes_map_anchors: + donor_anchors = donor_anchors + anchors + + else: # if body is missing or empty + if options.get('repo_url') and options.get('path'): + # File in Git repository + include_link = self.create_full_link( + options.get('repo_url'), + options.get('revision'), + options.get('path') + ) + donor_md_path = include_link + options.get('path') + donor_md_path = self.clean_tokens(donor_md_path) + + # Process include for anchors + _, anchors = self._process_include_for_includes_map( + included_file_path=Path('/dummy/path'), # dummy path for repo files + from_heading=options.get('from_heading'), + to_heading=options.get('to_heading'), + from_id=options.get('from_id'), + to_id=options.get('to_id'), + to_end=options.get('to_end') + ) + + if self.includes_map_anchors: + donor_anchors = donor_anchors + anchors + + elif options.get('url'): + # File from URL + donor_md_path = options['url'] + donor_md_path = self.clean_tokens(donor_md_path) + + elif options.get('src'): + # Local file + included_file_path = self._get_included_file_path(options.get('src'), markdown_file_path) + donor_md_path = self._prepare_path_for_includes_map(included_file_path) + donor_md_path = self.clean_tokens(donor_md_path) + + # Process include for anchors (reading from source file) + _, anchors = self._process_include_for_includes_map( + included_file_path=included_file_path, + from_heading=options.get('from_heading'), + to_heading=options.get('to_heading'), + from_id=options.get('from_id'), + to_id=options.get('to_id'), + to_end=options.get('to_end') + ) + + if self.includes_map_anchors: + donor_anchors = donor_anchors + anchors + + # Add to includes_map + if donor_md_path and (recipient_md_path in self.chapters or "index.md" in recipient_md_path): + if not self._exist_in_includes_map(self.includes_map, recipient_md_path): + if not self.includes_map_anchors or len(donor_anchors) == 0: + self.includes_map.append({'file': recipient_md_path, "includes": []}) + else: + self.includes_map.append({'file': recipient_md_path, "includes": [], 'anchors': []}) + + for i, f in enumerate(self.includes_map): + if f['file'] == recipient_md_path: + if donor_md_path not in self.includes_map[i]['includes']: + self.includes_map[i]['includes'].append(donor_md_path) + + if self.includes_map_anchors: + if 'anchors' not in self.includes_map[i]: + self.includes_map[i]['anchors'] = [] + for anchor in donor_anchors: + if anchor not in self.includes_map[i]['anchors']: + self.includes_map[i]['anchors'].append(anchor) + def process_includes( self, markdown_file_path: Path, @@ -1448,6 +1728,33 @@ def apply(self): source_files_extensions = self._get_source_files_extensions() + # First pass: collect includes_map for all files from source directory + if self.includes_map_enable: + self.logger.debug('First pass: collecting includes_map from source files') + + # Process source directory files for includes_map + src_dir_path = self.project_path / self.src_dir + for source_files_extension in source_files_extensions: + for source_file_path in src_dir_path.rglob(source_files_extension): + # Get relative path from src_dir + rel_path = source_file_path.relative_to(src_dir_path) + + # Read content from source file + with open(source_file_path, encoding='utf8') as source_file: + source_content = source_file.read() + + # Determine recipient path for includes_map + recipient_md_path = f'{self.src_dir}/{rel_path.as_posix()}' + + # Process includes for map collection + self.process_includes_for_map( + source_file_path, + source_content, + recipient_md_path + ) + + # Second pass: process files in working directory + self.logger.debug('Second pass: processing includes in working directory') for source_files_extension in source_files_extensions: for source_file_path in self.working_dir.rglob(source_files_extension): with open(source_file_path, encoding='utf8') as source_file: @@ -1463,12 +1770,27 @@ def apply(self): with open(source_file_path, 'w', encoding='utf8') as processed_file: processed_file.write(processed_content) - # Write includes map + # Write includes map (sort data for consistent output) if self.includes_map_enable: output = f'{self.working_dir}/static/includes_map.json' Path(f'{self.working_dir}/static/').mkdir(parents=True, exist_ok=True) - with open(f'{self.working_dir}/static/includes_map.json', 'w', encoding='utf8') as f: - dump(self.includes_map, f) - self.logger.debug(f'includes_map write to {output}') + + # Sort includes_map for consistent output + def sort_includes_map(data): + if isinstance(data, list): + for item in data: + if isinstance(item, dict): + if 'includes' in item and isinstance(item['includes'], list): + item['includes'].sort() + if 'anchors' in item and isinstance(item['anchors'], list): + item['anchors'].sort() + data.sort(key=lambda x: x.get('file', '')) + return data + + sorted_includes_map = sort_includes_map(self.includes_map) + + with open(output, 'w', encoding='utf8') as f: + dump(sorted_includes_map, f) + self.logger.debug(f'includes_map written to {output}') self.logger.info('Preprocessor applied') diff --git a/setup.py b/setup.py index 045dd20..c757cbe 100644 --- a/setup.py +++ b/setup.py @@ -16,7 +16,7 @@ description=SHORT_DESCRIPTION, long_description=LONG_DESCRIPTION, long_description_content_type='text/markdown', - version='1.1.20', + version='1.1.21', author='Konstantin Molchanov', author_email='moigagoo@live.com', url='https://github.com/foliant-docs/foliantcontrib.includes', diff --git a/test/test_includes.py b/test/test_includes.py index c52d8c7..4740262 100644 --- a/test/test_includes.py +++ b/test/test_includes.py @@ -350,3 +350,170 @@ def test_adjust_links_three(self): input_mapping=input_map, expected_mapping=expected_map, ) + + def test_includes_map_with_not_build_file(self): + '''Test includes_map generation for files with not_build: true parameter.''' + self.ptf.options = {'includes_map': True } + input_map = { + 'index.md': '# My title\n\n', + 'not_build.md': """--- +not_build: true +--- + +# Not built file + +""", + 'sub/sub-1.md': 'Included content 1', + 'sub/sub-2.md': 'Included content 2' + } + expected_map = { + 'index.md': '# My title\n\nIncluded content 1', + 'static/includes_map.json': "[{\"file\": \"__src__/index.md\", \"includes\": [\"__src__/sub/sub-1.md\"]}, {\"file\": \"__src__/not_build.md\", \"includes\": [\"__src__/sub/sub-2.md\"]}]", + 'not_build.md': """--- +not_build: true +--- + +# Not built file + +Included content 2""", + 'sub/sub-1.md': 'Included content 1', + 'sub/sub-2.md': 'Included content 2' + } + + self.ptf.test_preprocessor( + input_mapping=input_map, + expected_mapping=expected_map, + ) + + def test_includes_map_with_anchors_and_not_build(self): + '''Test includes_map generation with anchors for files with not_build: true.''' + self.ptf.options = {'includes_map': {'anchors': True} } + input_map = { + 'index.md': '# My title\n\n', + 'not_build.md': """--- +not_build: true +--- + +# Not built file + +""", + 'sub/sub-1.md': '# Included 1 {#anchor1}\n\nContent 1\n\nanchor2', + 'sub/sub-2.md': '# Included 2 {#anchor3}\n\nContent 2\n\nanchor4' + } + expected_map = { + 'index.md': '# My title\n\n# Included 1 {#anchor1}\n\nContent 1\n\nanchor2', + 'static/includes_map.json': "[{\"file\": \"__src__/index.md\", \"includes\": [\"__src__/sub/sub-1.md\"], \"anchors\": [\"anchor1\", \"anchor2\"]}, {\"file\": \"__src__/not_build.md\", \"includes\": [\"__src__/sub/sub-2.md\"], \"anchors\": [\"anchor3\", \"anchor4\"]}]", + 'not_build.md': """--- +not_build: true +--- + +# Not built file + +# Included 2 {#anchor3}\n\nContent 2\n\nanchor4""", + 'sub/sub-1.md': '# Included 1 {#anchor1}\n\nContent 1\n\nanchor2', + 'sub/sub-2.md': '# Included 2 {#anchor3}\n\nContent 2\n\nanchor4' + } + + self.ptf.test_preprocessor( + input_mapping=input_map, + expected_mapping=expected_map, + ) + + def test_recursive_includes_in_not_build_file(self): + '''Test recursive includes in files with not_build: true.''' + self.ptf.options = {'includes_map': True, 'recursive': True } + input_map = { + 'index.md': '# Main file\n\n', + 'not_build.md': """--- +not_build: true +--- + +# Not built file + +""", + 'level1.md': '# Level 1\n\n', + 'level2.md': '# Level 2\n\nFinal content' + } + expected_map = { + 'index.md': '# Main file\n\n# Not built file\n\n# Level 1\n\n# Level 2\n\nFinal content', + 'static/includes_map.json': "[{\"file\": \"__src__/index.md\", \"includes\": [\"__src__/not_build.md\"]}, {\"file\": \"__src__/level1.md\", \"includes\": [\"__src__/level2.md\"]}, {\"file\": \"__src__/not_build.md\", \"includes\": [\"__src__/level1.md\"]}]", + 'not_build.md': """--- +not_build: true +--- + +# Not built file + +# Level 1\n\n# Level 2\n\nFinal content""", + 'level1.md': '# Level 1\n\n# Level 2\n\nFinal content', + 'level2.md': '# Level 2\n\nFinal content' + } + + self.ptf.test_preprocessor( + input_mapping=input_map, + expected_mapping=expected_map, + ) + + def test_includes_map_with_from_to_in_not_build(self): + '''Test includes_map with from/to parameters in not_build files.''' + self.ptf.options = {'includes_map': True } + input_map = { + 'not_build.md': """--- +not_build: true +--- + +# Not built file + +""", + 'content.md': '# Section 1\n\nContent 1\n\n# Section 2\n\nContent 2\n\n# Section 3\n\nContent 3' + } + + expected_map = { + 'static/includes_map.json': "[{\"file\": \"__src__/not_build.md\", \"includes\": [\"__src__/content.md\"]}]\n", + 'not_build.md': """--- +not_build: true +--- + +# Not built file + +# Section 1\n\nContent 1\n""", + 'content.md': '# Section 1\n\nContent 1\n\n# Section 2\n\nContent 2\n\n# Section 3\n\nContent 3' + } + + self.ptf.test_preprocessor( + input_mapping=input_map, + expected_mapping=expected_map, + ) + + def test_includes_map_empty_file_with_not_build(self): + '''Test includes_map with empty file that has not_build: true.''' + self.ptf.options = {'includes_map': True } + working_dir = self.ptf.context["project_path"].absolute() + tmp_dir= self.ptf.context["config"]["tmp_dir"] + + input_map = { + 'not_build.md': """--- +not_build: true +--- + +# Empty not built file + +""", + } + + self.ptf.options['allow_failure'] = True + + expected_map = { + 'static/includes_map.json': "[{\"file\": \"__src__/not_build.md\", \"includes\": [\"__src__/non_existent.md\"]}]", + 'not_build.md': f"""--- +not_build: true +--- + +# Empty not built file + +The url or repo_url link is not correct, file not found: {working_dir}/{tmp_dir}/non_existent.md""", + } + + self.ptf.test_preprocessor( + input_mapping=input_map, + expected_mapping=expected_map, + )