Skip to content

Commit 08a1df8

Browse files
committed
fixing markitdown integration
1 parent 2bfe13e commit 08a1df8

File tree

6 files changed

+135
-106
lines changed

6 files changed

+135
-106
lines changed

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[project]
22
name = "readium"
3-
version = "0.1.0"
3+
version = "0.1.1"
44
description = "A tool to extract and analyze documentation from repositories and directories"
55
authors = [
66
{name = "Pablo Toledo", email = "[email protected]"}

src/readium/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
from .core import ReadConfig, Readium
22

3-
__all__ = ["ReadConfig", "Readium"]
3+
__all__ = ["ReadConfig", "Readium"]

src/readium/cli.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,11 @@
44
from rich.console import Console
55
from rich.table import Table
66

7-
from .config import DEFAULT_EXCLUDE_DIRS, DEFAULT_INCLUDE_EXTENSIONS
7+
from .config import (
8+
DEFAULT_EXCLUDE_DIRS,
9+
DEFAULT_INCLUDE_EXTENSIONS,
10+
MARKITDOWN_EXTENSIONS,
11+
)
812
from .core import ReadConfig, Readium
913

1014
console = Console()
@@ -64,7 +68,9 @@ def main(
6468
include_extensions=DEFAULT_INCLUDE_EXTENSIONS | set(include_ext),
6569
target_dir=target_dir,
6670
use_markitdown=use_markitdown,
67-
markitdown_extensions=set(markitdown_ext) if markitdown_ext else None,
71+
markitdown_extensions=(
72+
set(markitdown_ext) if markitdown_ext else MARKITDOWN_EXTENSIONS
73+
),
6874
debug=debug,
6975
)
7076

src/readium/config.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -142,12 +142,14 @@
142142
".xml",
143143
}
144144

145-
# Define extensiones que requieren MarkItDown
146145
MARKITDOWN_EXTENSIONS = {
147146
".pdf",
148147
".docx",
149148
".xlsx",
149+
".xls",
150150
".pptx",
151+
".html",
152+
".htm",
151153
".msg",
152154
}
153155

src/readium/core.py

Lines changed: 23 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -5,11 +5,15 @@
55
from pathlib import Path
66
from typing import Dict, List, Optional, Set, Tuple, Union
77

8-
from markitdown import (FileConversionException, MarkItDown,
9-
UnsupportedFormatException)
8+
from markitdown import FileConversionException, MarkItDown, UnsupportedFormatException
109

11-
from .config import (DEFAULT_EXCLUDE_DIRS, DEFAULT_EXCLUDE_FILES,
12-
DEFAULT_INCLUDE_EXTENSIONS, MARKITDOWN_EXTENSIONS, ReadConfig)
10+
from .config import (
11+
DEFAULT_EXCLUDE_DIRS,
12+
DEFAULT_EXCLUDE_FILES,
13+
DEFAULT_INCLUDE_EXTENSIONS,
14+
MARKITDOWN_EXTENSIONS,
15+
ReadConfig,
16+
)
1317

1418

1519
def is_git_url(url: str) -> bool:
@@ -99,7 +103,9 @@ def should_process_file(self, file_path: Union) -> bool:
99103
parts = file_path.parts
100104
for excluded_dir in self.config.exclude_dirs:
101105
if excluded_dir in parts:
102-
self.log_debug(f"Excluding {file_path} due to being in excluded directory {excluded_dir}")
106+
self.log_debug(
107+
f"Excluding {file_path} due to being in excluded directory {excluded_dir}"
108+
)
103109
return False
104110

105111
# Check exclude patterns - handle macOS @ suffix
@@ -120,24 +126,24 @@ def should_process_file(self, file_path: Union) -> bool:
120126
except FileNotFoundError:
121127
return False
122128

123-
if self.config.use_markitdown:
124-
# If markitdown is active and extensions were specified, use only those
125-
if self.config.markitdown_extensions:
126-
if file_ext in self.config.markitdown_extensions:
127-
self.log_debug(f"Including {file_path} for markitdown processing")
128-
return True
129-
self.log_debug(
130-
f"Extension {file_ext} not in markitdown extensions: {self.config.markitdown_extensions}"
131-
)
129+
should_use_markitdown = (
130+
self.config.use_markitdown
131+
and self.config.markitdown_extensions
132+
and file_ext in self.config.markitdown_extensions
133+
)
134+
135+
if should_use_markitdown:
136+
self.log_debug(f"Including {file_path} for markitdown processing")
137+
return True
132138

133-
# If markitdown is not used or the file is not compatible with markitdown,
134-
# check if it is in the included extensions
139+
# If not using markitdown or file isn't compatible with markitdown,
140+
# check if it's in the included extensions
135141
if file_ext not in self.config.include_extensions:
136142
self.log_debug(f"Extension {file_ext} not in supported extensions")
137143
return False
138144

139145
# Check if binary only for non-markitdown files
140-
if not (self.config.use_markitdown and self.config.markitdown_extensions and file_ext in self.config.markitdown_extensions):
146+
if not should_use_markitdown:
141147
is_bin = self.is_binary(file_path)
142148
if is_bin:
143149
self.log_debug(f"Excluding {file_path} because it's binary")

0 commit comments

Comments
 (0)