@@ -51,6 +51,46 @@ def _build_parser() -> argparse.ArgumentParser:
5151 action = "store_true" ,
5252 help = "package.zip 생성을 비활성화합니다." ,
5353 )
54+
55+ preprocess_parser = subparsers .add_parser (
56+ "preprocess" ,
57+ help = "PDF를 LLM 전달용으로 전처리합니다 (텍스트 추출 또는 이미지 압축)." ,
58+ )
59+ preprocess_parser .add_argument (
60+ "--input" ,
61+ required = True ,
62+ type = Path ,
63+ help = "입력 PDF 파일 경로" ,
64+ )
65+ preprocess_parser .add_argument (
66+ "--out" ,
67+ type = Path ,
68+ default = Path ("out/prep" ),
69+ help = "출력 디렉터리 (기본: out/prep)" ,
70+ )
71+ preprocess_parser .add_argument (
72+ "--method" ,
73+ choices = ["auto" , "mixed" , "text" , "images" , "markdown" ],
74+ default = "auto" ,
75+ help = (
76+ "처리 방식: auto=자동감지(기본), mixed=페이지별 수식 감지,"
77+ " text=텍스트 강제, images=이미지 강제,"
78+ " markdown=AI 수식 인식 후 LaTeX Markdown 출력 (marker-pdf 필요)"
79+ ),
80+ )
81+ preprocess_parser .add_argument (
82+ "--dpi" ,
83+ type = int ,
84+ default = 150 ,
85+ help = "이미지 모드 해상도 (기본: 150)" ,
86+ )
87+ preprocess_parser .add_argument (
88+ "--quality" ,
89+ type = int ,
90+ default = 72 ,
91+ help = "JPEG 압축 품질 0-100 (기본: 72)" ,
92+ )
93+
5494 return parser
5595
5696
@@ -126,6 +166,96 @@ def _run_build(input_path: Path, out_dir: Path, title: str | None, no_zip: bool)
126166 return 0
127167
128168
169+ _REASON_LABEL : dict [str , str ] = {
170+ "math_font" : "수식 폰트" ,
171+ "low_quality" : "텍스트 품질 낮음" ,
172+ "no_text" : "텍스트 없음" ,
173+ "text_ok" : "텍스트" ,
174+ "forced_text" : "텍스트(강제)" ,
175+ "forced_images" : "이미지(강제)" ,
176+ }
177+
178+
179+ def _run_preprocess (
180+ pdf_path : Path ,
181+ out_dir : Path ,
182+ method : str ,
183+ dpi : int ,
184+ quality : int ,
185+ ) -> int :
186+ from testmagick .preprocess import preprocess_pdf
187+
188+ try :
189+ result = preprocess_pdf (
190+ pdf_path ,
191+ out_dir ,
192+ method = method , # type: ignore[arg-type]
193+ dpi = dpi ,
194+ quality = quality ,
195+ )
196+ except ImportError as exc :
197+ print (f"{ _err_tag ()} { exc } " )
198+ return 1
199+ except Exception as exc :
200+ print (f"{ _err_tag ()} PDF 전처리 실패: { exc } " )
201+ return 1
202+
203+ method_labels = {
204+ "text" : "텍스트 추출" ,
205+ "images" : f"이미지 압축 ({ dpi } dpi · JPEG { quality } %)" ,
206+ "mixed" : f"mixed (페이지별 수식 감지 · 이미지 { dpi } dpi)" ,
207+ "markdown" : "AI 수식 인식 → LaTeX Markdown (marker-pdf)" ,
208+ }
209+ method_label = method_labels .get (result .method , result .method )
210+
211+ img_pages = sum (1 for d in result .page_decisions if d .use_image )
212+ txt_pages = result .page_count - img_pages
213+
214+ print (f"{ _ok_tag ()} 전처리 완료 [{ method_label } ] — { result .page_count } 페이지" )
215+ print ()
216+
217+ # 페이지별 결정 표 (mixed 모드일 때만)
218+ if result .method == "mixed" :
219+ for d in result .page_decisions :
220+ bar = "▓" if not d .use_image else "░"
221+ kind = _REASON_LABEL .get (d .reason , d .reason )
222+ score_str = f" score={ d .quality_score :.2f} " if d .reason != "no_text" else ""
223+ print (f" p{ d .page_num :>2} { bar } { kind } { score_str } " )
224+ print ()
225+ print (f" 텍스트 페이지: { txt_pages } / 이미지 페이지: { img_pages } " )
226+ print ()
227+
228+ # 출력 파일 목록
229+ for f in result .all_files :
230+ size_kb = f .stat ().st_size / 1024
231+ print (f" { _path_label (f .name ):<26} { size_kb :>7.1f} KB" )
232+ schema_kb = result .schema_ref .stat ().st_size / 1024
233+ print (f" { _path_label ('schema_ref.md' ):<26} { schema_kb :>7.1f} KB" )
234+ print (f" { _path_label ('prompt.md' )} " )
235+ print ()
236+
237+ # 토큰 추정
238+ schema_tokens = int (schema_kb * 1024 / 3.5 )
239+ total_tokens = result .est_tokens + schema_tokens
240+ print (f" { '추정 토큰 (본문)' :<20} ~{ result .est_tokens :>6,} " )
241+ print (f" { '스키마 참조' :<20} ~{ schema_tokens :>6,} " )
242+ print (f" { '합계' :<20} ~{ total_tokens :>6,} " )
243+
244+ if result .method in ("images" , "mixed" ) and img_pages > 0 :
245+ import math
246+ orig_w = int ((1240 / 150 ) * 300 )
247+ orig_h = int ((1754 / 150 ) * 300 )
248+ orig_tiles = math .ceil (orig_w / 512 ) * math .ceil (orig_h / 512 )
249+ orig_img_tokens = orig_tiles * 170 * img_pages
250+ cur_img_tokens = result .est_tokens # 근사
251+ if orig_img_tokens > 0 :
252+ saved_pct = max (0.0 , (1 - cur_img_tokens / orig_img_tokens ) * 100 )
253+ print (f"\n 이미지 { img_pages } 페이지 기준, 300dpi 대비 ~{ saved_pct :.0f} % 토큰 절감" )
254+
255+ print (f"\n 출력 경로: { _path_label (str (out_dir .resolve ()))} " )
256+ return 0
257+
258+
129259def main (argv : list [str ] | None = None ) -> int :
130260 parser = _build_parser ()
131261 args = parser .parse_args (argv )
@@ -139,6 +269,14 @@ def main(argv: list[str] | None = None) -> int:
139269 title = args .title ,
140270 no_zip = args .no_zip ,
141271 )
272+ if args .command == "preprocess" :
273+ return _run_preprocess (
274+ pdf_path = args .input ,
275+ out_dir = args .out ,
276+ method = args .method ,
277+ dpi = args .dpi ,
278+ quality = args .quality ,
279+ )
142280
143281 parser .print_help ()
144282 return 1
0 commit comments