Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

这个有点overkill了,我简化成一个脚本 #36

Open
vincentaxhe opened this issue Jun 17, 2024 · 1 comment
Open

这个有点overkill了,我简化成一个脚本 #36

vincentaxhe opened this issue Jun 17, 2024 · 1 comment

Comments

@vincentaxhe
Copy link

我用之前的版本,当尝试建立下级目录,需要想出精巧的正则公式,这就overkill了,我想要tab来指示层级。不知道现在是不是还忽略行首tab的。

#!/bin/python
import os
import re
import sys
from collections import defaultdict
from pypdf import PdfWriter, PdfReader


class Pdf(object):
    def __init__(self, path):
        self.path = path
        reader = PdfReader(open(path, "rb"), strict=False)
        self.writer = PdfWriter()
        self.writer.append(reader)
        self.writer._root_object.pop("/Outlines", None)

    @property
    def _new_path(self):
        name, ext = os.path.splitext(self.path)
        return name + '_new' + ext

    def add_bookmark(self, title, pagenum, parent=None):
        return self.writer.add_outline_item(title, pagenum, parent=parent)

    def save_pdf(self):
        if os.path.exists(self._new_path):
            os.remove(self._new_path)
        with open(self._new_path, 'wb') as out:
            self.writer.write(out)
        return self._new_path

def _add_bookmark(pdf, index_dict):
    if not index_dict:
        return None
    m = max(index_dict.keys())
    parent_dict = {}  # {parent index:IndirectObject}
    for i in range(m+1):
        value = index_dict[i]
        inobject = pdf.add_bookmark(value['title'], 
                                    value['pagenum'] - 1, 
                                    parent_dict.get(value.get('parent')))
        parent_dict[i] = inobject

def add_bookmark(path, index_dict):
    pdf = Pdf(path)
    _add_bookmark(pdf, index_dict)
    return pdf.save_pdf()

def toc_reader(path, gap):
    pattern = re.compile(r'^(\t*)([^\t]+)\t(\d+)$')
    tocdict = {}
    levels = defaultdict(list)
    lastpagenum = 0
    with open(path, 'r') as toc:
        for line, item in enumerate(toc):
            content = pattern.search(item)
            assert content, f"line {line}:{item} line ill-formatted"
            indent, title, pagenum = content.group(1, 2, 3)
            pagenum = int(pagenum) + int(gap)
            assert pagenum >= lastpagenum, f"line {line}:{item} pagenum wrong"
            tocdict[line] = {'title': title, 'pagenum': pagenum}
            levels[len(indent)].append(line)
            if len(indent) > 0:
                tocdict[line]['parent'] = levels[len(indent) - 1][-1]
            lastpagenum = pagenum
    return tocdict
if __name__ == '__main__':
    file, toc, gap = sys.argv[1:]
    index_dict = toc_reader(toc, gap)
    add_bookmark(file, index_dict)

使用pdfbookmark.py xxx.pdf toc 10来运行它,toc用tab缩进来分级

@vincentaxhe vincentaxhe changed the title 这个有点overkill了,我简化了脚本 这个有点overkill了,我简化成一个脚本 Jun 17, 2024
@chroming
Copy link
Owner

新版本支持空格分层了,不过脚本写的挺好的

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

2 participants