Skip to content

Commit

Permalink
Add pdf2text example docos, install examples to doc directory.
Browse files Browse the repository at this point in the history
  • Loading branch information
michaelrsweet committed Dec 23, 2024
1 parent ed14212 commit fd8427d
Show file tree
Hide file tree
Showing 5 changed files with 156 additions and 11 deletions.
29 changes: 27 additions & 2 deletions Makefile.in
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,26 @@ TARGETS = \
$(LIBPDFIO_STATIC) \
testpdfio \
testttf
DOCFILES = \
doc/pdfio.html \
doc/pdfio-512.png \
LICENSE \
NOTICE
EXAMPLES = \
examples/Makefile \
examples/Roboto-Bold.ttf \
examples/Roboto-Italic.ttf \
examples/Roboto-Regular.ttf \
examples/RobotoMono-Regular.ttf \
examples/code128.c \
examples/code128.ttf \
examples/image2pdf.c \
examples/md2pdf.c \
examples/md2pdf.md \
examples/mmd.c \
examples/mmd.h \
examples/pdf2text.c \
examples/pdfioinfo.c


# Make everything
Expand Down Expand Up @@ -150,8 +170,13 @@ install: $(TARGETS)
$(INSTALL) -c -m 644 pdfio.pc $(BUILDROOT)$(libdir)/pkgconfig
echo Installing documentation to $(BUILDROOT)$(datadir)/doc/pdfio...
$(INSTALL) -d -m 755 $(BUILDROOT)$(datadir)/doc/pdfio
for file in doc/pdfio.html doc/pdfio-512.png LICENSE NOTICE; do \
$(INSTALL) -c -m 644 $$file $(BUILDROOT)$(datadir)/doc/pdfio; \
for file in $(DOCFILES); do \
$(INSTALL) -c -m 644 $$file $(BUILDROOT)$(datadir)/doc/pdfio; \
done
echo Installing examples to $(BUILDROOT)$(datadir)/doc/pdfio/examples...
$(INSTALL) -d -m 755 $(BUILDROOT)$(datadir)/doc/pdfio/examples
for file in $(EXAMPLES); do \
$(INSTALL) -c -m 644 $$file $(BUILDROOT)$(datadir)/doc/pdfio/examples; \
done
echo Installing man page to $(BUILDROOT)$(mandir)/man3...
$(INSTALL) -d -m 755 $(BUILDROOT)$(mandir)/man3
Expand Down
42 changes: 39 additions & 3 deletions doc/pdfio.3
Original file line number Diff line number Diff line change
Expand Up @@ -1081,7 +1081,43 @@ The pdfioinfo.c example program opens a PDF file and prints the title, author, c
return (0);
}
.fi
.SS Create PDF File With Text and Image
.SS Extract Text from PDF File
.PP
The pdf2text.c example code extracts non\-Unicode text from a PDF file by scanning each page for strings and text drawing commands. Since it doesn't look at the font encoding or support Unicode text, it is really only useful to extract plain ASCII text from a PDF file. And since it writes text in the order it appears in the page stream, it may not come out in the same order as appears on the page.
.PP
The pdfioStreamGetToken function is used to read individual tokens from the page streams. Tokens starting with the open parenthesis are text strings, while PDF operators are left as\-is. We use some simple logic to make sure that we include spaces between text strings and add newlines for the text operators that start a new line in a text block:
.nf

pdfio_stream_t *st; // Page stream
bool first = true; // First string on line?
char buffer[1024]; // Token buffer

// Read PDF tokens from the page stream...
while (pdfioStreamGetToken(st, buffer, sizeof(buffer)))
{
if (buffer[0] == '(')
{
// Text string using an 8\-bit encoding
if (first)
first = false;
else if (buffer[1] != ' ')
putchar(' ');

fputs(buffer + 1, stdout);
}
else if (!strcmp(buffer, "Td") || !strcmp(buffer, "TD") || !strcmp(buffer, "T*") ||
!strcmp(buffer, "\\'") || !strcmp(buffer, "\\""))
{
// Text operators that advance to the next line in the block
putchar('\\n');
first = true;
}
}

if (!first)
putchar('\\n');
.fi
.SS Create a PDF File With Text and an Image
.PP
The image2pdf.c example code creates a PDF file containing a JPEG or PNG image file and optional caption on a single page. The create_pdf_image_file function creates the PDF file, embeds a base font and the named JPEG or PNG image file, and then creates a page with the image centered on the page with any text centered below:
.nf
Expand Down Expand Up @@ -2038,7 +2074,7 @@ We then loops through the fragments for the current line, drawing checkboxes, im
char targetlink[129]; // Targeted link

targetlink[0] = '#';
make_target_name(targetlink + 1, frag\->text, sizeof(targetlink) \- 1);
make_target_name(targetlink + 1, frag\->text, sNzeof(targetlink) \- 1);

l\->url = pdfioStringCreate(dd\->pdf, targetlink);
}
Expand Down Expand Up @@ -2099,7 +2135,7 @@ Then it formats each cell using the format_block function described previously.

for (col = 0; col < num_cols; col ++)
{
dd|>y = row_y;
dd\->y = row_y;

format_block(dd, row\->cells[col], deffont, SIZE_TABLE, cols[col].left,
cols[col].right, /*leader*/NULL);
Expand Down
41 changes: 37 additions & 4 deletions doc/pdfio.html
Original file line number Diff line number Diff line change
Expand Up @@ -276,7 +276,8 @@ <h2 class="title">Contents</h2>
</ul></li>
<li><a href="#examples">Examples</a><ul class="subcontents">
<li><a href="#read-pdf-metadata">Read PDF Metadata</a></li>
<li><a href="#create-pdf-file-with-text-and-image">Create PDF File With Text and Image</a></li>
<li><a href="#extract-text-from-pdf-file">Extract Text from PDF File</a></li>
<li><a href="#create-a-pdf-file-with-text-and-an-image">Create a PDF File With Text and an Image</a></li>
<li><a href="#generate-a-code-128-barcode">Generate a Code 128 Barcode</a></li>
<li><a href="#convert-markdown-to-pdf">Convert Markdown to PDF</a></li>
</ul></li>
Expand Down Expand Up @@ -1197,7 +1198,39 @@ <h3 class="title" id="read-pdf-metadata">Read PDF Metadata</h3>
<span class="reserved">return</span> (<span class="number">0</span>);
}
</code></pre>
<h3 class="title" id="create-pdf-file-with-text-and-image">Create PDF File With Text and Image</h3>
<h3 class="title" id="extract-text-from-pdf-file">Extract Text from PDF File</h3>
<p>The <code>pdf2text.c</code> example code extracts non-Unicode text from a PDF file by scanning each page for strings and text drawing commands. Since it doesn't look at the font encoding or support Unicode text, it is really only useful to extract plain ASCII text from a PDF file. And since it writes text in the order it appears in the page stream, it may not come out in the same order as appears on the page.</p>
<p>The <a href="#pdfioStreamGetToken"><code>pdfioStreamGetToken</code></a> function is used to read individual tokens from the page streams. Tokens starting with the open parenthesis are text strings, while PDF operators are left as-is. We use some simple logic to make sure that we include spaces between text strings and add newlines for the text operators that start a new line in a text block:</p>
<pre><code class="language-c">pdfio_stream_t *st; <span class="comment">// Page stream</span>
<span class="reserved">bool</span> first = <span class="reserved">true</span>; <span class="comment">// First string on line?</span>
<span class="reserved">char</span> buffer[<span class="number">1024</span>]; <span class="comment">// Token buffer</span>

<span class="comment">// Read PDF tokens from the page stream...</span>
<span class="reserved">while</span> (pdfioStreamGetToken(st, buffer, <span class="reserved">sizeof</span>(buffer)))
{
<span class="reserved">if</span> (buffer[<span class="number">0</span>] == <span class="string">'('</span>)
{
<span class="comment">// Text string using an 8-bit encoding</span>
<span class="reserved">if</span> (first)
first = <span class="reserved">false</span>;
<span class="reserved">else</span> <span class="reserved">if</span> (buffer[<span class="number">1</span>] != <span class="string">' '</span>)
putchar(<span class="string">' '</span>);

fputs(buffer + <span class="number">1</span>, stdout);
}
<span class="reserved">else</span> <span class="reserved">if</span> (!strcmp(buffer, <span class="string">&quot;Td&quot;</span>) || !strcmp(buffer, <span class="string">&quot;TD&quot;</span>) || !strcmp(buffer, <span class="string">&quot;T*&quot;</span>) ||
!strcmp(buffer, <span class="string">&quot;\'&quot;</span>) || !strcmp(buffer, <span class="string">&quot;\&quot;&quot;</span>))
{
<span class="comment">// Text operators that advance to the next line in the block</span>
putchar(<span class="string">'\n'</span>);
first = <span class="reserved">true</span>;
}
}

<span class="reserved">if</span> (!first)
putchar(<span class="string">'\n'</span>);
</code></pre>
<h3 class="title" id="create-a-pdf-file-with-text-and-an-image">Create a PDF File With Text and an Image</h3>
<p>The <code>image2pdf.c</code> example code creates a PDF file containing a JPEG or PNG image file and optional caption on a single page. The <code>create_pdf_image_file</code> function creates the PDF file, embeds a base font and the named JPEG or PNG image file, and then creates a page with the image centered on the page with any text centered below:</p>
<pre><code class="language-c"><span class="directive">#include &lt;pdfio.h&gt;</span>
<span class="directive">#include &lt;pdfio-content.h&gt;</span>
Expand Down Expand Up @@ -2000,7 +2033,7 @@ <h5 id="rendering-a-line-in-a-paragraph-heading-or-table-cell">Rendering a Line
<span class="reserved">char</span> targetlink[<span class="number">129</span>]; <span class="comment">// Targeted link</span>

targetlink[<span class="number">0</span>] = <span class="string">'#'</span>;
make_target_name(targetlink + <span class="number">1</span>, frag-&gt;text, <span class="reserved">sizeof</span>(targetlink) - <span class="number">1</span>);
make_target_name(targetlink + <span class="number">1</span>, frag-&gt;text, s¾zeof(targetlink) - <span class="number">1</span>);

l-&gt;url = pdfioStringCreate(dd-&gt;pdf, targetlink);
}
Expand Down Expand Up @@ -2053,7 +2086,7 @@ <h5 id="rendering-a-table-row">Rendering a Table Row</h5>

<span class="reserved">for</span> (col = <span class="number">0</span>; col &lt; num_cols; col ++)
{
ddì&gt;y = row_y;
dd-&gt;y = row_y;

format_block(dd, row-&gt;cells[col], deffont, SIZE_TABLE, cols[col].left,
cols[col].right, <span class="comment">/*leader*/</span>NULL);
Expand Down
52 changes: 50 additions & 2 deletions doc/pdfio.md
Original file line number Diff line number Diff line change
Expand Up @@ -922,8 +922,56 @@ main(int argc, // I - Number of command-line arguments
```
Create PDF File With Text and Image
-----------------------------------
Extract Text from PDF File
--------------------------
The `pdf2text.c` example code extracts non-Unicode text from a PDF file by
scanning each page for strings and text drawing commands. Since it doesn't
look at the font encoding or support Unicode text, it is really only useful to
extract plain ASCII text from a PDF file. And since it writes text in the order
it appears in the page stream, it may not come out in the same order as appears
on the page.
The [`pdfioStreamGetToken`](@@) function is used to read individual tokens from
the page streams. Tokens starting with the open parenthesis are text strings,
while PDF operators are left as-is. We use some simple logic to make sure that
we include spaces between text strings and add newlines for the text operators
that start a new line in a text block:
```c
pdfio_stream_t *st; // Page stream
bool first = true; // First string on line?
char buffer[1024]; // Token buffer
// Read PDF tokens from the page stream...
while (pdfioStreamGetToken(st, buffer, sizeof(buffer)))
{
if (buffer[0] == '(')
{
// Text string using an 8-bit encoding
if (first)
first = false;
else if (buffer[1] != ' ')
putchar(' ');
fputs(buffer + 1, stdout);
}
else if (!strcmp(buffer, "Td") || !strcmp(buffer, "TD") || !strcmp(buffer, "T*") ||
!strcmp(buffer, "\'") || !strcmp(buffer, "\""))
{
// Text operators that advance to the next line in the block
putchar('\n');
first = true;
}
}
if (!first)
putchar('\n');
```


Create a PDF File With Text and an Image
----------------------------------------

The `image2pdf.c` example code creates a PDF file containing a JPEG or PNG
image file and optional caption on a single page. The `create_pdf_image_file`
Expand Down
3 changes: 3 additions & 0 deletions examples/pdf2text.c
Original file line number Diff line number Diff line change
Expand Up @@ -57,11 +57,13 @@ main(int argc, // I - Number of command-line arguments
if ((st = pdfioPageOpenStream(obj, j, true)) == NULL)
continue;

// Read PDF tokens from the page stream...
first = true;
while (pdfioStreamGetToken(st, buffer, sizeof(buffer)))
{
if (buffer[0] == '(')
{
// Text string using an 8-bit encoding
if (first)
first = false;
else if (buffer[1] != ' ')
Expand All @@ -71,6 +73,7 @@ main(int argc, // I - Number of command-line arguments
}
else if (!strcmp(buffer, "Td") || !strcmp(buffer, "TD") || !strcmp(buffer, "T*") || !strcmp(buffer, "\'") || !strcmp(buffer, "\""))
{
// Text operators that advance to the next line in the block
putchar('\n');
first = true;
}
Expand Down

0 comments on commit fd8427d

Please sign in to comment.