Skip to content

Commit

Permalink
docs: Move examples to Discussions
Browse files Browse the repository at this point in the history
  • Loading branch information
adrienjoly committed Mar 25, 2022
1 parent 4afea2c commit 7dcf2e3
Showing 1 changed file with 19 additions and 140 deletions.
159 changes: 19 additions & 140 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,20 @@ new PdfReader().parseFileItems("test/sample.pdf", (err, item) => {
});
```

### Raw PDF reading from a PDF already in memory (buffer)
### Parsing a password-protected PDF file

```javascript
new PdfReader({ password: "YOUR_PASSWORD" }).parseFileItems(
"test/sample-with-password.pdf",
function (err, item) {
if (err) console.error(err);
else if (!item) console.warn("end of file");
else if (item.text) console.log(item.text);
}
);
```

### Raw PDF reading from a PDF buffer

As above, but reading from a buffer in memory rather than from a file referenced by path. For example:

Expand All @@ -72,155 +85,21 @@ fs.readFile("test/sample.pdf", (err, pdfBuffer) => {
});
```

### Example: reading from a buffer of an online PDF

```javascript
const get = (url) =>
new Promise((resolve, reject) =>
https
.get(url, (res) => {
const data = [];
res
.on("data", (chunk) => data.push(chunk))
.on("end", () => resolve(Buffer.concat(data)));
})
.on("error", reject)
);

function addTextToLines(textLines, item) {
const existingLine = textLines.find(({ y }) => y === item.y);
if (existingLine) {
existingLine.text += " " + item.text;
} else {
textLines.push(item);
}
}

const parseLinesPerPage = (buffer) =>
new Promise((resolve, reject) => {
const linesPerPage = [];
let pageNumber = 0;
new PdfReader().parseBuffer(buffer, (err, item) => {
if (err) reject(err);
else if (!item) {
resolve(linesPerPage.map((page) => page.map((line) => line.text)));
} else if (item.page) {
pageNumber = item.page - 1;
linesPerPage[pageNumber] = [];
} else if (item.text) {
addTextToLines(linesPerPage[pageNumber], item);
}
});
});

const url = new URL(
"https://raw.githubusercontent.com/adrienjoly/npm-pdfreader/master/test/sample.pdf"
);
const buffer = get(url)
.then((buffer) => parseLinesPerPage(buffer))
.then((linesPerPage) => console.log(linesPerPage));
```

### Example: parsing lines of text from a PDF file
### Other examples of use

![example cv resume parse convert pdf to text](https://github.com/adrienjoly/npm-pdfreader-example/raw/master/parseRows.png)

Here is the code required to convert this PDF file into text:

```js
const { PdfReader } = require("pdfreader");

let rows = {}; // indexed by y-position

function flushRows() {
Object.keys(rows) // => array of y-positions (type: float)
.sort((y1, y2) => parseFloat(y1) - parseFloat(y2)) // sort float positions
.forEach((y) => console.log((rows[y] || []).join("")));
rows = {}; // clear rows for next page
}

new PdfReader().parseFileItems("test/sample.pdf", (err, item) => {
if (err) {
console.error({ err });
} else if (!item) {
flushRows();
console.log("END OF FILE");
} else if (item.page) {
flushRows(); // print the rows of the previous page
console.log("PAGE:", item.page);
} else if (item.text) {
// accumulate text items into rows object, per line
(rows[item.y] = rows[item.y] || []).push(item.text);
}
});
```

Fork this example from [parsing a CV/résumé](https://github.com/adrienjoly/npm-pdfreader-example).

### Example: parsing a table from a PDF file

![example cv resume parse convert pdf table to text](https://github.com/adrienjoly/npm-pdfreader-example/raw/master/parseTable.png)

Here is the code required to convert this PDF file into a textual table:

```js
var pdfreader = require("pdfreader");

const nbCols = 2;
const cellPadding = 40; // each cell is padded to fit 40 characters
const columnQuantitizer = (item) => parseFloat(item.x) >= 20;

const padColumns = (array, nb) =>
Array.apply(null, { length: nb }).map((val, i) => array[i] || []);
// .. because map() skips undefined elements

const mergeCells = (cells) =>
(cells || [])
.map((cell) => cell.text)
.join("") // merge cells
.substr(0, cellPadding)
.padEnd(cellPadding, " "); // padding

const renderMatrix = (matrix) =>
(matrix || [])
.map((row, y) => padColumns(row, nbCols).map(mergeCells).join(" | "))
.join("\n");

var table = new pdfreader.TableParser();

new pdfreader.PdfReader().parseFileItems(filename, function (err, item) {
if (!item || item.page) {
// end of file, or page
console.log(renderMatrix(table.getMatrix()));
console.log("PAGE:", item.page);
table = new pdfreader.TableParser(); // new/clear table for next page
} else if (item.text) {
// accumulate text items into rows object, per line
table.processItem(item, columnQuantitizer(item));
}
});
```
Source code of the examples above: [parsing a CV/résumé](https://github.com/adrienjoly/npm-pdfreader-example).

Fork this example from [parsing a CV/résumé](https://github.com/adrienjoly/npm-pdfreader-example).

## Example: opening a PDF file with a password

```javascript
new PdfReader({ password: "YOUR_PASSWORD" }).parseFileItems(
"test/sample-with-password.pdf",
function (err, item) {
if (err) console.error(err);
else if (!item) console.warn("end of file");
else if (item.text) console.log(item.text);
}
);
```
For more, see [Examples of use](https://github.com/adrienjoly/npm-pdfreader/discussions/categories/examples-of-use).

## Rule-based data extraction

The Rule class can be used to define and process data extraction rules, while parsing a PDF document.
The `Rule` class can be used to define and process data extraction rules, while parsing a PDF document.

Rule instances expose "accumulators": methods that defines the data extraction strategy to be used for each rule.
`Rule` instances expose "accumulators": methods that defines the data extraction strategy to be used for each rule.

Example:

Expand Down

0 comments on commit 7dcf2e3

Please sign in to comment.