Skip to content

Commit

Permalink
build: restore default export, also minify the output bundles (#341)
Browse files Browse the repository at this point in the history
* build: fix issues of v3.1.0, bring back the default export of PDFParser to avoid breaking changes

* build: minify the output bundles

* build: keep test as jest tests
  • Loading branch information
modesty authored May 1, 2024
1 parent e37a56e commit 3c703fe
Show file tree
Hide file tree
Showing 9 changed files with 271 additions and 150 deletions.
2 changes: 1 addition & 1 deletion .npmignore
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,6 @@
node_modules/
target/
.gitignore
test
test/
.vscode/

10 changes: 3 additions & 7 deletions lib/p2jcmd.js
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,9 @@ import nodeUtil from "util";
import fs from "fs";
import path from "path";

import {
PDFParser,
ParserStream,
StringifyStream,
pkInfo,
_PARSER_SIG as _PRO_TIMER,
} from "../dist/pdfparser.js";
import PDFParser from "../dist/pdfparser.js";

const { ParserStream, StringifyStream, pkInfo, _PARSER_SIG: _PRO_TIMER } = PDFParser;

import { yargs } from "./p2jcmdarg.js";

Expand Down
3 changes: 2 additions & 1 deletion lib/p2jcmdarg.js
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import { pkInfo, _PARSER_SIG as _PRO_TIMER } from "../dist/pdfparser.js";
import PDFParser from "../dist/pdfparser.js";
const { pkInfo, _PARSER_SIG: _PRO_TIMER } = PDFParser;

class CLIArgParser {
args = [];
Expand Down
236 changes: 165 additions & 71 deletions package-lock.json

Large diffs are not rendered by default.

6 changes: 4 additions & 2 deletions package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "pdf2json",
"version": "3.1.0",
"version": "3.1.1",
"description": "PDF file parser that converts PDF binaries to text based JSON, powered by porting a fork of PDF.JS to Node.js",
"keywords": [
"pdf",
Expand Down Expand Up @@ -33,6 +33,7 @@
"typings": "./pdfparser.d.ts",
"scripts": {
"pretest": "npm run build",
"test:jest": "jest --config ./jest.config.json",
"test": "jest --config ./jest.config.json",
"test:forms": "cd ./test && sh p2j.forms.sh",
"test:misc": "cd ./test && sh p2j.one.sh misc . \"Expected: 7 success, 3 fail exception with stack trace\" ",
Expand Down Expand Up @@ -69,7 +70,8 @@
"@rollup/plugin-commonjs": "^25.0.4",
"@rollup/plugin-node-resolve": "^15.2.1",
"@rollup/plugin-replace": "^5.0.2",
"rollup": "^4.17.0",
"@rollup/plugin-terser": "^0.4.4",
"rollup": "^4.17.2",
"rollup-plugin-node-builtins": "^2.1.2",
"rollup-plugin-inject": "^3.0.2",
"rollup-plugin-sourcemaps": "^0.6.3",
Expand Down
50 changes: 38 additions & 12 deletions pdfparser.js
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,13 @@ import PDFJS from "./lib/pdf.js";
import { ParserStream, StringifyStream } from "./lib/parserstream.js";
import { kColors, kFontFaces, kFontStyles } from "./lib/pdfconst.js";
import { pkInfo, _PARSER_SIG } from "./lib/pkinfo.js";
import PDFUnit from "./lib/pdfunit.js";

/**
* Class representing a PDF Parser.
* @extends EventEmitter
*/
class PDFParser extends EventEmitter {
export default class PDFParser extends EventEmitter {
/**
* Static method to retrieve color dictionary.
* @returns {object} Color dictionary
Expand All @@ -37,6 +38,42 @@ class PDFParser extends EventEmitter {
return kFontStyles;
}

/**
* static property to expose PDFUnit class
* @returns {PDFUnit} PDFUnit class
*/
static get PDFUnit() {
return PDFUnit;
}

/**
* static property to expose ParserStream class
*/
static get ParserStream() {
return ParserStream;
}

/**
* static property to expose StringifyStream class
*/
static get StringifyStream() {
return StringifyStream;
}

/**
* static property to expose pkInfo function
*/
static get pkInfo() {
return pkInfo;
}

/**
* static property to expose _PARSER_SIG function
*/
static get _PARSER_SIG() {
return _PARSER_SIG;
}

static #maxBinBufferCount = 10;
static #binBuffer = {};

Expand Down Expand Up @@ -268,14 +305,3 @@ class PDFParser extends EventEmitter {
this.#PDFJS = null;
}
}

export {
PDFParser,
ParserStream,
StringifyStream,
kColors,
kFontFaces,
kFontStyles,
pkInfo,
_PARSER_SIG,
};
74 changes: 39 additions & 35 deletions readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,18 +29,22 @@ To Run in RESTful Web Service or as command line Utility

After install, run command line:

> npm test
> npm test:jest
It'll build bundles and source maps for both ES Module and CommonJS, output to `./dist` directory, and run Jest test suit defined in `./test/_test_.cjs`.

The default test suits are eseential tests for all PRs. But it only covers a portion of all tesing PDFs, for more broader converage, run:
The default test suits are essential tests for all PRs. But it only covers a portion of all testing PDFs, for more broader coverage, run:

> npm run test:forms
It'll scan and parse _260_ PDF AcroForm files under _*./test/pdf*_, runs with _*-s -t -c -m*_ command line options, generates primary output JSON, additional text content JSON, form fields JSON and merged text file for each PDF. It usually takes ~20s in my MacBook Pro to complete, check _*./test/target/*_ for outputs.

_update on 4/27/2024_: parsing 260 PDFs by `npm run test:forms` on M2 Mac takes 7~8s

To run full test suits:

> npm test
### Test Exception Handlings

After install, run command line:
Expand Down Expand Up @@ -80,7 +84,7 @@ To disable the first type, you could mock the console.log and console.warn APIs,

```javascript
import fs from "fs";
import { PDFParser } from "pdf2json"; // starting v3.1.0, PDFParser is no longer the default export
import PDFParser from "pdf2json";

const pdfParser = new PDFParser();

Expand Down Expand Up @@ -122,7 +126,7 @@ pdfParser.on("error", (err) => console.error("Parser Error", err));

```javascript
import fs from "fs";
import { PDFParser } from "pdf2json"; // starting v3.1.0, PDFParser is no longer the default export
import PDFParser from "pdf2json";

const pdfParser = new PDFParser(this, 1);

Expand All @@ -146,7 +150,7 @@ pdfParser.loadPDF("./pdf2json/test/pdf/fd/form/F1040EZ.pdf");

```javascript
import fs from "fs";
import { PDFParser } from "pdf2json"; // starting v3.1.0, PDFParser is no longer the default export
import PDFParser from "pdf2json";

const pdfParser = new PDFParser();

Expand All @@ -170,7 +174,7 @@ Alternatively, you can pipe input and output streams: (requires v1.1.4)

```javascript
import fs from "fs";
import { PDFParser } from "pdf2json"; // starting v3.1.0, no default export of PDFParser
import PDFParser from "pdf2json";

const inputStream = fs.createReadStream(
"./pdf2json/test/pdf/fd/form/F1040EZ.pdf",
Expand Down Expand Up @@ -248,8 +252,8 @@ See [p2jcmd.js](https://github.com/modesty/pdf2json/blob/master/lib/p2jcmd.js) f
- alternative events: (v2.0.0)

- readable: first event dispatched after PDF file metadata is parsed and before processing any page
- data: one parsed page succeeded, null means last page has been processed, signle end of data stream
- error: exception or error occured
- data: one parsed page succeeded, null means last page has been processed, single end of data stream
- error: exception or error occurred

- start to parse PDF file from specified file path asynchronously:

Expand Down Expand Up @@ -293,33 +297,33 @@ Current parsed data has four main sub objects to describe the PDF document.
- Parent: parent name, default "unknown";
- _*v2.0.0*_: 'Agency' and 'Id' are replaced with full metadata, example: for `./test/pdf/fd/form/F1040.pdf`, full metadata is:

```json
Meta: {
PDFFormatVersion: '1.7',
IsAcroFormPresent: true,
IsXFAPresent: false,
Author: 'SE:W:CAR:MP',
Subject: 'U.S. Individual Income Tax Return',
Creator: 'Adobe Acrobat Pro 10.1.8',
Producer: 'Adobe Acrobat Pro 10.1.8',
CreationDate: "D:20131203133943-08'00'",
ModDate: "D:20140131180702-08'00'",
Metadata: {
'xmp:modifydate': '2014-01-31T18:07:02-08:00',
'xmp:createdate': '2013-12-03T13:39:43-08:00',
'xmp:metadatadate': '2014-01-31T18:07:02-08:00',
'xmp:creatortool': 'Adobe Acrobat Pro 10.1.8',
'dc:format': 'application/pdf',
'dc:description': 'U.S. Individual Income Tax Return',
'dc:creator': 'SE:W:CAR:MP',
'xmpmm:documentid': 'uuid:4d81e082-7ef2-4df7-b07b-8190e5d3eadf',
'xmpmm:instanceid': 'uuid:7ea96d1c-3d2f-284a-a469-f0f284a093de',
'pdf:producer': 'Adobe Acrobat Pro 10.1.8',
'adhocwf:state': '1',
'adhocwf:version': '1.1'
}
}
```
```javascript
Meta: {
PDFFormatVersion: '1.7',
IsAcroFormPresent: true,
IsXFAPresent: false,
Author: 'SE:W:CAR:MP',
Subject: 'U.S. Individual Income Tax Return',
Creator: 'Adobe Acrobat Pro 10.1.8',
Producer: 'Adobe Acrobat Pro 10.1.8',
CreationDate: "D:20131203133943-08'00'",
ModDate: "D:20140131180702-08'00'",
Metadata: {
'xmp:modifydate': '2014-01-31T18:07:02-08:00',
'xmp:createdate': '2013-12-03T13:39:43-08:00',
'xmp:metadatadate': '2014-01-31T18:07:02-08:00',
'xmp:creatortool': 'Adobe Acrobat Pro 10.1.8',
'dc:format': 'application/pdf',
'dc:description': 'U.S. Individual Income Tax Return',
'dc:creator': 'SE:W:CAR:MP',
'xmpmm:documentid': 'uuid:4d81e082-7ef2-4df7-b07b-8190e5d3eadf',
'xmpmm:instanceid': 'uuid:7ea96d1c-3d2f-284a-a469-f0f284a093de',
'pdf:producer': 'Adobe Acrobat Pro 10.1.8',
'adhocwf:state': '1',
'adhocwf:version': '1.1'
}
}
```

- 'Pages': array of 'Page' object that describes each page in the PDF, including sizes, lines, fills and texts within the page. More info about 'Page' object can be found at 'Page Object Reference' section
- 'Width': the PDF page width in page unit
Expand Down
38 changes: 18 additions & 20 deletions rollup.config.js
Original file line number Diff line number Diff line change
@@ -1,23 +1,26 @@
import path from "path";
import nodeResolve from "@rollup/plugin-node-resolve";
import builtins from "rollup-plugin-node-builtins";
import path from "path";
import inject from "rollup-plugin-inject";
import terser from "@rollup/plugin-terser";
import sourcemaps from "rollup-plugin-sourcemaps";

const external = [
"fs",
"util",
"fs/promises",
"events",
"path",
"url",
"buffer",
"stream",
"@xmldom/xmldom",
];

export default [
{
input: "./pdfparser.js",
external: [
"fs",
"util",
"fs/promises",
"events",
"path",
"url",
"buffer",
"stream",
"@xmldom/xmldom",
],
external,
output: [
{
file: "dist/pdfparser.cjs",
Expand All @@ -41,15 +44,10 @@ export default [
path.resolve("lib/pdfcanvas.js"),
"createScratchCanvas",
],
PDFAnno: [
path.resolve("lib/pdfanno.js"),
"PDFAnno",
],
Image: [
path.resolve("lib/pdfimage.js"),
"Image",
],
PDFAnno: [path.resolve("lib/pdfanno.js"), "PDFAnno"],
Image: [path.resolve("lib/pdfimage.js"), "Image"],
}),
terser(),
sourcemaps(),
],
},
Expand Down
2 changes: 1 addition & 1 deletion test/_test_.cjs
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
const assert = require("assert");
const fs = require("fs");

const { PDFParser } = require("../dist/pdfparser.cjs");
const PDFParser = require("../dist/pdfparser.cjs");

function pdfParserRunner(fileName, fromBuffer) {
const pdfParser = new PDFParser();
Expand Down

0 comments on commit 3c703fe

Please sign in to comment.