Skip to content

Commit 6d6dd05

Browse files
authored
Merge pull request #60 from ffalt/feature/provide-text-colors
Add option to extract font color for text items
2 parents 0d60b16 + f669860 commit 6d6dd05

197 files changed

Lines changed: 13051 additions & 11517 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

README.md

Lines changed: 25 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ export interface PDFExtractOptions {
3232
disableCombineTextItems?: boolean; // default:`false` - do not attempt to combine same line {@link TextItem}'s.
3333
includeAttachments?: boolean; // include attachments as base64. The default value is `false`.
3434
includeImages?: boolean; // include images as base64. The default value is `false`.
35+
includeColors?: boolean; // default:`false` - include font fill color (best effort, possibly incomplete).
3536
}
3637
```
3738

@@ -281,8 +282,8 @@ interface PDFExtractImage {
281282
"horizontalCornerRadius": 0,
282283
"verticalCornerRadius": 0
283284
},
284-
"color": [0, 0, 0],
285-
"borderColor": [0, 0, 0],
285+
"color": "#000000",
286+
"borderColor": "#000000",
286287
"rotation": 0,
287288
"contentsObj": {
288289
"str": "",
@@ -312,7 +313,17 @@ interface PDFExtractImage {
312313
"dir": "ltr",
313314
"width": 64.656,
314315
"height": 12,
315-
"fontName": "Times"
316+
"transform": [12, 0, 0, 12, 70, 50],
317+
"font": {
318+
"size": 12,
319+
"name": "TimesNewRomanPSMT",
320+
"color": "#000000",
321+
"family": "serif",
322+
"vertical": false,
323+
"ascent": 0.891,
324+
"descent": -0.216
325+
},
326+
"hasEOL": false
316327
}
317328
],
318329
"images": [
@@ -340,3 +351,14 @@ interface PDFExtractImage {
340351
```
341352

342353
Note: The `images` and `attachments` arrays are optional and only included when they are detected in the PDF.
354+
355+
## Limitations
356+
357+
### Font Color
358+
359+
Font color extraction is enabled by setting `includeColors: true`.
360+
The `font.color` value is extracted with best effort by correlating the rendering operator list with the text content items using position matching.
361+
When pdf.js merges adjacent text runs with different colors into a single content item (e.g. differently colored words on the same line),
362+
only the first color is reported.
363+
This is an inherent limitation of how pdf.js combines text during extraction and cannot be resolved without upstream changes to pdf.js.
364+

lib/extraction/annotations.mjs

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,17 @@
11
import { Buffer } from 'node:buffer';
22
import { compactObj } from "../utils.mjs";
33

4+
const toHexColor = (arr) => {
5+
if (!arr || !Array.isArray(arr) || arr.length < 3) return arr;
6+
return '#' + arr.slice(0, 3).map(c => (c | 0).toString(16).padStart(2, '0')).join('');
7+
};
8+
49
const getPageAnnotation = (viewport, annot) => {
510
const result = compactObj(annot);
11+
if (result.color) result.color = toHexColor(result.color);
12+
if (result.backgroundColor) result.backgroundColor = toHexColor(result.backgroundColor);
13+
if (result.borderColor) result.borderColor = toHexColor(result.borderColor);
14+
if (result.defaultAppearanceData?.fontColor) result.defaultAppearanceData.fontColor = toHexColor(result.defaultAppearanceData.fontColor);
615
if (annot.rect) {
716
if (viewport.rotation === 90) {
817
result.x = annot.rect[3];

lib/extraction/content.mjs

Lines changed: 124 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,108 @@
1-
import { Util } from "../pdfjs/pdf.mjs";
1+
import { OPS, Util } from "../pdfjs/pdf.mjs";
22

33
const normalizeTransform = transform => transform.map(v => v === 0 ? 0 : v);
44

5-
const getPageTextItem = (page, viewport, item, content) => {
5+
const translateMatrix = (m, tx, ty) => [m[0], m[1], m[2], m[3], m[0] * tx + m[2] * ty + m[4], m[1] * tx + m[3] * ty + m[5]];
6+
7+
async function extractTextFillColors(page) {
8+
const operatorList = await page.getOperatorList();
9+
const { fnArray, argsArray } = operatorList;
10+
let fillColor = "#000000";
11+
let ctm = [1, 0, 0, 1, 0, 0];
12+
let textMatrix = [1, 0, 0, 1, 0, 0];
13+
let textLineMatrix = [1, 0, 0, 1, 0, 0];
14+
let fontSize = 0;
15+
let leading = 0;
16+
const saveStack = [];
17+
const colorEntries = [];
18+
for (let i = 0; i < fnArray.length; i++) {
19+
const args = argsArray[i];
20+
switch (fnArray[i]) {
21+
case OPS.save:
22+
saveStack.push({ ctm, fillColor, textMatrix, textLineMatrix, fontSize, leading });
23+
break;
24+
case OPS.restore:
25+
if (saveStack.length > 0) {
26+
({ ctm, fillColor, textMatrix, textLineMatrix, fontSize, leading } = saveStack.pop());
27+
}
28+
break;
29+
case OPS.transform:
30+
ctm = Util.transform(ctm, args);
31+
break;
32+
case OPS.paintFormXObjectBegin:
33+
saveStack.push({ ctm, fillColor, textMatrix, textLineMatrix, fontSize, leading });
34+
if (args[0]) {
35+
ctm = Util.transform(ctm, args[0]);
36+
}
37+
break;
38+
case OPS.paintFormXObjectEnd:
39+
if (saveStack.length > 0) {
40+
({ ctm, fillColor, textMatrix, textLineMatrix, fontSize, leading } = saveStack.pop());
41+
}
42+
break;
43+
case OPS.setFillRGBColor:
44+
fillColor = args[0];
45+
break;
46+
case OPS.setFillTransparent:
47+
fillColor = undefined;
48+
break;
49+
case OPS.beginText:
50+
textMatrix = [1, 0, 0, 1, 0, 0];
51+
textLineMatrix = [1, 0, 0, 1, 0, 0];
52+
break;
53+
case OPS.setTextMatrix: {
54+
const m = args[0];
55+
textMatrix = [m[0], m[1], m[2], m[3], m[4], m[5]];
56+
textLineMatrix = textMatrix.slice();
57+
break;
58+
}
59+
case OPS.moveText:
60+
textLineMatrix = translateMatrix(textLineMatrix, args[0], args[1]);
61+
textMatrix = textLineMatrix.slice();
62+
break;
63+
case OPS.setLeadingMoveText:
64+
leading = -args[1];
65+
textLineMatrix = translateMatrix(textLineMatrix, args[0], args[1]);
66+
textMatrix = textLineMatrix.slice();
67+
break;
68+
case OPS.nextLine:
69+
textLineMatrix = translateMatrix(textLineMatrix, 0, -leading);
70+
textMatrix = textLineMatrix.slice();
71+
break;
72+
case OPS.setFont:
73+
fontSize = args[1];
74+
break;
75+
case OPS.setLeading:
76+
leading = args[0];
77+
break;
78+
case OPS.showText: {
79+
const tsm = [fontSize, 0, 0, fontSize, 0, 0];
80+
const pos = Util.transform(ctm, Util.transform(textMatrix, tsm));
81+
colorEntries.push({ x: pos[4], y: pos[5], color: fillColor });
82+
break;
83+
}
84+
}
85+
}
86+
return colorEntries;
87+
}
88+
89+
function findFillColor(colorEntries, searchStart, itemX, itemY) {
90+
for (let i = searchStart; i < colorEntries.length; i++) {
91+
const entry = colorEntries[i];
92+
if (Math.abs(entry.x - itemX) < 5 && Math.abs(entry.y - itemY) < 5) {
93+
let nextIdx = i + 1;
94+
while (nextIdx < colorEntries.length &&
95+
Math.abs(colorEntries[nextIdx].x - itemX) < 5 &&
96+
Math.abs(colorEntries[nextIdx].y - itemY) < 5) {
97+
nextIdx++;
98+
}
99+
return { color: entry.color, nextIdx };
100+
}
101+
}
102+
return null;
103+
}
104+
105+
const getPageTextItem = (page, viewport, item, content, fillColor) => {
6106
const tx = Util.transform(viewport.transform, item.transform);
7107
const style = content.styles[item.fontName] ?? {};
8108
const fontSize = Math.sqrt(tx[2] * tx[2] + tx[3] * tx[3]);
@@ -19,6 +119,7 @@ const getPageTextItem = (page, viewport, item, content) => {
19119
size: fontSize,
20120
name: font?.name,
21121
family: style.fontFamily,
122+
color: fillColor,
22123
vertical: style.vertical,
23124
ascent: isNaN(style.ascent) || style.ascent === null ? undefined : style.ascent,
24125
descent: isNaN(style.descent) || style.descent === null ? undefined : style.descent
@@ -28,10 +129,27 @@ const getPageTextItem = (page, viewport, item, content) => {
28129
};
29130
};
30131

31-
export async function getPageContent(page, textExtractOptions) {
132+
export async function getPageContent(page, textExtractOptions, includeColors) {
32133
const viewport = page.getViewport({ scale: 1.0 });
33-
const content = await page.getTextContent(textExtractOptions);
34-
return content.items
35-
.map(item => getPageTextItem(page, viewport, item, content));
134+
if (!includeColors) {
135+
const content = await page.getTextContent(textExtractOptions);
136+
return content.items.map(item => getPageTextItem(page, viewport, item, content));
137+
}
138+
const [content, colorEntries] = await Promise.all([
139+
page.getTextContent(textExtractOptions),
140+
extractTextFillColors(page)
141+
]);
142+
let entryIdx = 0;
143+
let lastColor = colorEntries[0]?.color ?? "#000000";
144+
return content.items.map(item => {
145+
if (item.str && item.str.trim()) {
146+
const result = findFillColor(colorEntries, entryIdx, item.transform[4], item.transform[5]);
147+
if (result) {
148+
lastColor = result.color;
149+
entryIdx = result.nextIdx;
150+
}
151+
}
152+
return getPageTextItem(page, viewport, item, content, lastColor);
153+
});
36154
}
37155

lib/index.d.ts

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ declare module "pdf.js-extract" {
3535
disableCombineTextItems?: boolean; // do not attempt to combine same line {@link TextItem}'s. The default value is `false`.
3636
includeAttachments?: boolean; // include attachments as base64. The default value is `false`.
3737
includeImages?: boolean; // include images as base64. The default value is `false`.
38+
includeColors?: boolean; // include font color in text content items. The default value is `false`.
3839
}
3940

4041
export interface PDFExtractResult {
@@ -133,6 +134,7 @@ declare module "pdf.js-extract" {
133134
name: string;
134135
family: string;
135136
size: number;
137+
color?: string;
136138
vertical?: boolean;
137139
ascent?: number;
138140
descent?: number;
@@ -333,7 +335,7 @@ declare module "pdf.js-extract" {
333335
export interface PDFExtractAnnotDefaultAppearanceData {
334336
fontSize: number;
335337
fontName: string;
336-
fontColor?: Uint8ClampedArray;
338+
fontColor?: string;
337339
}
338340

339341
export const enum PDFExtractAnnotFieldFlag {
@@ -384,10 +386,10 @@ declare module "pdf.js-extract" {
384386
x?: number;
385387
y?: number;
386388
annotationFlags: PDFExtractAnnotFlag;
387-
color?: Uint8ClampedArray;
388-
backgroundColor?: Uint8ClampedArray;
389+
color?: string;
390+
backgroundColor?: string;
389391
borderStyle: PDFExtractAnnotBorderStyle;
390-
borderColor?: Uint8ClampedArray;
392+
borderColor?: string;
391393
rotation: number;
392394
contentsObj: PDFExtractBidiText;
393395
richText?: PDFExtractAnnotRichText;

lib/index.mjs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -119,7 +119,7 @@ class PDFExtract {
119119
await page.getOperatorList();
120120
const resultPage = {
121121
info: getPageInfo(pageNum, page),
122-
content: await getPageContent(page, textExtractOptions)
122+
content: await getPageContent(page, textExtractOptions, options.includeColors === true)
123123
};
124124
const annotations = await getPageAnnotations(page);
125125
if (annotations) {

package-lock.json

Lines changed: 0 additions & 24 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)