1- import { Util } from "../pdfjs/pdf.mjs" ;
1+ import { OPS , Util } from "../pdfjs/pdf.mjs" ;
22
33const normalizeTransform = transform => transform . map ( v => v === 0 ? 0 : v ) ;
44
5- const getPageTextItem = ( page , viewport , item , content ) => {
5+ const translateMatrix = ( m , tx , ty ) => [ m [ 0 ] , m [ 1 ] , m [ 2 ] , m [ 3 ] , m [ 0 ] * tx + m [ 2 ] * ty + m [ 4 ] , m [ 1 ] * tx + m [ 3 ] * ty + m [ 5 ] ] ;
6+
7+ async function extractTextFillColors ( page ) {
8+ const operatorList = await page . getOperatorList ( ) ;
9+ const { fnArray, argsArray } = operatorList ;
10+ let fillColor = "#000000" ;
11+ let ctm = [ 1 , 0 , 0 , 1 , 0 , 0 ] ;
12+ let textMatrix = [ 1 , 0 , 0 , 1 , 0 , 0 ] ;
13+ let textLineMatrix = [ 1 , 0 , 0 , 1 , 0 , 0 ] ;
14+ let fontSize = 0 ;
15+ let leading = 0 ;
16+ const saveStack = [ ] ;
17+ const colorEntries = [ ] ;
18+ for ( let i = 0 ; i < fnArray . length ; i ++ ) {
19+ const args = argsArray [ i ] ;
20+ switch ( fnArray [ i ] ) {
21+ case OPS . save :
22+ saveStack . push ( { ctm, fillColor, textMatrix, textLineMatrix, fontSize, leading } ) ;
23+ break ;
24+ case OPS . restore :
25+ if ( saveStack . length > 0 ) {
26+ ( { ctm, fillColor, textMatrix, textLineMatrix, fontSize, leading } = saveStack . pop ( ) ) ;
27+ }
28+ break ;
29+ case OPS . transform :
30+ ctm = Util . transform ( ctm , args ) ;
31+ break ;
32+ case OPS . paintFormXObjectBegin :
33+ saveStack . push ( { ctm, fillColor, textMatrix, textLineMatrix, fontSize, leading } ) ;
34+ if ( args [ 0 ] ) {
35+ ctm = Util . transform ( ctm , args [ 0 ] ) ;
36+ }
37+ break ;
38+ case OPS . paintFormXObjectEnd :
39+ if ( saveStack . length > 0 ) {
40+ ( { ctm, fillColor, textMatrix, textLineMatrix, fontSize, leading } = saveStack . pop ( ) ) ;
41+ }
42+ break ;
43+ case OPS . setFillRGBColor :
44+ fillColor = args [ 0 ] ;
45+ break ;
46+ case OPS . setFillTransparent :
47+ fillColor = undefined ;
48+ break ;
49+ case OPS . beginText :
50+ textMatrix = [ 1 , 0 , 0 , 1 , 0 , 0 ] ;
51+ textLineMatrix = [ 1 , 0 , 0 , 1 , 0 , 0 ] ;
52+ break ;
53+ case OPS . setTextMatrix : {
54+ const m = args [ 0 ] ;
55+ textMatrix = [ m [ 0 ] , m [ 1 ] , m [ 2 ] , m [ 3 ] , m [ 4 ] , m [ 5 ] ] ;
56+ textLineMatrix = textMatrix . slice ( ) ;
57+ break ;
58+ }
59+ case OPS . moveText :
60+ textLineMatrix = translateMatrix ( textLineMatrix , args [ 0 ] , args [ 1 ] ) ;
61+ textMatrix = textLineMatrix . slice ( ) ;
62+ break ;
63+ case OPS . setLeadingMoveText :
64+ leading = - args [ 1 ] ;
65+ textLineMatrix = translateMatrix ( textLineMatrix , args [ 0 ] , args [ 1 ] ) ;
66+ textMatrix = textLineMatrix . slice ( ) ;
67+ break ;
68+ case OPS . nextLine :
69+ textLineMatrix = translateMatrix ( textLineMatrix , 0 , - leading ) ;
70+ textMatrix = textLineMatrix . slice ( ) ;
71+ break ;
72+ case OPS . setFont :
73+ fontSize = args [ 1 ] ;
74+ break ;
75+ case OPS . setLeading :
76+ leading = args [ 0 ] ;
77+ break ;
78+ case OPS . showText : {
79+ const tsm = [ fontSize , 0 , 0 , fontSize , 0 , 0 ] ;
80+ const pos = Util . transform ( ctm , Util . transform ( textMatrix , tsm ) ) ;
81+ colorEntries . push ( { x : pos [ 4 ] , y : pos [ 5 ] , color : fillColor } ) ;
82+ break ;
83+ }
84+ }
85+ }
86+ return colorEntries ;
87+ }
88+
89+ function findFillColor ( colorEntries , searchStart , itemX , itemY ) {
90+ for ( let i = searchStart ; i < colorEntries . length ; i ++ ) {
91+ const entry = colorEntries [ i ] ;
92+ if ( Math . abs ( entry . x - itemX ) < 5 && Math . abs ( entry . y - itemY ) < 5 ) {
93+ let nextIdx = i + 1 ;
94+ while ( nextIdx < colorEntries . length &&
95+ Math . abs ( colorEntries [ nextIdx ] . x - itemX ) < 5 &&
96+ Math . abs ( colorEntries [ nextIdx ] . y - itemY ) < 5 ) {
97+ nextIdx ++ ;
98+ }
99+ return { color : entry . color , nextIdx } ;
100+ }
101+ }
102+ return null ;
103+ }
104+
105+ const getPageTextItem = ( page , viewport , item , content , fillColor ) => {
6106 const tx = Util . transform ( viewport . transform , item . transform ) ;
7107 const style = content . styles [ item . fontName ] ?? { } ;
8108 const fontSize = Math . sqrt ( tx [ 2 ] * tx [ 2 ] + tx [ 3 ] * tx [ 3 ] ) ;
@@ -19,6 +119,7 @@ const getPageTextItem = (page, viewport, item, content) => {
19119 size : fontSize ,
20120 name : font ?. name ,
21121 family : style . fontFamily ,
122+ color : fillColor ,
22123 vertical : style . vertical ,
23124 ascent : isNaN ( style . ascent ) || style . ascent === null ? undefined : style . ascent ,
24125 descent : isNaN ( style . descent ) || style . descent === null ? undefined : style . descent
@@ -28,10 +129,27 @@ const getPageTextItem = (page, viewport, item, content) => {
28129 } ;
29130} ;
30131
31- export async function getPageContent ( page , textExtractOptions ) {
132+ export async function getPageContent ( page , textExtractOptions , includeColors ) {
32133 const viewport = page . getViewport ( { scale : 1.0 } ) ;
33- const content = await page . getTextContent ( textExtractOptions ) ;
34- return content . items
35- . map ( item => getPageTextItem ( page , viewport , item , content ) ) ;
134+ if ( ! includeColors ) {
135+ const content = await page . getTextContent ( textExtractOptions ) ;
136+ return content . items . map ( item => getPageTextItem ( page , viewport , item , content ) ) ;
137+ }
138+ const [ content , colorEntries ] = await Promise . all ( [
139+ page . getTextContent ( textExtractOptions ) ,
140+ extractTextFillColors ( page )
141+ ] ) ;
142+ let entryIdx = 0 ;
143+ let lastColor = colorEntries [ 0 ] ?. color ?? "#000000" ;
144+ return content . items . map ( item => {
145+ if ( item . str && item . str . trim ( ) ) {
146+ const result = findFillColor ( colorEntries , entryIdx , item . transform [ 4 ] , item . transform [ 5 ] ) ;
147+ if ( result ) {
148+ lastColor = result . color ;
149+ entryIdx = result . nextIdx ;
150+ }
151+ }
152+ return getPageTextItem ( page , viewport , item , content , lastColor ) ;
153+ } ) ;
36154}
37155
0 commit comments