@@ -41,60 +41,146 @@ public function detect(DetectionContext $context): ?MimeTypeMatch
4141 }
4242 }
4343
44- if ($ buffer ->checkForBytes ([0xFF , 0xFE ]) && $ this ->checkUtf16LeSequence ($ buffer, ' <?xml ' , 2 )) {
44+ if ($ buffer ->checkForBytes ([0xFF , 0xFE ]) && $ this ->isUtf16LeXmlDeclaration ($ buffer , 2 )) {
4545 return $ this ->match ('xml ' , 'application/xml ' );
4646 }
4747
48- if ($ buffer ->checkForBytes ([0xFE , 0xFF ]) && $ this ->checkUtf16BeSequence ($ buffer, ' <?xml ' , 2 )) {
48+ if ($ buffer ->checkForBytes ([0xFE , 0xFF ]) && $ this ->isUtf16BeXmlDeclaration ($ buffer , 2 )) {
4949 return $ this ->match ('xml ' , 'application/xml ' );
5050 }
5151
52- if (
53- $ buffer ->checkString ('<!doctype html ' )
54- || $ buffer ->checkString ('<!DOCTYPE html ' )
55- || $ buffer ->checkString ('<html ' )
56- ) {
57- return $ this ->match ('html ' , 'text/html ' );
58- }
59-
6052 return null ;
6153 }
6254
6355 private function detectAsciiXml (FileBuffer $ buffer , int $ offset ): ?MimeTypeMatch
6456 {
65- if (!$ buffer ->checkString ('<?xml ' , $ offset )) {
57+ $ snippet = $ buffer ->sliceAsString ($ offset );
58+
59+ if ($ snippet === '' ) {
60+ return null ;
61+ }
62+
63+ $ snippet = $ this ->ltrimAsciiWhitespace ($ snippet );
64+
65+ if ($ snippet === '' ) {
6666 return null ;
6767 }
6868
69- $ searchOffset = $ offset + 6 ;
69+ $ snippet = \strtolower ( $ snippet ) ;
7070
7171 if (
72- $ buffer -> searchForBytes ( $ buffer -> toBytes ( ' <!doctype svg ' ), $ searchOffset ) !== - 1
73- || $ buffer -> searchForBytes ( $ buffer -> toBytes ( ' <!DOCTYPE svg ' ), $ searchOffset ) !== - 1
74- || $ buffer -> searchForBytes ( $ buffer -> toBytes ( ' <svg ' ), $ searchOffset ) !== - 1
72+ ! $ this -> startsWithXmlDeclaration ( $ snippet )
73+ && ! \str_starts_with ( $ snippet , ' <?xpacket ' )
74+ && ! $ this -> startsWithXmlTag ( $ snippet )
7575 ) {
76+ return null ;
77+ }
78+
79+ return $ this ->detectXmlFamily ($ snippet );
80+ }
81+
82+ private function detectXmlFamily (string $ snippet ): MimeTypeMatch
83+ {
84+ if (\str_contains ($ snippet , '<!doctype svg ' ) || \str_contains ($ snippet , '<svg ' )) {
7685 return $ this ->match ('svg ' , 'image/svg+xml ' );
7786 }
7887
79- if (
80- $ buffer ->searchForBytes ($ buffer ->toBytes ('<!doctype html ' ), $ searchOffset ) !== -1
81- || $ buffer ->searchForBytes ($ buffer ->toBytes ('<!DOCTYPE html ' ), $ searchOffset ) !== -1
82- || $ buffer ->searchForBytes ($ buffer ->toBytes ('<html ' ), $ searchOffset ) !== -1
83- ) {
88+ if (\str_contains ($ snippet , '<!doctype html ' ) || \str_contains ($ snippet , '<html ' )) {
8489 return $ this ->match ('html ' , 'text/html ' );
8590 }
8691
87- if ($ buffer -> searchForBytes ( $ buffer -> toBytes ( ' <rdf:RDF ' ), $ searchOffset ) !== - 1 ) {
92+ if (\str_contains ( $ snippet , ' <x:xmpmeta ' ) || \str_contains ( $ snippet , ' <rdf:rdf ' ) ) {
8893 return $ this ->match ('rdf ' , 'application/rdf+xml ' );
8994 }
9095
91- if ($ buffer -> searchForBytes ( $ buffer -> toBytes ( '<rss version="2.0" ' ), $ searchOffset ) !== - 1 ) {
96+ if (\str_contains ( $ snippet , '<rss version="2.0" ' )) {
9297 return $ this ->match ('rss ' , 'application/rss+xml ' );
9398 }
9499
95100 return $ this ->match ('xml ' , 'application/xml ' );
96101 }
97102
103+ private function startsWithXmlDeclaration (string $ snippet ): bool
104+ {
105+ if (!\str_starts_with ($ snippet , '<?xml ' )) {
106+ return false ;
107+ }
108+
109+ return $ this ->isAsciiWhitespaceByte ($ snippet , 5 );
110+ }
111+
112+ private function startsWithXmlTag (string $ snippet ): bool
113+ {
114+ if (!isset ($ snippet [0 ], $ snippet [1 ]) || $ snippet [0 ] !== '< ' ) {
115+ return false ;
116+ }
117+
118+ $ next = $ snippet [1 ];
119+
120+ if ($ next === '? ' || $ next === '! ' ) {
121+ return true ;
122+ }
123+
124+ $ byte = \ord ($ next );
125+
126+ return ($ byte >= 0x61 && $ byte <= 0x7A ) || ($ byte >= 0x41 && $ byte <= 0x5A ) || $ next === '_ ' || $ next === ': ' ;
127+ }
128+
129+ private function isAsciiWhitespaceByte (string $ snippet , int $ offset ): bool
130+ {
131+ if (!isset ($ snippet [$ offset ])) {
132+ return false ;
133+ }
134+
135+ $ byte = \ord ($ snippet [$ offset ]);
136+
137+ return \in_array ($ byte , [0x09 , 0x0A , 0x0D , 0x20 ], true );
138+ }
139+
140+ private function ltrimAsciiWhitespace (string $ snippet ): string
141+ {
142+ return \ltrim ($ snippet , " \t\r\n" );
143+ }
144+
145+ private function isUtf16LeXmlDeclaration (FileBuffer $ buffer , int $ offset ): bool
146+ {
147+ if (!$ this ->checkUtf16LeSequence ($ buffer , '<?xml ' , $ offset )) {
148+ return false ;
149+ }
150+
151+ return $ this ->isUtf16LeWhitespace ($ buffer , $ offset + 10 );
152+ }
153+
154+ private function isUtf16BeXmlDeclaration (FileBuffer $ buffer , int $ offset ): bool
155+ {
156+ if (!$ this ->checkUtf16BeSequence ($ buffer , '<?xml ' , $ offset )) {
157+ return false ;
158+ }
159+
160+ return $ this ->isUtf16BeWhitespace ($ buffer , $ offset + 10 );
161+ }
162+
163+ private function isUtf16LeWhitespace (FileBuffer $ buffer , int $ offset ): bool
164+ {
165+ $ lowByte = $ buffer ->get ($ offset );
166+ $ highByte = $ buffer ->get ($ offset + 1 );
167+
168+ return $ lowByte !== null && $ highByte === 0x00 && $ this ->isAsciiWhitespace ($ lowByte );
169+ }
170+
171+ private function isUtf16BeWhitespace (FileBuffer $ buffer , int $ offset ): bool
172+ {
173+ $ highByte = $ buffer ->get ($ offset );
174+ $ lowByte = $ buffer ->get ($ offset + 1 );
175+
176+ return $ highByte === 0x00 && $ lowByte !== null && $ this ->isAsciiWhitespace ($ lowByte );
177+ }
178+
179+ private function isAsciiWhitespace (int $ byte ): bool
180+ {
181+ return \in_array ($ byte , [0x09 , 0x0A , 0x0D , 0x20 ], true );
182+ }
183+
98184 private function checkUtf16LeSequence (FileBuffer $ buffer , string $ value , int $ offset ): bool
99185 {
100186 $ bytes = [];
0 commit comments