Skip to content

Commit 240d74e

Browse files
committed
xml detector: accept XMP/whitespace + refine UTF-16 checks
- tolerate leading whitespace/BOM and XML without declaration - detect XMP/RDF via xpacket and xmpmeta - add tests for new XML/XMP cases and UTF-16 guard paths Fixes FriendsOfFlarum/upload#455
1 parent 307b7c2 commit 240d74e

2 files changed

Lines changed: 155 additions & 22 deletions

File tree

src/SoftCreatR/MimeDetector/Detector/XmlSignatureDetector.php

Lines changed: 108 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -41,60 +41,146 @@ public function detect(DetectionContext $context): ?MimeTypeMatch
4141
}
4242
}
4343

44-
if ($buffer->checkForBytes([0xFF, 0xFE]) && $this->checkUtf16LeSequence($buffer, '<?xml ', 2)) {
44+
if ($buffer->checkForBytes([0xFF, 0xFE]) && $this->isUtf16LeXmlDeclaration($buffer, 2)) {
4545
return $this->match('xml', 'application/xml');
4646
}
4747

48-
if ($buffer->checkForBytes([0xFE, 0xFF]) && $this->checkUtf16BeSequence($buffer, '<?xml ', 2)) {
48+
if ($buffer->checkForBytes([0xFE, 0xFF]) && $this->isUtf16BeXmlDeclaration($buffer, 2)) {
4949
return $this->match('xml', 'application/xml');
5050
}
5151

52-
if (
53-
$buffer->checkString('<!doctype html')
54-
|| $buffer->checkString('<!DOCTYPE html')
55-
|| $buffer->checkString('<html')
56-
) {
57-
return $this->match('html', 'text/html');
58-
}
59-
6052
return null;
6153
}
6254

6355
private function detectAsciiXml(FileBuffer $buffer, int $offset): ?MimeTypeMatch
6456
{
65-
if (!$buffer->checkString('<?xml ', $offset)) {
57+
$snippet = $buffer->sliceAsString($offset);
58+
59+
if ($snippet === '') {
60+
return null;
61+
}
62+
63+
$snippet = $this->ltrimAsciiWhitespace($snippet);
64+
65+
if ($snippet === '') {
6666
return null;
6767
}
6868

69-
$searchOffset = $offset + 6;
69+
$snippet = \strtolower($snippet);
7070

7171
if (
72-
$buffer->searchForBytes($buffer->toBytes('<!doctype svg'), $searchOffset) !== -1
73-
|| $buffer->searchForBytes($buffer->toBytes('<!DOCTYPE svg'), $searchOffset) !== -1
74-
|| $buffer->searchForBytes($buffer->toBytes('<svg'), $searchOffset) !== -1
72+
!$this->startsWithXmlDeclaration($snippet)
73+
&& !\str_starts_with($snippet, '<?xpacket')
74+
&& !$this->startsWithXmlTag($snippet)
7575
) {
76+
return null;
77+
}
78+
79+
return $this->detectXmlFamily($snippet);
80+
}
81+
82+
private function detectXmlFamily(string $snippet): MimeTypeMatch
83+
{
84+
if (\str_contains($snippet, '<!doctype svg') || \str_contains($snippet, '<svg')) {
7685
return $this->match('svg', 'image/svg+xml');
7786
}
7887

79-
if (
80-
$buffer->searchForBytes($buffer->toBytes('<!doctype html'), $searchOffset) !== -1
81-
|| $buffer->searchForBytes($buffer->toBytes('<!DOCTYPE html'), $searchOffset) !== -1
82-
|| $buffer->searchForBytes($buffer->toBytes('<html'), $searchOffset) !== -1
83-
) {
88+
if (\str_contains($snippet, '<!doctype html') || \str_contains($snippet, '<html')) {
8489
return $this->match('html', 'text/html');
8590
}
8691

87-
if ($buffer->searchForBytes($buffer->toBytes('<rdf:RDF'), $searchOffset) !== -1) {
92+
if (\str_contains($snippet, '<x:xmpmeta') || \str_contains($snippet, '<rdf:rdf')) {
8893
return $this->match('rdf', 'application/rdf+xml');
8994
}
9095

91-
if ($buffer->searchForBytes($buffer->toBytes('<rss version="2.0"'), $searchOffset) !== -1) {
96+
if (\str_contains($snippet, '<rss version="2.0"')) {
9297
return $this->match('rss', 'application/rss+xml');
9398
}
9499

95100
return $this->match('xml', 'application/xml');
96101
}
97102

103+
private function startsWithXmlDeclaration(string $snippet): bool
104+
{
105+
if (!\str_starts_with($snippet, '<?xml')) {
106+
return false;
107+
}
108+
109+
return $this->isAsciiWhitespaceByte($snippet, 5);
110+
}
111+
112+
private function startsWithXmlTag(string $snippet): bool
113+
{
114+
if (!isset($snippet[0], $snippet[1]) || $snippet[0] !== '<') {
115+
return false;
116+
}
117+
118+
$next = $snippet[1];
119+
120+
if ($next === '?' || $next === '!') {
121+
return true;
122+
}
123+
124+
$byte = \ord($next);
125+
126+
return ($byte >= 0x61 && $byte <= 0x7A) || ($byte >= 0x41 && $byte <= 0x5A) || $next === '_' || $next === ':';
127+
}
128+
129+
private function isAsciiWhitespaceByte(string $snippet, int $offset): bool
130+
{
131+
if (!isset($snippet[$offset])) {
132+
return false;
133+
}
134+
135+
$byte = \ord($snippet[$offset]);
136+
137+
return \in_array($byte, [0x09, 0x0A, 0x0D, 0x20], true);
138+
}
139+
140+
private function ltrimAsciiWhitespace(string $snippet): string
141+
{
142+
return \ltrim($snippet, " \t\r\n");
143+
}
144+
145+
private function isUtf16LeXmlDeclaration(FileBuffer $buffer, int $offset): bool
146+
{
147+
if (!$this->checkUtf16LeSequence($buffer, '<?xml', $offset)) {
148+
return false;
149+
}
150+
151+
return $this->isUtf16LeWhitespace($buffer, $offset + 10);
152+
}
153+
154+
private function isUtf16BeXmlDeclaration(FileBuffer $buffer, int $offset): bool
155+
{
156+
if (!$this->checkUtf16BeSequence($buffer, '<?xml', $offset)) {
157+
return false;
158+
}
159+
160+
return $this->isUtf16BeWhitespace($buffer, $offset + 10);
161+
}
162+
163+
private function isUtf16LeWhitespace(FileBuffer $buffer, int $offset): bool
164+
{
165+
$lowByte = $buffer->get($offset);
166+
$highByte = $buffer->get($offset + 1);
167+
168+
return $lowByte !== null && $highByte === 0x00 && $this->isAsciiWhitespace($lowByte);
169+
}
170+
171+
private function isUtf16BeWhitespace(FileBuffer $buffer, int $offset): bool
172+
{
173+
$highByte = $buffer->get($offset);
174+
$lowByte = $buffer->get($offset + 1);
175+
176+
return $highByte === 0x00 && $lowByte !== null && $this->isAsciiWhitespace($lowByte);
177+
}
178+
179+
private function isAsciiWhitespace(int $byte): bool
180+
{
181+
return \in_array($byte, [0x09, 0x0A, 0x0D, 0x20], true);
182+
}
183+
98184
private function checkUtf16LeSequence(FileBuffer $buffer, string $value, int $offset): bool
99185
{
100186
$bytes = [];

tests/SoftCreatR/MimeDetector/Detector/XmlSignatureDetectorTest.php

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,15 @@ public static function provideXmlSamples(): iterable
4343
yield 'rdf' => ['<?xml <rdf:RDF></rdf:RDF>', 'rdf', 'application/rdf+xml'];
4444
yield 'rss' => ['<?xml <rss version="2.0"></rss>', 'rss', 'application/rss+xml'];
4545
yield 'generic xml' => ['<?xml <root/>', 'xml', 'application/xml'];
46+
yield 'xml with leading whitespace' => ["\n\t<?xml <root/>", 'xml', 'application/xml'];
47+
yield 'xml without declaration' => ['<root/>', 'xml', 'application/xml'];
48+
yield 'truncated xml declaration' => ['<?xml', 'xml', 'application/xml'];
49+
yield 'xpacket xmp' => [
50+
'<?xpacket begin="id"?>'
51+
. '<x:xmpmeta><rdf:RDF></rdf:RDF></x:xmpmeta>',
52+
'rdf',
53+
'application/rdf+xml',
54+
];
4655
yield 'utf8 bom xml' => ["\xEF\xBB\xBF<?xml <root/>", 'xml', 'application/xml'];
4756
yield 'utf16-le xml' => ["\xFF\xFE" . self::toUtf16Le('<?xml <root/>'), 'xml', 'application/xml'];
4857
yield 'utf16-be xml' => ["\xFE\xFF" . self::toUtf16Be('<?xml <root/>'), 'xml', 'application/xml'];
@@ -74,6 +83,44 @@ public function testReturnsNullForNonXmlContent(): void
7483
$this->assertNull($match);
7584
}
7685

86+
public function testReturnsNullForEmptyPayload(): void
87+
{
88+
$detector = new XmlSignatureDetector();
89+
90+
$match = $this->detect($detector, '');
91+
92+
$this->assertNull($match);
93+
}
94+
95+
public function testReturnsNullForWhitespaceOnlyPayload(): void
96+
{
97+
$detector = new XmlSignatureDetector();
98+
99+
$match = $this->detect($detector, " \n\t");
100+
101+
$this->assertNull($match);
102+
}
103+
104+
public function testReturnsNullForUtf16LeWithoutXmlDeclaration(): void
105+
{
106+
$detector = new XmlSignatureDetector();
107+
108+
$payload = "\xFF\xFE" . self::toUtf16Le('not xml');
109+
$match = $this->detect($detector, $payload);
110+
111+
$this->assertNull($match);
112+
}
113+
114+
public function testReturnsNullForUtf16BeWithoutXmlDeclaration(): void
115+
{
116+
$detector = new XmlSignatureDetector();
117+
118+
$payload = "\xFE\xFF" . self::toUtf16Be('not xml');
119+
$match = $this->detect($detector, $payload);
120+
121+
$this->assertNull($match);
122+
}
123+
77124
/**
78125
* @throws MimeDetectorException
79126
*/

0 commit comments

Comments
 (0)