Qt
Internal/Contributor docs for the Qt SDK. Note: These are NOT official API docs; those are found at https://doc.qt.io/
Loading...
Searching...
No Matches
cfx_xmlparser.cpp
Go to the documentation of this file.
1// Copyright 2016 The PDFium Authors
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5// Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
6
7#include "core/fxcrt/xml/cfx_xmlparser.h"
8
9#include <stdint.h>
10
11#include <algorithm>
12#include <iterator>
13#include <stack>
14#include <utility>
15
16#include "core/fxcrt/autorestorer.h"
17#include "core/fxcrt/cfx_seekablestreamproxy.h"
18#include "core/fxcrt/data_vector.h"
19#include "core/fxcrt/fx_codepage.h"
20#include "core/fxcrt/fx_extension.h"
21#include "core/fxcrt/fx_safe_types.h"
22#include "core/fxcrt/xml/cfx_xmlchardata.h"
23#include "core/fxcrt/xml/cfx_xmldocument.h"
24#include "core/fxcrt/xml/cfx_xmlelement.h"
25#include "core/fxcrt/xml/cfx_xmlinstruction.h"
26#include "core/fxcrt/xml/cfx_xmlnode.h"
27#include "core/fxcrt/xml/cfx_xmltext.h"
28#include "third_party/base/check.h"
29#include "third_party/base/notreached.h"
30
31namespace {
32
33constexpr size_t kCurrentTextReserve = 128;
34constexpr uint32_t kMaxCharRange = 0x10ffff;
35
36bool IsXMLWhiteSpace(wchar_t ch) {
37 return ch == L' ' || ch == 0x0A || ch == 0x0D || ch == 0x09;
38}
39
40struct FX_XMLNAMECHAR {
41 uint16_t wStart;
42 uint16_t wEnd;
43 bool bStartChar;
44};
45
46constexpr FX_XMLNAMECHAR kXMLNameChars[] = {
47 {L'-', L'.', false}, {L'0', L'9', false}, {L':', L':', false},
48 {L'A', L'Z', true}, {L'_', L'_', true}, {L'a', L'z', true},
49 {0xB7, 0xB7, false}, {0xC0, 0xD6, true}, {0xD8, 0xF6, true},
50 {0xF8, 0x02FF, true}, {0x0300, 0x036F, false}, {0x0370, 0x037D, true},
51 {0x037F, 0x1FFF, true}, {0x200C, 0x200D, true}, {0x203F, 0x2040, false},
52 {0x2070, 0x218F, true}, {0x2C00, 0x2FEF, true}, {0x3001, 0xD7FF, true},
53 {0xF900, 0xFDCF, true}, {0xFDF0, 0xFFFD, true},
54};
55
56} // namespace
57
58// static
59bool CFX_XMLParser::IsXMLNameChar(wchar_t ch, bool bFirstChar) {
60 auto* it = std::lower_bound(
61 std::begin(kXMLNameChars), std::end(kXMLNameChars), ch,
62 [](const FX_XMLNAMECHAR& arg, wchar_t ch) { return arg.wEnd < ch; });
63 return it != std::end(kXMLNameChars) && ch >= it->wStart &&
64 (!bFirstChar || it->bStartChar);
65}
66
67CFX_XMLParser::CFX_XMLParser(const RetainPtr<IFX_SeekableReadStream>& pStream) {
68 DCHECK(pStream);
69
70 auto proxy = pdfium::MakeRetain<CFX_SeekableStreamProxy>(pStream);
71 FX_CodePage wCodePage = proxy->GetCodePage();
72 if (wCodePage != FX_CodePage::kUTF16LE &&
73 wCodePage != FX_CodePage::kUTF16BE && wCodePage != FX_CodePage::kUTF8) {
74 proxy->SetCodePage(FX_CodePage::kUTF8);
75 }
76 stream_ = proxy;
77
78 xml_plane_size_ = std::min(
79 xml_plane_size_, pdfium::base::checked_cast<size_t>(stream_->GetSize()));
80
81 current_text_.reserve(kCurrentTextReserve);
82}
83
84CFX_XMLParser::~CFX_XMLParser() = default;
85
86std::unique_ptr<CFX_XMLDocument> CFX_XMLParser::Parse() {
87 auto doc = std::make_unique<CFX_XMLDocument>();
88 AutoRestorer<UnownedPtr<CFX_XMLNode>> restorer(&current_node_);
89 current_node_ = doc->GetRoot();
90 return DoSyntaxParse(doc.get()) ? std::move(doc) : nullptr;
91}
92
93bool CFX_XMLParser::DoSyntaxParse(CFX_XMLDocument* doc) {
94 if (xml_plane_size_ <= 0)
95 return false;
96
97 FX_SAFE_SIZE_T alloc_size_safe = xml_plane_size_;
98 alloc_size_safe += 1; // For NUL.
99 if (!alloc_size_safe.IsValid())
100 return false;
101
102 size_t current_buffer_idx = 0;
103 size_t buffer_size = 0;
104
105 DataVector<wchar_t> buffer;
106 buffer.resize(alloc_size_safe.ValueOrDie());
107
108 std::stack<wchar_t> character_to_skip_too_stack;
109 std::stack<CFX_XMLNode::Type> node_type_stack;
110 WideString current_attribute_name;
111 FDE_XmlSyntaxState current_parser_state = FDE_XmlSyntaxState::Text;
112 wchar_t current_quote_character = 0;
113 wchar_t current_character_to_skip_to = 0;
114
115 while (true) {
116 if (current_buffer_idx >= buffer_size) {
117 if (stream_->IsEOF())
118 return true;
119
120 size_t buffer_chars = stream_->ReadBlock(buffer.data(), xml_plane_size_);
121 if (buffer_chars == 0)
122 return true;
123
124 current_buffer_idx = 0;
125 buffer_size = buffer_chars;
126 }
127
128 while (current_buffer_idx < buffer_size) {
129 wchar_t ch = buffer[current_buffer_idx];
130 switch (current_parser_state) {
131 case FDE_XmlSyntaxState::Text:
132 if (ch == L'<') {
133 if (!current_text_.empty()) {
134 current_node_->AppendLastChild(
135 doc->CreateNode<CFX_XMLText>(GetTextData()));
136 } else {
137 current_buffer_idx++;
138 current_parser_state = FDE_XmlSyntaxState::Node;
139 }
140 } else {
141 // Fail if there is text outside of the root element, ignore
142 // whitespace/null.
143 if (node_type_stack.empty() && ch && !FXSYS_iswspace(ch))
144 return false;
145 ProcessTextChar(ch);
146 current_buffer_idx++;
147 }
148 break;
149 case FDE_XmlSyntaxState::Node:
150 if (ch == L'!') {
151 current_buffer_idx++;
152 current_parser_state = FDE_XmlSyntaxState::SkipCommentOrDecl;
153 } else if (ch == L'/') {
154 current_buffer_idx++;
155 current_parser_state = FDE_XmlSyntaxState::CloseElement;
156 } else if (ch == L'?') {
157 node_type_stack.push(CFX_XMLNode::Type::kInstruction);
158 current_buffer_idx++;
159 current_parser_state = FDE_XmlSyntaxState::Target;
160 } else {
161 node_type_stack.push(CFX_XMLNode::Type::kElement);
162 current_parser_state = FDE_XmlSyntaxState::Tag;
163 }
164 break;
165 case FDE_XmlSyntaxState::Target:
166 if (!IsXMLNameChar(ch, current_text_.empty())) {
167 if (current_text_.empty())
168 return false;
169
170 current_parser_state = FDE_XmlSyntaxState::TargetData;
171
172 WideString target_name = GetTextData();
173 if (target_name.EqualsASCII("originalXFAVersion") ||
174 target_name.EqualsASCII("acrobat")) {
175 auto* node = doc->CreateNode<CFX_XMLInstruction>(target_name);
176 current_node_->AppendLastChild(node);
177 current_node_ = node;
178 }
179 } else {
180 current_text_.push_back(ch);
181 current_buffer_idx++;
182 }
183 break;
184 case FDE_XmlSyntaxState::Tag:
185 if (!IsXMLNameChar(ch, current_text_.empty())) {
186 if (current_text_.empty())
187 return false;
188
189 current_parser_state = FDE_XmlSyntaxState::AttriName;
190
191 auto* child = doc->CreateNode<CFX_XMLElement>(GetTextData());
192 current_node_->AppendLastChild(child);
193 current_node_ = child;
194 } else {
195 current_text_.push_back(ch);
196 current_buffer_idx++;
197 }
198 break;
199 case FDE_XmlSyntaxState::AttriName:
200 if (current_text_.empty() && IsXMLWhiteSpace(ch)) {
201 current_buffer_idx++;
202 break;
203 }
204 if (!IsXMLNameChar(ch, current_text_.empty())) {
205 if (current_text_.empty()) {
206 if (node_type_stack.top() == CFX_XMLNode::Type::kElement) {
207 if (ch == L'>' || ch == L'/') {
208 current_parser_state = FDE_XmlSyntaxState::BreakElement;
209 break;
210 }
211 } else if (node_type_stack.top() ==
213 if (ch == L'?') {
214 current_parser_state = FDE_XmlSyntaxState::CloseInstruction;
215 current_buffer_idx++;
216 } else {
217 current_parser_state = FDE_XmlSyntaxState::TargetData;
218 }
219 break;
220 }
221 return false;
222 } else {
223 if (node_type_stack.top() == CFX_XMLNode::Type::kInstruction) {
224 if (ch != '=' && !IsXMLWhiteSpace(ch)) {
225 current_parser_state = FDE_XmlSyntaxState::TargetData;
226 break;
227 }
228 }
229 current_parser_state = FDE_XmlSyntaxState::AttriEqualSign;
230 current_attribute_name = GetTextData();
231 }
232 } else {
233 current_text_.push_back(ch);
234 current_buffer_idx++;
235 }
236 break;
237 case FDE_XmlSyntaxState::AttriEqualSign:
238 if (IsXMLWhiteSpace(ch)) {
239 current_buffer_idx++;
240 break;
241 }
242 if (ch != L'=') {
243 if (node_type_stack.top() == CFX_XMLNode::Type::kInstruction) {
244 current_parser_state = FDE_XmlSyntaxState::TargetData;
245 break;
246 }
247 return false;
248 } else {
249 current_parser_state = FDE_XmlSyntaxState::AttriQuotation;
250 current_buffer_idx++;
251 }
252 break;
253 case FDE_XmlSyntaxState::AttriQuotation:
254 if (IsXMLWhiteSpace(ch)) {
255 current_buffer_idx++;
256 break;
257 }
258 if (ch != L'\"' && ch != L'\'') {
259 return false;
260 }
261
262 current_quote_character = ch;
263 current_parser_state = FDE_XmlSyntaxState::AttriValue;
264 current_buffer_idx++;
265 break;
266 case FDE_XmlSyntaxState::AttriValue:
267 if (ch == current_quote_character) {
268 if (entity_start_.has_value())
269 return false;
270
271 current_quote_character = 0;
272 current_buffer_idx++;
273 current_parser_state = FDE_XmlSyntaxState::AttriName;
274
275 CFX_XMLElement* elem = ToXMLElement(current_node_);
276 if (elem)
277 elem->SetAttribute(current_attribute_name, GetTextData());
278
279 current_attribute_name.clear();
280 } else {
281 ProcessTextChar(ch);
282 current_buffer_idx++;
283 }
284 break;
285 case FDE_XmlSyntaxState::CloseInstruction:
286 if (ch != L'>') {
287 current_text_.push_back(ch);
288 current_parser_state = FDE_XmlSyntaxState::TargetData;
289 } else if (!current_text_.empty()) {
290 ProcessTargetData();
291 } else {
292 current_buffer_idx++;
293 if (node_type_stack.empty())
294 return false;
295
296 node_type_stack.pop();
297 current_parser_state = FDE_XmlSyntaxState::Text;
298
299 if (current_node_ &&
300 current_node_->GetType() == CFX_XMLNode::Type::kInstruction)
301 current_node_ = current_node_->GetParent();
302 }
303 break;
304 case FDE_XmlSyntaxState::BreakElement:
305 if (ch == L'>') {
306 current_parser_state = FDE_XmlSyntaxState::Text;
307 } else if (ch == L'/') {
308 current_parser_state = FDE_XmlSyntaxState::CloseElement;
309 } else {
310 return false;
311 }
312 current_buffer_idx++;
313 break;
314 case FDE_XmlSyntaxState::CloseElement:
315 if (!IsXMLNameChar(ch, current_text_.empty())) {
316 if (ch == L'>') {
317 if (node_type_stack.empty())
318 return false;
319
320 node_type_stack.pop();
321 current_parser_state = FDE_XmlSyntaxState::Text;
322
323 CFX_XMLElement* element = ToXMLElement(current_node_);
324 if (!element)
325 return false;
326
327 WideString element_name = GetTextData();
328 if (element_name.GetLength() > 0 &&
329 element_name != element->GetName()) {
330 return false;
331 }
332
333 current_node_ = current_node_->GetParent();
334 } else if (!IsXMLWhiteSpace(ch)) {
335 return false;
336 }
337 } else {
338 current_text_.push_back(ch);
339 }
340 current_buffer_idx++;
341 break;
342 case FDE_XmlSyntaxState::SkipCommentOrDecl: {
343 auto current_span =
344 pdfium::make_span(buffer).subspan(current_buffer_idx);
345 if (FXSYS_wcsnicmp(current_span.data(), L"--", 2) == 0) {
346 current_buffer_idx += 2;
347 current_parser_state = FDE_XmlSyntaxState::SkipComment;
348 } else if (FXSYS_wcsnicmp(current_span.data(), L"[CDATA[", 7) == 0) {
349 current_buffer_idx += 7;
350 current_parser_state = FDE_XmlSyntaxState::SkipCData;
351 } else {
352 current_parser_state = FDE_XmlSyntaxState::SkipDeclNode;
353 current_character_to_skip_to = L'>';
354 character_to_skip_too_stack.push(L'>');
355 }
356 break;
357 }
358 case FDE_XmlSyntaxState::SkipCData: {
359 auto current_span =
360 pdfium::make_span(buffer).subspan(current_buffer_idx);
361 if (FXSYS_wcsnicmp(current_span.data(), L"]]>", 3) == 0) {
362 current_buffer_idx += 3;
363 current_parser_state = FDE_XmlSyntaxState::Text;
364 current_node_->AppendLastChild(
365 doc->CreateNode<CFX_XMLCharData>(GetTextData()));
366 } else {
367 current_text_.push_back(ch);
368 current_buffer_idx++;
369 }
370 break;
371 }
372 case FDE_XmlSyntaxState::SkipDeclNode:
373 if (current_character_to_skip_to == L'\'' ||
374 current_character_to_skip_to == L'\"') {
375 current_buffer_idx++;
376 if (ch != current_character_to_skip_to)
377 break;
378
379 character_to_skip_too_stack.pop();
380 if (character_to_skip_too_stack.empty())
381 current_parser_state = FDE_XmlSyntaxState::Text;
382 else
383 current_character_to_skip_to = character_to_skip_too_stack.top();
384 } else {
385 switch (ch) {
386 case L'<':
387 current_character_to_skip_to = L'>';
388 character_to_skip_too_stack.push(L'>');
389 break;
390 case L'[':
391 current_character_to_skip_to = L']';
392 character_to_skip_too_stack.push(L']');
393 break;
394 case L'(':
395 current_character_to_skip_to = L')';
396 character_to_skip_too_stack.push(L')');
397 break;
398 case L'\'':
399 current_character_to_skip_to = L'\'';
400 character_to_skip_too_stack.push(L'\'');
401 break;
402 case L'\"':
403 current_character_to_skip_to = L'\"';
404 character_to_skip_too_stack.push(L'\"');
405 break;
406 default:
407 if (ch == current_character_to_skip_to) {
408 character_to_skip_too_stack.pop();
409 if (character_to_skip_too_stack.empty()) {
410 current_parser_state = FDE_XmlSyntaxState::Text;
411 } else {
412 current_character_to_skip_to =
413 character_to_skip_too_stack.top();
414 }
415 }
416 break;
417 }
418 current_buffer_idx++;
419 }
420 break;
421 case FDE_XmlSyntaxState::SkipComment: {
422 auto current_span =
423 pdfium::make_span(buffer).subspan(current_buffer_idx);
424 if (FXSYS_wcsnicmp(current_span.data(), L"-->", 3) == 0) {
425 current_buffer_idx += 2;
426 current_parser_state = FDE_XmlSyntaxState::Text;
427 }
428 current_buffer_idx++;
429 break;
430 }
431 case FDE_XmlSyntaxState::TargetData:
432 if (IsXMLWhiteSpace(ch)) {
433 if (current_text_.empty()) {
434 current_buffer_idx++;
435 break;
436 }
437 if (current_quote_character == 0) {
438 current_buffer_idx++;
439 ProcessTargetData();
440 break;
441 }
442 }
443 if (ch == '?') {
444 current_parser_state = FDE_XmlSyntaxState::CloseInstruction;
445 current_buffer_idx++;
446 } else if (ch == '\"') {
447 if (current_quote_character == 0) {
448 current_quote_character = ch;
449 current_buffer_idx++;
450 } else if (ch == current_quote_character) {
451 current_quote_character = 0;
452 current_buffer_idx++;
453 ProcessTargetData();
454 } else {
455 return false;
456 }
457 } else {
458 current_text_.push_back(ch);
459 current_buffer_idx++;
460 }
461 break;
462 }
463 }
464 }
465
466 NOTREACHED_NORETURN();
467}
468
469void CFX_XMLParser::ProcessTextChar(wchar_t character) {
470 current_text_.push_back(character);
471
472 if (entity_start_.has_value() && character == L';') {
473 // Copy the entity out into a string and remove from the vector. When we
474 // copy the entity we don't want to copy out the & or the ; so we start
475 // shifted by one and want to copy 2 less characters in total.
476 WideString csEntity(current_text_.data() + entity_start_.value() + 1,
477 current_text_.size() - entity_start_.value() - 2);
478 current_text_.erase(current_text_.begin() + entity_start_.value(),
479 current_text_.end());
480
481 size_t iLen = csEntity.GetLength();
482 if (iLen > 0) {
483 if (csEntity[0] == L'#') {
484 uint32_t ch = 0;
485 if (iLen > 1 && csEntity[1] == L'x') {
486 for (size_t i = 2; i < iLen; i++) {
487 if (!FXSYS_IsHexDigit(csEntity[i]))
488 break;
489 ch = (ch << 4) + FXSYS_HexCharToInt(csEntity[i]);
490 }
491 } else {
492 for (size_t i = 1; i < iLen; i++) {
493 if (!FXSYS_IsDecimalDigit(csEntity[i]))
494 break;
495 ch = ch * 10 + FXSYS_DecimalCharToInt(csEntity[i]);
496 }
497 }
498 if (ch > kMaxCharRange)
499 ch = ' ';
500
501 character = static_cast<wchar_t>(ch);
502 if (character != 0)
503 current_text_.push_back(character);
504 } else {
505 if (csEntity == L"amp") {
506 current_text_.push_back(L'&');
507 } else if (csEntity == L"lt") {
508 current_text_.push_back(L'<');
509 } else if (csEntity == L"gt") {
510 current_text_.push_back(L'>');
511 } else if (csEntity == L"apos") {
512 current_text_.push_back(L'\'');
513 } else if (csEntity == L"quot") {
514 current_text_.push_back(L'"');
515 }
516 }
517 }
518 entity_start_ = absl::nullopt;
519 } else if (!entity_start_.has_value() && character == L'&') {
520 entity_start_ = current_text_.size() - 1;
521 }
522}
523
524void CFX_XMLParser::ProcessTargetData() {
525 WideString target_data = GetTextData();
526 if (target_data.IsEmpty())
527 return;
528
529 CFX_XMLInstruction* instruction = ToXMLInstruction(current_node_);
530 if (instruction)
531 instruction->AppendData(target_data);
532}
533
534WideString CFX_XMLParser::GetTextData() {
535 WideString ret(current_text_.data(), current_text_.size());
536 entity_start_ = absl::nullopt;
537 current_text_.clear();
538 current_text_.reserve(kCurrentTextReserve);
539 return ret;
540}
const WideString & GetName() const
void SetAttribute(const WideString &name, const WideString &value)
void AppendData(const WideString &wsData)
CFX_XMLParser(const RetainPtr< IFX_SeekableReadStream > &pStream)
static bool IsXMLNameChar(wchar_t ch, bool bFirstChar)
std::unique_ptr< CFX_XMLDocument > Parse()
WideString & operator=(WideString &&that) noexcept
bool operator==(const wchar_t *ptr) const
CharType operator[](const size_t index) const
Definition widestring.h:146
bool IsEmpty() const
Definition widestring.h:118
bool operator!=(const WideString &other) const
Definition widestring.h:140
bool EqualsASCII(ByteStringView that) const
Definition widestring.h:216
FX_CodePage
Definition fx_codepage.h:18
bool FXSYS_iswspace(wchar_t c)