7#include "core/fxcrt/xml/cfx_xmlparser.h"
16#include "core/fxcrt/autorestorer.h"
17#include "core/fxcrt/cfx_seekablestreamproxy.h"
18#include "core/fxcrt/check.h"
19#include "core/fxcrt/data_vector.h"
20#include "core/fxcrt/fx_codepage.h"
21#include "core/fxcrt/fx_extension.h"
22#include "core/fxcrt/fx_safe_types.h"
23#include "core/fxcrt/notreached.h"
24#include "core/fxcrt/xml/cfx_xmlchardata.h"
25#include "core/fxcrt/xml/cfx_xmldocument.h"
26#include "core/fxcrt/xml/cfx_xmlelement.h"
27#include "core/fxcrt/xml/cfx_xmlinstruction.h"
28#include "core/fxcrt/xml/cfx_xmlnode.h"
29#include "core/fxcrt/xml/cfx_xmltext.h"
33constexpr size_t kCurrentTextReserve = 128;
34constexpr uint32_t kMaxCharRange = 0x10ffff;
36bool IsXMLWhiteSpace(
wchar_t ch) {
37 return ch == L' ' || ch == 0x0A || ch == 0x0D || ch == 0x09;
40struct FX_XMLNAMECHAR {
46constexpr FX_XMLNAMECHAR kXMLNameChars[] = {
47 {L'-', L'.',
false}, {L'0', L'9',
false}, {L':', L':',
false},
48 {L'A', L'Z',
true}, {L'_', L'_',
true}, {L'a', L'z',
true},
49 {0xB7, 0xB7,
false}, {0xC0, 0xD6,
true}, {0xD8, 0xF6,
true},
50 {0xF8, 0x02FF,
true}, {0x0300, 0x036F,
false}, {0x0370, 0x037D,
true},
51 {0x037F, 0x1FFF,
true}, {0x200C, 0x200D,
true}, {0x203F, 0x2040,
false},
52 {0x2070, 0x218F,
true}, {0x2C00, 0x2FEF,
true}, {0x3001, 0xD7FF,
true},
53 {0xF900, 0xFDCF,
true}, {0xFDF0, 0xFFFD,
true},
60 auto* it =
std::lower_bound(
61 std::begin(kXMLNameChars),
std::end(kXMLNameChars), ch,
62 [](
const FX_XMLNAMECHAR& arg,
wchar_t ch) {
return arg.wEnd < ch; });
63 return it !=
std::end(kXMLNameChars) && ch >= it->wStart &&
64 (!bFirstChar || it->bStartChar);
70 auto proxy =
pdfium::MakeRetain<CFX_SeekableStreamProxy>(pStream);
78 xml_plane_size_ = std::min(xml_plane_size_,
79 pdfium::checked_cast<size_t>(stream_->GetSize()));
81 current_text_.Reserve(kCurrentTextReserve);
89 current_node_ = doc->GetRoot();
90 if (!DoSyntaxParse(doc.get())) {
97 if (xml_plane_size_ <= 0)
100 FX_SAFE_SIZE_T alloc_size_safe = xml_plane_size_;
101 alloc_size_safe += 1;
102 if (!alloc_size_safe.IsValid())
105 size_t current_buffer_idx = 0;
106 size_t buffer_size = 0;
108 DataVector<
wchar_t> buffer;
109 buffer.resize(alloc_size_safe.ValueOrDie());
111 std::stack<
wchar_t> character_to_skip_too_stack;
112 std::stack<CFX_XMLNode::Type> node_type_stack;
114 FDE_XmlSyntaxState current_parser_state = FDE_XmlSyntaxState::Text;
115 wchar_t current_quote_character = 0;
116 wchar_t current_character_to_skip_to = 0;
119 if (current_buffer_idx >= buffer_size) {
120 if (stream_->IsEOF())
123 size_t buffer_chars =
124 stream_->ReadBlock(pdfium::make_span(buffer).first(xml_plane_size_));
125 if (buffer_chars == 0)
128 current_buffer_idx = 0;
129 buffer_size = buffer_chars;
132 while (current_buffer_idx < buffer_size) {
133 wchar_t ch = buffer[current_buffer_idx];
134 switch (current_parser_state) {
135 case FDE_XmlSyntaxState::Text:
137 if (!current_text_.IsEmpty()) {
138 current_node_->AppendLastChild(
139 doc->CreateNode<CFX_XMLText>(GetTextData()));
141 current_buffer_idx++;
142 current_parser_state = FDE_XmlSyntaxState::Node;
150 current_buffer_idx++;
153 case FDE_XmlSyntaxState::Node:
155 current_buffer_idx++;
156 current_parser_state = FDE_XmlSyntaxState::SkipCommentOrDecl;
157 }
else if (ch == L'/') {
158 current_buffer_idx++;
159 current_parser_state = FDE_XmlSyntaxState::CloseElement;
160 }
else if (ch == L'?') {
162 current_buffer_idx++;
163 current_parser_state = FDE_XmlSyntaxState::Target;
166 current_parser_state = FDE_XmlSyntaxState::Tag;
169 case FDE_XmlSyntaxState::Target:
170 if (!IsXMLNameChar(ch, current_text_.IsEmpty())) {
171 if (current_text_.IsEmpty()) {
175 current_parser_state = FDE_XmlSyntaxState::TargetData;
180 auto* node = doc->CreateNode<CFX_XMLInstruction>(target_name);
181 current_node_->AppendLastChild(node);
182 current_node_ = node;
186 current_buffer_idx++;
189 case FDE_XmlSyntaxState::Tag:
190 if (!IsXMLNameChar(ch, current_text_.IsEmpty())) {
191 if (current_text_.IsEmpty()) {
195 current_parser_state = FDE_XmlSyntaxState::AttriName;
197 auto* child = doc->CreateNode<CFX_XMLElement>(GetTextData());
198 current_node_->AppendLastChild(child);
199 current_node_ = child;
202 current_buffer_idx++;
205 case FDE_XmlSyntaxState::AttriName:
206 if (current_text_.IsEmpty() && IsXMLWhiteSpace(ch)) {
207 current_buffer_idx++;
210 if (!IsXMLNameChar(ch, current_text_.IsEmpty())) {
211 if (current_text_.IsEmpty()) {
213 if (ch == L'>' || ch == L'/') {
214 current_parser_state = FDE_XmlSyntaxState::BreakElement;
217 }
else if (node_type_stack.top() ==
220 current_parser_state = FDE_XmlSyntaxState::CloseInstruction;
221 current_buffer_idx++;
223 current_parser_state = FDE_XmlSyntaxState::TargetData;
230 if (ch !=
'=' && !IsXMLWhiteSpace(ch)) {
231 current_parser_state = FDE_XmlSyntaxState::TargetData;
235 current_parser_state = FDE_XmlSyntaxState::AttriEqualSign;
236 current_attribute_name
= GetTextData();
240 current_buffer_idx++;
243 case FDE_XmlSyntaxState::AttriEqualSign:
244 if (IsXMLWhiteSpace(ch)) {
245 current_buffer_idx++;
250 current_parser_state = FDE_XmlSyntaxState::TargetData;
255 current_parser_state = FDE_XmlSyntaxState::AttriQuotation;
256 current_buffer_idx++;
259 case FDE_XmlSyntaxState::AttriQuotation:
260 if (IsXMLWhiteSpace(ch)) {
261 current_buffer_idx++;
264 if (ch != L'\"' && ch != L'\'') {
268 current_quote_character = ch;
269 current_parser_state = FDE_XmlSyntaxState::AttriValue;
270 current_buffer_idx++;
272 case FDE_XmlSyntaxState::AttriValue:
273 if (ch == current_quote_character) {
274 if (entity_start_.has_value())
277 current_quote_character = 0;
278 current_buffer_idx++;
279 current_parser_state = FDE_XmlSyntaxState::AttriName;
281 CFX_XMLElement* elem = ToXMLElement(current_node_);
285 current_attribute_name.clear();
288 current_buffer_idx++;
291 case FDE_XmlSyntaxState::CloseInstruction:
294 current_parser_state = FDE_XmlSyntaxState::TargetData;
295 }
else if (!current_text_.IsEmpty()) {
298 current_buffer_idx++;
299 if (node_type_stack.empty())
302 node_type_stack.pop();
303 current_parser_state = FDE_XmlSyntaxState::Text;
306 current_node_->GetType() == CFX_XMLNode::Type::kInstruction)
307 current_node_ = current_node_->GetParent();
310 case FDE_XmlSyntaxState::BreakElement:
312 current_parser_state = FDE_XmlSyntaxState::Text;
313 }
else if (ch == L'/') {
314 current_parser_state = FDE_XmlSyntaxState::CloseElement;
318 current_buffer_idx++;
320 case FDE_XmlSyntaxState::CloseElement:
321 if (!IsXMLNameChar(ch, current_text_.IsEmpty())) {
323 if (node_type_stack.empty())
326 node_type_stack.pop();
327 current_parser_state = FDE_XmlSyntaxState::Text;
329 CFX_XMLElement* element = ToXMLElement(current_node_);
334 if (element_name.GetLength() > 0 &&
339 current_node_ = current_node_->GetParent();
340 }
else if (!IsXMLWhiteSpace(ch)) {
346 current_buffer_idx++;
348 case FDE_XmlSyntaxState::SkipCommentOrDecl: {
350 pdfium::make_span(buffer).subspan(current_buffer_idx));
351 if (current_view.First(2).EqualsASCII(
"--")) {
352 current_buffer_idx += 2;
353 current_parser_state = FDE_XmlSyntaxState::SkipComment;
354 }
else if (current_view.First(7).EqualsASCIINoCase(
"[CDATA[")) {
355 current_buffer_idx += 7;
356 current_parser_state = FDE_XmlSyntaxState::SkipCData;
358 current_parser_state = FDE_XmlSyntaxState::SkipDeclNode;
359 current_character_to_skip_to = L'>';
360 character_to_skip_too_stack.push(L'>');
364 case FDE_XmlSyntaxState::SkipCData: {
366 pdfium::make_span(buffer).subspan(current_buffer_idx));
367 if (current_view.First(3).EqualsASCII(
"]]>")) {
368 current_buffer_idx += 3;
369 current_parser_state = FDE_XmlSyntaxState::Text;
370 current_node_->AppendLastChild(
371 doc->CreateNode<CFX_XMLCharData>(GetTextData()));
374 current_buffer_idx++;
378 case FDE_XmlSyntaxState::SkipDeclNode:
379 if (current_character_to_skip_to == L'\'' ||
380 current_character_to_skip_to == L'\"') {
381 current_buffer_idx++;
382 if (ch != current_character_to_skip_to)
385 character_to_skip_too_stack.pop();
386 if (character_to_skip_too_stack.empty())
387 current_parser_state = FDE_XmlSyntaxState::Text;
389 current_character_to_skip_to = character_to_skip_too_stack.top();
393 current_character_to_skip_to = L'>';
394 character_to_skip_too_stack.push(L'>');
397 current_character_to_skip_to = L']';
398 character_to_skip_too_stack.push(L']');
401 current_character_to_skip_to = L')';
402 character_to_skip_too_stack.push(L')');
405 current_character_to_skip_to = L'\'';
406 character_to_skip_too_stack.push(L'\'');
409 current_character_to_skip_to = L'\"';
410 character_to_skip_too_stack.push(L'\"');
413 if (ch == current_character_to_skip_to) {
414 character_to_skip_too_stack.pop();
415 if (character_to_skip_too_stack.empty()) {
416 current_parser_state = FDE_XmlSyntaxState::Text;
418 current_character_to_skip_to =
419 character_to_skip_too_stack.top();
424 current_buffer_idx++;
427 case FDE_XmlSyntaxState::SkipComment: {
429 pdfium::make_span(buffer).subspan(current_buffer_idx));
430 if (current_view.First(3).EqualsASCII(
"-->")) {
431 current_buffer_idx += 2;
432 current_parser_state = FDE_XmlSyntaxState::Text;
434 current_buffer_idx++;
437 case FDE_XmlSyntaxState::TargetData:
438 if (IsXMLWhiteSpace(ch)) {
439 if (current_text_.IsEmpty()) {
440 current_buffer_idx++;
443 if (current_quote_character == 0) {
444 current_buffer_idx++;
450 current_parser_state = FDE_XmlSyntaxState::CloseInstruction;
451 current_buffer_idx++;
452 }
else if (ch ==
'\"') {
453 if (current_quote_character == 0) {
454 current_quote_character = ch;
455 current_buffer_idx++;
456 }
else if (ch == current_quote_character) {
457 current_quote_character = 0;
458 current_buffer_idx++;
465 current_buffer_idx++;
475void CFX_XMLParser::ProcessTextChar(
wchar_t character) {
476 current_text_ += character;
478 if (entity_start_.has_value() && character == L';') {
483 entity_start_.value() + 1,
484 current_text_.GetLength() - entity_start_.value() - 2);
486 current_text_.Delete(entity_start_.value(),
487 current_text_.GetLength() - entity_start_.value());
489 size_t iLen = csEntity.GetLength();
491 if (csEntity[0] == L'#') {
493 if (iLen > 1 && csEntity[1] == L'x') {
494 for (size_t i = 2; i < iLen; i++) {
495 if (!FXSYS_IsHexDigit(csEntity[i]))
497 ch = (ch << 4) + FXSYS_HexCharToInt(csEntity[i]);
500 for (size_t i = 1; i < iLen; i++) {
501 if (!FXSYS_IsDecimalDigit(csEntity[i]))
503 ch = ch * 10 + FXSYS_DecimalCharToInt(csEntity[i]);
506 if (ch > kMaxCharRange)
509 character =
static_cast<
wchar_t>(ch);
511 current_text_ += character;
514 current_text_ += L'&';
516 current_text_ += L'<';
518 current_text_ += L'>';
520 current_text_ += L'\'';
522 current_text_ += L'"';
526 entity_start_ = std::nullopt;
527 }
else if (!entity_start_.has_value() && character == L'&') {
528 entity_start_ = current_text_.GetLength() - 1;
532void CFX_XMLParser::ProcessTargetData() {
534 if (target_data.IsEmpty())
537 CFX_XMLInstruction* instruction = ToXMLInstruction(current_node_);
544 current_text_.Reserve(kCurrentTextReserve);
545 entity_start_ = std::nullopt;
const WideString & GetName() const
void SetAttribute(const WideString &name, const WideString &value)
void AppendData(const WideString &wsData)
CFX_XMLParser(const RetainPtr< IFX_SeekableReadStream > &pStream)
static bool IsXMLNameChar(wchar_t ch, bool bFirstChar)
std::unique_ptr< CFX_XMLDocument > Parse()
WideString & operator=(WideString &&that) noexcept
bool operator!=(const WideString &other) const
bool EqualsASCII(ByteStringView that) const
bool FXSYS_iswspace(wchar_t c)
#define NOTREACHED_NORETURN()
fxcrt::WideStringView WideStringView
fxcrt::WideString WideString