Qt
Internal/Contributor docs for the Qt SDK. Note: These are NOT official API docs; those are found at https://doc.qt.io/
Loading...
Searching...
No Matches
cpdf_syntax_parser.cpp
Go to the documentation of this file.
1// Copyright 2016 The PDFium Authors
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5// Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
6
7#include "core/fpdfapi/parser/cpdf_syntax_parser.h"
8
9#include <ctype.h>
10
11#include <algorithm>
12#include <utility>
13
14#include "core/fpdfapi/parser/cpdf_array.h"
15#include "core/fpdfapi/parser/cpdf_boolean.h"
16#include "core/fpdfapi/parser/cpdf_crypto_handler.h"
17#include "core/fpdfapi/parser/cpdf_dictionary.h"
18#include "core/fpdfapi/parser/cpdf_name.h"
19#include "core/fpdfapi/parser/cpdf_null.h"
20#include "core/fpdfapi/parser/cpdf_number.h"
21#include "core/fpdfapi/parser/cpdf_read_validator.h"
22#include "core/fpdfapi/parser/cpdf_reference.h"
23#include "core/fpdfapi/parser/cpdf_stream.h"
24#include "core/fpdfapi/parser/cpdf_string.h"
25#include "core/fpdfapi/parser/fpdf_parser_utility.h"
26#include "core/fxcrt/autorestorer.h"
27#include "core/fxcrt/cfx_read_only_vector_stream.h"
28#include "core/fxcrt/fixed_size_data_vector.h"
29#include "core/fxcrt/fx_extension.h"
30#include "core/fxcrt/fx_safe_types.h"
31#include "third_party/base/check.h"
32#include "third_party/base/check_op.h"
33
34namespace {
35
36enum class ReadStatus {
37 kNormal,
38 kBackslash,
39 kOctal,
40 kFinishOctal,
41 kCarriageReturn
42};
43
44class ReadableSubStream final : public IFX_SeekableReadStream {
45 public:
46 ReadableSubStream(RetainPtr<IFX_SeekableReadStream> pFileRead,
47 FX_FILESIZE part_offset,
48 FX_FILESIZE part_size)
49 : m_pFileRead(std::move(pFileRead)),
50 m_PartOffset(part_offset),
51 m_PartSize(part_size) {}
52
53 ~ReadableSubStream() override = default;
54
55 // IFX_SeekableReadStream overrides:
56 bool ReadBlockAtOffset(pdfium::span<uint8_t> buffer,
57 FX_FILESIZE offset) override {
58 FX_SAFE_FILESIZE safe_end = offset;
59 safe_end += buffer.size();
60 // Check that requested range is valid, to prevent calling of ReadBlock
61 // of original m_pFileRead with incorrect params.
62 if (!safe_end.IsValid() || safe_end.ValueOrDie() > m_PartSize)
63 return false;
64
65 return m_pFileRead->ReadBlockAtOffset(buffer, m_PartOffset + offset);
66 }
67
68 FX_FILESIZE GetSize() override { return m_PartSize; }
69
70 private:
72 FX_FILESIZE m_PartOffset;
73 FX_FILESIZE m_PartSize;
74};
75
76} // namespace
77
78// static
79int CPDF_SyntaxParser::s_CurrentRecursionDepth = 0;
80
81// static
82std::unique_ptr<CPDF_SyntaxParser> CPDF_SyntaxParser::CreateForTesting(
84 FX_FILESIZE HeaderOffset) {
85 return std::make_unique<CPDF_SyntaxParser>(
86 pdfium::MakeRetain<CPDF_ReadValidator>(std::move(pFileAccess), nullptr),
87 HeaderOffset);
88}
89
93 pdfium::MakeRetain<CPDF_ReadValidator>(std::move(pFileAccess),
94 nullptr),
95 0) {}
96
98 FX_FILESIZE HeaderOffset)
100 m_HeaderOffset(HeaderOffset),
102 DCHECK(m_HeaderOffset <= m_FileLen);
103}
104
106
107bool CPDF_SyntaxParser::GetCharAt(FX_FILESIZE pos, uint8_t& ch) {
108 AutoRestorer<FX_FILESIZE> save_pos(&m_Pos);
109 m_Pos = pos;
110 return GetNextChar(ch);
111}
112
113bool CPDF_SyntaxParser::ReadBlockAt(FX_FILESIZE read_pos) {
114 if (read_pos >= m_FileLen)
115 return false;
116 size_t read_size = m_ReadBufferSize;
117 FX_SAFE_FILESIZE safe_end = read_pos;
118 safe_end += read_size;
119 if (!safe_end.IsValid() || safe_end.ValueOrDie() > m_FileLen)
120 read_size = m_FileLen - read_pos;
121
122 m_pFileBuf.resize(read_size);
123 if (!m_pFileAccess->ReadBlockAtOffset(m_pFileBuf, read_pos)) {
124 m_pFileBuf.clear();
125 return false;
126 }
127
128 m_BufOffset = read_pos;
129 return true;
130}
131
132bool CPDF_SyntaxParser::GetNextChar(uint8_t& ch) {
133 FX_FILESIZE pos = m_Pos + m_HeaderOffset;
134 if (pos >= m_FileLen)
135 return false;
136
137 if (!IsPositionRead(pos) && !ReadBlockAt(pos))
138 return false;
139
140 ch = m_pFileBuf[pos - m_BufOffset];
141 m_Pos++;
142 return true;
143}
144
146 return m_FileLen - m_HeaderOffset;
147}
148
149bool CPDF_SyntaxParser::GetCharAtBackward(FX_FILESIZE pos, uint8_t* ch) {
150 pos += m_HeaderOffset;
151 if (pos >= m_FileLen)
152 return false;
153
154 if (!IsPositionRead(pos)) {
155 FX_FILESIZE block_start = 0;
156 if (pos >= CPDF_Stream::kFileBufSize)
157 block_start = pos - CPDF_Stream::kFileBufSize + 1;
158 if (!ReadBlockAt(block_start) || !IsPositionRead(pos))
159 return false;
160 }
161 *ch = m_pFileBuf[pos - m_BufOffset];
162 return true;
163}
164
165bool CPDF_SyntaxParser::ReadBlock(pdfium::span<uint8_t> buffer) {
166 if (!m_pFileAccess->ReadBlockAtOffset(buffer, m_Pos + m_HeaderOffset))
167 return false;
168 m_Pos += buffer.size();
169 return true;
170}
171
172CPDF_SyntaxParser::WordType CPDF_SyntaxParser::GetNextWordInternal() {
173 m_WordSize = 0;
174 WordType word_type = WordType::kNumber;
175
177 uint8_t ch;
178 if (!GetNextChar(ch))
179 return word_type;
180
181 if (PDFCharIsDelimiter(ch)) {
182 word_type = WordType::kWord;
183
184 m_WordBuffer[m_WordSize++] = ch;
185 if (ch == '/') {
186 while (true) {
187 if (!GetNextChar(ch))
188 return word_type;
189
191 m_Pos--;
192 return word_type;
193 }
194
195 if (m_WordSize < sizeof(m_WordBuffer) - 1)
196 m_WordBuffer[m_WordSize++] = ch;
197 }
198 } else if (ch == '<') {
199 if (!GetNextChar(ch))
200 return word_type;
201
202 if (ch == '<')
203 m_WordBuffer[m_WordSize++] = ch;
204 else
205 m_Pos--;
206 } else if (ch == '>') {
207 if (!GetNextChar(ch))
208 return word_type;
209
210 if (ch == '>')
211 m_WordBuffer[m_WordSize++] = ch;
212 else
213 m_Pos--;
214 }
215 return word_type;
216 }
217
218 while (true) {
219 if (m_WordSize < sizeof(m_WordBuffer) - 1)
220 m_WordBuffer[m_WordSize++] = ch;
221
222 if (!PDFCharIsNumeric(ch))
223 word_type = WordType::kWord;
224
225 if (!GetNextChar(ch))
226 return word_type;
227
229 m_Pos--;
230 break;
231 }
232 }
233 return word_type;
234}
235
237 uint8_t ch;
238 if (!GetNextChar(ch))
239 return ByteString();
240
241 ByteString buf;
242 int32_t parlevel = 0;
243 ReadStatus status = ReadStatus::kNormal;
244 int32_t iEscCode = 0;
245 while (true) {
246 switch (status) {
247 case ReadStatus::kNormal:
248 if (ch == ')') {
249 if (parlevel == 0)
250 return ByteString(buf);
251 parlevel--;
252 } else if (ch == '(') {
253 parlevel++;
254 }
255 if (ch == '\\')
256 status = ReadStatus::kBackslash;
257 else
258 buf += static_cast<char>(ch);
259 break;
260 case ReadStatus::kBackslash:
261 if (FXSYS_IsOctalDigit(ch)) {
262 iEscCode = FXSYS_DecimalCharToInt(static_cast<wchar_t>(ch));
263 status = ReadStatus::kOctal;
264 break;
265 }
266 if (ch == '\r') {
267 status = ReadStatus::kCarriageReturn;
268 break;
269 }
270 if (ch == 'n') {
271 buf += '\n';
272 } else if (ch == 'r') {
273 buf += '\r';
274 } else if (ch == 't') {
275 buf += '\t';
276 } else if (ch == 'b') {
277 buf += '\b';
278 } else if (ch == 'f') {
279 buf += '\f';
280 } else if (ch != '\n') {
281 buf += static_cast<char>(ch);
282 }
283 status = ReadStatus::kNormal;
284 break;
285 case ReadStatus::kOctal:
286 if (FXSYS_IsOctalDigit(ch)) {
287 iEscCode =
288 iEscCode * 8 + FXSYS_DecimalCharToInt(static_cast<wchar_t>(ch));
289 status = ReadStatus::kFinishOctal;
290 } else {
291 buf += static_cast<char>(iEscCode);
292 status = ReadStatus::kNormal;
293 continue;
294 }
295 break;
296 case ReadStatus::kFinishOctal:
297 status = ReadStatus::kNormal;
298 if (FXSYS_IsOctalDigit(ch)) {
299 iEscCode =
300 iEscCode * 8 + FXSYS_DecimalCharToInt(static_cast<wchar_t>(ch));
301 buf += static_cast<char>(iEscCode);
302 } else {
303 buf += static_cast<char>(iEscCode);
304 continue;
305 }
306 break;
307 case ReadStatus::kCarriageReturn:
308 status = ReadStatus::kNormal;
309 if (ch != '\n')
310 continue;
311 break;
312 }
313
314 if (!GetNextChar(ch))
315 break;
316 }
317
319 return buf;
320}
321
323 uint8_t ch;
324 if (!GetNextChar(ch))
325 return ByteString();
326
327 ByteString buf;
328 bool bFirst = true;
329 uint8_t code = 0;
330 while (true) {
331 if (ch == '>')
332 break;
333
334 if (isxdigit(ch)) {
335 int val = FXSYS_HexCharToInt(ch);
336 if (bFirst) {
337 code = val * 16;
338 } else {
339 code += val;
340 buf += static_cast<char>(code);
341 }
342 bFirst = !bFirst;
343 }
344
345 if (!GetNextChar(ch))
346 break;
347 }
348 if (!bFirst)
349 buf += static_cast<char>(code);
350
351 return buf;
352}
353
355 uint8_t ch;
356 while (GetNextChar(ch)) {
357 if (ch == '\n')
358 break;
359
360 if (ch == '\r') {
362 if (ch != '\n')
363 --m_Pos;
364 break;
365 }
366 }
367}
368
370 if (m_TrailerEnds) {
372 return;
373 }
374
375 uint8_t ch;
376 if (!GetNextChar(ch))
377 return;
378
379 while (true) {
380 while (PDFCharIsWhitespace(ch)) {
381 if (!GetNextChar(ch))
382 return;
383 }
384
385 if (ch != '%')
386 break;
387
388 while (true) {
389 if (!GetNextChar(ch))
390 return;
392 break;
393 }
394 }
395 m_Pos--;
396}
397
398// A state machine which goes % -> E -> O -> F -> line ending.
399enum class EofState {
400 kInitial = 0,
402 kPercent,
403 kE,
404 kO,
405 kF,
406 kInvalid,
407};
408
410 DCHECK(m_TrailerEnds);
411
412 EofState eof_state = EofState::kInitial;
413 // Find the first character which is neither whitespace, nor part of a
414 // comment.
415 while (true) {
416 uint8_t ch;
417 if (!GetNextChar(ch))
418 return;
419 switch (eof_state) {
422 eof_state = ch == '%' ? EofState::kPercent : EofState::kNonPercent;
423 break;
425 break;
427 if (ch == 'E')
428 eof_state = EofState::kE;
429 else if (ch != '%')
430 eof_state = EofState::kInvalid;
431 break;
432 case EofState::kE:
433 eof_state = ch == 'O' ? EofState::kO : EofState::kInvalid;
434 break;
435 case EofState::kO:
436 eof_state = ch == 'F' ? EofState::kF : EofState::kInvalid;
437 break;
438 case EofState::kF:
439 if (ch == '\r') {
440 // See if \r has to be combined with a \n that follows it
441 // immediately.
442 if (GetNextChar(ch) && ch != '\n') {
443 ch = '\r';
444 m_Pos--;
445 }
446 }
447 // If we now have a \r, that's not followed by a \n, so both are OK.
448 if (ch == '\r' || ch == '\n')
449 m_TrailerEnds->push_back(m_Pos);
450 eof_state = EofState::kInvalid;
451 break;
453 break;
454 }
456 eof_state = EofState::kInitial;
457 if (eof_state == EofState::kNonPercent)
458 break;
459 }
460 m_Pos--;
461}
462
465 WordType word_type = GetNextWordInternal();
466 ByteString word;
467 if (!GetValidator()->has_read_problems())
468 word = ByteString(m_WordBuffer, m_WordSize);
469 return {word, word_type == WordType::kNumber};
470}
471
473 AutoRestorer<FX_FILESIZE> save_pos(&m_Pos);
474 return GetNextWord().word;
475}
476
478 return GetNextWord().word;
479}
480
482 DCHECK_GE(pos, 0);
483 m_Pos = std::min(pos, m_FileLen);
484}
485
487 CPDF_IndirectObjectHolder* pObjList) {
489 auto result = GetObjectBodyInternal(pObjList, ParseType::kLoose);
490 if (GetValidator()->has_read_problems())
491 return nullptr;
492 return result;
493}
494
495RetainPtr<CPDF_Object> CPDF_SyntaxParser::GetObjectBodyInternal(
497 ParseType parse_type) {
498 AutoRestorer<int> depth_restorer(&s_CurrentRecursionDepth);
499 if (++s_CurrentRecursionDepth > kParserMaxRecursionDepth)
500 return nullptr;
501
502 FX_FILESIZE SavedObjPos = m_Pos;
503 WordResult word_result = GetNextWord();
504 const ByteString& word = word_result.word;
505 if (word.IsEmpty())
506 return nullptr;
507
508 if (word_result.is_number) {
509 AutoRestorer<FX_FILESIZE> pos_restorer(&m_Pos);
510 WordResult nextword = GetNextWord();
511 if (!nextword.is_number)
512 return pdfium::MakeRetain<CPDF_Number>(word.AsStringView());
513
514 WordResult nextword2 = GetNextWord();
515 if (nextword2.word != "R")
516 return pdfium::MakeRetain<CPDF_Number>(word.AsStringView());
517
518 pos_restorer.AbandonRestoration();
519 uint32_t refnum = FXSYS_atoui(word.c_str());
520 if (refnum == CPDF_Object::kInvalidObjNum)
521 return nullptr;
522
523 return pdfium::MakeRetain<CPDF_Reference>(pObjList, refnum);
524 }
525
526 if (word == "true" || word == "false")
527 return pdfium::MakeRetain<CPDF_Boolean>(word == "true");
528
529 if (word == "null")
530 return pdfium::MakeRetain<CPDF_Null>();
531
532 if (word == "(") {
533 ByteString str = ReadString();
534 return pdfium::MakeRetain<CPDF_String>(m_pPool, str, false);
535 }
536 if (word == "<") {
537 ByteString str = ReadHexString();
538 return pdfium::MakeRetain<CPDF_String>(m_pPool, str, true);
539 }
540 if (word == "[") {
541 auto pArray = pdfium::MakeRetain<CPDF_Array>();
542 while (RetainPtr<CPDF_Object> pObj =
543 GetObjectBodyInternal(pObjList, ParseType::kLoose)) {
544 pArray->Append(std::move(pObj));
545 }
546 return (parse_type == ParseType::kLoose || m_WordBuffer[0] == ']')
547 ? std::move(pArray)
548 : nullptr;
549 }
550 if (word[0] == '/') {
551 return pdfium::MakeRetain<CPDF_Name>(
552 m_pPool,
553 PDF_NameDecode(ByteStringView(m_WordBuffer + 1, m_WordSize - 1)));
554 }
555 if (word == "<<") {
556 RetainPtr<CPDF_Dictionary> pDict =
557 pdfium::MakeRetain<CPDF_Dictionary>(m_pPool);
558 while (true) {
559 WordResult inner_word_result = GetNextWord();
560 const ByteString& inner_word = inner_word_result.word;
561 if (inner_word.IsEmpty())
562 return nullptr;
563
564 FX_FILESIZE SavedPos = m_Pos - inner_word.GetLength();
565 if (inner_word == ">>")
566 break;
567
568 if (inner_word == "endobj") {
569 m_Pos = SavedPos;
570 break;
571 }
572 if (inner_word[0] != '/')
573 continue;
574
575 ByteString key = PDF_NameDecode(inner_word.AsStringView());
576 if (key.IsEmpty() && parse_type == ParseType::kLoose)
577 continue;
578
579 RetainPtr<CPDF_Object> pObj =
580 GetObjectBodyInternal(pObjList, ParseType::kLoose);
581 if (!pObj) {
582 if (parse_type == ParseType::kLoose)
583 continue;
584
586 return nullptr;
587 }
588
589 // `key` has to be "/X" at the minimum.
590 // `pObj` cannot be a stream, per ISO 32000-1:2008 section 7.3.8.1.
591 if (key.GetLength() > 1 && !pObj->IsStream()) {
592 pDict->SetFor(key.Substr(1), std::move(pObj));
593 }
594 }
595
596 AutoRestorer<FX_FILESIZE> pos_restorer(&m_Pos);
597 if (GetNextWord().word != "stream")
598 return pDict;
599 pos_restorer.AbandonRestoration();
600 return ReadStream(std::move(pDict));
601 }
602 if (word == ">>")
603 m_Pos = SavedObjPos;
604
605 return nullptr;
606}
607
610 ParseType parse_type) {
612 const FX_FILESIZE saved_pos = GetPos();
613
614 WordResult objnum_word_result = GetNextWord();
615 if (!objnum_word_result.is_number || objnum_word_result.word.IsEmpty()) {
616 SetPos(saved_pos);
617 return nullptr;
618 }
619 const uint32_t parser_objnum = FXSYS_atoui(objnum_word_result.word.c_str());
620
621 WordResult gennum_word_result = GetNextWord();
622 const ByteString& gennum_word = gennum_word_result.word;
623 if (!gennum_word_result.is_number || gennum_word.IsEmpty()) {
624 SetPos(saved_pos);
625 return nullptr;
626 }
627 const uint32_t parser_gennum = FXSYS_atoui(gennum_word.c_str());
628
629 if (GetKeyword() != "obj") {
630 SetPos(saved_pos);
631 return nullptr;
632 }
633
634 RetainPtr<CPDF_Object> pObj = GetObjectBodyInternal(pObjList, parse_type);
635 if (pObj) {
636 pObj->SetObjNum(parser_objnum);
637 pObj->SetGenNum(parser_gennum);
638 }
639
640 return GetValidator()->has_read_problems() ? nullptr : std::move(pObj);
641}
642
643unsigned int CPDF_SyntaxParser::ReadEOLMarkers(FX_FILESIZE pos) {
644 unsigned char byte1 = 0;
645 unsigned char byte2 = 0;
646
647 GetCharAt(pos, byte1);
648 GetCharAt(pos + 1, byte2);
649
650 if (byte1 == '\r' && byte2 == '\n')
651 return 2;
652
653 if (byte1 == '\r' || byte1 == '\n')
654 return 1;
655
656 return 0;
657}
658
659FX_FILESIZE CPDF_SyntaxParser::FindWordPos(ByteStringView word) {
660 AutoRestorer<FX_FILESIZE> pos_restorer(&m_Pos);
661 FX_FILESIZE end_offset = FindTag(word);
662 while (end_offset >= 0) {
663 // Stop searching when word is found.
664 if (IsWholeWord(GetPos() - word.GetLength(), m_FileLen, word, true))
665 return GetPos() - word.GetLength();
666
667 end_offset = FindTag(word);
668 }
669 return -1;
670}
671
672FX_FILESIZE CPDF_SyntaxParser::FindStreamEndPos() {
673 const ByteStringView kEndStreamStr("endstream");
674 const ByteStringView kEndObjStr("endobj");
675
676 FX_FILESIZE endStreamWordOffset = FindWordPos(kEndStreamStr);
677 FX_FILESIZE endObjWordOffset = FindWordPos(kEndObjStr);
678
679 // Can't find "endstream" or "endobj".
680 if (endStreamWordOffset < 0 && endObjWordOffset < 0) {
681 return -1;
682 }
683
684 if (endStreamWordOffset < 0 && endObjWordOffset >= 0) {
685 // Correct the position of end stream.
686 endStreamWordOffset = endObjWordOffset;
687 } else if (endStreamWordOffset >= 0 && endObjWordOffset < 0) {
688 // Correct the position of end obj.
689 endObjWordOffset = endStreamWordOffset;
690 } else if (endStreamWordOffset > endObjWordOffset) {
691 endStreamWordOffset = endObjWordOffset;
692 }
693
694 int numMarkers = ReadEOLMarkers(endStreamWordOffset - 2);
695 if (numMarkers == 2) {
696 endStreamWordOffset -= 2;
697 } else {
698 numMarkers = ReadEOLMarkers(endStreamWordOffset - 1);
699 if (numMarkers == 1) {
700 endStreamWordOffset -= 1;
701 }
702 }
703 if (endStreamWordOffset < GetPos()) {
704 return -1;
705 }
706 return endStreamWordOffset;
707}
708
709RetainPtr<CPDF_Stream> CPDF_SyntaxParser::ReadStream(
710 RetainPtr<CPDF_Dictionary> pDict) {
711 RetainPtr<const CPDF_Number> pLenObj =
712 ToNumber(pDict->GetDirectObjectFor("Length"));
713 FX_FILESIZE len = pLenObj ? pLenObj->GetInteger() : -1;
714
715 // Locate the start of stream.
717 const FX_FILESIZE streamStartPos = GetPos();
718
719 if (len > 0) {
720 FX_SAFE_FILESIZE pos = GetPos();
721 pos += len;
722 if (!pos.IsValid() || pos.ValueOrDie() >= m_FileLen)
723 len = -1;
724 }
725
727 if (len > 0) {
728 // Check data availability first to allow the Validator to request data
729 // smoothly, without jumps.
730 if (!GetValidator()->CheckDataRangeAndRequestIfUnavailable(
731 m_HeaderOffset + GetPos(), len)) {
732 return nullptr;
733 }
734
735 substream = pdfium::MakeRetain<ReadableSubStream>(
736 GetValidator(), m_HeaderOffset + GetPos(), len);
737 SetPos(GetPos() + len);
738 }
739
740 const ByteStringView kEndStreamStr("endstream");
741 const ByteStringView kEndObjStr("endobj");
742
743 // Note, we allow zero length streams as we need to pass them through when we
744 // are importing pages into a new document.
745 if (len >= 0) {
747 m_Pos += ReadEOLMarkers(GetPos());
748 memset(m_WordBuffer, 0, kEndStreamStr.GetLength() + 1);
749 GetNextWordInternal();
750 if (GetValidator()->has_read_problems())
751 return nullptr;
752
753 // Earlier version of PDF specification doesn't require EOL marker before
754 // 'endstream' keyword. If keyword 'endstream' follows the bytes in
755 // specified length, it signals the end of stream.
756 if (memcmp(m_WordBuffer, kEndStreamStr.raw_str(),
757 kEndStreamStr.GetLength()) != 0) {
758 substream.Reset();
759 len = -1;
760 SetPos(streamStartPos);
761 }
762 }
763
764 if (len < 0) {
765 // If len is not available or incorrect, len needs to be calculated
766 // by searching the keywords "endstream" or "endobj".
767 const FX_FILESIZE streamEndPos = FindStreamEndPos();
768 if (streamEndPos < 0)
769 return nullptr;
770
771 len = streamEndPos - streamStartPos;
772 DCHECK_GE(len, 0);
773 if (len > 0) {
774 SetPos(streamStartPos);
775 // Check data availability first to allow the Validator to request data
776 // smoothly, without jumps.
777 if (!GetValidator()->CheckDataRangeAndRequestIfUnavailable(
778 m_HeaderOffset + GetPos(), len)) {
779 return nullptr;
780 }
781
782 substream = pdfium::MakeRetain<ReadableSubStream>(
783 GetValidator(), m_HeaderOffset + GetPos(), len);
784 SetPos(GetPos() + len);
785 }
786 }
787
788 RetainPtr<CPDF_Stream> pStream;
789 if (substream) {
790 // It is unclear from CPDF_SyntaxParser's perspective what object
791 // `substream` is ultimately holding references to. To avoid unexpectedly
792 // changing object lifetimes by handing `substream` to `pStream`, make a
793 // copy of the data here.
794 auto data = FixedSizeDataVector<uint8_t>::Uninit(substream->GetSize());
795 bool did_read = substream->ReadBlockAtOffset(data.span(), 0);
796 CHECK(did_read);
797 auto data_as_stream =
798 pdfium::MakeRetain<CFX_ReadOnlyVectorStream>(std::move(data));
799
800 pStream = pdfium::MakeRetain<CPDF_Stream>();
801 pStream->InitStreamFromFile(std::move(data_as_stream), std::move(pDict));
802 } else {
803 DCHECK(!len);
804 pStream = pdfium::MakeRetain<CPDF_Stream>(std::move(pDict));
805 }
806 const FX_FILESIZE end_stream_offset = GetPos();
807 memset(m_WordBuffer, 0, kEndObjStr.GetLength() + 1);
808 GetNextWordInternal();
809
810 // Allow whitespace after endstream and before a newline.
811 unsigned char ch = 0;
812 while (GetNextChar(ch)) {
814 break;
815 }
817
818 int numMarkers = ReadEOLMarkers(GetPos());
819 if (m_WordSize == static_cast<unsigned int>(kEndObjStr.GetLength()) &&
820 numMarkers != 0 &&
821 memcmp(m_WordBuffer, kEndObjStr.raw_str(), kEndObjStr.GetLength()) == 0) {
822 SetPos(end_stream_offset);
823 }
824 return pStream;
825}
826
828 if (GetNextWordInternal() != WordType::kNumber)
829 return 0;
830
831 m_WordBuffer[m_WordSize] = 0;
832 return FXSYS_atoui(reinterpret_cast<const char*>(m_WordBuffer));
833}
834
836 return m_pFileAccess;
837}
838
839bool CPDF_SyntaxParser::IsWholeWord(FX_FILESIZE startpos,
840 FX_FILESIZE limit,
841 ByteStringView tag,
842 bool checkKeyword) {
843 const uint32_t taglen = tag.GetLength();
844
845 bool bCheckLeft = !PDFCharIsDelimiter(tag[0]) && !PDFCharIsWhitespace(tag[0]);
846 bool bCheckRight = !PDFCharIsDelimiter(tag[taglen - 1]) &&
847 !PDFCharIsWhitespace(tag[taglen - 1]);
848
849 uint8_t ch;
850 if (bCheckRight && startpos + static_cast<int32_t>(taglen) <= limit &&
851 GetCharAt(startpos + static_cast<int32_t>(taglen), ch)) {
853 (checkKeyword && PDFCharIsDelimiter(ch))) {
854 return false;
855 }
856 }
857
858 if (bCheckLeft && startpos > 0 && GetCharAt(startpos - 1, ch)) {
860 (checkKeyword && PDFCharIsDelimiter(ch))) {
861 return false;
862 }
863 }
864 return true;
865}
866
867bool CPDF_SyntaxParser::BackwardsSearchToWord(ByteStringView word,
868 FX_FILESIZE limit) {
869 int32_t taglen = word.GetLength();
870 if (taglen == 0)
871 return false;
872
873 FX_FILESIZE pos = m_Pos;
874 int32_t offset = taglen - 1;
875 while (true) {
876 if (limit && pos <= m_Pos - limit)
877 return false;
878
879 uint8_t byte;
880 if (!GetCharAtBackward(pos, &byte))
881 return false;
882
883 if (byte == word[offset]) {
884 offset--;
885 if (offset >= 0) {
886 pos--;
887 continue;
888 }
889 if (IsWholeWord(pos, limit, word, false)) {
890 m_Pos = pos;
891 return true;
892 }
893 }
894 offset = byte == word[taglen - 1] ? taglen - 2 : taglen - 1;
895 pos--;
896 if (pos < 0)
897 return false;
898 }
899}
900
902 const FX_FILESIZE startpos = GetPos();
903 const int32_t taglen = tag.GetLength();
904 DCHECK_GT(taglen, 0);
905
906 int32_t match = 0;
907 while (true) {
908 uint8_t ch;
909 if (!GetNextChar(ch))
910 return -1;
911
912 if (ch == tag[match]) {
913 match++;
914 if (match == taglen)
915 return GetPos() - startpos - taglen;
916 } else {
917 match = ch == tag[0] ? 1 : 0;
918 }
919 }
920}
921
922bool CPDF_SyntaxParser::IsPositionRead(FX_FILESIZE pos) const {
923 return m_BufOffset <= pos &&
924 pos < static_cast<FX_FILESIZE>(m_BufOffset + m_pFileBuf.size());
925}
static constexpr uint32_t kInvalidObjNum
Definition cpdf_object.h:52
static constexpr int kFileBufSize
Definition cpdf_stream.h:25
RetainPtr< CPDF_Object > GetIndirectObject(CPDF_IndirectObjectHolder *pObjList, ParseType parse_type)
bool BackwardsSearchToWord(ByteStringView word, FX_FILESIZE limit)
RetainPtr< CPDF_ReadValidator > GetValidator() const
CPDF_SyntaxParser(RetainPtr< IFX_SeekableReadStream > pFileAccess)
FX_FILESIZE FindTag(ByteStringView tag)
FX_FILESIZE GetPos() const
FX_FILESIZE GetDocumentSize() const
CPDF_SyntaxParser(RetainPtr< CPDF_ReadValidator > pValidator, FX_FILESIZE HeaderOffset)
bool GetNextChar(uint8_t &ch)
bool ReadBlock(pdfium::span< uint8_t > buffer)
void SetPos(FX_FILESIZE pos)
RetainPtr< CPDF_Object > GetObjectBody(CPDF_IndirectObjectHolder *pObjList)
bool GetCharAt(FX_FILESIZE pos, uint8_t &ch)
ByteString & operator+=(char ch)
bool operator==(const char *ptr) const
ByteString(const ByteString &other)
const char * c_str() const
Definition bytestring.h:76
ByteString & operator=(ByteString &&that) noexcept
bool IsEmpty() const
Definition bytestring.h:119
CharType operator[](const size_t index) const
Definition bytestring.h:150
bool operator!=(const char *ptr) const
Definition bytestring.h:130
bool PDFCharIsWhitespace(uint8_t c)
bool PDFCharIsOther(uint8_t c)
bool PDFCharIsNumeric(uint8_t c)
bool PDFCharIsDelimiter(uint8_t c)
bool PDFCharIsLineEnding(uint8_t c)
bool FXSYS_IsOctalDigit(char c)
int FXSYS_DecimalCharToInt(wchar_t c)
int FXSYS_HexCharToInt(char c)
uint32_t FXSYS_atoui(const char *str)
#define FX_FILESIZE
Definition fx_types.h:19
#define CHECK(cvref)