Qt
Internal/Contributor docs for the Qt SDK. Note: These are NOT official API docs; those are found at https://doc.qt.io/
Loading...
Searching...
No Matches
cpdf_cmap.cpp
Go to the documentation of this file.
1// Copyright 2017 The PDFium Authors
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5// Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
6
7#include "core/fpdfapi/font/cpdf_cmap.h"
8
9#include <utility>
10#include <vector>
11
12#include "core/fpdfapi/cmaps/fpdf_cmaps.h"
13#include "core/fpdfapi/font/cpdf_cmapparser.h"
14#include "core/fpdfapi/font/cpdf_fontglobals.h"
15#include "core/fpdfapi/parser/cpdf_simple_parser.h"
16#include "third_party/base/check.h"
17
18namespace {
19
20struct ByteRange {
21 uint8_t m_First;
22 uint8_t m_Last; // Inclusive.
23};
24
25struct PredefinedCMap {
26 const char* m_pName; // Raw, POD struct.
27 CIDSet m_Charset;
28 CIDCoding m_Coding;
29 CPDF_CMap::CodingScheme m_CodingScheme;
30 uint8_t m_LeadingSegCount;
31 ByteRange m_LeadingSegs[2];
32};
33
34constexpr PredefinedCMap kPredefinedCMaps[] = {
35 {"GB-EUC",
38 CPDF_CMap::MixedTwoBytes,
39 1,
40 {{0xa1, 0xfe}}},
41 {"GBpc-EUC",
44 CPDF_CMap::MixedTwoBytes,
45 1,
46 {{0xa1, 0xfc}}},
47 {"GBK-EUC",
50 CPDF_CMap::MixedTwoBytes,
51 1,
52 {{0x81, 0xfe}}},
53 {"GBKp-EUC",
56 CPDF_CMap::MixedTwoBytes,
57 1,
58 {{0x81, 0xfe}}},
59 {"GBK2K-EUC",
62 CPDF_CMap::MixedTwoBytes,
63 1,
64 {{0x81, 0xfe}}},
65 {"GBK2K",
68 CPDF_CMap::MixedTwoBytes,
69 1,
70 {{0x81, 0xfe}}},
71 {"UniGB-UCS2", CIDSET_GB1, CIDCoding::kUCS2, CPDF_CMap::TwoBytes, 0, {}},
72 {"UniGB-UTF16", CIDSET_GB1, CIDCoding::kUTF16, CPDF_CMap::TwoBytes, 0, {}},
73 {"B5pc",
76 CPDF_CMap::MixedTwoBytes,
77 1,
78 {{0xa1, 0xfc}}},
79 {"HKscs-B5",
82 CPDF_CMap::MixedTwoBytes,
83 1,
84 {{0x88, 0xfe}}},
85 {"ETen-B5",
88 CPDF_CMap::MixedTwoBytes,
89 1,
90 {{0xa1, 0xfe}}},
91 {"ETenms-B5",
94 CPDF_CMap::MixedTwoBytes,
95 1,
96 {{0xa1, 0xfe}}},
97 {"UniCNS-UCS2", CIDSET_CNS1, CIDCoding::kUCS2, CPDF_CMap::TwoBytes, 0, {}},
98 {"UniCNS-UTF16",
101 CPDF_CMap::TwoBytes,
102 0,
103 {}},
104 {"83pv-RKSJ",
107 CPDF_CMap::MixedTwoBytes,
108 2,
109 {{0x81, 0x9f}, {0xe0, 0xfc}}},
110 {"90ms-RKSJ",
113 CPDF_CMap::MixedTwoBytes,
114 2,
115 {{0x81, 0x9f}, {0xe0, 0xfc}}},
116 {"90msp-RKSJ",
119 CPDF_CMap::MixedTwoBytes,
120 2,
121 {{0x81, 0x9f}, {0xe0, 0xfc}}},
122 {"90pv-RKSJ",
125 CPDF_CMap::MixedTwoBytes,
126 2,
127 {{0x81, 0x9f}, {0xe0, 0xfc}}},
128 {"Add-RKSJ",
131 CPDF_CMap::MixedTwoBytes,
132 2,
133 {{0x81, 0x9f}, {0xe0, 0xfc}}},
134 {"EUC",
137 CPDF_CMap::MixedTwoBytes,
138 2,
139 {{0x8e, 0x8e}, {0xa1, 0xfe}}},
140 {"H",
143 CPDF_CMap::TwoBytes,
144 1,
145 {{0x21, 0x7e}}},
146 {"V",
149 CPDF_CMap::TwoBytes,
150 1,
151 {{0x21, 0x7e}}},
152 {"Ext-RKSJ",
155 CPDF_CMap::MixedTwoBytes,
156 2,
157 {{0x81, 0x9f}, {0xe0, 0xfc}}},
158 {"UniJIS-UCS2",
161 CPDF_CMap::TwoBytes,
162 0,
163 {}},
164 {"UniJIS-UCS2-HW",
167 CPDF_CMap::TwoBytes,
168 0,
169 {}},
170 {"UniJIS-UTF16",
173 CPDF_CMap::TwoBytes,
174 0,
175 {}},
176 {"KSC-EUC",
179 CPDF_CMap::MixedTwoBytes,
180 1,
181 {{0xa1, 0xfe}}},
182 {"KSCms-UHC",
185 CPDF_CMap::MixedTwoBytes,
186 1,
187 {{0x81, 0xfe}}},
188 {"KSCms-UHC-HW",
191 CPDF_CMap::MixedTwoBytes,
192 1,
193 {{0x81, 0xfe}}},
194 {"KSCpc-EUC",
197 CPDF_CMap::MixedTwoBytes,
198 1,
199 {{0xa1, 0xfd}}},
200 {"UniKS-UCS2", CIDSET_KOREA1, CIDCoding::kUCS2, CPDF_CMap::TwoBytes, 0, {}},
201 {"UniKS-UTF16",
204 CPDF_CMap::TwoBytes,
205 0,
206 {}},
207};
208
209const PredefinedCMap* GetPredefinedCMap(ByteStringView cmapid) {
210 if (cmapid.GetLength() > 2)
211 cmapid = cmapid.First(cmapid.GetLength() - 2);
212 for (const auto& map : kPredefinedCMaps) {
213 if (cmapid == map.m_pName)
214 return &map;
215 }
216 return nullptr;
217}
218
219std::vector<bool> LoadLeadingSegments(const PredefinedCMap& map) {
220 std::vector<bool> segments(256);
221 for (uint32_t i = 0; i < map.m_LeadingSegCount; ++i) {
222 const ByteRange& seg = map.m_LeadingSegs[i];
223 for (int b = seg.m_First; b <= seg.m_Last; ++b)
224 segments[b] = true;
225 }
226 return segments;
227}
228
229int CheckFourByteCodeRange(uint8_t* codes,
230 size_t size,
231 const std::vector<CPDF_CMap::CodeRange>& ranges) {
232 for (size_t i = ranges.size(); i > 0; i--) {
233 size_t seg = i - 1;
234 if (ranges[seg].m_CharSize < size)
235 continue;
236 size_t iChar = 0;
237 while (iChar < size) {
238 if (codes[iChar] < ranges[seg].m_Lower[iChar] ||
239 codes[iChar] > ranges[seg].m_Upper[iChar]) {
240 break;
241 }
242 ++iChar;
243 }
244 if (iChar == ranges[seg].m_CharSize)
245 return 2;
246 if (iChar)
247 return (size == ranges[seg].m_CharSize) ? 2 : 1;
248 }
249 return 0;
250}
251
252size_t GetFourByteCharSizeImpl(
253 uint32_t charcode,
254 const std::vector<CPDF_CMap::CodeRange>& ranges) {
255 if (ranges.empty())
256 return 1;
257
258 uint8_t codes[4];
259 codes[0] = codes[1] = 0x00;
260 codes[2] = static_cast<uint8_t>(charcode >> 8 & 0xFF);
261 codes[3] = static_cast<uint8_t>(charcode);
262 for (size_t offset = 0; offset < 4; offset++) {
263 size_t size = 4 - offset;
264 for (size_t j = 0; j < ranges.size(); j++) {
265 size_t iSeg = (ranges.size() - 1) - j;
266 if (ranges[iSeg].m_CharSize < size)
267 continue;
268 size_t iChar = 0;
269 while (iChar < size) {
270 if (codes[offset + iChar] < ranges[iSeg].m_Lower[iChar] ||
271 codes[offset + iChar] > ranges[iSeg].m_Upper[iChar]) {
272 break;
273 }
274 ++iChar;
275 }
276 if (iChar == ranges[iSeg].m_CharSize)
277 return size;
278 }
279 }
280 return 1;
281}
282
283const fxcmap::CMap* FindEmbeddedCMap(pdfium::span<const fxcmap::CMap> pCMaps,
284 ByteStringView bsName) {
285 for (size_t i = 0; i < pCMaps.size(); i++) {
286 if (bsName == pCMaps[i].m_Name)
287 return &pCMaps[i];
288 }
289 return nullptr;
290}
291
292} // namespace
293
294CPDF_CMap::CPDF_CMap(ByteStringView bsPredefinedName)
295 : m_bVertical(bsPredefinedName.Back() == 'V') {
296 if (bsPredefinedName == "Identity-H" || bsPredefinedName == "Identity-V") {
297 m_Coding = CIDCoding::kCID;
298 m_bLoaded = true;
299 return;
300 }
301
302 const PredefinedCMap* map = GetPredefinedCMap(bsPredefinedName);
303 if (!map)
304 return;
305
306 m_Charset = map->m_Charset;
307 m_Coding = map->m_Coding;
308 m_CodingScheme = map->m_CodingScheme;
309 if (m_CodingScheme == MixedTwoBytes)
310 m_MixedTwoByteLeadingBytes = LoadLeadingSegments(*map);
311 m_pEmbedMap = FindEmbeddedCMap(
312 CPDF_FontGlobals::GetInstance()->GetEmbeddedCharset(m_Charset),
313 bsPredefinedName);
314 if (!m_pEmbedMap)
315 return;
316
317 m_bLoaded = true;
318}
319
320CPDF_CMap::CPDF_CMap(pdfium::span<const uint8_t> spEmbeddedData)
321 : m_DirectCharcodeToCIDTable(
322 FixedSizeDataVector<uint16_t>::Zeroed(kDirectMapTableSize)) {
323 CPDF_CMapParser parser(this);
324 CPDF_SimpleParser syntax(spEmbeddedData);
325 while (true) {
326 ByteStringView word = syntax.GetWord();
327 if (word.IsEmpty()) {
328 break;
329 }
330 parser.ParseWord(word);
331 }
332}
333
334CPDF_CMap::~CPDF_CMap() = default;
335
336uint16_t CPDF_CMap::CIDFromCharCode(uint32_t charcode) const {
337 if (m_Coding == CIDCoding::kCID)
338 return static_cast<uint16_t>(charcode);
339
340 if (m_pEmbedMap)
341 return fxcmap::CIDFromCharCode(m_pEmbedMap, charcode);
342
343 if (m_DirectCharcodeToCIDTable.empty())
344 return static_cast<uint16_t>(charcode);
345
346 auto table_span = m_DirectCharcodeToCIDTable.span();
347 if (charcode < table_span.size())
348 return table_span[charcode];
349
350 auto it = std::lower_bound(m_AdditionalCharcodeToCIDMappings.begin(),
351 m_AdditionalCharcodeToCIDMappings.end(), charcode,
352 [](const CPDF_CMap::CIDRange& arg, uint32_t val) {
353 return arg.m_EndCode < val;
354 });
355 if (it == m_AdditionalCharcodeToCIDMappings.end() ||
356 it->m_StartCode > charcode) {
357 return 0;
358 }
359 return it->m_StartCID + charcode - it->m_StartCode;
360}
361
362uint32_t CPDF_CMap::GetNextChar(ByteStringView pString, size_t* pOffset) const {
363 size_t& offset = *pOffset;
364 auto pBytes = pString.raw_span();
365 switch (m_CodingScheme) {
366 case OneByte: {
367 return offset < pBytes.size() ? pBytes[offset++] : 0;
368 }
369 case TwoBytes: {
370 uint8_t byte1 = offset < pBytes.size() ? pBytes[offset++] : 0;
371 uint8_t byte2 = offset < pBytes.size() ? pBytes[offset++] : 0;
372 return 256 * byte1 + byte2;
373 }
374 case MixedTwoBytes: {
375 uint8_t byte1 = offset < pBytes.size() ? pBytes[offset++] : 0;
376 if (!m_MixedTwoByteLeadingBytes[byte1])
377 return byte1;
378 uint8_t byte2 = offset < pBytes.size() ? pBytes[offset++] : 0;
379 return 256 * byte1 + byte2;
380 }
381 case MixedFourBytes: {
382 uint8_t codes[4];
383 int char_size = 1;
384 codes[0] = offset < pBytes.size() ? pBytes[offset++] : 0;
385 while (true) {
386 int ret = CheckFourByteCodeRange(codes, char_size,
387 m_MixedFourByteLeadingRanges);
388 if (ret == 0)
389 return 0;
390 if (ret == 2) {
391 uint32_t charcode = 0;
392 for (int i = 0; i < char_size; i++)
393 charcode = (charcode << 8) + codes[i];
394 return charcode;
395 }
396 if (char_size == 4 || offset == pBytes.size())
397 return 0;
398 codes[char_size++] = pBytes[offset++];
399 }
400 }
401 }
402 return 0;
403}
404
405int CPDF_CMap::GetCharSize(uint32_t charcode) const {
406 switch (m_CodingScheme) {
407 case OneByte:
408 return 1;
409 case TwoBytes:
410 return 2;
411 case MixedTwoBytes:
412 if (charcode < 0x100)
413 return 1;
414 return 2;
415 case MixedFourBytes:
416 if (charcode < 0x100)
417 return 1;
418 if (charcode < 0x10000)
419 return 2;
420 if (charcode < 0x1000000)
421 return 3;
422 return 4;
423 }
424 return 1;
425}
426
427size_t CPDF_CMap::CountChar(ByteStringView pString) const {
428 switch (m_CodingScheme) {
429 case OneByte:
430 return pString.GetLength();
431 case TwoBytes:
432 return (pString.GetLength() + 1) / 2;
433 case MixedTwoBytes: {
434 size_t count = 0;
435 for (size_t i = 0; i < pString.GetLength(); i++) {
436 count++;
437 if (m_MixedTwoByteLeadingBytes[pString[i]])
438 i++;
439 }
440 return count;
441 }
442 case MixedFourBytes: {
443 size_t count = 0;
444 size_t offset = 0;
445 while (offset < pString.GetLength()) {
446 GetNextChar(pString, &offset);
447 count++;
448 }
449 return count;
450 }
451 }
452 return pString.GetLength();
453}
454
455int CPDF_CMap::AppendChar(char* str, uint32_t charcode) const {
456 switch (m_CodingScheme) {
457 case OneByte:
458 str[0] = static_cast<char>(charcode);
459 return 1;
460 case TwoBytes:
461 str[0] = static_cast<char>(charcode / 256);
462 str[1] = static_cast<char>(charcode % 256);
463 return 2;
464 case MixedTwoBytes:
465 if (charcode < 0x100 && !m_MixedTwoByteLeadingBytes[charcode]) {
466 str[0] = static_cast<char>(charcode);
467 return 1;
468 }
469 str[0] = static_cast<char>(charcode >> 8);
470 str[1] = static_cast<char>(charcode);
471 return 2;
472 case MixedFourBytes:
473 if (charcode < 0x100) {
474 int iSize = static_cast<int>(
475 GetFourByteCharSizeImpl(charcode, m_MixedFourByteLeadingRanges));
476 if (iSize == 0)
477 iSize = 1;
478 str[iSize - 1] = static_cast<char>(charcode);
479 if (iSize > 1)
480 memset(str, 0, iSize - 1);
481 return iSize;
482 }
483 if (charcode < 0x10000) {
484 str[0] = static_cast<char>(charcode >> 8);
485 str[1] = static_cast<char>(charcode);
486 return 2;
487 }
488 if (charcode < 0x1000000) {
489 str[0] = static_cast<char>(charcode >> 16);
490 str[1] = static_cast<char>(charcode >> 8);
491 str[2] = static_cast<char>(charcode);
492 return 3;
493 }
494 str[0] = static_cast<char>(charcode >> 24);
495 str[1] = static_cast<char>(charcode >> 16);
496 str[2] = static_cast<char>(charcode >> 8);
497 str[3] = static_cast<char>(charcode);
498 return 4;
499 }
500 return 0;
501}
502
503void CPDF_CMap::SetAdditionalMappings(std::vector<CIDRange> mappings) {
504 DCHECK(m_AdditionalCharcodeToCIDMappings.empty());
505 if (m_CodingScheme != MixedFourBytes || mappings.empty())
506 return;
507
508 std::sort(
509 mappings.begin(), mappings.end(),
510 [](const CPDF_CMap::CIDRange& arg1, const CPDF_CMap::CIDRange& arg2) {
511 return arg1.m_EndCode < arg2.m_EndCode;
512 });
513 m_AdditionalCharcodeToCIDMappings = std::move(mappings);
514}
515
516void CPDF_CMap::SetMixedFourByteLeadingRanges(std::vector<CodeRange> ranges) {
517 m_MixedFourByteLeadingRanges = std::move(ranges);
518}
519
520void CPDF_CMap::SetDirectCharcodeToCIDTableRange(uint32_t start_code,
521 uint32_t end_code,
522 uint16_t start_cid) {
523 pdfium::span<uint16_t> span = m_DirectCharcodeToCIDTable.span();
524 for (uint32_t code = start_code; code <= end_code; ++code) {
525 span[code] = static_cast<uint16_t>(start_cid + code - start_code);
526 }
527}
CPDF_CMapParser(CPDF_CMap *pCMap)
void ParseWord(ByteStringView word)
~CPDF_CMap() override
size_t CountChar(ByteStringView pString) const
uint32_t GetNextChar(ByteStringView pString, size_t *pOffset) const
int AppendChar(char *str, uint32_t charcode) const
@ MixedTwoBytes
Definition cpdf_cmap.h:42
@ MixedFourBytes
Definition cpdf_cmap.h:43
uint16_t CIDFromCharCode(uint32_t charcode) const
void SetAdditionalMappings(std::vector< CIDRange > mappings)
int GetCharSize(uint32_t charcode) const
void SetMixedFourByteLeadingRanges(std::vector< CodeRange > ranges)
void SetDirectCharcodeToCIDTableRange(uint32_t start_code, uint32_t end_code, uint16_t start_cid)
CIDSet
@ CIDSET_KOREA1
@ CIDSET_JAPAN1
@ CIDSET_GB1
@ CIDSET_CNS1
CIDCoding
Definition cpdf_cmap.h:24