Qt
Internal/Contributor docs for the Qt SDK. Note: These are NOT official API docs; those are found at https://doc.qt.io/
Loading...
Searching...
No Matches
cpdf_tounicodemap.cpp
Go to the documentation of this file.
1// Copyright 2017 The PDFium Authors
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5// Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
6
7#include "core/fpdfapi/font/cpdf_tounicodemap.h"
8
9#include <limits>
10#include <set>
11#include <utility>
12
13#include "core/fpdfapi/font/cpdf_cid2unicodemap.h"
14#include "core/fpdfapi/font/cpdf_fontglobals.h"
15#include "core/fpdfapi/parser/cpdf_simple_parser.h"
16#include "core/fpdfapi/parser/cpdf_stream.h"
17#include "core/fpdfapi/parser/fpdf_parser_utility.h"
18#include "core/fxcrt/fx_extension.h"
19#include "core/fxcrt/fx_safe_types.h"
20#include "third_party/base/containers/contains.h"
21
22namespace {
23
24WideString StringDataAdd(WideString str) {
25 WideString ret;
26 wchar_t value = 1;
27 for (size_t i = str.GetLength(); i > 0; --i) {
28 wchar_t ch = str[i - 1] + value;
29 if (ch < str[i - 1]) {
30 ret.InsertAtFront(0);
31 } else {
32 ret.InsertAtFront(ch);
33 value = 0;
34 }
35 }
36 if (value)
37 ret.InsertAtFront(value);
38 return ret;
39}
40
41} // namespace
42
43CPDF_ToUnicodeMap::CPDF_ToUnicodeMap(RetainPtr<const CPDF_Stream> pStream) {
44 Load(std::move(pStream));
45}
46
48
49WideString CPDF_ToUnicodeMap::Lookup(uint32_t charcode) const {
50 auto it = m_Multimap.find(charcode);
51 if (it == m_Multimap.end()) {
52 if (!m_pBaseMap)
53 return WideString();
54 return WideString(
55 m_pBaseMap->UnicodeFromCID(static_cast<uint16_t>(charcode)));
56 }
57
58 uint32_t value = *it->second.begin();
59 wchar_t unicode = static_cast<wchar_t>(value & 0xffff);
60 if (unicode != 0xffff)
61 return WideString(unicode);
62
63 size_t index = value >> 16;
64 return index < m_MultiCharVec.size() ? m_MultiCharVec[index] : WideString();
65}
66
67uint32_t CPDF_ToUnicodeMap::ReverseLookup(wchar_t unicode) const {
68 for (const auto& pair : m_Multimap) {
69 if (pdfium::Contains(pair.second, static_cast<uint32_t>(unicode)))
70 return pair.first;
71 }
72 return 0;
73}
74
76 uint32_t charcode) const {
77 auto it = m_Multimap.find(charcode);
78 return it != m_Multimap.end() ? it->second.size() : 0u;
79}
80
81// static
82absl::optional<uint32_t> CPDF_ToUnicodeMap::StringToCode(ByteStringView input) {
83 // Ignore whitespaces within `input`. See https://crbug.com/pdfium/2065.
84 std::set<char> seen_whitespace_chars;
85 for (char c : input) {
86 if (PDFCharIsWhitespace(c)) {
87 seen_whitespace_chars.insert(c);
88 }
89 }
90 ByteString str_without_whitespace_chars; // Must outlive `str`.
91 ByteStringView str;
92 if (seen_whitespace_chars.empty()) {
93 str = input;
94 } else {
95 str_without_whitespace_chars.Reserve(input.GetLength());
96 for (char c : input) {
97 if (!pdfium::Contains(seen_whitespace_chars, c)) {
98 str_without_whitespace_chars += c;
99 }
100 }
101 str = str_without_whitespace_chars.AsStringView();
102 }
103
104 size_t len = str.GetLength();
105 if (len <= 2 || str[0] != '<' || str[len - 1] != '>')
106 return absl::nullopt;
107
108 FX_SAFE_UINT32 code = 0;
109 for (char c : str.Substr(1, len - 2)) {
110 if (!FXSYS_IsHexDigit(c))
111 return absl::nullopt;
112
113 code = code * 16 + FXSYS_HexCharToInt(c);
114 if (!code.IsValid())
115 return absl::nullopt;
116 }
117 return absl::optional<uint32_t>(code.ValueOrDie());
118}
119
120// static
121WideString CPDF_ToUnicodeMap::StringToWideString(ByteStringView str) {
122 size_t len = str.GetLength();
123 if (len <= 2 || str[0] != '<' || str[len - 1] != '>')
124 return WideString();
125
126 WideString result;
127 int byte_pos = 0;
128 wchar_t ch = 0;
129 for (char c : str.Substr(1, len - 2)) {
130 if (!FXSYS_IsHexDigit(c))
131 break;
132
133 ch = ch * 16 + FXSYS_HexCharToInt(c);
134 byte_pos++;
135 if (byte_pos == 4) {
136 result += ch;
137 byte_pos = 0;
138 ch = 0;
139 }
140 }
141 return result;
142}
143
144void CPDF_ToUnicodeMap::Load(RetainPtr<const CPDF_Stream> pStream) {
145 CIDSet cid_set = CIDSET_UNKNOWN;
146 auto pAcc = pdfium::MakeRetain<CPDF_StreamAcc>(std::move(pStream));
147 pAcc->LoadAllDataFiltered();
148 CPDF_SimpleParser parser(pAcc->GetSpan());
149 while (true) {
150 ByteStringView word = parser.GetWord();
151 if (word.IsEmpty())
152 break;
153
154 if (word == "beginbfchar")
155 HandleBeginBFChar(&parser);
156 else if (word == "beginbfrange")
157 HandleBeginBFRange(&parser);
158 else if (word == "/Adobe-Korea1-UCS2")
159 cid_set = CIDSET_KOREA1;
160 else if (word == "/Adobe-Japan1-UCS2")
161 cid_set = CIDSET_JAPAN1;
162 else if (word == "/Adobe-CNS1-UCS2")
163 cid_set = CIDSET_CNS1;
164 else if (word == "/Adobe-GB1-UCS2")
165 cid_set = CIDSET_GB1;
166 }
167 if (cid_set != CIDSET_UNKNOWN) {
168 m_pBaseMap = CPDF_FontGlobals::GetInstance()->GetCID2UnicodeMap(cid_set);
169 }
170}
171
172void CPDF_ToUnicodeMap::HandleBeginBFChar(CPDF_SimpleParser* pParser) {
173 while (true) {
174 ByteStringView word = pParser->GetWord();
175 if (word.IsEmpty() || word == "endbfchar")
176 return;
177
178 absl::optional<uint32_t> code = StringToCode(word);
179 if (!code.has_value())
180 return;
181
182 SetCode(code.value(), StringToWideString(pParser->GetWord()));
183 }
184}
185
186void CPDF_ToUnicodeMap::HandleBeginBFRange(CPDF_SimpleParser* pParser) {
187 while (true) {
188 ByteStringView lowcode_str = pParser->GetWord();
189 if (lowcode_str.IsEmpty() || lowcode_str == "endbfrange")
190 return;
191
192 absl::optional<uint32_t> lowcode_opt = StringToCode(lowcode_str);
193 if (!lowcode_opt.has_value())
194 return;
195
196 ByteStringView highcode_str = pParser->GetWord();
197 absl::optional<uint32_t> highcode_opt = StringToCode(highcode_str);
198 if (!highcode_opt.has_value())
199 return;
200
201 uint32_t lowcode = lowcode_opt.value();
202 uint32_t highcode = (lowcode & 0xffffff00) | (highcode_opt.value() & 0xff);
203
204 ByteStringView start = pParser->GetWord();
205 if (start == "[") {
206 for (uint32_t code = lowcode; code <= highcode; ++code) {
207 SetCode(code, StringToWideString(pParser->GetWord()));
208 if (code == std::numeric_limits<uint32_t>::max()) {
209 break;
210 }
211 }
212 pParser->GetWord();
213 continue;
214 }
215
216 WideString destcode = StringToWideString(start);
217 if (destcode.GetLength() == 1) {
218 absl::optional<uint32_t> value_or_error = StringToCode(start);
219 if (!value_or_error.has_value())
220 return;
221
222 uint32_t value = value_or_error.value();
223 for (uint32_t code = lowcode; code <= highcode; ++code) {
224 InsertIntoMultimap(code, value++);
225 if (code == std::numeric_limits<uint32_t>::max()) {
226 break;
227 }
228 }
229 } else {
230 for (uint32_t code = lowcode; code <= highcode; ++code) {
231 WideString retcode =
232 code == lowcode ? destcode : StringDataAdd(destcode);
233 InsertIntoMultimap(code, GetMultiCharIndexIndicator());
234 m_MultiCharVec.push_back(retcode);
235 destcode = std::move(retcode);
236 if (code == std::numeric_limits<uint32_t>::max()) {
237 break;
238 }
239 }
240 }
241 }
242}
243
244uint32_t CPDF_ToUnicodeMap::GetMultiCharIndexIndicator() const {
245 FX_SAFE_UINT32 uni = m_MultiCharVec.size();
246 uni = uni * 0x10000 + 0xffff;
247 return uni.ValueOrDefault(0);
248}
249
250void CPDF_ToUnicodeMap::SetCode(uint32_t srccode, WideString destcode) {
251 size_t len = destcode.GetLength();
252 if (len == 0)
253 return;
254
255 if (len == 1) {
256 InsertIntoMultimap(srccode, destcode[0]);
257 } else {
258 InsertIntoMultimap(srccode, GetMultiCharIndexIndicator());
259 m_MultiCharVec.push_back(destcode);
260 }
261}
262
263void CPDF_ToUnicodeMap::InsertIntoMultimap(uint32_t code, uint32_t destcode) {
264 auto it = m_Multimap.find(code);
265 if (it == m_Multimap.end()) {
266 m_Multimap.emplace(code, std::set<uint32_t>{destcode});
267 return;
268 }
269
270 it->second.emplace(destcode);
271}
ByteStringView GetWord()
size_t GetUnicodeCountByCharcodeForTesting(uint32_t charcode) const
uint32_t ReverseLookup(wchar_t unicode) const
CPDF_ToUnicodeMap(RetainPtr< const CPDF_Stream > pStream)
WideString Lookup(uint32_t charcode) const
WideString(wchar_t ch)
CharType operator[](const size_t index) const
Definition widestring.h:146
CIDSet
@ CIDSET_KOREA1
@ CIDSET_JAPAN1
@ CIDSET_UNKNOWN
@ CIDSET_GB1
@ CIDSET_CNS1