Qt
Internal/Contributor docs for the Qt SDK. Note: These are NOT official API docs; those are found at https://doc.qt.io/
Loading...
Searching...
No Matches
cpdf_linkextract.cpp
Go to the documentation of this file.
1// Copyright 2016 The PDFium Authors
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5// Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
6
7#include "core/fpdftext/cpdf_linkextract.h"
8
9#include <vector>
10
11#include "core/fpdftext/cpdf_textpage.h"
12#include "core/fxcrt/fx_extension.h"
13#include "core/fxcrt/fx_string.h"
14#include "core/fxcrt/fx_system.h"
15
16namespace {
17
18// Find the end of a web link starting from offset |start| and ending at offset
19// |end|. The purpose of this function is to separate url from the surrounding
20// context characters, we do not intend to fully validate the url. |str|
21// contains lower case characters only.
22size_t FindWebLinkEnding(const WideString& str, size_t start, size_t end) {
23 if (str.Contains(L'/', start)) {
24 // When there is a path and query after '/', most ASCII chars are allowed.
25 // We don't sanitize in this case.
26 return end;
27 }
28
29 // When there is no path, it only has IP address or host name.
30 // Port is optional at the end.
31 if (str[start] == L'[') {
32 // IPv6 reference.
33 // Find the end of the reference.
34 auto result = str.Find(L']', start + 1);
35 if (result.has_value()) {
36 end = result.value();
37 if (end > start + 1) { // Has content inside brackets.
38 size_t len = str.GetLength();
39 size_t off = end + 1;
40 if (off < len && str[off] == L':') {
41 off++;
42 while (off < len && FXSYS_IsDecimalDigit(str[off]))
43 off++;
44 if (off > end + 2 &&
45 off <= len) // At least one digit in port number.
46 end = off - 1; // |off| is offset of the first invalid char.
47 }
48 }
49 }
50 return end;
51 }
52
53 // According to RFC1123, host name only has alphanumeric chars, hyphens,
54 // and periods. Hyphen should not at the end though.
55 // Non-ASCII chars are ignored during checking.
56 while (end > start && str[end] < 0x80) {
57 if (FXSYS_IsDecimalDigit(str[end]) ||
58 (str[end] >= L'a' && str[end] <= L'z') || str[end] == L'.') {
59 break;
60 }
61 end--;
62 }
63 return end;
64}
65
66// Remove characters from the end of |str|, delimited by |start| and |end|, up
67// to and including |charToFind|. No-op if |charToFind| is not present. Updates
68// |end| if characters were removed.
69void TrimBackwardsToChar(const WideString& str,
70 wchar_t charToFind,
71 size_t start,
72 size_t* end) {
73 for (size_t pos = *end; pos >= start; pos--) {
74 if (str[pos] == charToFind) {
75 *end = pos - 1;
76 break;
77 }
78 }
79}
80
81// Finds opening brackets ()[]{}<> and quotes "' before the URL delimited by
82// |start| and |end| in |str|. Matches a closing bracket or quote for each
83// opening character and, if present, removes everything afterwards. Returns the
84// new end position for the string.
85size_t TrimExternalBracketsFromWebLink(const WideString& str,
86 size_t start,
87 size_t end) {
88 for (size_t pos = 0; pos < start; pos++) {
89 if (str[pos] == '(') {
90 TrimBackwardsToChar(str, ')', start, &end);
91 } else if (str[pos] == '[') {
92 TrimBackwardsToChar(str, ']', start, &end);
93 } else if (str[pos] == '{') {
94 TrimBackwardsToChar(str, '}', start, &end);
95 } else if (str[pos] == '<') {
96 TrimBackwardsToChar(str, '>', start, &end);
97 } else if (str[pos] == '"') {
98 TrimBackwardsToChar(str, '"', start, &end);
99 } else if (str[pos] == '\'') {
100 TrimBackwardsToChar(str, '\'', start, &end);
101 }
102 }
103 return end;
104}
105
106} // namespace
107
110
112
114 m_LinkArray.clear();
115 size_t start = 0;
116 size_t pos = 0;
117 bool bAfterHyphen = false;
118 bool bLineBreak = false;
119 const size_t nTotalChar = m_pTextPage->CountChars();
120 const WideString page_text = m_pTextPage->GetAllPageText();
121 while (pos < nTotalChar) {
122 const CPDF_TextPage::CharInfo& char_info = m_pTextPage->GetCharInfo(pos);
124 char_info.m_Unicode != L' ' && pos != nTotalChar - 1) {
125 bAfterHyphen =
128 char_info.m_Unicode == L'-'));
129 ++pos;
130 continue;
131 }
132
133 size_t nCount = pos - start;
134 if (pos == nTotalChar - 1) {
135 ++nCount;
136 } else if (bAfterHyphen &&
137 (char_info.m_Unicode == L'\n' || char_info.m_Unicode == L'\r')) {
138 // Handle text breaks with a hyphen to the next line.
139 bLineBreak = true;
140 ++pos;
141 continue;
142 }
143
144 WideString strBeCheck = page_text.Substr(start, nCount);
145 if (bLineBreak) {
146 strBeCheck.Remove(L'\n');
147 strBeCheck.Remove(L'\r');
148 bLineBreak = false;
149 }
150 // Replace the generated code with the hyphen char.
151 strBeCheck.Replace(L"\xfffe", L"-");
152
153 if (strBeCheck.GetLength() > 5) {
154 while (strBeCheck.GetLength() > 0) {
155 wchar_t ch = strBeCheck.Back();
156 if (ch != L')' && ch != L',' && ch != L'>' && ch != L'.')
157 break;
158
159 strBeCheck = strBeCheck.First(strBeCheck.GetLength() - 1);
160 nCount--;
161 }
162
163 // Check for potential web URLs and email addresses.
164 // Ftp address, file system links, data, blob etc. are not checked.
165 if (nCount > 5) {
166 auto maybe_link = CheckWebLink(strBeCheck);
167 if (maybe_link.has_value()) {
168 maybe_link.value().m_Start += start;
169 m_LinkArray.push_back(maybe_link.value());
170 } else if (CheckMailLink(&strBeCheck)) {
171 m_LinkArray.push_back(Link{{start, nCount}, strBeCheck});
172 }
173 }
174 }
175 start = ++pos;
176 }
177}
178
180 const WideString& strBeCheck) {
181 static const wchar_t kHttpScheme[] = L"http";
182 static const wchar_t kWWWAddrStart[] = L"www.";
183
184 const size_t kHttpSchemeLen = FXSYS_len(kHttpScheme);
185 const size_t kWWWAddrStartLen = FXSYS_len(kWWWAddrStart);
186
187 WideString str = strBeCheck;
188 str.MakeLower();
189
190 // First, try to find the scheme.
191 auto start = str.Find(kHttpScheme);
192 if (start.has_value()) {
193 size_t off = start.value() + kHttpSchemeLen; // move after "http".
194 if (str.GetLength() > off + 4) { // At least "://<char>" follows.
195 if (str[off] == L's') // "https" scheme is accepted.
196 off++;
197 if (str[off] == L':' && str[off + 1] == L'/' && str[off + 2] == L'/') {
198 off += 3;
199 const size_t end =
200 FindWebLinkEnding(str, off,
201 TrimExternalBracketsFromWebLink(
202 str, start.value(), str.GetLength() - 1));
203 if (end > off) { // Non-empty host name.
204 const size_t nStart = start.value();
205 const size_t nCount = end - nStart + 1;
206 return Link{{nStart, nCount}, strBeCheck.Substr(nStart, nCount)};
207 }
208 }
209 }
210 }
211
212 // When there is no scheme, try to find url starting with "www.".
213 start = str.Find(kWWWAddrStart);
214 if (start.has_value()) {
215 size_t off = start.value() + kWWWAddrStartLen;
216 if (str.GetLength() > off) {
217 const size_t end =
218 FindWebLinkEnding(str, start.value(),
219 TrimExternalBracketsFromWebLink(
220 str, start.value(), str.GetLength() - 1));
221 if (end > off) {
222 const size_t nStart = start.value();
223 const size_t nCount = end - nStart + 1;
224 return Link{{nStart, nCount},
225 L"http://" + strBeCheck.Substr(nStart, nCount)};
226 }
227 }
228 }
229
230 return absl::nullopt;
231}
232
233bool CPDF_LinkExtract::CheckMailLink(WideString* str) {
234 auto aPos = str->Find(L'@');
235 // Invalid when no '@' or when starts/ends with '@'.
236 if (!aPos.has_value() || aPos.value() == 0 || aPos == str->GetLength() - 1)
237 return false;
238
239 // Check the local part.
240 size_t pPos = aPos.value(); // Used to track the position of '@' or '.'.
241 for (size_t i = aPos.value(); i > 0; i--) {
242 wchar_t ch = (*str)[i - 1];
243 if (ch == L'_' || ch == L'-' || FXSYS_iswalnum(ch))
244 continue;
245
246 if (ch != L'.' || i == pPos || i == 1) {
247 if (i == aPos.value()) {
248 // There is '.' or invalid char before '@'.
249 return false;
250 }
251 // End extracting for other invalid chars, '.' at the beginning, or
252 // consecutive '.'.
253 size_t removed_len = i == pPos ? i + 1 : i;
254 *str = str->Last(str->GetLength() - removed_len);
255 break;
256 }
257 // Found a valid '.'.
258 pPos = i - 1;
259 }
260
261 // Check the domain name part.
262 aPos = str->Find(L'@');
263 if (!aPos.has_value() || aPos.value() == 0)
264 return false;
265
266 str->TrimRight(L'.');
267 // At least one '.' in domain name, but not at the beginning.
268 // TODO(weili): RFC5322 allows domain names to be a local name without '.'.
269 // Check whether we should remove this check.
270 auto ePos = str->Find(L'.', aPos.value() + 1);
271 if (!ePos.has_value() || ePos.value() == aPos.value() + 1)
272 return false;
273
274 // Validate all other chars in domain name.
275 size_t nLen = str->GetLength();
276 pPos = 0; // Used to track the position of '.'.
277 for (size_t i = aPos.value() + 1; i < nLen; i++) {
278 wchar_t wch = (*str)[i];
279 if (wch == L'-' || FXSYS_iswalnum(wch))
280 continue;
281
282 if (wch != L'.' || i == pPos + 1) {
283 // Domain name should end before invalid char.
284 size_t host_end = i == pPos + 1 ? i - 2 : i - 1;
285 if (pPos > 0 && host_end - aPos.value() >= 3) {
286 // Trim the ending invalid chars if there is at least one '.' and name.
287 *str = str->First(host_end + 1);
288 break;
289 }
290 return false;
291 }
292 pPos = i;
293 }
294
295 if (!str->Contains(L"mailto:"))
296 *str = L"mailto:" + *str;
297
298 return true;
299}
300
301WideString CPDF_LinkExtract::GetURL(size_t index) const {
302 return index < m_LinkArray.size() ? m_LinkArray[index].m_strUrl
303 : WideString();
304}
305
307 if (index >= m_LinkArray.size())
308 return std::vector<CFX_FloatRect>();
309
310 return m_pTextPage->GetRectArray(m_LinkArray[index].m_Start,
311 m_LinkArray[index].m_Count);
312}
313
315 size_t index) const {
316 if (index >= m_LinkArray.size())
317 return absl::nullopt;
318 return m_LinkArray[index];
319}
std::vector< CFX_FloatRect > GetRects(size_t index) const
WideString GetURL(size_t index) const
absl::optional< Range > GetTextRange(size_t index) const
CPDF_LinkExtract(const CPDF_TextPage *pTextPage)
absl::optional< Link > CheckWebLink(const WideString &str)
bool CheckMailLink(WideString *str)
WideString & operator=(WideString &&that) noexcept
void TrimRight(wchar_t target)
CharType Back() const
Definition widestring.h:152
bool FXSYS_iswalnum(wchar_t c)
WideString operator+(const wchar_t *str1, const WideString &str2)
Definition widestring.h:281