qregexp.cpp

Absolute File Name:/home/qt/qt5_coco/qt5/qtbase/src/corelib/tools/qregexp.cpp
Source codeSwitch to Preprocessed file
LineSourceCount
1/****************************************************************************-
2**-
3** Copyright (C) 2016 The Qt Company Ltd.-
4** Contact: https://www.qt.io/licensing/-
5**-
6** This file is part of the QtCore module of the Qt Toolkit.-
7**-
8** $QT_BEGIN_LICENSE:LGPL$-
9** Commercial License Usage-
10** Licensees holding valid commercial Qt licenses may use this file in-
11** accordance with the commercial license agreement provided with the-
12** Software or, alternatively, in accordance with the terms contained in-
13** a written agreement between you and The Qt Company. For licensing terms-
14** and conditions see https://www.qt.io/terms-conditions. For further-
15** information use the contact form at https://www.qt.io/contact-us.-
16**-
17** GNU Lesser General Public License Usage-
18** Alternatively, this file may be used under the terms of the GNU Lesser-
19** General Public License version 3 as published by the Free Software-
20** Foundation and appearing in the file LICENSE.LGPL3 included in the-
21** packaging of this file. Please review the following information to-
22** ensure the GNU Lesser General Public License version 3 requirements-
23** will be met: https://www.gnu.org/licenses/lgpl-3.0.html.-
24**-
25** GNU General Public License Usage-
26** Alternatively, this file may be used under the terms of the GNU-
27** General Public License version 2.0 or (at your option) the GNU General-
28** Public license version 3 or any later version approved by the KDE Free-
29** Qt Foundation. The licenses are as published by the Free Software-
30** Foundation and appearing in the file LICENSE.GPL2 and LICENSE.GPL3-
31** included in the packaging of this file. Please review the following-
32** information to ensure the GNU General Public License requirements will-
33** be met: https://www.gnu.org/licenses/gpl-2.0.html and-
34** https://www.gnu.org/licenses/gpl-3.0.html.-
35**-
36** $QT_END_LICENSE$-
37**-
38****************************************************************************/-
39-
40#include "qregexp.h"-
41-
42#include "qalgorithms.h"-
43#include "qbitarray.h"-
44#include "qcache.h"-
45#include "qdatastream.h"-
46#include "qdebug.h"-
47#include "qhashfunctions.h"-
48#include "qlist.h"-
49#include "qmap.h"-
50#include "qmutex.h"-
51#include "qstring.h"-
52#include "qstringlist.h"-
53#include "qstringmatcher.h"-
54#include "qvector.h"-
55-
56#include <limits.h>-
57#include <algorithm>-
58-
59QT_BEGIN_NAMESPACE-
60-
61int qFindString(const QChar *haystack, int haystackLen, int from,-
62 const QChar *needle, int needleLen, Qt::CaseSensitivity cs);-
63-
64// error strings for the regexp parser-
65#define RXERR_OK QT_TRANSLATE_NOOP("QRegExp", "no error occurred")-
66#define RXERR_DISABLED QT_TRANSLATE_NOOP("QRegExp", "disabled feature used")-
67#define RXERR_CHARCLASS QT_TRANSLATE_NOOP("QRegExp", "bad char class syntax")-
68#define RXERR_LOOKAHEAD QT_TRANSLATE_NOOP("QRegExp", "bad lookahead syntax")-
69#define RXERR_LOOKBEHIND QT_TRANSLATE_NOOP("QRegExp", "lookbehinds not supported, see QTBUG-2371")-
70#define RXERR_REPETITION QT_TRANSLATE_NOOP("QRegExp", "bad repetition syntax")-
71#define RXERR_OCTAL QT_TRANSLATE_NOOP("QRegExp", "invalid octal value")-
72#define RXERR_LEFTDELIM QT_TRANSLATE_NOOP("QRegExp", "missing left delim")-
73#define RXERR_END QT_TRANSLATE_NOOP("QRegExp", "unexpected end")-
74#define RXERR_LIMIT QT_TRANSLATE_NOOP("QRegExp", "met internal limit")-
75#define RXERR_INTERVAL QT_TRANSLATE_NOOP("QRegExp", "invalid interval")-
76#define RXERR_CATEGORY QT_TRANSLATE_NOOP("QRegExp", "invalid category")-
77-
78/*!-
79 \class QRegExp-
80 \inmodule QtCore-
81 \reentrant-
82 \brief The QRegExp class provides pattern matching using regular expressions.-
83-
84 \ingroup tools-
85 \ingroup shared-
86-
87 \keyword regular expression-
88-
89 A regular expression, or "regexp", is a pattern for matching-
90 substrings in a text. This is useful in many contexts, e.g.,-
91-
92 \table-
93 \row \li Validation-
94 \li A regexp can test whether a substring meets some criteria,-
95 e.g. is an integer or contains no whitespace.-
96 \row \li Searching-
97 \li A regexp provides more powerful pattern matching than-
98 simple substring matching, e.g., match one of the words-
99 \e{mail}, \e{letter} or \e{correspondence}, but none of the-
100 words \e{email}, \e{mailman}, \e{mailer}, \e{letterbox}, etc.-
101 \row \li Search and Replace-
102 \li A regexp can replace all occurrences of a substring with a-
103 different substring, e.g., replace all occurrences of \e{&}-
104 with \e{\&amp;} except where the \e{&} is already followed by-
105 an \e{amp;}.-
106 \row \li String Splitting-
107 \li A regexp can be used to identify where a string should be-
108 split apart, e.g. splitting tab-delimited strings.-
109 \endtable-
110-
111 A brief introduction to regexps is presented, a description of-
112 Qt's regexp language, some examples, and the function-
113 documentation itself. QRegExp is modeled on Perl's regexp-
114 language. It fully supports Unicode. QRegExp can also be used in a-
115 simpler, \e{wildcard mode} that is similar to the functionality-
116 found in command shells. The syntax rules used by QRegExp can be-
117 changed with setPatternSyntax(). In particular, the pattern syntax-
118 can be set to QRegExp::FixedString, which means the pattern to be-
119 matched is interpreted as a plain string, i.e., special characters-
120 (e.g., backslash) are not escaped.-
121-
122 A good text on regexps is \e {Mastering Regular Expressions}-
123 (Third Edition) by Jeffrey E. F. Friedl, ISBN 0-596-52812-4.-
124-
125 \note In Qt 5, the new QRegularExpression class provides a Perl-
126 compatible implementation of regular expressions and is recommended-
127 in place of QRegExp.-
128-
129 \tableofcontents-
130-
131 \section1 Introduction-
132-
133 Regexps are built up from expressions, quantifiers, and-
134 assertions. The simplest expression is a character, e.g. \b{x}-
135 or \b{5}. An expression can also be a set of characters-
136 enclosed in square brackets. \b{[ABCD]} will match an \b{A}-
137 or a \b{B} or a \b{C} or a \b{D}. We can write this same-
138 expression as \b{[A-D]}, and an expression to match any-
139 capital letter in the English alphabet is written as-
140 \b{[A-Z]}.-
141-
142 A quantifier specifies the number of occurrences of an expression-
143 that must be matched. \b{x{1,1}} means match one and only one-
144 \b{x}. \b{x{1,5}} means match a sequence of \b{x}-
145 characters that contains at least one \b{x} but no more than-
146 five.-
147-
148 Note that in general regexps cannot be used to check for balanced-
149 brackets or tags. For example, a regexp can be written to match an-
150 opening html \c{<b>} and its closing \c{</b>}, if the \c{<b>} tags-
151 are not nested, but if the \c{<b>} tags are nested, that same-
152 regexp will match an opening \c{<b>} tag with the wrong closing-
153 \c{</b>}. For the fragment \c{<b>bold <b>bolder</b></b>}, the-
154 first \c{<b>} would be matched with the first \c{</b>}, which is-
155 not correct. However, it is possible to write a regexp that will-
156 match nested brackets or tags correctly, but only if the number of-
157 nesting levels is fixed and known. If the number of nesting levels-
158 is not fixed and known, it is impossible to write a regexp that-
159 will not fail.-
160-
161 Suppose we want a regexp to match integers in the range 0 to 99.-
162 At least one digit is required, so we start with the expression-
163 \b{[0-9]{1,1}}, which matches a single digit exactly once. This-
164 regexp matches integers in the range 0 to 9. To match integers up-
165 to 99, increase the maximum number of occurrences to 2, so the-
166 regexp becomes \b{[0-9]{1,2}}. This regexp satisfies the-
167 original requirement to match integers from 0 to 99, but it will-
168 also match integers that occur in the middle of strings. If we-
169 want the matched integer to be the whole string, we must use the-
170 anchor assertions, \b{^} (caret) and \b{$} (dollar). When-
171 \b{^} is the first character in a regexp, it means the regexp-
172 must match from the beginning of the string. When \b{$} is the-
173 last character of the regexp, it means the regexp must match to-
174 the end of the string. The regexp becomes \b{^[0-9]{1,2}$}.-
175 Note that assertions, e.g. \b{^} and \b{$}, do not match-
176 characters but locations in the string.-
177-
178 If you have seen regexps described elsewhere, they may have looked-
179 different from the ones shown here. This is because some sets of-
180 characters and some quantifiers are so common that they have been-
181 given special symbols to represent them. \b{[0-9]} can be-
182 replaced with the symbol \b{\\d}. The quantifier to match-
183 exactly one occurrence, \b{{1,1}}, can be replaced with the-
184 expression itself, i.e. \b{x{1,1}} is the same as \b{x}. So-
185 our 0 to 99 matcher could be written as \b{^\\d{1,2}$}. It can-
186 also be written \b{^\\d\\d{0,1}$}, i.e. \e{From the start of-
187 the string, match a digit, followed immediately by 0 or 1 digits}.-
188 In practice, it would be written as \b{^\\d\\d?$}. The \b{?}-
189 is shorthand for the quantifier \b{{0,1}}, i.e. 0 or 1-
190 occurrences. \b{?} makes an expression optional. The regexp-
191 \b{^\\d\\d?$} means \e{From the beginning of the string, match-
192 one digit, followed immediately by 0 or 1 more digit, followed-
193 immediately by end of string}.-
194-
195 To write a regexp that matches one of the words 'mail' \e or-
196 'letter' \e or 'correspondence' but does not match words that-
197 contain these words, e.g., 'email', 'mailman', 'mailer', and-
198 'letterbox', start with a regexp that matches 'mail'. Expressed-
199 fully, the regexp is \b{m{1,1}a{1,1}i{1,1}l{1,1}}, but because-
200 a character expression is automatically quantified by-
201 \b{{1,1}}, we can simplify the regexp to \b{mail}, i.e., an-
202 'm' followed by an 'a' followed by an 'i' followed by an 'l'. Now-
203 we can use the vertical bar \b{|}, which means \b{or}, to-
204 include the other two words, so our regexp for matching any of the-
205 three words becomes \b{mail|letter|correspondence}. Match-
206 'mail' \b{or} 'letter' \b{or} 'correspondence'. While this-
207 regexp will match one of the three words we want to match, it will-
208 also match words we don't want to match, e.g., 'email'. To-
209 prevent the regexp from matching unwanted words, we must tell it-
210 to begin and end the match at word boundaries. First we enclose-
211 our regexp in parentheses, \b{(mail|letter|correspondence)}.-
212 Parentheses group expressions together, and they identify a part-
213 of the regexp that we wish to \l{capturing text}{capture}.-
214 Enclosing the expression in parentheses allows us to use it as a-
215 component in more complex regexps. It also allows us to examine-
216 which of the three words was actually matched. To force the match-
217 to begin and end on word boundaries, we enclose the regexp in-
218 \b{\\b} \e{word boundary} assertions:-
219 \b{\\b(mail|letter|correspondence)\\b}. Now the regexp means:-
220 \e{Match a word boundary, followed by the regexp in parentheses,-
221 followed by a word boundary}. The \b{\\b} assertion matches a-
222 \e position in the regexp, not a \e character. A word boundary is-
223 any non-word character, e.g., a space, newline, or the beginning-
224 or ending of a string.-
225-
226 If we want to replace ampersand characters with the HTML entity-
227 \b{\&amp;}, the regexp to match is simply \b{\&}. But this-
228 regexp will also match ampersands that have already been converted-
229 to HTML entities. We want to replace only ampersands that are not-
230 already followed by \b{amp;}. For this, we need the negative-
231 lookahead assertion, \b{(?!}__\b{)}. The regexp can then be-
232 written as \b{\&(?!amp;)}, i.e. \e{Match an ampersand that is}-
233 \b{not} \e{followed by} \b{amp;}.-
234-
235 If we want to count all the occurrences of 'Eric' and 'Eirik' in a-
236 string, two valid solutions are \b{\\b(Eric|Eirik)\\b} and-
237 \b{\\bEi?ri[ck]\\b}. The word boundary assertion '\\b' is-
238 required to avoid matching words that contain either name,-
239 e.g. 'Ericsson'. Note that the second regexp matches more-
240 spellings than we want: 'Eric', 'Erik', 'Eiric' and 'Eirik'.-
241-
242 Some of the examples discussed above are implemented in the-
243 \l{#code-examples}{code examples} section.-
244-
245 \target characters-and-abbreviations-for-sets-of-characters-
246 \section1 Characters and Abbreviations for Sets of Characters-
247-
248 \table-
249 \header \li Element \li Meaning-
250 \row \li \b{c}-
251 \li A character represents itself unless it has a special-
252 regexp meaning. e.g. \b{c} matches the character \e c.-
253 \row \li \b{\\c}-
254 \li A character that follows a backslash matches the character-
255 itself, except as specified below. e.g., To match a literal-
256 caret at the beginning of a string, write \b{\\^}.-
257 \row \li \b{\\a}-
258 \li Matches the ASCII bell (BEL, 0x07).-
259 \row \li \b{\\f}-
260 \li Matches the ASCII form feed (FF, 0x0C).-
261 \row \li \b{\\n}-
262 \li Matches the ASCII line feed (LF, 0x0A, Unix newline).-
263 \row \li \b{\\r}-
264 \li Matches the ASCII carriage return (CR, 0x0D).-
265 \row \li \b{\\t}-
266 \li Matches the ASCII horizontal tab (HT, 0x09).-
267 \row \li \b{\\v}-
268 \li Matches the ASCII vertical tab (VT, 0x0B).-
269 \row \li \b{\\x\e{hhhh}}-
270 \li Matches the Unicode character corresponding to the-
271 hexadecimal number \e{hhhh} (between 0x0000 and 0xFFFF).-
272 \row \li \b{\\0\e{ooo}} (i.e., \\zero \e{ooo})-
273 \li matches the ASCII/Latin1 character for the octal number-
274 \e{ooo} (between 0 and 0377).-
275 \row \li \b{. (dot)}-
276 \li Matches any character (including newline).-
277 \row \li \b{\\d}-
278 \li Matches a digit (QChar::isDigit()).-
279 \row \li \b{\\D}-
280 \li Matches a non-digit.-
281 \row \li \b{\\s}-
282 \li Matches a whitespace character (QChar::isSpace()).-
283 \row \li \b{\\S}-
284 \li Matches a non-whitespace character.-
285 \row \li \b{\\w}-
286 \li Matches a word character (QChar::isLetterOrNumber(), QChar::isMark(), or '_').-
287 \row \li \b{\\W}-
288 \li Matches a non-word character.-
289 \row \li \b{\\\e{n}}-
290 \li The \e{n}-th backreference, e.g. \\1, \\2, etc.-
291 \endtable-
292-
293 \b{Note:} The C++ compiler transforms backslashes in strings.-
294 To include a \b{\\} in a regexp, enter it twice, i.e. \c{\\}.-
295 To match the backslash character itself, enter it four times, i.e.-
296 \c{\\\\}.-
297-
298 \target sets-of-characters-
299 \section1 Sets of Characters-
300-
301 Square brackets mean match any character contained in the square-
302 brackets. The character set abbreviations described above can-
303 appear in a character set in square brackets. Except for the-
304 character set abbreviations and the following two exceptions,-
305 characters do not have special meanings in square brackets.-
306-
307 \table-
308 \row \li \b{^}-
309-
310 \li The caret negates the character set if it occurs as the-
311 first character (i.e. immediately after the opening square-
312 bracket). \b{[abc]} matches 'a' or 'b' or 'c', but-
313 \b{[^abc]} matches anything \e but 'a' or 'b' or 'c'.-
314-
315 \row \li \b{-}-
316-
317 \li The dash indicates a range of characters. \b{[W-Z]}-
318 matches 'W' or 'X' or 'Y' or 'Z'.-
319-
320 \endtable-
321-
322 Using the predefined character set abbreviations is more portable-
323 than using character ranges across platforms and languages. For-
324 example, \b{[0-9]} matches a digit in Western alphabets but-
325 \b{\\d} matches a digit in \e any alphabet.-
326-
327 Note: In other regexp documentation, sets of characters are often-
328 called "character classes".-
329-
330 \target quantifiers-
331 \section1 Quantifiers-
332-
333 By default, an expression is automatically quantified by-
334 \b{{1,1}}, i.e. it should occur exactly once. In the following-
335 list, \b{\e {E}} stands for expression. An expression is a-
336 character, or an abbreviation for a set of characters, or a set of-
337 characters in square brackets, or an expression in parentheses.-
338-
339 \table-
340 \row \li \b{\e {E}?}-
341-
342 \li Matches zero or one occurrences of \e E. This quantifier-
343 means \e{The previous expression is optional}, because it-
344 will match whether or not the expression is found. \b{\e-
345 {E}?} is the same as \b{\e {E}{0,1}}. e.g., \b{dents?}-
346 matches 'dent' or 'dents'.-
347-
348 \row \li \b{\e {E}+}-
349-
350 \li Matches one or more occurrences of \e E. \b{\e {E}+} is-
351 the same as \b{\e {E}{1,}}. e.g., \b{0+} matches '0',-
352 '00', '000', etc.-
353-
354 \row \li \b{\e {E}*}-
355-
356 \li Matches zero or more occurrences of \e E. It is the same-
357 as \b{\e {E}{0,}}. The \b{*} quantifier is often used-
358 in error where \b{+} should be used. For example, if-
359 \b{\\s*$} is used in an expression to match strings that-
360 end in whitespace, it will match every string because-
361 \b{\\s*$} means \e{Match zero or more whitespaces followed-
362 by end of string}. The correct regexp to match strings that-
363 have at least one trailing whitespace character is-
364 \b{\\s+$}.-
365-
366 \row \li \b{\e {E}{n}}-
367-
368 \li Matches exactly \e n occurrences of \e E. \b{\e {E}{n}}-
369 is the same as repeating \e E \e n times. For example,-
370 \b{x{5}} is the same as \b{xxxxx}. It is also the same-
371 as \b{\e {E}{n,n}}, e.g. \b{x{5,5}}.-
372-
373 \row \li \b{\e {E}{n,}}-
374 \li Matches at least \e n occurrences of \e E.-
375-
376 \row \li \b{\e {E}{,m}}-
377 \li Matches at most \e m occurrences of \e E. \b{\e {E}{,m}}-
378 is the same as \b{\e {E}{0,m}}.-
379-
380 \row \li \b{\e {E}{n,m}}-
381 \li Matches at least \e n and at most \e m occurrences of \e E.-
382 \endtable-
383-
384 To apply a quantifier to more than just the preceding character,-
385 use parentheses to group characters together in an expression. For-
386 example, \b{tag+} matches a 't' followed by an 'a' followed by-
387 at least one 'g', whereas \b{(tag)+} matches at least one-
388 occurrence of 'tag'.-
389-
390 Note: Quantifiers are normally "greedy". They always match as much-
391 text as they can. For example, \b{0+} matches the first zero it-
392 finds and all the consecutive zeros after the first zero. Applied-
393 to '20005', it matches '2\underline{000}5'. Quantifiers can be made-
394 non-greedy, see setMinimal().-
395-
396 \target capturing parentheses-
397 \target backreferences-
398 \section1 Capturing Text-
399-
400 Parentheses allow us to group elements together so that we can-
401 quantify and capture them. For example if we have the expression-
402 \b{mail|letter|correspondence} that matches a string we know-
403 that \e one of the words matched but not which one. Using-
404 parentheses allows us to "capture" whatever is matched within-
405 their bounds, so if we used \b{(mail|letter|correspondence)}-
406 and matched this regexp against the string "I sent you some email"-
407 we can use the cap() or capturedTexts() functions to extract the-
408 matched characters, in this case 'mail'.-
409-
410 We can use captured text within the regexp itself. To refer to the-
411 captured text we use \e backreferences which are indexed from 1,-
412 the same as for cap(). For example we could search for duplicate-
413 words in a string using \b{\\b(\\w+)\\W+\\1\\b} which means match a-
414 word boundary followed by one or more word characters followed by-
415 one or more non-word characters followed by the same text as the-
416 first parenthesized expression followed by a word boundary.-
417-
418 If we want to use parentheses purely for grouping and not for-
419 capturing we can use the non-capturing syntax, e.g.-
420 \b{(?:green|blue)}. Non-capturing parentheses begin '(?:' and-
421 end ')'. In this example we match either 'green' or 'blue' but we-
422 do not capture the match so we only know whether or not we matched-
423 but not which color we actually found. Using non-capturing-
424 parentheses is more efficient than using capturing parentheses-
425 since the regexp engine has to do less book-keeping.-
426-
427 Both capturing and non-capturing parentheses may be nested.-
428-
429 \target greedy quantifiers-
430-
431 For historical reasons, quantifiers (e.g. \b{*}) that apply to-
432 capturing parentheses are more "greedy" than other quantifiers.-
433 For example, \b{a*(a*)} will match "aaa" with cap(1) == "aaa".-
434 This behavior is different from what other regexp engines do-
435 (notably, Perl). To obtain a more intuitive capturing behavior,-
436 specify QRegExp::RegExp2 to the QRegExp constructor or call-
437 setPatternSyntax(QRegExp::RegExp2).-
438-
439 \target cap_in_a_loop-
440-
441 When the number of matches cannot be determined in advance, a-
442 common idiom is to use cap() in a loop. For example:-
443-
444 \snippet code/src_corelib_tools_qregexp.cpp 0-
445-
446 \target assertions-
447 \section1 Assertions-
448-
449 Assertions make some statement about the text at the point where-
450 they occur in the regexp but they do not match any characters. In-
451 the following list \b{\e {E}} stands for any expression.-
452-
453 \table-
454 \row \li \b{^}-
455 \li The caret signifies the beginning of the string. If you-
456 wish to match a literal \c{^} you must escape it by-
457 writing \c{\\^}. For example, \b{^#include} will only-
458 match strings which \e begin with the characters '#include'.-
459 (When the caret is the first character of a character set it-
460 has a special meaning, see \l{#sets-of-characters}{Sets of Characters}.)-
461-
462 \row \li \b{$}-
463 \li The dollar signifies the end of the string. For example-
464 \b{\\d\\s*$} will match strings which end with a digit-
465 optionally followed by whitespace. If you wish to match a-
466 literal \c{$} you must escape it by writing-
467 \c{\\$}.-
468-
469 \row \li \b{\\b}-
470 \li A word boundary. For example the regexp-
471 \b{\\bOK\\b} means match immediately after a word-
472 boundary (e.g. start of string or whitespace) the letter 'O'-
473 then the letter 'K' immediately before another word boundary-
474 (e.g. end of string or whitespace). But note that the-
475 assertion does not actually match any whitespace so if we-
476 write \b{(\\bOK\\b)} and we have a match it will only-
477 contain 'OK' even if the string is "It's \underline{OK} now".-
478-
479 \row \li \b{\\B}-
480 \li A non-word boundary. This assertion is true wherever-
481 \b{\\b} is false. For example if we searched for-
482 \b{\\Bon\\B} in "Left on" the match would fail (space-
483 and end of string aren't non-word boundaries), but it would-
484 match in "t\underline{on}ne".-
485-
486 \row \li \b{(?=\e E)}-
487 \li Positive lookahead. This assertion is true if the-
488 expression matches at this point in the regexp. For example,-
489 \b{const(?=\\s+char)} matches 'const' whenever it is-
490 followed by 'char', as in 'static \underline{const} char *'.-
491 (Compare with \b{const\\s+char}, which matches 'static-
492 \underline{const char} *'.)-
493-
494 \row \li \b{(?!\e E)}-
495 \li Negative lookahead. This assertion is true if the-
496 expression does not match at this point in the regexp. For-
497 example, \b{const(?!\\s+char)} matches 'const' \e except-
498 when it is followed by 'char'.-
499 \endtable-
500-
501 \target QRegExp wildcard matching-
502 \section1 Wildcard Matching-
503-
504 Most command shells such as \e bash or \e cmd.exe support "file-
505 globbing", the ability to identify a group of files by using-
506 wildcards. The setPatternSyntax() function is used to switch-
507 between regexp and wildcard mode. Wildcard matching is much-
508 simpler than full regexps and has only four features:-
509-
510 \table-
511 \row \li \b{c}-
512 \li Any character represents itself apart from those mentioned-
513 below. Thus \b{c} matches the character \e c.-
514 \row \li \b{?}-
515 \li Matches any single character. It is the same as-
516 \b{.} in full regexps.-
517 \row \li \b{*}-
518 \li Matches zero or more of any characters. It is the-
519 same as \b{.*} in full regexps.-
520 \row \li \b{[...]}-
521 \li Sets of characters can be represented in square brackets,-
522 similar to full regexps. Within the character class, like-
523 outside, backslash has no special meaning.-
524 \endtable-
525-
526 In the mode Wildcard, the wildcard characters cannot be-
527 escaped. In the mode WildcardUnix, the character '\\' escapes the-
528 wildcard.-
529-
530 For example if we are in wildcard mode and have strings which-
531 contain filenames we could identify HTML files with \b{*.html}.-
532 This will match zero or more characters followed by a dot followed-
533 by 'h', 't', 'm' and 'l'.-
534-
535 To test a string against a wildcard expression, use exactMatch().-
536 For example:-
537-
538 \snippet code/src_corelib_tools_qregexp.cpp 1-
539-
540 \target perl-users-
541 \section1 Notes for Perl Users-
542-
543 Most of the character class abbreviations supported by Perl are-
544 supported by QRegExp, see \l{#characters-and-abbreviations-for-sets-of-characters}-
545 {characters and abbreviations for sets of characters}.-
546-
547 In QRegExp, apart from within character classes, \c{^} always-
548 signifies the start of the string, so carets must always be-
549 escaped unless used for that purpose. In Perl the meaning of caret-
550 varies automagically depending on where it occurs so escaping it-
551 is rarely necessary. The same applies to \c{$} which in-
552 QRegExp always signifies the end of the string.-
553-
554 QRegExp's quantifiers are the same as Perl's greedy quantifiers-
555 (but see the \l{greedy quantifiers}{note above}). Non-greedy-
556 matching cannot be applied to individual quantifiers, but can be-
557 applied to all the quantifiers in the pattern. For example, to-
558 match the Perl regexp \b{ro+?m} requires:-
559-
560 \snippet code/src_corelib_tools_qregexp.cpp 2-
561-
562 The equivalent of Perl's \c{/i} option is-
563 setCaseSensitivity(Qt::CaseInsensitive).-
564-
565 Perl's \c{/g} option can be emulated using a \l{#cap_in_a_loop}{loop}.-
566-
567 In QRegExp \b{.} matches any character, therefore all QRegExp-
568 regexps have the equivalent of Perl's \c{/s} option. QRegExp-
569 does not have an equivalent to Perl's \c{/m} option, but this-
570 can be emulated in various ways for example by splitting the input-
571 into lines or by looping with a regexp that searches for newlines.-
572-
573 Because QRegExp is string oriented, there are no \\A, \\Z, or \\z-
574 assertions. The \\G assertion is not supported but can be emulated-
575 in a loop.-
576-
577 Perl's $& is cap(0) or capturedTexts()[0]. There are no QRegExp-
578 equivalents for $`, $' or $+. Perl's capturing variables, $1, $2,-
579 ... correspond to cap(1) or capturedTexts()[1], cap(2) or-
580 capturedTexts()[2], etc.-
581-
582 To substitute a pattern use QString::replace().-
583-
584 Perl's extended \c{/x} syntax is not supported, nor are-
585 directives, e.g. (?i), or regexp comments, e.g. (?#comment). On-
586 the other hand, C++'s rules for literal strings can be used to-
587 achieve the same:-
588-
589 \snippet code/src_corelib_tools_qregexp.cpp 3-
590-
591 Both zero-width positive and zero-width negative lookahead-
592 assertions (?=pattern) and (?!pattern) are supported with the same-
593 syntax as Perl. Perl's lookbehind assertions, "independent"-
594 subexpressions and conditional expressions are not supported.-
595-
596 Non-capturing parentheses are also supported, with the same-
597 (?:pattern) syntax.-
598-
599 See QString::split() and QStringList::join() for equivalents-
600 to Perl's split and join functions.-
601-
602 Note: because C++ transforms \\'s they must be written \e twice in-
603 code, e.g. \b{\\b} must be written \b{\\\\b}.-
604-
605 \target code-examples-
606 \section1 Code Examples-
607-
608 \snippet code/src_corelib_tools_qregexp.cpp 4-
609-
610 The third string matches '\underline{6}'. This is a simple validation-
611 regexp for integers in the range 0 to 99.-
612-
613 \snippet code/src_corelib_tools_qregexp.cpp 5-
614-
615 The second string matches '\underline{This_is-OK}'. We've used the-
616 character set abbreviation '\\S' (non-whitespace) and the anchors-
617 to match strings which contain no whitespace.-
618-
619 In the following example we match strings containing 'mail' or-
620 'letter' or 'correspondence' but only match whole words i.e. not-
621 'email'-
622-
623 \snippet code/src_corelib_tools_qregexp.cpp 6-
624-
625 The second string matches "Please write the \underline{letter}". The-
626 word 'letter' is also captured (because of the parentheses). We-
627 can see what text we've captured like this:-
628-
629 \snippet code/src_corelib_tools_qregexp.cpp 7-
630-
631 This will capture the text from the first set of capturing-
632 parentheses (counting capturing left parentheses from left to-
633 right). The parentheses are counted from 1 since cap(0) is the-
634 whole matched regexp (equivalent to '&' in most regexp engines).-
635-
636 \snippet code/src_corelib_tools_qregexp.cpp 8-
637-
638 Here we've passed the QRegExp to QString's replace() function to-
639 replace the matched text with new text.-
640-
641 \snippet code/src_corelib_tools_qregexp.cpp 9-
642-
643 We've used the indexIn() function to repeatedly match the regexp in-
644 the string. Note that instead of moving forward by one character-
645 at a time \c pos++ we could have written \c {pos +=-
646 rx.matchedLength()} to skip over the already matched string. The-
647 count will equal 3, matching 'One \underline{Eric} another-
648 \underline{Eirik}, and an Ericsson. How many Eiriks, \underline{Eric}?'; it-
649 doesn't match 'Ericsson' or 'Eiriks' because they are not bounded-
650 by non-word boundaries.-
651-
652 One common use of regexps is to split lines of delimited data into-
653 their component fields.-
654-
655 \snippet code/src_corelib_tools_qregexp.cpp 10-
656-
657 In this example our input lines have the format company name, web-
658 address and country. Unfortunately the regexp is rather long and-
659 not very versatile -- the code will break if we add any more-
660 fields. A simpler and better solution is to look for the-
661 separator, '\\t' in this case, and take the surrounding text. The-
662 QString::split() function can take a separator string or regexp-
663 as an argument and split a string accordingly.-
664-
665 \snippet code/src_corelib_tools_qregexp.cpp 11-
666-
667 Here field[0] is the company, field[1] the web address and so on.-
668-
669 To imitate the matching of a shell we can use wildcard mode.-
670-
671 \snippet code/src_corelib_tools_qregexp.cpp 12-
672-
673 Wildcard matching can be convenient because of its simplicity, but-
674 any wildcard regexp can be defined using full regexps, e.g.-
675 \b{.*\\.html$}. Notice that we can't match both \c .html and \c-
676 .htm files with a wildcard unless we use \b{*.htm*} which will-
677 also match 'test.html.bak'. A full regexp gives us the precision-
678 we need, \b{.*\\.html?$}.-
679-
680 QRegExp can match case insensitively using setCaseSensitivity(),-
681 and can use non-greedy matching, see setMinimal(). By-
682 default QRegExp uses full regexps but this can be changed with-
683 setPatternSyntax(). Searching can be done forward with indexIn() or backward-
684 with lastIndexIn(). Captured text can be accessed using-
685 capturedTexts() which returns a string list of all captured-
686 strings, or using cap() which returns the captured string for the-
687 given index. The pos() function takes a match index and returns-
688 the position in the string where the match was made (or -1 if-
689 there was no match).-
690-
691 \sa QString, QStringList, QRegExpValidator, QSortFilterProxyModel,-
692 {tools/regexp}{Regular Expression Example}-
693*/-
694-
695#if defined(Q_OS_VXWORKS) && defined(EOS)-
696# undef EOS-
697#endif-
698-
699const int NumBadChars = 64;-
700#define BadChar(ch) ((ch).unicode() % NumBadChars)-
701-
702const int NoOccurrence = INT_MAX;-
703const int EmptyCapture = INT_MAX;-
704const int InftyLen = INT_MAX;-
705const int InftyRep = 1025;-
706const int EOS = -1;-
707-
708static bool isWord(QChar ch)-
709{-
710 return ch.isLetterOrNumber() || ch.isMark() || ch == QLatin1Char('_');-
711}-
712-
713/*-
714 Merges two vectors of ints and puts the result into the first-
715 one.-
716*/-
717static void mergeInto(QVector<int> *a, const QVector<int> &b)-
718{-
719 int asize = a->size();-
720 int bsize = b.size();-
721 if (asize == 0) {-
722 *a = b;-
723#ifndef QT_NO_REGEXP_OPTIM-
724 } else if (bsize == 1 && a->at(asize - 1) < b.at(0)) {-
725 a->resize(asize + 1);-
726 (*a)[asize] = b.at(0);-
727#endif-
728 } else if (bsize >= 1) {-
729 int csize = asize + bsize;-
730 QVector<int> c(csize);-
731 int i = 0, j = 0, k = 0;-
732 while (i < asize) {-
733 if (j < bsize) {-
734 if (a->at(i) == b.at(j)) {-
735 ++i;-
736 --csize;-
737 } else if (a->at(i) < b.at(j)) {-
738 c[k++] = a->at(i++);-
739 } else {-
740 c[k++] = b.at(j++);-
741 }-
742 } else {-
743 memcpy(c.data() + k, a->constData() + i, (asize - i) * sizeof(int));-
744 break;-
745 }-
746 }-
747 c.resize(csize);-
748 if (j < bsize)-
749 memcpy(c.data() + k, b.constData() + j, (bsize - j) * sizeof(int));-
750 *a = c;-
751 }-
752}-
753-
754#ifndef QT_NO_REGEXP_WILDCARD-
755/*-
756 Translates a wildcard pattern to an equivalent regular expression-
757 pattern (e.g., *.cpp to .*\.cpp).-
758-
759 If enableEscaping is true, it is possible to escape the wildcard-
760 characters with \-
761*/-
762static QString wc2rx(const QString &wc_str, const bool enableEscaping)-
763{-
764 const int wclen = wc_str.length();-
765 QString rx;-
766 int i = 0;-
767 bool isEscaping = false; // the previous character is '\'-
768 const QChar *wc = wc_str.unicode();-
769-
770 while (i < wclen) {-
771 const QChar c = wc[i++];-
772 switch (c.unicode()) {-
773 case '\\':-
774 if (enableEscaping) {-
775 if (isEscaping) {-
776 rx += QLatin1String("\\\\");-
777 } // we insert the \\ later if necessary-
778 if (i == wclen) { // the end-
779 rx += QLatin1String("\\\\");-
780 }-
781 } else {-
782 rx += QLatin1String("\\\\");-
783 }-
784 isEscaping = true;-
785 break;-
786 case '*':-
787 if (isEscaping) {-
788 rx += QLatin1String("\\*");-
789 isEscaping = false;-
790 } else {-
791 rx += QLatin1String(".*");-
792 }-
793 break;-
794 case '?':-
795 if (isEscaping) {-
796 rx += QLatin1String("\\?");-
797 isEscaping = false;-
798 } else {-
799 rx += QLatin1Char('.');-
800 }-
801-
802 break;-
803 case '$':-
804 case '(':-
805 case ')':-
806 case '+':-
807 case '.':-
808 case '^':-
809 case '{':-
810 case '|':-
811 case '}':-
812 if (isEscaping) {-
813 isEscaping = false;-
814 rx += QLatin1String("\\\\");-
815 }-
816 rx += QLatin1Char('\\');-
817 rx += c;-
818 break;-
819 case '[':-
820 if (isEscaping) {-
821 isEscaping = false;-
822 rx += QLatin1String("\\[");-
823 } else {-
824 rx += c;-
825 if (wc[i] == QLatin1Char('^'))-
826 rx += wc[i++];-
827 if (i < wclen) {-
828 if (rx[i] == QLatin1Char(']'))-
829 rx += wc[i++];-
830 while (i < wclen && wc[i] != QLatin1Char(']')) {-
831 if (wc[i] == QLatin1Char('\\'))-
832 rx += QLatin1Char('\\');-
833 rx += wc[i++];-
834 }-
835 }-
836 }-
837 break;-
838-
839 case ']':-
840 if(isEscaping){-
841 isEscaping = false;-
842 rx += QLatin1String("\\");-
843 }-
844 rx += c;-
845 break;-
846-
847 default:-
848 if(isEscaping){-
849 isEscaping = false;-
850 rx += QLatin1String("\\\\");-
851 }-
852 rx += c;-
853 }-
854 }-
855 return rx;-
856}-
857#endif-
858-
859static int caretIndex(int offset, QRegExp::CaretMode caretMode)-
860{-
861 if (caretMode == QRegExp::CaretAtZero) {-
862 return 0;-
863 } else if (caretMode == QRegExp::CaretAtOffset) {-
864 return offset;-
865 } else { // QRegExp::CaretWontMatch-
866 return -1;-
867 }-
868}-
869-
870/*-
871 The QRegExpEngineKey struct uniquely identifies an engine.-
872*/-
873struct QRegExpEngineKey-
874{-
875 QString pattern;-
876 QRegExp::PatternSyntax patternSyntax;-
877 Qt::CaseSensitivity cs;-
878-
879 inline QRegExpEngineKey(const QString &pattern, QRegExp::PatternSyntax patternSyntax,-
880 Qt::CaseSensitivity cs)-
881 : pattern(pattern), patternSyntax(patternSyntax), cs(cs) {}-
882-
883 inline void clear() {-
884 pattern.clear();-
885 patternSyntax = QRegExp::RegExp;-
886 cs = Qt::CaseSensitive;-
887 }-
888};-
889-
890static bool operator==(const QRegExpEngineKey &key1, const QRegExpEngineKey &key2)-
891{-
892 return key1.pattern == key2.pattern && key1.patternSyntax == key2.patternSyntax-
893 && key1.cs == key2.cs;-
894}-
895-
896static uint qHash(const QRegExpEngineKey &key, uint seed = 0) Q_DECL_NOTHROW-
897{-
898 QtPrivate::QHashCombine hash;-
899 seed = hash(seed, key.pattern);-
900 seed = hash(seed, key.patternSyntax);-
901 seed = hash(seed, key.cs);-
902 return seed;
executed 630297 times by 167 tests: return seed;
Executed by:
  • tst_Collections
  • tst_Lancelot
  • tst_ModelTest
  • tst_NetworkSelfTest
  • tst_QAbstractFileEngine
  • tst_QAbstractItemModel
  • tst_QAbstractItemView
  • tst_QAbstractNetworkCache
  • tst_QAbstractPrintDialog
  • tst_QAbstractScrollArea
  • tst_QAccessibility
  • tst_QApplication
  • tst_QBrush
  • tst_QButtonGroup
  • tst_QCalendarWidget
  • tst_QColorDialog
  • tst_QColumnView
  • tst_QComboBox
  • tst_QCommandLinkButton
  • tst_QCompleter
  • tst_QCssParser
  • tst_QDBusInterface
  • tst_QDataStream
  • tst_QDate
  • tst_QDateTime
  • ...
630297
903}-
904-
905class QRegExpEngine;-
906-
907//Q_DECLARE_TYPEINFO(QVector<int>, Q_MOVABLE_TYPE);-
908-
909/*-
910 This is the engine state during matching.-
911*/-
912struct QRegExpMatchState-
913{-
914 const QChar *in; // a pointer to the input string data-
915 int pos; // the current position in the string-
916 int caretPos;-
917 int len; // the length of the input string-
918 bool minimal; // minimal matching?-
919 int *bigArray; // big array holding the data for the next pointers-
920 int *inNextStack; // is state is nextStack?-
921 int *curStack; // stack of current states-
922 int *nextStack; // stack of next states-
923 int *curCapBegin; // start of current states' captures-
924 int *nextCapBegin; // start of next states' captures-
925 int *curCapEnd; // end of current states' captures-
926 int *nextCapEnd; // end of next states' captures-
927 int *tempCapBegin; // start of temporary captures-
928 int *tempCapEnd; // end of temporary captures-
929 int *capBegin; // start of captures for a next state-
930 int *capEnd; // end of captures for a next state-
931 int *slideTab; // bump-along slide table for bad-character heuristic-
932 int *captured; // what match() returned last-
933 int slideTabSize; // size of slide table-
934 int capturedSize;-
935#ifndef QT_NO_REGEXP_BACKREF-
936 QList<QVector<int> > sleeping; // list of back-reference sleepers-
937#endif-
938 int matchLen; // length of match-
939 int oneTestMatchedLen; // length of partial match-
940-
941 const QRegExpEngine *eng;-
942-
943 inline QRegExpMatchState() : bigArray(0), captured(0) {}-
944 inline ~QRegExpMatchState() { free(bigArray); }-
945-
946 void drain() { free(bigArray); bigArray = 0; captured = 0; } // to save memory-
947 void prepareForMatch(QRegExpEngine *eng);-
948 void match(const QChar *str, int len, int pos, bool minimal,-
949 bool oneTest, int caretIndex);-
950 bool matchHere();-
951 bool testAnchor(int i, int a, const int *capBegin);-
952};-
953-
954/*-
955 The struct QRegExpAutomatonState represents one state in a modified NFA. The-
956 input characters matched are stored in the state instead of on-
957 the transitions, something possible for an automaton-
958 constructed from a regular expression.-
959*/-
960struct QRegExpAutomatonState-
961{-
962#ifndef QT_NO_REGEXP_CAPTURE-
963 int atom; // which atom does this state belong to?-
964#endif-
965 int match; // what does it match? (see CharClassBit and BackRefBit)-
966 QVector<int> outs; // out-transitions-
967 QMap<int, int> reenter; // atoms reentered when transiting out-
968 QMap<int, int> anchors; // anchors met when transiting out-
969-
970 inline QRegExpAutomatonState() { }-
971#ifndef QT_NO_REGEXP_CAPTURE-
972 inline QRegExpAutomatonState(int a, int m)-
973 : atom(a), match(m) { }-
974#else-
975 inline QRegExpAutomatonState(int m)-
976 : match(m) { }-
977#endif-
978};-
979-
980Q_DECLARE_TYPEINFO(QRegExpAutomatonState, Q_MOVABLE_TYPE);-
981-
982/*-
983 The struct QRegExpCharClassRange represents a range of characters (e.g.,-
984 [0-9] denotes range 48 to 57).-
985*/-
986struct QRegExpCharClassRange-
987{-
988 ushort from; // 48-
989 ushort len; // 10-
990};-
991-
992Q_DECLARE_TYPEINFO(QRegExpCharClassRange, Q_PRIMITIVE_TYPE);-
993-
994#ifndef QT_NO_REGEXP_CAPTURE-
995/*-
996 The struct QRegExpAtom represents one node in the hierarchy of regular-
997 expression atoms.-
998*/-
999struct QRegExpAtom-
1000{-
1001 enum { NoCapture = -1, OfficialCapture = -2, UnofficialCapture = -3 };-
1002-
1003 int parent; // index of parent in array of atoms-
1004 int capture; // index of capture, from 1 to ncap - 1-
1005};-
1006-
1007Q_DECLARE_TYPEINFO(QRegExpAtom, Q_PRIMITIVE_TYPE);-
1008#endif-
1009-
1010struct QRegExpLookahead;-
1011-
1012#ifndef QT_NO_REGEXP_ANCHOR_ALT-
1013/*-
1014 The struct QRegExpAnchorAlternation represents a pair of anchors with-
1015 OR semantics.-
1016*/-
1017struct QRegExpAnchorAlternation-
1018{-
1019 int a; // this anchor...-
1020 int b; // ...or this one-
1021};-
1022-
1023Q_DECLARE_TYPEINFO(QRegExpAnchorAlternation, Q_PRIMITIVE_TYPE);-
1024#endif-
1025-
1026#ifndef QT_NO_REGEXP_CCLASS-
1027-
1028#define FLAG(x) (1 << (x))-
1029/*-
1030 The class QRegExpCharClass represents a set of characters, such as can-
1031 be found in regular expressions (e.g., [a-z] denotes the set-
1032 {a, b, ..., z}).-
1033*/-
1034class QRegExpCharClass-
1035{-
1036public:-
1037 QRegExpCharClass();-
1038-
1039 void clear();-
1040 bool negative() const { return n; }-
1041 void setNegative(bool negative);-
1042 void addCategories(uint cats);-
1043 void addRange(ushort from, ushort to);-
1044 void addSingleton(ushort ch) { addRange(ch, ch); }-
1045-
1046 bool in(QChar ch) const;-
1047#ifndef QT_NO_REGEXP_OPTIM-
1048 const QVector<int> &firstOccurrence() const { return occ1; }-
1049#endif-
1050-
1051#if defined(QT_DEBUG)-
1052 void dump() const;-
1053#endif-
1054-
1055private:-
1056 QVector<QRegExpCharClassRange> r; // character ranges-
1057#ifndef QT_NO_REGEXP_OPTIM-
1058 QVector<int> occ1; // first-occurrence array-
1059#endif-
1060 uint c; // character classes-
1061 bool n; // negative?-
1062};-
1063#else-
1064struct QRegExpCharClass-
1065{-
1066 int dummy;-
1067-
1068#ifndef QT_NO_REGEXP_OPTIM-
1069 QRegExpCharClass() { occ1.fill(0, NumBadChars); }-
1070-
1071 const QVector<int> &firstOccurrence() const { return occ1; }-
1072 QVector<int> occ1;-
1073#endif-
1074};-
1075#endif-
1076-
1077Q_DECLARE_TYPEINFO(QRegExpCharClass, Q_MOVABLE_TYPE);-
1078-
1079/*-
1080 The QRegExpEngine class encapsulates a modified nondeterministic-
1081 finite automaton (NFA).-
1082*/-
1083class QRegExpEngine-
1084{-
1085public:-
1086 QRegExpEngine(Qt::CaseSensitivity cs, bool greedyQuantifiers)-
1087 : cs(cs), greedyQuantifiers(greedyQuantifiers) { setup(); }-
1088-
1089 QRegExpEngine(const QRegExpEngineKey &key);-
1090 ~QRegExpEngine();-
1091-
1092 bool isValid() const { return valid; }-
1093 const QString &errorString() const { return yyError; }-
1094 int captureCount() const { return officialncap; }-
1095-
1096 int createState(QChar ch);-
1097 int createState(const QRegExpCharClass &cc);-
1098#ifndef QT_NO_REGEXP_BACKREF-
1099 int createState(int bref);-
1100#endif-
1101-
1102 void addCatTransitions(const QVector<int> &from, const QVector<int> &to);-
1103#ifndef QT_NO_REGEXP_CAPTURE-
1104 void addPlusTransitions(const QVector<int> &from, const QVector<int> &to, int atom);-
1105#endif-
1106-
1107#ifndef QT_NO_REGEXP_ANCHOR_ALT-
1108 int anchorAlternation(int a, int b);-
1109 int anchorConcatenation(int a, int b);-
1110#else-
1111 int anchorAlternation(int a, int b) { return a & b; }-
1112 int anchorConcatenation(int a, int b) { return a | b; }-
1113#endif-
1114 void addAnchors(int from, int to, int a);-
1115-
1116#ifndef QT_NO_REGEXP_OPTIM-
1117 void heuristicallyChooseHeuristic();-
1118#endif-
1119-
1120#if defined(QT_DEBUG)-
1121 void dump() const;-
1122#endif-
1123-
1124 QAtomicInt ref;-
1125-
1126private:-
1127 enum { CharClassBit = 0x10000, BackRefBit = 0x20000 };-
1128 enum { InitialState = 0, FinalState = 1 };-
1129-
1130 void setup();-
1131 int setupState(int match);-
1132-
1133 /*-
1134 Let's hope that 13 lookaheads and 14 back-references are-
1135 enough.-
1136 */-
1137 enum { MaxLookaheads = 13, MaxBackRefs = 14 };-
1138 enum { Anchor_Dollar = 0x00000001, Anchor_Caret = 0x00000002, Anchor_Word = 0x00000004,-
1139 Anchor_NonWord = 0x00000008, Anchor_FirstLookahead = 0x00000010,-
1140 Anchor_BackRef1Empty = Anchor_FirstLookahead << MaxLookaheads,-
1141 Anchor_BackRef0Empty = Anchor_BackRef1Empty >> 1,-
1142 Anchor_Alternation = unsigned(Anchor_BackRef1Empty) << MaxBackRefs,-
1143-
1144 Anchor_LookaheadMask = (Anchor_FirstLookahead - 1) ^-
1145 ((Anchor_FirstLookahead << MaxLookaheads) - 1) };-
1146#ifndef QT_NO_REGEXP_CAPTURE-
1147 int startAtom(bool officialCapture);-
1148 void finishAtom(int atom, bool needCapture);-
1149#endif-
1150-
1151#ifndef QT_NO_REGEXP_LOOKAHEAD-
1152 int addLookahead(QRegExpEngine *eng, bool negative);-
1153#endif-
1154-
1155#ifndef QT_NO_REGEXP_OPTIM-
1156 bool goodStringMatch(QRegExpMatchState &matchState) const;-
1157 bool badCharMatch(QRegExpMatchState &matchState) const;-
1158#else-
1159 bool bruteMatch(QRegExpMatchState &matchState) const;-
1160#endif-
1161-
1162 QVector<QRegExpAutomatonState> s; // array of states-
1163#ifndef QT_NO_REGEXP_CAPTURE-
1164 QVector<QRegExpAtom> f; // atom hierarchy-
1165 int nf; // number of atoms-
1166 int cf; // current atom-
1167 QVector<int> captureForOfficialCapture;-
1168#endif-
1169 int officialncap; // number of captures, seen from the outside-
1170 int ncap; // number of captures, seen from the inside-
1171#ifndef QT_NO_REGEXP_CCLASS-
1172 QVector<QRegExpCharClass> cl; // array of character classes-
1173#endif-
1174#ifndef QT_NO_REGEXP_LOOKAHEAD-
1175 QVector<QRegExpLookahead *> ahead; // array of lookaheads-
1176#endif-
1177#ifndef QT_NO_REGEXP_ANCHOR_ALT-
1178 QVector<QRegExpAnchorAlternation> aa; // array of (a, b) pairs of anchors-
1179#endif-
1180#ifndef QT_NO_REGEXP_OPTIM-
1181 bool caretAnchored; // does the regexp start with ^?-
1182 bool trivial; // is the good-string all that needs to match?-
1183#endif-
1184 bool valid; // is the regular expression valid?-
1185 Qt::CaseSensitivity cs; // case sensitive?-
1186 bool greedyQuantifiers; // RegExp2?-
1187 bool xmlSchemaExtensions;-
1188#ifndef QT_NO_REGEXP_BACKREF-
1189 int nbrefs; // number of back-references-
1190#endif-
1191-
1192#ifndef QT_NO_REGEXP_OPTIM-
1193 bool useGoodStringHeuristic; // use goodStringMatch? otherwise badCharMatch-
1194-
1195 int goodEarlyStart; // the index where goodStr can first occur in a match-
1196 int goodLateStart; // the index where goodStr can last occur in a match-
1197 QString goodStr; // the string that any match has to contain-
1198-
1199 int minl; // the minimum length of a match-
1200 QVector<int> occ1; // first-occurrence array-
1201#endif-
1202-
1203 /*-
1204 The class Box is an abstraction for a regular expression-
1205 fragment. It can also be seen as one node in the syntax tree of-
1206 a regular expression with synthetized attributes.-
1207-
1208 Its interface is ugly for performance reasons.-
1209 */-
1210 class Box-
1211 {-
1212 public:-
1213 Box(QRegExpEngine *engine);-
1214 Box(const Box &b) { operator=(b); }-
1215-
1216 Box &operator=(const Box &b);-
1217-
1218 void clear() { operator=(Box(eng)); }-
1219 void set(QChar ch);-
1220 void set(const QRegExpCharClass &cc);-
1221#ifndef QT_NO_REGEXP_BACKREF-
1222 void set(int bref);-
1223#endif-
1224-
1225 void cat(const Box &b);-
1226 void orx(const Box &b);-
1227 void plus(int atom);-
1228 void opt();-
1229 void catAnchor(int a);-
1230#ifndef QT_NO_REGEXP_OPTIM-
1231 void setupHeuristics();-
1232#endif-
1233-
1234#if defined(QT_DEBUG)-
1235 void dump() const;-
1236#endif-
1237-
1238 private:-
1239 void addAnchorsToEngine(const Box &to) const;-
1240-
1241 QRegExpEngine *eng; // the automaton under construction-
1242 QVector<int> ls; // the left states (firstpos)-
1243 QVector<int> rs; // the right states (lastpos)-
1244 QMap<int, int> lanchors; // the left anchors-
1245 QMap<int, int> ranchors; // the right anchors-
1246 int skipanchors; // the anchors to match if the box is skipped-
1247-
1248#ifndef QT_NO_REGEXP_OPTIM-
1249 int earlyStart; // the index where str can first occur-
1250 int lateStart; // the index where str can last occur-
1251 QString str; // a string that has to occur in any match-
1252 QString leftStr; // a string occurring at the left of this box-
1253 QString rightStr; // a string occurring at the right of this box-
1254 int maxl; // the maximum length of this box (possibly InftyLen)-
1255#endif-
1256-
1257 int minl; // the minimum length of this box-
1258#ifndef QT_NO_REGEXP_OPTIM-
1259 QVector<int> occ1; // first-occurrence array-
1260#endif-
1261 };-
1262-
1263 friend class Box;-
1264-
1265 /*-
1266 This is the lexical analyzer for regular expressions.-
1267 */-
1268 enum { Tok_Eos, Tok_Dollar, Tok_LeftParen, Tok_MagicLeftParen, Tok_PosLookahead,-
1269 Tok_NegLookahead, Tok_RightParen, Tok_CharClass, Tok_Caret, Tok_Quantifier, Tok_Bar,-
1270 Tok_Word, Tok_NonWord, Tok_Char = 0x10000, Tok_BackRef = 0x20000 };-
1271 int getChar();-
1272 int getEscape();-
1273#ifndef QT_NO_REGEXP_INTERVAL-
1274 int getRep(int def);-
1275#endif-
1276#ifndef QT_NO_REGEXP_LOOKAHEAD-
1277 void skipChars(int n);-
1278#endif-
1279 void error(const char *msg);-
1280 void startTokenizer(const QChar *rx, int len);-
1281 int getToken();-
1282-
1283 const QChar *yyIn; // a pointer to the input regular expression pattern-
1284 int yyPos0; // the position of yyTok in the input pattern-
1285 int yyPos; // the position of the next character to read-
1286 int yyLen; // the length of yyIn-
1287 int yyCh; // the last character read-
1288 QScopedPointer<QRegExpCharClass> yyCharClass; // attribute for Tok_CharClass tokens-
1289 int yyMinRep; // attribute for Tok_Quantifier-
1290 int yyMaxRep; // ditto-
1291 QString yyError; // syntax error or overflow during parsing?-
1292-
1293 /*-
1294 This is the syntactic analyzer for regular expressions.-
1295 */-
1296 int parse(const QChar *rx, int len);-
1297 void parseAtom(Box *box);-
1298 void parseFactor(Box *box);-
1299 void parseTerm(Box *box);-
1300 void parseExpression(Box *box);-
1301-
1302 int yyTok; // the last token read-
1303 bool yyMayCapture; // set this to false to disable capturing-
1304-
1305 friend struct QRegExpMatchState;-
1306};-
1307-
1308#ifndef QT_NO_REGEXP_LOOKAHEAD-
1309/*-
1310 The struct QRegExpLookahead represents a lookahead a la Perl (e.g.,-
1311 (?=foo) and (?!bar)).-
1312*/-
1313struct QRegExpLookahead-
1314{-
1315 QRegExpEngine *eng; // NFA representing the embedded regular expression-
1316 bool neg; // negative lookahead?-
1317-
1318 inline QRegExpLookahead(QRegExpEngine *eng0, bool neg0)-
1319 : eng(eng0), neg(neg0) { }-
1320 inline ~QRegExpLookahead() { delete eng; }-
1321};-
1322#endif-
1323-
1324/*!-
1325 \internal-
1326 convert the pattern string to the RegExp syntax.-
1327-
1328 This is also used by QScriptEngine::newRegExp to convert to a pattern that JavaScriptCore can understan-
1329 */-
1330Q_CORE_EXPORT QString qt_regexp_toCanonical(const QString &pattern, QRegExp::PatternSyntax patternSyntax)-
1331{-
1332 switch (patternSyntax) {-
1333#ifndef QT_NO_REGEXP_WILDCARD-
1334 case QRegExp::Wildcard:-
1335 return wc2rx(pattern, false);-
1336 case QRegExp::WildcardUnix:-
1337 return wc2rx(pattern, true);-
1338#endif-
1339 case QRegExp::FixedString:-
1340 return QRegExp::escape(pattern);-
1341 case QRegExp::W3CXmlSchema11:-
1342 default:-
1343 return pattern;-
1344 }-
1345}-
1346-
1347QRegExpEngine::QRegExpEngine(const QRegExpEngineKey &key)-
1348 : cs(key.cs), greedyQuantifiers(key.patternSyntax == QRegExp::RegExp2),-
1349 xmlSchemaExtensions(key.patternSyntax == QRegExp::W3CXmlSchema11)-
1350{-
1351 setup();-
1352-
1353 QString rx = qt_regexp_toCanonical(key.pattern, key.patternSyntax);-
1354-
1355 valid = (parse(rx.unicode(), rx.length()) == rx.length());-
1356 if (!valid) {-
1357#ifndef QT_NO_REGEXP_OPTIM-
1358 trivial = false;-
1359#endif-
1360 error(RXERR_LEFTDELIM);-
1361 }-
1362}-
1363-
1364QRegExpEngine::~QRegExpEngine()-
1365{-
1366#ifndef QT_NO_REGEXP_LOOKAHEAD-
1367 qDeleteAll(ahead);-
1368#endif-
1369}-
1370-
1371void QRegExpMatchState::prepareForMatch(QRegExpEngine *eng)-
1372{-
1373 /*-
1374 We use one QVector<int> for all the big data used a lot in-
1375 matchHere() and friends.-
1376 */-
1377 int ns = eng->s.size(); // number of states-
1378 int ncap = eng->ncap;-
1379#ifndef QT_NO_REGEXP_OPTIM-
1380 int newSlideTabSize = qMax(eng->minl + 1, 16);-
1381#else-
1382 int newSlideTabSize = 0;-
1383#endif-
1384 int numCaptures = eng->captureCount();-
1385 int newCapturedSize = 2 + 2 * numCaptures;-
1386 bigArray = q_check_ptr((int *)realloc(bigArray, ((3 + 4 * ncap) * ns + 4 * ncap + newSlideTabSize + newCapturedSize)*sizeof(int)));-
1387-
1388 // set all internal variables only _after_ bigArray is realloc'ed-
1389 // to prevent a broken regexp in oom case-
1390-
1391 slideTabSize = newSlideTabSize;-
1392 capturedSize = newCapturedSize;-
1393 inNextStack = bigArray;-
1394 memset(inNextStack, -1, ns * sizeof(int));-
1395 curStack = inNextStack + ns;-
1396 nextStack = inNextStack + 2 * ns;-
1397-
1398 curCapBegin = inNextStack + 3 * ns;-
1399 nextCapBegin = curCapBegin + ncap * ns;-
1400 curCapEnd = curCapBegin + 2 * ncap * ns;-
1401 nextCapEnd = curCapBegin + 3 * ncap * ns;-
1402-
1403 tempCapBegin = curCapBegin + 4 * ncap * ns;-
1404 tempCapEnd = tempCapBegin + ncap;-
1405 capBegin = tempCapBegin + 2 * ncap;-
1406 capEnd = tempCapBegin + 3 * ncap;-
1407-
1408 slideTab = tempCapBegin + 4 * ncap;-
1409 captured = slideTab + slideTabSize;-
1410 memset(captured, -1, capturedSize*sizeof(int));-
1411 this->eng = eng;-
1412}-
1413-
1414/*-
1415 Tries to match in str and returns an array of (begin, length) pairs-
1416 for captured text. If there is no match, all pairs are (-1, -1).-
1417*/-
1418void QRegExpMatchState::match(const QChar *str0, int len0, int pos0,-
1419 bool minimal0, bool oneTest, int caretIndex)-
1420{-
1421 bool matched = false;-
1422 QChar char_null;-
1423-
1424#ifndef QT_NO_REGEXP_OPTIM-
1425 if (eng->trivial && !oneTest) {-
1426 pos = qFindString(str0, len0, pos0, eng->goodStr.unicode(), eng->goodStr.length(), eng->cs);-
1427 matchLen = eng->goodStr.length();-
1428 matched = (pos != -1);-
1429 } else-
1430#endif-
1431 {-
1432 in = str0;-
1433 if (in == 0)-
1434 in = &char_null;-
1435 pos = pos0;-
1436 caretPos = caretIndex;-
1437 len = len0;-
1438 minimal = minimal0;-
1439 matchLen = 0;-
1440 oneTestMatchedLen = 0;-
1441-
1442 if (eng->valid && pos >= 0 && pos <= len) {-
1443#ifndef QT_NO_REGEXP_OPTIM-
1444 if (oneTest) {-
1445 matched = matchHere();-
1446 } else {-
1447 if (pos <= len - eng->minl) {-
1448 if (eng->caretAnchored) {-
1449 matched = matchHere();-
1450 } else if (eng->useGoodStringHeuristic) {-
1451 matched = eng->goodStringMatch(*this);-
1452 } else {-
1453 matched = eng->badCharMatch(*this);-
1454 }-
1455 }-
1456 }-
1457#else-
1458 matched = oneTest ? matchHere() : eng->bruteMatch(*this);-
1459#endif-
1460 }-
1461 }-
1462-
1463 if (matched) {-
1464 int *c = captured;-
1465 *c++ = pos;-
1466 *c++ = matchLen;-
1467-
1468 int numCaptures = (capturedSize - 2) >> 1;-
1469#ifndef QT_NO_REGEXP_CAPTURE-
1470 for (int i = 0; i < numCaptures; ++i) {-
1471 int j = eng->captureForOfficialCapture.at(i);-
1472 if (capBegin[j] != EmptyCapture) {-
1473 int len = capEnd[j] - capBegin[j];-
1474 *c++ = (len > 0) ? pos + capBegin[j] : 0;-
1475 *c++ = len;-
1476 } else {-
1477 *c++ = -1;-
1478 *c++ = -1;-
1479 }-
1480 }-
1481#endif-
1482 } else {-
1483 // we rely on 2's complement here-
1484 memset(captured, -1, capturedSize * sizeof(int));-
1485 }-
1486}-
1487-
1488/*-
1489 The three following functions add one state to the automaton and-
1490 return the number of the state.-
1491*/-
1492-
1493int QRegExpEngine::createState(QChar ch)-
1494{-
1495 return setupState(ch.unicode());-
1496}-
1497-
1498int QRegExpEngine::createState(const QRegExpCharClass &cc)-
1499{-
1500#ifndef QT_NO_REGEXP_CCLASS-
1501 int n = cl.size();-
1502 cl += QRegExpCharClass(cc);-
1503 return setupState(CharClassBit | n);-
1504#else-
1505 Q_UNUSED(cc);-
1506 return setupState(CharClassBit);-
1507#endif-
1508}-
1509-
1510#ifndef QT_NO_REGEXP_BACKREF-
1511int QRegExpEngine::createState(int bref)-
1512{-
1513 if (bref > nbrefs) {-
1514 nbrefs = bref;-
1515 if (nbrefs > MaxBackRefs) {-
1516 error(RXERR_LIMIT);-
1517 return 0;-
1518 }-
1519 }-
1520 return setupState(BackRefBit | bref);-
1521}-
1522#endif-
1523-
1524/*-
1525 The two following functions add a transition between all pairs of-
1526 states (i, j) where i is found in from, and j is found in to.-
1527-
1528 Cat-transitions are distinguished from plus-transitions for-
1529 capturing.-
1530*/-
1531-
1532void QRegExpEngine::addCatTransitions(const QVector<int> &from, const QVector<int> &to)-
1533{-
1534 for (int i = 0; i < from.size(); i++)-
1535 mergeInto(&s[from.at(i)].outs, to);-
1536}-
1537-
1538#ifndef QT_NO_REGEXP_CAPTURE-
1539void QRegExpEngine::addPlusTransitions(const QVector<int> &from, const QVector<int> &to, int atom)-
1540{-
1541 for (int i = 0; i < from.size(); i++) {-
1542 QRegExpAutomatonState &st = s[from.at(i)];-
1543 const QVector<int> oldOuts = st.outs;-
1544 mergeInto(&st.outs, to);-
1545 if (f.at(atom).capture != QRegExpAtom::NoCapture) {-
1546 for (int j = 0; j < to.size(); j++) {-
1547 // ### st.reenter.contains(to.at(j)) check looks suspicious-
1548 if (!st.reenter.contains(to.at(j)) &&-
1549 !std::binary_search(oldOuts.constBegin(), oldOuts.constEnd(), to.at(j)))-
1550 st.reenter.insert(to.at(j), atom);-
1551 }-
1552 }-
1553 }-
1554}-
1555#endif-
1556-
1557#ifndef QT_NO_REGEXP_ANCHOR_ALT-
1558/*-
1559 Returns an anchor that means a OR b.-
1560*/-
1561int QRegExpEngine::anchorAlternation(int a, int b)-
1562{-
1563 if (((a & b) == a || (a & b) == b) && ((a | b) & Anchor_Alternation) == 0)-
1564 return a & b;-
1565-
1566 int n = aa.size();-
1567#ifndef QT_NO_REGEXP_OPTIM-
1568 if (n > 0 && aa.at(n - 1).a == a && aa.at(n - 1).b == b)-
1569 return Anchor_Alternation | (n - 1);-
1570#endif-
1571-
1572 QRegExpAnchorAlternation element = {a, b};-
1573 aa.append(element);-
1574 return Anchor_Alternation | n;-
1575}-
1576-
1577/*-
1578 Returns an anchor that means a AND b.-
1579*/-
1580int QRegExpEngine::anchorConcatenation(int a, int b)-
1581{-
1582 if (((a | b) & Anchor_Alternation) == 0)-
1583 return a | b;-
1584 if ((b & Anchor_Alternation) != 0)-
1585 qSwap(a, b);-
1586-
1587 int aprime = anchorConcatenation(aa.at(a ^ Anchor_Alternation).a, b);-
1588 int bprime = anchorConcatenation(aa.at(a ^ Anchor_Alternation).b, b);-
1589 return anchorAlternation(aprime, bprime);-
1590}-
1591#endif-
1592-
1593/*-
1594 Adds anchor a on a transition caracterised by its from state and-
1595 its to state.-
1596*/-
1597void QRegExpEngine::addAnchors(int from, int to, int a)-
1598{-
1599 QRegExpAutomatonState &st = s[from];-
1600 if (st.anchors.contains(to))-
1601 a = anchorAlternation(st.anchors.value(to), a);-
1602 st.anchors.insert(to, a);-
1603}-
1604-
1605#ifndef QT_NO_REGEXP_OPTIM-
1606/*-
1607 This function chooses between the good-string and the bad-character-
1608 heuristics. It computes two scores and chooses the heuristic with-
1609 the highest score.-
1610-
1611 Here are some common-sense constraints on the scores that should be-
1612 respected if the formulas are ever modified: (1) If goodStr is-
1613 empty, the good-string heuristic scores 0. (2) If the regular-
1614 expression is trivial, the good-string heuristic should be used.-
1615 (3) If the search is case insensitive, the good-string heuristic-
1616 should be used, unless it scores 0. (Case insensitivity turns all-
1617 entries of occ1 to 0.) (4) If (goodLateStart - goodEarlyStart) is-
1618 big, the good-string heuristic should score less.-
1619*/-
1620void QRegExpEngine::heuristicallyChooseHeuristic()-
1621{-
1622 if (minl == 0) {-
1623 useGoodStringHeuristic = false;-
1624 } else if (trivial) {-
1625 useGoodStringHeuristic = true;-
1626 } else {-
1627 /*-
1628 Magic formula: The good string has to constitute a good-
1629 proportion of the minimum-length string, and appear at a-
1630 more-or-less known index.-
1631 */-
1632 int goodStringScore = (64 * goodStr.length() / minl) --
1633 (goodLateStart - goodEarlyStart);-
1634 /*-
1635 Less magic formula: We pick some characters at random, and-
1636 check whether they are good or bad.-
1637 */-
1638 int badCharScore = 0;-
1639 int step = qMax(1, NumBadChars / 32);-
1640 for (int i = 1; i < NumBadChars; i += step) {-
1641 if (occ1.at(i) == NoOccurrence)-
1642 badCharScore += minl;-
1643 else-
1644 badCharScore += occ1.at(i);-
1645 }-
1646 badCharScore /= minl;-
1647 useGoodStringHeuristic = (goodStringScore > badCharScore);-
1648 }-
1649}-
1650#endif-
1651-
1652#if defined(QT_DEBUG)-
1653void QRegExpEngine::dump() const-
1654{-
1655 int i, j;-
1656 qDebug("Case %ssensitive engine", cs ? "" : "in");-
1657 qDebug(" States");-
1658 for (i = 0; i < s.size(); i++) {-
1659 qDebug(" %d%s", i, i == InitialState ? " (initial)" : i == FinalState ? " (final)" : "");-
1660#ifndef QT_NO_REGEXP_CAPTURE-
1661 if (nf > 0)-
1662 qDebug(" in atom %d", s[i].atom);-
1663#endif-
1664 int m = s[i].match;-
1665 if ((m & CharClassBit) != 0) {-
1666 qDebug(" match character class %d", m ^ CharClassBit);-
1667#ifndef QT_NO_REGEXP_CCLASS-
1668 cl[m ^ CharClassBit].dump();-
1669#else-
1670 qDebug(" negative character class");-
1671#endif-
1672 } else if ((m & BackRefBit) != 0) {-
1673 qDebug(" match back-reference %d", m ^ BackRefBit);-
1674 } else if (m >= 0x20 && m <= 0x7e) {-
1675 qDebug(" match 0x%.4x (%c)", m, m);-
1676 } else {-
1677 qDebug(" match 0x%.4x", m);-
1678 }-
1679 for (j = 0; j < s[i].outs.size(); j++) {-
1680 int next = s[i].outs[j];-
1681 qDebug(" -> %d", next);-
1682 if (s[i].reenter.contains(next))-
1683 qDebug(" [reenter %d]", s[i].reenter[next]);-
1684 if (s[i].anchors.value(next) != 0)-
1685 qDebug(" [anchors 0x%.8x]", s[i].anchors[next]);-
1686 }-
1687 }-
1688#ifndef QT_NO_REGEXP_CAPTURE-
1689 if (nf > 0) {-
1690 qDebug(" Atom Parent Capture");-
1691 for (i = 0; i < nf; i++) {-
1692 if (f[i].capture == QRegExpAtom::NoCapture) {-
1693 qDebug(" %6d %6d nil", i, f[i].parent);-
1694 } else {-
1695 int cap = f[i].capture;-
1696 bool official = captureForOfficialCapture.contains(cap);-
1697 qDebug(" %6d %6d %6d %s", i, f[i].parent, f[i].capture,-
1698 official ? "official" : "");-
1699 }-
1700 }-
1701 }-
1702#endif-
1703#ifndef QT_NO_REGEXP_ANCHOR_ALT-
1704 for (i = 0; i < aa.size(); i++)-
1705 qDebug(" Anchor alternation 0x%.8x: 0x%.8x 0x%.9x", i, aa[i].a, aa[i].b);-
1706#endif-
1707}-
1708#endif-
1709-
1710void QRegExpEngine::setup()-
1711{-
1712 ref.store(1);-
1713#ifndef QT_NO_REGEXP_CAPTURE-
1714 f.resize(32);-
1715 nf = 0;-
1716 cf = -1;-
1717#endif-
1718 officialncap = 0;-
1719 ncap = 0;-
1720#ifndef QT_NO_REGEXP_OPTIM-
1721 caretAnchored = true;-
1722 trivial = true;-
1723#endif-
1724 valid = false;-
1725#ifndef QT_NO_REGEXP_BACKREF-
1726 nbrefs = 0;-
1727#endif-
1728#ifndef QT_NO_REGEXP_OPTIM-
1729 useGoodStringHeuristic = true;-
1730 minl = 0;-
1731 occ1.fill(0, NumBadChars);-
1732#endif-
1733}-
1734-
1735int QRegExpEngine::setupState(int match)-
1736{-
1737#ifndef QT_NO_REGEXP_CAPTURE-
1738 s += QRegExpAutomatonState(cf, match);-
1739#else-
1740 s += QRegExpAutomatonState(match);-
1741#endif-
1742 return s.size() - 1;-
1743}-
1744-
1745#ifndef QT_NO_REGEXP_CAPTURE-
1746/*-
1747 Functions startAtom() and finishAtom() should be called to delimit-
1748 atoms. When a state is created, it is assigned to the current atom.-
1749 The information is later used for capturing.-
1750*/-
1751int QRegExpEngine::startAtom(bool officialCapture)-
1752{-
1753 if ((nf & (nf + 1)) == 0 && nf + 1 >= f.size())-
1754 f.resize((nf + 1) << 1);-
1755 f[nf].parent = cf;-
1756 cf = nf++;-
1757 f[cf].capture = officialCapture ? QRegExpAtom::OfficialCapture : QRegExpAtom::NoCapture;-
1758 return cf;-
1759}-
1760-
1761void QRegExpEngine::finishAtom(int atom, bool needCapture)-
1762{-
1763 if (greedyQuantifiers && needCapture && f[atom].capture == QRegExpAtom::NoCapture)-
1764 f[atom].capture = QRegExpAtom::UnofficialCapture;-
1765 cf = f.at(atom).parent;-
1766}-
1767#endif-
1768-
1769#ifndef QT_NO_REGEXP_LOOKAHEAD-
1770/*-
1771 Creates a lookahead anchor.-
1772*/-
1773int QRegExpEngine::addLookahead(QRegExpEngine *eng, bool negative)-
1774{-
1775 int n = ahead.size();-
1776 if (n == MaxLookaheads) {-
1777 error(RXERR_LIMIT);-
1778 return 0;-
1779 }-
1780 ahead += new QRegExpLookahead(eng, negative);-
1781 return Anchor_FirstLookahead << n;-
1782}-
1783#endif-
1784-
1785#ifndef QT_NO_REGEXP_CAPTURE-
1786/*-
1787 We want the longest leftmost captures.-
1788*/-
1789static bool isBetterCapture(int ncap, const int *begin1, const int *end1, const int *begin2,-
1790 const int *end2)-
1791{-
1792 for (int i = 0; i < ncap; i++) {-
1793 int delta = begin2[i] - begin1[i]; // it has to start early...-
1794 if (delta == 0)-
1795 delta = end1[i] - end2[i]; // ...and end late-
1796-
1797 if (delta != 0)-
1798 return delta > 0;-
1799 }-
1800 return false;-
1801}-
1802#endif-
1803-
1804/*-
1805 Returns \c true if anchor a matches at position pos + i in the input-
1806 string, otherwise false.-
1807*/-
1808bool QRegExpMatchState::testAnchor(int i, int a, const int *capBegin)-
1809{-
1810 int j;-
1811-
1812#ifndef QT_NO_REGEXP_ANCHOR_ALT-
1813 if ((a & QRegExpEngine::Anchor_Alternation) != 0)-
1814 return testAnchor(i, eng->aa.at(a ^ QRegExpEngine::Anchor_Alternation).a, capBegin)-
1815 || testAnchor(i, eng->aa.at(a ^ QRegExpEngine::Anchor_Alternation).b, capBegin);-
1816#endif-
1817-
1818 if ((a & QRegExpEngine::Anchor_Caret) != 0) {-
1819 if (pos + i != caretPos)-
1820 return false;-
1821 }-
1822 if ((a & QRegExpEngine::Anchor_Dollar) != 0) {-
1823 if (pos + i != len)-
1824 return false;-
1825 }-
1826#ifndef QT_NO_REGEXP_ESCAPE-
1827 if ((a & (QRegExpEngine::Anchor_Word | QRegExpEngine::Anchor_NonWord)) != 0) {-
1828 bool before = false;-
1829 bool after = false;-
1830 if (pos + i != 0)-
1831 before = isWord(in[pos + i - 1]);-
1832 if (pos + i != len)-
1833 after = isWord(in[pos + i]);-
1834 if ((a & QRegExpEngine::Anchor_Word) != 0 && (before == after))-
1835 return false;-
1836 if ((a & QRegExpEngine::Anchor_NonWord) != 0 && (before != after))-
1837 return false;-
1838 }-
1839#endif-
1840#ifndef QT_NO_REGEXP_LOOKAHEAD-
1841 if ((a & QRegExpEngine::Anchor_LookaheadMask) != 0) {-
1842 const QVector<QRegExpLookahead *> &ahead = eng->ahead;-
1843 for (j = 0; j < ahead.size(); j++) {-
1844 if ((a & (QRegExpEngine::Anchor_FirstLookahead << j)) != 0) {-
1845 QRegExpMatchState matchState;-
1846 matchState.prepareForMatch(ahead[j]->eng);-
1847 matchState.match(in + pos + i, len - pos - i, 0,-
1848 true, true, caretPos - pos - i);-
1849 if ((matchState.captured[0] == 0) == ahead[j]->neg)-
1850 return false;-
1851 }-
1852 }-
1853 }-
1854#endif-
1855#ifndef QT_NO_REGEXP_CAPTURE-
1856#ifndef QT_NO_REGEXP_BACKREF-
1857 for (j = 0; j < eng->nbrefs; j++) {-
1858 if ((a & (QRegExpEngine::Anchor_BackRef1Empty << j)) != 0) {-
1859 int i = eng->captureForOfficialCapture.at(j);-
1860 if (capBegin[i] != EmptyCapture)-
1861 return false;-
1862 }-
1863 }-
1864#endif-
1865#endif-
1866 return true;-
1867}-
1868-
1869#ifndef QT_NO_REGEXP_OPTIM-
1870/*-
1871 The three following functions are what Jeffrey Friedl would call-
1872 transmissions (or bump-alongs). Using one or the other should make-
1873 no difference except in performance.-
1874*/-
1875-
1876bool QRegExpEngine::goodStringMatch(QRegExpMatchState &matchState) const-
1877{-
1878 int k = matchState.pos + goodEarlyStart;-
1879 QStringMatcher matcher(goodStr.unicode(), goodStr.length(), cs);-
1880 while ((k = matcher.indexIn(matchState.in, matchState.len, k)) != -1) {-
1881 int from = k - goodLateStart;-
1882 int to = k - goodEarlyStart;-
1883 if (from > matchState.pos)-
1884 matchState.pos = from;-
1885-
1886 while (matchState.pos <= to) {-
1887 if (matchState.matchHere())-
1888 return true;-
1889 ++matchState.pos;-
1890 }-
1891 ++k;-
1892 }-
1893 return false;-
1894}-
1895-
1896bool QRegExpEngine::badCharMatch(QRegExpMatchState &matchState) const-
1897{-
1898 int slideHead = 0;-
1899 int slideNext = 0;-
1900 int i;-
1901 int lastPos = matchState.len - minl;-
1902 memset(matchState.slideTab, 0, matchState.slideTabSize * sizeof(int));-
1903-
1904 /*-
1905 Set up the slide table, used for the bad-character heuristic,-
1906 using the table of first occurrence of each character.-
1907 */-
1908 for (i = 0; i < minl; i++) {-
1909 int sk = occ1[BadChar(matchState.in[matchState.pos + i])];-
1910 if (sk == NoOccurrence)-
1911 sk = i + 1;-
1912 if (sk > 0) {-
1913 int k = i + 1 - sk;-
1914 if (k < 0) {-
1915 sk = i + 1;-
1916 k = 0;-
1917 }-
1918 if (sk > matchState.slideTab[k])-
1919 matchState.slideTab[k] = sk;-
1920 }-
1921 }-
1922-
1923 if (matchState.pos > lastPos)-
1924 return false;-
1925-
1926 for (;;) {-
1927 if (++slideNext >= matchState.slideTabSize)-
1928 slideNext = 0;-
1929 if (matchState.slideTab[slideHead] > 0) {-
1930 if (matchState.slideTab[slideHead] - 1 > matchState.slideTab[slideNext])-
1931 matchState.slideTab[slideNext] = matchState.slideTab[slideHead] - 1;-
1932 matchState.slideTab[slideHead] = 0;-
1933 } else {-
1934 if (matchState.matchHere())-
1935 return true;-
1936 }-
1937-
1938 if (matchState.pos == lastPos)-
1939 break;-
1940-
1941 /*-
1942 Update the slide table. This code has much in common with-
1943 the initialization code.-
1944 */-
1945 int sk = occ1[BadChar(matchState.in[matchState.pos + minl])];-
1946 if (sk == NoOccurrence) {-
1947 matchState.slideTab[slideNext] = minl;-
1948 } else if (sk > 0) {-
1949 int k = slideNext + minl - sk;-
1950 if (k >= matchState.slideTabSize)-
1951 k -= matchState.slideTabSize;-
1952 if (sk > matchState.slideTab[k])-
1953 matchState.slideTab[k] = sk;-
1954 }-
1955 slideHead = slideNext;-
1956 ++matchState.pos;-
1957 }-
1958 return false;-
1959}-
1960#else-
1961bool QRegExpEngine::bruteMatch(QRegExpMatchState &matchState) const-
1962{-
1963 while (matchState.pos <= matchState.len) {-
1964 if (matchState.matchHere())-
1965 return true;-
1966 ++matchState.pos;-
1967 }-
1968 return false;-
1969}-
1970#endif-
1971-
1972/*-
1973 Here's the core of the engine. It tries to do a match here and now.-
1974*/-
1975bool QRegExpMatchState::matchHere()-
1976{-
1977 int ncur = 1, nnext = 0;-
1978 int i = 0, j, k, m;-
1979 bool stop = false;-
1980-
1981 matchLen = -1;-
1982 oneTestMatchedLen = -1;-
1983 curStack[0] = QRegExpEngine::InitialState;-
1984-
1985 int ncap = eng->ncap;-
1986#ifndef QT_NO_REGEXP_CAPTURE-
1987 if (ncap > 0) {-
1988 for (j = 0; j < ncap; j++) {-
1989 curCapBegin[j] = EmptyCapture;-
1990 curCapEnd[j] = EmptyCapture;-
1991 }-
1992 }-
1993#endif-
1994-
1995#ifndef QT_NO_REGEXP_BACKREF-
1996 while ((ncur > 0 || !sleeping.isEmpty()) && i <= len - pos && !stop)-
1997#else-
1998 while (ncur > 0 && i <= len - pos && !stop)-
1999#endif-
2000 {-
2001 int ch = (i < len - pos) ? in[pos + i].unicode() : 0;-
2002 for (j = 0; j < ncur; j++) {-
2003 int cur = curStack[j];-
2004 const QRegExpAutomatonState &scur = eng->s.at(cur);-
2005 const QVector<int> &outs = scur.outs;-
2006 for (k = 0; k < outs.size(); k++) {-
2007 int next = outs.at(k);-
2008 const QRegExpAutomatonState &snext = eng->s.at(next);-
2009 bool inside = true;-
2010#if !defined(QT_NO_REGEXP_BACKREF) && !defined(QT_NO_REGEXP_CAPTURE)-
2011 int needSomeSleep = 0;-
2012#endif-
2013-
2014 /*-
2015 First, check if the anchors are anchored properly.-
2016 */-
2017 int a = scur.anchors.value(next);-
2018 if (a != 0 && !testAnchor(i, a, curCapBegin + j * ncap))-
2019 inside = false;-
2020-
2021 /*-
2022 If indeed they are, check if the input character is-
2023 correct for this transition.-
2024 */-
2025 if (inside) {-
2026 m = snext.match;-
2027 if ((m & (QRegExpEngine::CharClassBit | QRegExpEngine::BackRefBit)) == 0) {-
2028 if (eng->cs)-
2029 inside = (m == ch);-
2030 else-
2031 inside = (QChar(m).toLower() == QChar(ch).toLower());-
2032 } else if (next == QRegExpEngine::FinalState) {-
2033 matchLen = i;-
2034 stop = minimal;-
2035 inside = true;-
2036 } else if ((m & QRegExpEngine::CharClassBit) != 0) {-
2037#ifndef QT_NO_REGEXP_CCLASS-
2038 const QRegExpCharClass &cc = eng->cl.at(m ^ QRegExpEngine::CharClassBit);-
2039 if (eng->cs)-
2040 inside = cc.in(ch);-
2041 else if (cc.negative())-
2042 inside = cc.in(QChar(ch).toLower()) &&-
2043 cc.in(QChar(ch).toUpper());-
2044 else-
2045 inside = cc.in(QChar(ch).toLower()) ||-
2046 cc.in(QChar(ch).toUpper());-
2047#endif-
2048#if !defined(QT_NO_REGEXP_BACKREF) && !defined(QT_NO_REGEXP_CAPTURE)-
2049 } else { /* ((m & QRegExpEngine::BackRefBit) != 0) */-
2050 int bref = m ^ QRegExpEngine::BackRefBit;-
2051 int ell = j * ncap + eng->captureForOfficialCapture.at(bref - 1);-
2052-
2053 inside = bref <= ncap && curCapBegin[ell] != EmptyCapture;-
2054 if (inside) {-
2055 if (eng->cs)-
2056 inside = (in[pos + curCapBegin[ell]] == QChar(ch));-
2057 else-
2058 inside = (in[pos + curCapBegin[ell]].toLower()-
2059 == QChar(ch).toLower());-
2060 }-
2061-
2062 if (inside) {-
2063 int delta;-
2064 if (curCapEnd[ell] == EmptyCapture)-
2065 delta = i - curCapBegin[ell];-
2066 else-
2067 delta = curCapEnd[ell] - curCapBegin[ell];-
2068-
2069 inside = (delta <= len - (pos + i));-
2070 if (inside && delta > 1) {-
2071 int n = 1;-
2072 if (eng->cs) {-
2073 while (n < delta) {-
2074 if (in[pos + curCapBegin[ell] + n]-
2075 != in[pos + i + n])-
2076 break;-
2077 ++n;-
2078 }-
2079 } else {-
2080 while (n < delta) {-
2081 QChar a = in[pos + curCapBegin[ell] + n];-
2082 QChar b = in[pos + i + n];-
2083 if (a.toLower() != b.toLower())-
2084 break;-
2085 ++n;-
2086 }-
2087 }-
2088 inside = (n == delta);-
2089 if (inside)-
2090 needSomeSleep = delta - 1;-
2091 }-
2092 }-
2093#endif-
2094 }-
2095 }-
2096-
2097 /*-
2098 We must now update our data structures.-
2099 */-
2100 if (inside) {-
2101#ifndef QT_NO_REGEXP_CAPTURE-
2102 int *capBegin, *capEnd;-
2103#endif-
2104 /*-
2105 If the next state was not encountered yet, all-
2106 is fine.-
2107 */-
2108 if ((m = inNextStack[next]) == -1) {-
2109 m = nnext++;-
2110 nextStack[m] = next;-
2111 inNextStack[next] = m;-
2112#ifndef QT_NO_REGEXP_CAPTURE-
2113 capBegin = nextCapBegin + m * ncap;-
2114 capEnd = nextCapEnd + m * ncap;-
2115-
2116 /*-
2117 Otherwise, we'll first maintain captures in-
2118 temporary arrays, and decide at the end whether-
2119 it's best to keep the previous capture zones or-
2120 the new ones.-
2121 */-
2122 } else {-
2123 capBegin = tempCapBegin;-
2124 capEnd = tempCapEnd;-
2125#endif-
2126 }-
2127-
2128#ifndef QT_NO_REGEXP_CAPTURE-
2129 /*-
2130 Updating the capture zones is much of a task.-
2131 */-
2132 if (ncap > 0) {-
2133 memcpy(capBegin, curCapBegin + j * ncap, ncap * sizeof(int));-
2134 memcpy(capEnd, curCapEnd + j * ncap, ncap * sizeof(int));-
2135 int c = scur.atom, n = snext.atom;-
2136 int p = -1, q = -1;-
2137 int cap;-
2138-
2139 /*-
2140 Lemma 1. For any x in the range [0..nf), we-
2141 have f[x].parent < x.-
2142-
2143 Proof. By looking at startAtom(), it is-
2144 clear that cf < nf holds all the time, and-
2145 thus that f[nf].parent < nf.-
2146 */-
2147-
2148 /*-
2149 If we are reentering an atom, we empty all-
2150 capture zones inside it.-
2151 */-
2152 if ((q = scur.reenter.value(next)) != 0) {-
2153 QBitArray b(eng->nf, false);-
2154 b.setBit(q, true);-
2155 for (int ell = q + 1; ell < eng->nf; ell++) {-
2156 if (b.testBit(eng->f.at(ell).parent)) {-
2157 b.setBit(ell, true);-
2158 cap = eng->f.at(ell).capture;-
2159 if (cap >= 0) {-
2160 capBegin[cap] = EmptyCapture;-
2161 capEnd[cap] = EmptyCapture;-
2162 }-
2163 }-
2164 }-
2165 p = eng->f.at(q).parent;-
2166-
2167 /*-
2168 Otherwise, close the capture zones we are-
2169 leaving. We are leaving f[c].capture,-
2170 f[f[c].parent].capture,-
2171 f[f[f[c].parent].parent].capture, ...,-
2172 until f[x].capture, with x such that-
2173 f[x].parent is the youngest common ancestor-
2174 for c and n.-
2175-
2176 We go up along c's and n's ancestry until-
2177 we find x.-
2178 */-
2179 } else {-
2180 p = c;-
2181 q = n;-
2182 while (p != q) {-
2183 if (p > q) {-
2184 cap = eng->f.at(p).capture;-
2185 if (cap >= 0) {-
2186 if (capBegin[cap] == i) {-
2187 capBegin[cap] = EmptyCapture;-
2188 capEnd[cap] = EmptyCapture;-
2189 } else {-
2190 capEnd[cap] = i;-
2191 }-
2192 }-
2193 p = eng->f.at(p).parent;-
2194 } else {-
2195 q = eng->f.at(q).parent;-
2196 }-
2197 }-
2198 }-
2199-
2200 /*-
2201 In any case, we now open the capture zones-
2202 we are entering. We work upwards from n-
2203 until we reach p (the parent of the atom we-
2204 reenter or the youngest common ancestor).-
2205 */-
2206 while (n > p) {-
2207 cap = eng->f.at(n).capture;-
2208 if (cap >= 0) {-
2209 capBegin[cap] = i;-
2210 capEnd[cap] = EmptyCapture;-
2211 }-
2212 n = eng->f.at(n).parent;-
2213 }-
2214 /*-
2215 If the next state was already in-
2216 nextStack, we must choose carefully which-
2217 capture zones we want to keep.-
2218 */-
2219 if (capBegin == tempCapBegin &&-
2220 isBetterCapture(ncap, capBegin, capEnd, nextCapBegin + m * ncap,-
2221 nextCapEnd + m * ncap)) {-
2222 memcpy(nextCapBegin + m * ncap, capBegin, ncap * sizeof(int));-
2223 memcpy(nextCapEnd + m * ncap, capEnd, ncap * sizeof(int));-
2224 }-
2225 }-
2226#ifndef QT_NO_REGEXP_BACKREF-
2227 /*-
2228 We are done with updating the capture zones.-
2229 It's now time to put the next state to sleep,-
2230 if it needs to, and to remove it from-
2231 nextStack.-
2232 */-
2233 if (needSomeSleep > 0) {-
2234 QVector<int> zzZ(2 + 2 * ncap);-
2235 zzZ[0] = i + needSomeSleep;-
2236 zzZ[1] = next;-
2237 if (ncap > 0) {-
2238 memcpy(zzZ.data() + 2, capBegin, ncap * sizeof(int));-
2239 memcpy(zzZ.data() + 2 + ncap, capEnd, ncap * sizeof(int));-
2240 }-
2241 inNextStack[nextStack[--nnext]] = -1;-
2242 sleeping.append(zzZ);-
2243 }-
2244#endif-
2245#endif-
2246 }-
2247 }-
2248 }-
2249#ifndef QT_NO_REGEXP_CAPTURE-
2250 /*-
2251 If we reached the final state, hurray! Copy the captured-
2252 zone.-
2253 */-
2254 if (ncap > 0 && (m = inNextStack[QRegExpEngine::FinalState]) != -1) {-
2255 memcpy(capBegin, nextCapBegin + m * ncap, ncap * sizeof(int));-
2256 memcpy(capEnd, nextCapEnd + m * ncap, ncap * sizeof(int));-
2257 }-
2258#ifndef QT_NO_REGEXP_BACKREF-
2259 /*-
2260 It's time to wake up the sleepers.-
2261 */-
2262 j = 0;-
2263 while (j < sleeping.count()) {-
2264 if (sleeping.at(j)[0] == i) {-
2265 const QVector<int> &zzZ = sleeping.at(j);-
2266 int next = zzZ[1];-
2267 const int *capBegin = zzZ.data() + 2;-
2268 const int *capEnd = zzZ.data() + 2 + ncap;-
2269 bool copyOver = true;-
2270-
2271 if ((m = inNextStack[next]) == -1) {-
2272 m = nnext++;-
2273 nextStack[m] = next;-
2274 inNextStack[next] = m;-
2275 } else {-
2276 copyOver = isBetterCapture(ncap, nextCapBegin + m * ncap, nextCapEnd + m * ncap,-
2277 capBegin, capEnd);-
2278 }-
2279 if (copyOver) {-
2280 memcpy(nextCapBegin + m * ncap, capBegin, ncap * sizeof(int));-
2281 memcpy(nextCapEnd + m * ncap, capEnd, ncap * sizeof(int));-
2282 }-
2283-
2284 sleeping.removeAt(j);-
2285 } else {-
2286 ++j;-
2287 }-
2288 }-
2289#endif-
2290#endif-
2291 for (j = 0; j < nnext; j++)-
2292 inNextStack[nextStack[j]] = -1;-
2293-
2294 // avoid needless iteration that confuses oneTestMatchedLen-
2295 if (nnext == 1 && nextStack[0] == QRegExpEngine::FinalState-
2296#ifndef QT_NO_REGEXP_BACKREF-
2297 && sleeping.isEmpty()-
2298#endif-
2299 )-
2300 stop = true;-
2301-
2302 qSwap(curStack, nextStack);-
2303#ifndef QT_NO_REGEXP_CAPTURE-
2304 qSwap(curCapBegin, nextCapBegin);-
2305 qSwap(curCapEnd, nextCapEnd);-
2306#endif-
2307 ncur = nnext;-
2308 nnext = 0;-
2309 ++i;-
2310 }-
2311-
2312#ifndef QT_NO_REGEXP_BACKREF-
2313 /*-
2314 If minimal matching is enabled, we might have some sleepers-
2315 left.-
2316 */-
2317 if (!sleeping.isEmpty())-
2318 sleeping.clear();-
2319#endif-
2320-
2321 oneTestMatchedLen = i - 1;-
2322 return (matchLen >= 0);-
2323}-
2324-
2325#ifndef QT_NO_REGEXP_CCLASS-
2326-
2327QRegExpCharClass::QRegExpCharClass()-
2328 : c(0), n(false)-
2329{-
2330#ifndef QT_NO_REGEXP_OPTIM-
2331 occ1.fill(NoOccurrence, NumBadChars);-
2332#endif-
2333}-
2334-
2335void QRegExpCharClass::clear()-
2336{-
2337 c = 0;-
2338 r.resize(0);clear();-
2339 n = false;-
2340}
executed 34392 times by 116 tests: end of block
Executed by:
  • tst_Collections
  • tst_Lancelot
  • tst_ModelTest
  • tst_NetworkSelfTest
  • tst_QAbstractFileEngine
  • tst_QAbstractItemModel
  • tst_QAbstractItemView
  • tst_QAbstractNetworkCache
  • tst_QAccessibility
  • tst_QApplication
  • tst_QCalendarWidget
  • tst_QComboBox
  • tst_QCompleter
  • tst_QDBusInterface
  • tst_QDataStream
  • tst_QDate
  • tst_QDateTime
  • tst_QDateTimeEdit
  • tst_QDir
  • tst_QDirIterator
  • tst_QDirModel
  • tst_QFactoryLoader
  • tst_QFile
  • tst_QFileDialog2
  • tst_QFileIconProvider
  • ...
34392
2341-
2342void QRegExpCharClass::setNegative(bool negative)-
2343{-
2344 n = negative;-
2345#ifndef QT_NO_REGEXP_OPTIM-
2346 occ1.fill(0, NumBadChars);-
2347#endif-
2348}-
2349-
2350void QRegExpCharClass::addCategories(uint cats)-
2351{-
2352 static const int all_cats = FLAG(QChar::Mark_NonSpacing) |-
2353 FLAG(QChar::Mark_SpacingCombining) |-
2354 FLAG(QChar::Mark_Enclosing) |-
2355 FLAG(QChar::Number_DecimalDigit) |-
2356 FLAG(QChar::Number_Letter) |-
2357 FLAG(QChar::Number_Other) |-
2358 FLAG(QChar::Separator_Space) |-
2359 FLAG(QChar::Separator_Line) |-
2360 FLAG(QChar::Separator_Paragraph) |-
2361 FLAG(QChar::Other_Control) |-
2362 FLAG(QChar::Other_Format) |-
2363 FLAG(QChar::Other_Surrogate) |-
2364 FLAG(QChar::Other_PrivateUse) |-
2365 FLAG(QChar::Other_NotAssigned) |-
2366 FLAG(QChar::Letter_Uppercase) |-
2367 FLAG(QChar::Letter_Lowercase) |-
2368 FLAG(QChar::Letter_Titlecase) |-
2369 FLAG(QChar::Letter_Modifier) |-
2370 FLAG(QChar::Letter_Other) |-
2371 FLAG(QChar::Punctuation_Connector) |-
2372 FLAG(QChar::Punctuation_Dash) |-
2373 FLAG(QChar::Punctuation_Open) |-
2374 FLAG(QChar::Punctuation_Close) |-
2375 FLAG(QChar::Punctuation_InitialQuote) |-
2376 FLAG(QChar::Punctuation_FinalQuote) |-
2377 FLAG(QChar::Punctuation_Other) |-
2378 FLAG(QChar::Symbol_Math) |-
2379 FLAG(QChar::Symbol_Currency) |-
2380 FLAG(QChar::Symbol_Modifier) |-
2381 FLAG(QChar::Symbol_Other);-
2382 c |= (all_cats & cats);-
2383#ifndef QT_NO_REGEXP_OPTIM-
2384 occ1.fill(0, NumBadChars);-
2385#endif-
2386}-
2387-
2388void QRegExpCharClass::addRange(ushort from, ushort to)-
2389{-
2390 if (from > to)-
2391 qSwap(from, to);-
2392 int m = r.size();-
2393 r.resize(m + 1);-
2394 r[m].from = from;-
2395 r[m].len = to - from + 1;-
2396-
2397#ifndef QT_NO_REGEXP_OPTIM-
2398 int i;-
2399-
2400 if (to - from < NumBadChars) {-
2401 if (from % NumBadChars <= to % NumBadChars) {-
2402 for (i = from % NumBadChars; i <= to % NumBadChars; i++)-
2403 occ1[i] = 0;-
2404 } else {-
2405 for (i = 0; i <= to % NumBadChars; i++)-
2406 occ1[i] = 0;-
2407 for (i = from % NumBadChars; i < NumBadChars; i++)-
2408 occ1[i] = 0;-
2409 }-
2410 } else {-
2411 occ1.fill(0, NumBadChars);-
2412 }-
2413#endif-
2414}-
2415-
2416bool QRegExpCharClass::in(QChar ch) const-
2417{-
2418#ifndef QT_NO_REGEXP_OPTIM-
2419 if (occ1.at(BadChar(ch)) == NoOccurrence)-
2420 return n;-
2421#endif-
2422-
2423 if (c != 0 && (c & FLAG(ch.category())) != 0)-
2424 return !n;-
2425-
2426 const int uc = ch.unicode();-
2427 int size = r.size();-
2428-
2429 for (int i = 0; i < size; ++i) {-
2430 const QRegExpCharClassRange &range = r.at(i);-
2431 if (uint(uc - range.from) < uint(r.at(i).len))-
2432 return !n;-
2433 }-
2434 return n;-
2435}-
2436-
2437#if defined(QT_DEBUG)-
2438void QRegExpCharClass::dump() const-
2439{-
2440 int i;-
2441 qDebug(" %stive character class", n ? "nega" : "posi");-
2442#ifndef QT_NO_REGEXP_CCLASS-
2443 if (c != 0)-
2444 qDebug(" categories 0x%.8x", c);-
2445#endif-
2446 for (i = 0; i < r.size(); i++)-
2447 qDebug(" 0x%.4x through 0x%.4x", r[i].from, r[i].from + r[i].len - 1);-
2448}-
2449#endif-
2450#endif-
2451-
2452QRegExpEngine::Box::Box(QRegExpEngine *engine)-
2453 : eng(engine), skipanchors(0)-
2454#ifndef QT_NO_REGEXP_OPTIM-
2455 , earlyStart(0), lateStart(0), maxl(0)-
2456#endif-
2457{-
2458#ifndef QT_NO_REGEXP_OPTIM-
2459 occ1.fill(NoOccurrence, NumBadChars);-
2460#endif-
2461 minl = 0;-
2462}-
2463-
2464QRegExpEngine::Box &QRegExpEngine::Box::operator=(const Box &b)-
2465{-
2466 eng = b.eng;-
2467 ls = b.ls;-
2468 rs = b.rs;-
2469 lanchors = b.lanchors;-
2470 ranchors = b.ranchors;-
2471 skipanchors = b.skipanchors;-
2472#ifndef QT_NO_REGEXP_OPTIM-
2473 earlyStart = b.earlyStart;-
2474 lateStart = b.lateStart;-
2475 str = b.str;-
2476 leftStr = b.leftStr;-
2477 rightStr = b.rightStr;-
2478 maxl = b.maxl;-
2479 occ1 = b.occ1;-
2480#endif-
2481 minl = b.minl;-
2482 return *this;-
2483}-
2484-
2485void QRegExpEngine::Box::set(QChar ch)-
2486{-
2487 ls.resize(1);-
2488 ls[0] = eng->createState(ch);-
2489 rs = ls;-
2490#ifndef QT_NO_REGEXP_OPTIM-
2491 str = ch;-
2492 leftStr = ch;-
2493 rightStr = ch;-
2494 maxl = 1;-
2495 occ1[BadChar(ch)] = 0;-
2496#endif-
2497 minl = 1;-
2498}-
2499-
2500void QRegExpEngine::Box::set(const QRegExpCharClass &cc)-
2501{-
2502 ls.resize(1);-
2503 ls[0] = eng->createState(cc);-
2504 rs = ls;-
2505#ifndef QT_NO_REGEXP_OPTIM-
2506 maxl = 1;-
2507 occ1 = cc.firstOccurrence();-
2508#endif-
2509 minl = 1;-
2510}-
2511-
2512#ifndef QT_NO_REGEXP_BACKREF-
2513void QRegExpEngine::Box::set(int bref)-
2514{-
2515 ls.resize(1);-
2516 ls[0] = eng->createState(bref);-
2517 rs = ls;-
2518 if (bref >= 1 && bref <= MaxBackRefs)-
2519 skipanchors = Anchor_BackRef0Empty << bref;-
2520#ifndef QT_NO_REGEXP_OPTIM-
2521 maxl = InftyLen;-
2522#endif-
2523 minl = 0;-
2524}-
2525#endif-
2526-
2527void QRegExpEngine::Box::cat(const Box &b)-
2528{-
2529 eng->addCatTransitions(rs, b.ls);-
2530 addAnchorsToEngine(b);-
2531 if (minl == 0) {-
2532 lanchors.unite(b.lanchors);-
2533 if (skipanchors != 0) {-
2534 for (int i = 0; i < b.ls.size(); i++) {-
2535 int a = eng->anchorConcatenation(lanchors.value(b.ls.at(i), 0), skipanchors);-
2536 lanchors.insert(b.ls.at(i), a);-
2537 }-
2538 }-
2539 mergeInto(&ls, b.ls);-
2540 }-
2541 if (b.minl == 0) {-
2542 ranchors.unite(b.ranchors);-
2543 if (b.skipanchors != 0) {-
2544 for (int i = 0; i < rs.size(); i++) {-
2545 int a = eng->anchorConcatenation(ranchors.value(rs.at(i), 0), b.skipanchors);-
2546 ranchors.insert(rs.at(i), a);-
2547 }-
2548 }-
2549 mergeInto(&rs, b.rs);-
2550 } else {-
2551 ranchors = b.ranchors;-
2552 rs = b.rs;-
2553 }-
2554-
2555#ifndef QT_NO_REGEXP_OPTIM-
2556 if (maxl != InftyLen) {-
2557 if (rightStr.length() + b.leftStr.length() >-
2558 qMax(str.length(), b.str.length())) {-
2559 earlyStart = minl - rightStr.length();-
2560 lateStart = maxl - rightStr.length();-
2561 str = rightStr + b.leftStr;-
2562 } else if (b.str.length() > str.length()) {-
2563 earlyStart = minl + b.earlyStart;-
2564 lateStart = maxl + b.lateStart;-
2565 str = b.str;-
2566 }-
2567 }-
2568-
2569 if (leftStr.length() == maxl)-
2570 leftStr += b.leftStr;-
2571-
2572 if (b.rightStr.length() == b.maxl) {-
2573 rightStr += b.rightStr;-
2574 } else {-
2575 rightStr = b.rightStr;-
2576 }-
2577-
2578 if (maxl == InftyLen || b.maxl == InftyLen) {-
2579 maxl = InftyLen;-
2580 } else {-
2581 maxl += b.maxl;-
2582 }-
2583-
2584 for (int i = 0; i < NumBadChars; i++) {-
2585 if (b.occ1.at(i) != NoOccurrence && minl + b.occ1.at(i) < occ1.at(i))-
2586 occ1[i] = minl + b.occ1.at(i);-
2587 }-
2588#endif-
2589-
2590 minl += b.minl;-
2591 if (minl == 0)-
2592 skipanchors = eng->anchorConcatenation(skipanchors, b.skipanchors);-
2593 else-
2594 skipanchors = 0;-
2595}-
2596-
2597void QRegExpEngine::Box::orx(const Box &b)-
2598{-
2599 mergeInto(&ls, b.ls);-
2600 lanchors.unite(b.lanchors);-
2601 mergeInto(&rs, b.rs);-
2602 ranchors.unite(b.ranchors);-
2603-
2604 if (b.minl == 0) {-
2605 if (minl == 0)-
2606 skipanchors = eng->anchorAlternation(skipanchors, b.skipanchors);-
2607 else-
2608 skipanchors = b.skipanchors;-
2609 }-
2610-
2611#ifndef QT_NO_REGEXP_OPTIM-
2612 for (int i = 0; i < NumBadChars; i++) {-
2613 if (occ1.at(i) > b.occ1.at(i))-
2614 occ1[i] = b.occ1.at(i);-
2615 }-
2616 earlyStart = 0;-
2617 lateStart = 0;-
2618 str = QString();-
2619 leftStr = QString();-
2620 rightStr = QString();-
2621 if (b.maxl > maxl)-
2622 maxl = b.maxl;-
2623#endif-
2624 if (b.minl < minl)-
2625 minl = b.minl;-
2626}-
2627-
2628void QRegExpEngine::Box::plus(int atom)-
2629{-
2630#ifndef QT_NO_REGEXP_CAPTURE-
2631 eng->addPlusTransitions(rs, ls, atom);-
2632#else-
2633 Q_UNUSED(atom);-
2634 eng->addCatTransitions(rs, ls);-
2635#endif-
2636 addAnchorsToEngine(*this);-
2637#ifndef QT_NO_REGEXP_OPTIM-
2638 maxl = InftyLen;-
2639#endif-
2640}-
2641-
2642void QRegExpEngine::Box::opt()-
2643{-
2644#ifndef QT_NO_REGEXP_OPTIM-
2645 earlyStart = 0;-
2646 lateStart = 0;-
2647 str = QString();-
2648 leftStr = QString();-
2649 rightStr = QString();-
2650#endif-
2651 skipanchors = 0;-
2652 minl = 0;-
2653}-
2654-
2655void QRegExpEngine::Box::catAnchor(int a)-
2656{-
2657 if (a != 0) {-
2658 for (int i = 0; i < rs.size(); i++) {-
2659 a = eng->anchorConcatenation(ranchors.value(rs.at(i), 0), a);-
2660 ranchors.insert(rs.at(i), a);-
2661 }-
2662 if (minl == 0)-
2663 skipanchors = eng->anchorConcatenation(skipanchors, a);-
2664 }-
2665}-
2666-
2667#ifndef QT_NO_REGEXP_OPTIM-
2668void QRegExpEngine::Box::setupHeuristics()-
2669{-
2670 eng->goodEarlyStart = earlyStart;-
2671 eng->goodLateStart = lateStart;-
2672 eng->goodStr = eng->cs ? str : str.toLower();-
2673-
2674 eng->minl = minl;-
2675 if (eng->cs) {-
2676 /*-
2677 A regular expression such as 112|1 has occ1['2'] = 2 and minl =-
2678 1 at this point. An entry of occ1 has to be at most minl or-
2679 infinity for the rest of the algorithm to go well.-
2680-
2681 We waited until here before normalizing these cases (instead of-
2682 doing it in Box::orx()) because sometimes things improve by-
2683 themselves. Consider for example (112|1)34.-
2684 */-
2685 for (int i = 0; i < NumBadChars; i++) {-
2686 if (occ1.at(i) != NoOccurrence && occ1.at(i) >= minl)-
2687 occ1[i] = minl;-
2688 }-
2689 eng->occ1 = occ1;-
2690 } else {-
2691 eng->occ1.fill(0, NumBadChars);-
2692 }-
2693-
2694 eng->heuristicallyChooseHeuristic();-
2695}-
2696#endif-
2697-
2698#if defined(QT_DEBUG)-
2699void QRegExpEngine::Box::dump() const-
2700{-
2701 int i;-
2702 qDebug("Box of at least %d character%s", minl, minl == 1 ? "" : "s");-
2703 qDebug(" Left states:");-
2704 for (i = 0; i < ls.size(); i++) {-
2705 if (lanchors.value(ls[i], 0) == 0)-
2706 qDebug(" %d", ls[i]);-
2707 else-
2708 qDebug(" %d [anchors 0x%.8x]", ls[i], lanchors[ls[i]]);-
2709 }-
2710 qDebug(" Right states:");-
2711 for (i = 0; i < rs.size(); i++) {-
2712 if (ranchors.value(rs[i], 0) == 0)-
2713 qDebug(" %d", rs[i]);-
2714 else-
2715 qDebug(" %d [anchors 0x%.8x]", rs[i], ranchors[rs[i]]);-
2716 }-
2717 qDebug(" Skip anchors: 0x%.8x", skipanchors);-
2718}-
2719#endif-
2720-
2721void QRegExpEngine::Box::addAnchorsToEngine(const Box &to) const-
2722{-
2723 for (int i = 0; i < to.ls.size(); i++) {-
2724 for (int j = 0; j < rs.size(); j++) {-
2725 int a = eng->anchorConcatenation(ranchors.value(rs.at(j), 0),-
2726 to.lanchors.value(to.ls.at(i), 0));-
2727 eng->addAnchors(rs[j], to.ls[i], a);-
2728 }-
2729 }-
2730}-
2731-
2732#ifndef QT_NO_REGEXP_CCLASS-
2733// fast lookup hash for xml schema extensions-
2734// sorted by name for b-search-
2735static const struct CategoriesRangeMapEntry {-
2736 const char name[40];-
2737 uint first, second;-
2738} categoriesRangeMap[] = {-
2739 { "AegeanNumbers", 0x10100, 0x1013F },-
2740 { "AlphabeticPresentationForms", 0xFB00, 0xFB4F },-
2741 { "AncientGreekMusicalNotation", 0x1D200, 0x1D24F },-
2742 { "AncientGreekNumbers", 0x10140, 0x1018F },-
2743 { "Arabic", 0x0600, 0x06FF },-
2744 { "ArabicPresentationForms-A", 0xFB50, 0xFDFF },-
2745 { "ArabicPresentationForms-B", 0xFE70, 0xFEFF },-
2746 { "ArabicSupplement", 0x0750, 0x077F },-
2747 { "Armenian", 0x0530, 0x058F },-
2748 { "Arrows", 0x2190, 0x21FF },-
2749 { "BasicLatin", 0x0000, 0x007F },-
2750 { "Bengali", 0x0980, 0x09FF },-
2751 { "BlockElements", 0x2580, 0x259F },-
2752 { "Bopomofo", 0x3100, 0x312F },-
2753 { "BopomofoExtended", 0x31A0, 0x31BF },-
2754 { "BoxDrawing", 0x2500, 0x257F },-
2755 { "BraillePatterns", 0x2800, 0x28FF },-
2756 { "Buginese", 0x1A00, 0x1A1F },-
2757 { "Buhid", 0x1740, 0x175F },-
2758 { "ByzantineMusicalSymbols", 0x1D000, 0x1D0FF },-
2759 { "CJKCompatibility", 0x3300, 0x33FF },-
2760 { "CJKCompatibilityForms", 0xFE30, 0xFE4F },-
2761 { "CJKCompatibilityIdeographs", 0xF900, 0xFAFF },-
2762 { "CJKCompatibilityIdeographsSupplement", 0x2F800, 0x2FA1F },-
2763 { "CJKRadicalsSupplement", 0x2E80, 0x2EFF },-
2764 { "CJKStrokes", 0x31C0, 0x31EF },-
2765 { "CJKSymbolsandPunctuation", 0x3000, 0x303F },-
2766 { "CJKUnifiedIdeographs", 0x4E00, 0x9FFF },-
2767 { "CJKUnifiedIdeographsExtensionA", 0x3400, 0x4DB5 },-
2768 { "CJKUnifiedIdeographsExtensionB", 0x20000, 0x2A6DF },-
2769 { "Cherokee", 0x13A0, 0x13FF },-
2770 { "CombiningDiacriticalMarks", 0x0300, 0x036F },-
2771 { "CombiningDiacriticalMarksSupplement", 0x1DC0, 0x1DFF },-
2772 { "CombiningHalfMarks", 0xFE20, 0xFE2F },-
2773 { "CombiningMarksforSymbols", 0x20D0, 0x20FF },-
2774 { "ControlPictures", 0x2400, 0x243F },-
2775 { "Coptic", 0x2C80, 0x2CFF },-
2776 { "CurrencySymbols", 0x20A0, 0x20CF },-
2777 { "CypriotSyllabary", 0x10800, 0x1083F },-
2778 { "Cyrillic", 0x0400, 0x04FF },-
2779 { "CyrillicSupplement", 0x0500, 0x052F },-
2780 { "Deseret", 0x10400, 0x1044F },-
2781 { "Devanagari", 0x0900, 0x097F },-
2782 { "Dingbats", 0x2700, 0x27BF },-
2783 { "EnclosedAlphanumerics", 0x2460, 0x24FF },-
2784 { "EnclosedCJKLettersandMonths", 0x3200, 0x32FF },-
2785 { "Ethiopic", 0x1200, 0x137F },-
2786 { "EthiopicExtended", 0x2D80, 0x2DDF },-
2787 { "EthiopicSupplement", 0x1380, 0x139F },-
2788 { "GeneralPunctuation", 0x2000, 0x206F },-
2789 { "GeometricShapes", 0x25A0, 0x25FF },-
2790 { "Georgian", 0x10A0, 0x10FF },-
2791 { "GeorgianSupplement", 0x2D00, 0x2D2F },-
2792 { "Glagolitic", 0x2C00, 0x2C5F },-
2793 { "Gothic", 0x10330, 0x1034F },-
2794 { "Greek", 0x0370, 0x03FF },-
2795 { "GreekExtended", 0x1F00, 0x1FFF },-
2796 { "Gujarati", 0x0A80, 0x0AFF },-
2797 { "Gurmukhi", 0x0A00, 0x0A7F },-
2798 { "HalfwidthandFullwidthForms", 0xFF00, 0xFFEF },-
2799 { "HangulCompatibilityJamo", 0x3130, 0x318F },-
2800 { "HangulJamo", 0x1100, 0x11FF },-
2801 { "HangulSyllables", 0xAC00, 0xD7A3 },-
2802 { "Hanunoo", 0x1720, 0x173F },-
2803 { "Hebrew", 0x0590, 0x05FF },-
2804 { "Hiragana", 0x3040, 0x309F },-
2805 { "IPAExtensions", 0x0250, 0x02AF },-
2806 { "IdeographicDescriptionCharacters", 0x2FF0, 0x2FFF },-
2807 { "Kanbun", 0x3190, 0x319F },-
2808 { "KangxiRadicals", 0x2F00, 0x2FDF },-
2809 { "Kannada", 0x0C80, 0x0CFF },-
2810 { "Katakana", 0x30A0, 0x30FF },-
2811 { "KatakanaPhoneticExtensions", 0x31F0, 0x31FF },-
2812 { "Kharoshthi", 0x10A00, 0x10A5F },-
2813 { "Khmer", 0x1780, 0x17FF },-
2814 { "KhmerSymbols", 0x19E0, 0x19FF },-
2815 { "Lao", 0x0E80, 0x0EFF },-
2816 { "Latin-1Supplement", 0x0080, 0x00FF },-
2817 { "LatinExtended-A", 0x0100, 0x017F },-
2818 { "LatinExtended-B", 0x0180, 0x024F },-
2819 { "LatinExtendedAdditional", 0x1E00, 0x1EFF },-
2820 { "LetterlikeSymbols", 0x2100, 0x214F },-
2821 { "Limbu", 0x1900, 0x194F },-
2822 { "LinearBIdeograms", 0x10080, 0x100FF },-
2823 { "LinearBSyllabary", 0x10000, 0x1007F },-
2824 { "Malayalam", 0x0D00, 0x0D7F },-
2825 { "MathematicalAlphanumericSymbols", 0x1D400, 0x1D7FF },-
2826 { "MathematicalOperators", 0x2200, 0x22FF },-
2827 { "MiscellaneousMathematicalSymbols-A", 0x27C0, 0x27EF },-
2828 { "MiscellaneousMathematicalSymbols-B", 0x2980, 0x29FF },-
2829 { "MiscellaneousSymbols", 0x2600, 0x26FF },-
2830 { "MiscellaneousSymbolsandArrows", 0x2B00, 0x2BFF },-
2831 { "MiscellaneousTechnical", 0x2300, 0x23FF },-
2832 { "ModifierToneLetters", 0xA700, 0xA71F },-
2833 { "Mongolian", 0x1800, 0x18AF },-
2834 { "MusicalSymbols", 0x1D100, 0x1D1FF },-
2835 { "Myanmar", 0x1000, 0x109F },-
2836 { "NewTaiLue", 0x1980, 0x19DF },-
2837 { "NumberForms", 0x2150, 0x218F },-
2838 { "Ogham", 0x1680, 0x169F },-
2839 { "OldItalic", 0x10300, 0x1032F },-
2840 { "OldPersian", 0x103A0, 0x103DF },-
2841 { "OpticalCharacterRecognition", 0x2440, 0x245F },-
2842 { "Oriya", 0x0B00, 0x0B7F },-
2843 { "Osmanya", 0x10480, 0x104AF },-
2844 { "PhoneticExtensions", 0x1D00, 0x1D7F },-
2845 { "PhoneticExtensionsSupplement", 0x1D80, 0x1DBF },-
2846 { "PrivateUse", 0xE000, 0xF8FF },-
2847 { "Runic", 0x16A0, 0x16FF },-
2848 { "Shavian", 0x10450, 0x1047F },-
2849 { "Sinhala", 0x0D80, 0x0DFF },-
2850 { "SmallFormVariants", 0xFE50, 0xFE6F },-
2851 { "SpacingModifierLetters", 0x02B0, 0x02FF },-
2852 { "Specials", 0xFFF0, 0xFFFF },-
2853 { "SuperscriptsandSubscripts", 0x2070, 0x209F },-
2854 { "SupplementalArrows-A", 0x27F0, 0x27FF },-
2855 { "SupplementalArrows-B", 0x2900, 0x297F },-
2856 { "SupplementalMathematicalOperators", 0x2A00, 0x2AFF },-
2857 { "SupplementalPunctuation", 0x2E00, 0x2E7F },-
2858 { "SupplementaryPrivateUseArea-A", 0xF0000, 0xFFFFF },-
2859 { "SupplementaryPrivateUseArea-B", 0x100000, 0x10FFFF },-
2860 { "SylotiNagri", 0xA800, 0xA82F },-
2861 { "Syriac", 0x0700, 0x074F },-
2862 { "Tagalog", 0x1700, 0x171F },-
2863 { "Tagbanwa", 0x1760, 0x177F },-
2864 { "Tags", 0xE0000, 0xE007F },-
2865 { "TaiLe", 0x1950, 0x197F },-
2866 { "TaiXuanJingSymbols", 0x1D300, 0x1D35F },-
2867 { "Tamil", 0x0B80, 0x0BFF },-
2868 { "Telugu", 0x0C00, 0x0C7F },-
2869 { "Thaana", 0x0780, 0x07BF },-
2870 { "Thai", 0x0E00, 0x0E7F },-
2871 { "Tibetan", 0x0F00, 0x0FFF },-
2872 { "Tifinagh", 0x2D30, 0x2D7F },-
2873 { "Ugaritic", 0x10380, 0x1039F },-
2874 { "UnifiedCanadianAboriginalSyllabics", 0x1400, 0x167F },-
2875 { "VariationSelectors", 0xFE00, 0xFE0F },-
2876 { "VariationSelectorsSupplement", 0xE0100, 0xE01EF },-
2877 { "VerticalForms", 0xFE10, 0xFE1F },-
2878 { "YiRadicals", 0xA490, 0xA4CF },-
2879 { "YiSyllables", 0xA000, 0xA48F },-
2880 { "YijingHexagramSymbols", 0x4DC0, 0x4DFF }-
2881};-
2882-
2883inline bool operator<(const CategoriesRangeMapEntry &entry1, const CategoriesRangeMapEntry &entry2)-
2884{ return qstrcmp(entry1.name, entry2.name) < 0; }-
2885inline bool operator<(const char *name, const CategoriesRangeMapEntry &entry)-
2886{ return qstrcmp(name, entry.name) < 0; }-
2887inline bool operator<(const CategoriesRangeMapEntry &entry, const char *name)-
2888{ return qstrcmp(entry.name, name) < 0; }-
2889#endif // QT_NO_REGEXP_CCLASS-
2890-
2891int QRegExpEngine::getChar()-
2892{-
2893 return (yyPos == yyLen) ? EOS : yyIn[yyPos++].unicode();-
2894}-
2895-
2896int QRegExpEngine::getEscape()-
2897{-
2898#ifndef QT_NO_REGEXP_ESCAPE-
2899 const char tab[] = "afnrtv"; // no b, as \b means word boundary-
2900 const char backTab[] = "\a\f\n\r\t\v";-
2901 ushort low;-
2902 int i;-
2903#endif-
2904 ushort val;-
2905 int prevCh = yyCh;-
2906-
2907 if (prevCh == EOS) {-
2908 error(RXERR_END);-
2909 return Tok_Char | '\\';-
2910 }-
2911 yyCh = getChar();-
2912#ifndef QT_NO_REGEXP_ESCAPE-
2913 if ((prevCh & ~0xff) == 0) {-
2914 const char *p = strchr(tab, prevCh);-
2915 if (p != 0)-
2916 return Tok_Char | backTab[p - tab];-
2917 }-
2918#endif-
2919-
2920 switch (prevCh) {-
2921#ifndef QT_NO_REGEXP_ESCAPE-
2922 case '0':-
2923 val = 0;-
2924 for (i = 0; i < 3; i++) {-
2925 if (yyCh >= '0' && yyCh <= '7')-
2926 val = (val << 3) | (yyCh - '0');-
2927 else-
2928 break;-
2929 yyCh = getChar();-
2930 }-
2931 if ((val & ~0377) != 0)-
2932 error(RXERR_OCTAL);-
2933 return Tok_Char | val;-
2934#endif-
2935#ifndef QT_NO_REGEXP_ESCAPE-
2936 case 'B':-
2937 return Tok_NonWord;-
2938#endif-
2939#ifndef QT_NO_REGEXP_CCLASS-
2940 case 'D':-
2941 // see QChar::isDigit()-
2942 yyCharClass->addCategories(uint(-1) ^ FLAG(QChar::Number_DecimalDigit));-
2943 return Tok_CharClass;-
2944 case 'S':-
2945 // see QChar::isSpace()-
2946 yyCharClass->addCategories(uint(-1) ^ (FLAG(QChar::Separator_Space) |-
2947 FLAG(QChar::Separator_Line) |-
2948 FLAG(QChar::Separator_Paragraph) |-
2949 FLAG(QChar::Other_Control)));-
2950 yyCharClass->addRange(0x0000, 0x0008);-
2951 yyCharClass->addRange(0x000e, 0x001f);-
2952 yyCharClass->addRange(0x007f, 0x0084);-
2953 yyCharClass->addRange(0x0086, 0x009f);-
2954 return Tok_CharClass;-
2955 case 'W':-
2956 // see QChar::isLetterOrNumber() and QChar::isMark()-
2957 yyCharClass->addCategories(uint(-1) ^ (FLAG(QChar::Mark_NonSpacing) |-
2958 FLAG(QChar::Mark_SpacingCombining) |-
2959 FLAG(QChar::Mark_Enclosing) |-
2960 FLAG(QChar::Number_DecimalDigit) |-
2961 FLAG(QChar::Number_Letter) |-
2962 FLAG(QChar::Number_Other) |-
2963 FLAG(QChar::Letter_Uppercase) |-
2964 FLAG(QChar::Letter_Lowercase) |-
2965 FLAG(QChar::Letter_Titlecase) |-
2966 FLAG(QChar::Letter_Modifier) |-
2967 FLAG(QChar::Letter_Other) |-
2968 FLAG(QChar::Punctuation_Connector)));-
2969 yyCharClass->addRange(0x203f, 0x2040);-
2970 yyCharClass->addSingleton(0x2040);-
2971 yyCharClass->addSingleton(0x2054);-
2972 yyCharClass->addSingleton(0x30fb);-
2973 yyCharClass->addRange(0xfe33, 0xfe34);-
2974 yyCharClass->addRange(0xfe4d, 0xfe4f);-
2975 yyCharClass->addSingleton(0xff3f);-
2976 yyCharClass->addSingleton(0xff65);-
2977 return Tok_CharClass;-
2978#endif-
2979#ifndef QT_NO_REGEXP_ESCAPE-
2980 case 'b':-
2981 return Tok_Word;-
2982#endif-
2983#ifndef QT_NO_REGEXP_CCLASS-
2984 case 'd':-
2985 // see QChar::isDigit()-
2986 yyCharClass->addCategories(FLAG(QChar::Number_DecimalDigit));-
2987 return Tok_CharClass;-
2988 case 's':-
2989 // see QChar::isSpace()-
2990 yyCharClass->addCategories(FLAG(QChar::Separator_Space) |-
2991 FLAG(QChar::Separator_Line) |-
2992 FLAG(QChar::Separator_Paragraph));-
2993 yyCharClass->addRange(0x0009, 0x000d);-
2994 yyCharClass->addSingleton(0x0085);-
2995 return Tok_CharClass;-
2996 case 'w':-
2997 // see QChar::isLetterOrNumber() and QChar::isMark()-
2998 yyCharClass->addCategories(FLAG(QChar::Mark_NonSpacing) |-
2999 FLAG(QChar::Mark_SpacingCombining) |-
3000 FLAG(QChar::Mark_Enclosing) |-
3001 FLAG(QChar::Number_DecimalDigit) |-
3002 FLAG(QChar::Number_Letter) |-
3003 FLAG(QChar::Number_Other) |-
3004 FLAG(QChar::Letter_Uppercase) |-
3005 FLAG(QChar::Letter_Lowercase) |-
3006 FLAG(QChar::Letter_Titlecase) |-
3007 FLAG(QChar::Letter_Modifier) |-
3008 FLAG(QChar::Letter_Other));-
3009 yyCharClass->addSingleton(0x005f); // '_'-
3010 return Tok_CharClass;-
3011 case 'I':-
3012 if (xmlSchemaExtensions) {-
3013 yyCharClass->setNegative(!yyCharClass->negative());-
3014 // fall through-
3015 } else {-
3016 break;-
3017 }-
3018 case 'i':-
3019 if (xmlSchemaExtensions) {-
3020 yyCharClass->addCategories(FLAG(QChar::Mark_NonSpacing) |-
3021 FLAG(QChar::Mark_SpacingCombining) |-
3022 FLAG(QChar::Mark_Enclosing) |-
3023 FLAG(QChar::Number_DecimalDigit) |-
3024 FLAG(QChar::Number_Letter) |-
3025 FLAG(QChar::Number_Other) |-
3026 FLAG(QChar::Letter_Uppercase) |-
3027 FLAG(QChar::Letter_Lowercase) |-
3028 FLAG(QChar::Letter_Titlecase) |-
3029 FLAG(QChar::Letter_Modifier) |-
3030 FLAG(QChar::Letter_Other));-
3031 yyCharClass->addSingleton(0x003a); // ':'-
3032 yyCharClass->addSingleton(0x005f); // '_'-
3033 yyCharClass->addRange(0x0041, 0x005a); // [A-Z]-
3034 yyCharClass->addRange(0x0061, 0x007a); // [a-z]-
3035 yyCharClass->addRange(0xc0, 0xd6);-
3036 yyCharClass->addRange(0xd8, 0xf6);-
3037 yyCharClass->addRange(0xf8, 0x2ff);-
3038 yyCharClass->addRange(0x370, 0x37d);-
3039 yyCharClass->addRange(0x37f, 0x1fff);-
3040 yyCharClass->addRange(0x200c, 0x200d);-
3041 yyCharClass->addRange(0x2070, 0x218f);-
3042 yyCharClass->addRange(0x2c00, 0x2fef);-
3043 yyCharClass->addRange(0x3001, 0xd7ff);-
3044 yyCharClass->addRange(0xf900, 0xfdcf);-
3045 yyCharClass->addRange(0xfdf0, 0xfffd);-
3046 yyCharClass->addRange((ushort)0x10000, (ushort)0xeffff);-
3047 return Tok_CharClass;-
3048 } else {-
3049 break;-
3050 }-
3051 case 'C':-
3052 if (xmlSchemaExtensions) {-
3053 yyCharClass->setNegative(!yyCharClass->negative());-
3054 // fall through-
3055 } else {-
3056 break;-
3057 }-
3058 case 'c':-
3059 if (xmlSchemaExtensions) {-
3060 yyCharClass->addCategories(FLAG(QChar::Mark_NonSpacing) |-
3061 FLAG(QChar::Mark_SpacingCombining) |-
3062 FLAG(QChar::Mark_Enclosing) |-
3063 FLAG(QChar::Number_DecimalDigit) |-
3064 FLAG(QChar::Number_Letter) |-
3065 FLAG(QChar::Number_Other) |-
3066 FLAG(QChar::Letter_Uppercase) |-
3067 FLAG(QChar::Letter_Lowercase) |-
3068 FLAG(QChar::Letter_Titlecase) |-
3069 FLAG(QChar::Letter_Modifier) |-
3070 FLAG(QChar::Letter_Other));-
3071 yyCharClass->addSingleton(0x002d); // '-'-
3072 yyCharClass->addSingleton(0x002e); // '.'-
3073 yyCharClass->addSingleton(0x003a); // ':'-
3074 yyCharClass->addSingleton(0x005f); // '_'-
3075 yyCharClass->addSingleton(0xb7);-
3076 yyCharClass->addRange(0x0030, 0x0039); // [0-9]-
3077 yyCharClass->addRange(0x0041, 0x005a); // [A-Z]-
3078 yyCharClass->addRange(0x0061, 0x007a); // [a-z]-
3079 yyCharClass->addRange(0xc0, 0xd6);-
3080 yyCharClass->addRange(0xd8, 0xf6);-
3081 yyCharClass->addRange(0xf8, 0x2ff);-
3082 yyCharClass->addRange(0x370, 0x37d);-
3083 yyCharClass->addRange(0x37f, 0x1fff);-
3084 yyCharClass->addRange(0x200c, 0x200d);-
3085 yyCharClass->addRange(0x2070, 0x218f);-
3086 yyCharClass->addRange(0x2c00, 0x2fef);-
3087 yyCharClass->addRange(0x3001, 0xd7ff);-
3088 yyCharClass->addRange(0xf900, 0xfdcf);-
3089 yyCharClass->addRange(0xfdf0, 0xfffd);-
3090 yyCharClass->addRange((ushort)0x10000, (ushort)0xeffff);-
3091 yyCharClass->addRange(0x0300, 0x036f);-
3092 yyCharClass->addRange(0x203f, 0x2040);-
3093 return Tok_CharClass;-
3094 } else {-
3095 break;-
3096 }-
3097 case 'P':-
3098 if (xmlSchemaExtensions) {-
3099 yyCharClass->setNegative(!yyCharClass->negative());-
3100 // fall through-
3101 } else {-
3102 break;-
3103 }-
3104 case 'p':-
3105 if (xmlSchemaExtensions) {-
3106 if (yyCh != '{') {-
3107 error(RXERR_CHARCLASS);-
3108 return Tok_CharClass;-
3109 }-
3110-
3111 QByteArray category;-
3112 yyCh = getChar();-
3113 while (yyCh != '}') {-
3114 if (yyCh == EOS) {-
3115 error(RXERR_END);-
3116 return Tok_CharClass;-
3117 }-
3118 category.append(yyCh);-
3119 yyCh = getChar();-
3120 }-
3121 yyCh = getChar(); // skip closing '}'-
3122-
3123 int catlen = category.length();-
3124 if (catlen == 1 || catlen == 2) {-
3125 switch (category.at(0)) {-
3126 case 'M':-
3127 if (catlen == 1) {-
3128 yyCharClass->addCategories(FLAG(QChar::Mark_NonSpacing) |-
3129 FLAG(QChar::Mark_SpacingCombining) |-
3130 FLAG(QChar::Mark_Enclosing));-
3131 } else {-
3132 switch (category.at(1)) {-
3133 case 'n': yyCharClass->addCategories(FLAG(QChar::Mark_NonSpacing)); break; // Mn-
3134 case 'c': yyCharClass->addCategories(FLAG(QChar::Mark_SpacingCombining)); break; // Mc-
3135 case 'e': yyCharClass->addCategories(FLAG(QChar::Mark_Enclosing)); break; // Me-
3136 default: error(RXERR_CATEGORY); break;-
3137 }-
3138 }-
3139 break;-
3140 case 'N':-
3141 if (catlen == 1) {-
3142 yyCharClass->addCategories(FLAG(QChar::Number_DecimalDigit) |-
3143 FLAG(QChar::Number_Letter) |-
3144 FLAG(QChar::Number_Other));-
3145 } else {-
3146 switch (category.at(1)) {-
3147 case 'd': yyCharClass->addCategories(FLAG(QChar::Number_DecimalDigit)); break; // Nd-
3148 case 'l': yyCharClass->addCategories(FLAG(QChar::Number_Letter)); break; // Hl-
3149 case 'o': yyCharClass->addCategories(FLAG(QChar::Number_Other)); break; // No-
3150 default: error(RXERR_CATEGORY); break;-
3151 }-
3152 }-
3153 break;-
3154 case 'Z':-
3155 if (catlen == 1) {-
3156 yyCharClass->addCategories(FLAG(QChar::Separator_Space) |-
3157 FLAG(QChar::Separator_Line) |-
3158 FLAG(QChar::Separator_Paragraph));-
3159 } else {-
3160 switch (category.at(1)) {-
3161 case 's': yyCharClass->addCategories(FLAG(QChar::Separator_Space)); break; // Zs-
3162 case 'l': yyCharClass->addCategories(FLAG(QChar::Separator_Line)); break; // Zl-
3163 case 'p': yyCharClass->addCategories(FLAG(QChar::Separator_Paragraph)); break; // Zp-
3164 default: error(RXERR_CATEGORY); break;-
3165 }-
3166 }-
3167 break;-
3168 case 'C':-
3169 if (catlen == 1) {-
3170 yyCharClass->addCategories(FLAG(QChar::Other_Control) |-
3171 FLAG(QChar::Other_Format) |-
3172 FLAG(QChar::Other_Surrogate) |-
3173 FLAG(QChar::Other_PrivateUse) |-
3174 FLAG(QChar::Other_NotAssigned));-
3175 } else {-
3176 switch (category.at(1)) {-
3177 case 'c': yyCharClass->addCategories(FLAG(QChar::Other_Control)); break; // Cc-
3178 case 'f': yyCharClass->addCategories(FLAG(QChar::Other_Format)); break; // Cf-
3179 case 's': yyCharClass->addCategories(FLAG(QChar::Other_Surrogate)); break; // Cs-
3180 case 'o': yyCharClass->addCategories(FLAG(QChar::Other_PrivateUse)); break; // Co-
3181 case 'n': yyCharClass->addCategories(FLAG(QChar::Other_NotAssigned)); break; // Cn-
3182 default: error(RXERR_CATEGORY); break;-
3183 }-
3184 }-
3185 break;-
3186 case 'L':-
3187 if (catlen == 1) {-
3188 yyCharClass->addCategories(FLAG(QChar::Letter_Uppercase) |-
3189 FLAG(QChar::Letter_Lowercase) |-
3190 FLAG(QChar::Letter_Titlecase) |-
3191 FLAG(QChar::Letter_Modifier) |-
3192 FLAG(QChar::Letter_Other));-
3193 } else {-
3194 switch (category.at(1)) {-
3195 case 'u': yyCharClass->addCategories(FLAG(QChar::Letter_Uppercase)); break; // Lu-
3196 case 'l': yyCharClass->addCategories(FLAG(QChar::Letter_Lowercase)); break; // Ll-
3197 case 't': yyCharClass->addCategories(FLAG(QChar::Letter_Titlecase)); break; // Lt-
3198 case 'm': yyCharClass->addCategories(FLAG(QChar::Letter_Modifier)); break; // Lm-
3199 case 'o': yyCharClass->addCategories(FLAG(QChar::Letter_Other)); break; // Lo-
3200 default: error(RXERR_CATEGORY); break;-
3201 }-
3202 }-
3203 break;-
3204 case 'P':-
3205 if (catlen == 1) {-
3206 yyCharClass->addCategories(FLAG(QChar::Punctuation_Connector) |-
3207 FLAG(QChar::Punctuation_Dash) |-
3208 FLAG(QChar::Punctuation_Open) |-
3209 FLAG(QChar::Punctuation_Close) |-
3210 FLAG(QChar::Punctuation_InitialQuote) |-
3211 FLAG(QChar::Punctuation_FinalQuote) |-
3212 FLAG(QChar::Punctuation_Other));-
3213 } else {-
3214 switch (category.at(1)) {-
3215 case 'c': yyCharClass->addCategories(FLAG(QChar::Punctuation_Connector)); break; // Pc-
3216 case 'd': yyCharClass->addCategories(FLAG(QChar::Punctuation_Dash)); break; // Pd-
3217 case 's': yyCharClass->addCategories(FLAG(QChar::Punctuation_Open)); break; // Ps-
3218 case 'e': yyCharClass->addCategories(FLAG(QChar::Punctuation_Close)); break; // Pe-
3219 case 'i': yyCharClass->addCategories(FLAG(QChar::Punctuation_InitialQuote)); break; // Pi-
3220 case 'f': yyCharClass->addCategories(FLAG(QChar::Punctuation_FinalQuote)); break; // Pf-
3221 case 'o': yyCharClass->addCategories(FLAG(QChar::Punctuation_Other)); break; // Po-
3222 default: error(RXERR_CATEGORY); break;-
3223 }-
3224 }-
3225 break;-
3226 case 'S':-
3227 if (catlen == 1) {-
3228 yyCharClass->addCategories(FLAG(QChar::Symbol_Math) |-
3229 FLAG(QChar::Symbol_Currency) |-
3230 FLAG(QChar::Symbol_Modifier) |-
3231 FLAG(QChar::Symbol_Other));-
3232 } else {-
3233 switch (category.at(1)) {-
3234 case 'm': yyCharClass->addCategories(FLAG(QChar::Symbol_Math)); break; // Sm-
3235 case 'c': yyCharClass->addCategories(FLAG(QChar::Symbol_Currency)); break; // Sc-
3236 case 'k': yyCharClass->addCategories(FLAG(QChar::Symbol_Modifier)); break; // Sk-
3237 case 'o': yyCharClass->addCategories(FLAG(QChar::Symbol_Other)); break; // So-
3238 default: error(RXERR_CATEGORY); break;-
3239 }-
3240 }-
3241 break;-
3242 default:-
3243 error(RXERR_CATEGORY);-
3244 break;-
3245 }-
3246 } else if (catlen > 2 && category.at(0) == 'I' && category.at(1) == 's') {-
3247 static const int N = sizeof(categoriesRangeMap) / sizeof(categoriesRangeMap[0]);-
3248 const char * const categoryFamily = category.constData() + 2;-
3249 const CategoriesRangeMapEntry *r = std::lower_bound(categoriesRangeMap, categoriesRangeMap + N, categoryFamily);-
3250 if (r != categoriesRangeMap + N && qstrcmp(r->name, categoryFamily) == 0)-
3251 yyCharClass->addRange(r->first, r->second);-
3252 else-
3253 error(RXERR_CATEGORY);-
3254 } else {-
3255 error(RXERR_CATEGORY);-
3256 }-
3257 return Tok_CharClass;-
3258 } else {-
3259 break;-
3260 }-
3261#endif-
3262#ifndef QT_NO_REGEXP_ESCAPE-
3263 case 'x':-
3264 val = 0;-
3265 for (i = 0; i < 4; i++) {-
3266 low = QChar(yyCh).toLower().unicode();-
3267 if (low >= '0' && low <= '9')-
3268 val = (val << 4) | (low - '0');-
3269 else if (low >= 'a' && low <= 'f')-
3270 val = (val << 4) | (low - 'a' + 10);-
3271 else-
3272 break;-
3273 yyCh = getChar();-
3274 }-
3275 return Tok_Char | val;-
3276#endif-
3277 default:-
3278 break;-
3279 }-
3280 if (prevCh >= '1' && prevCh <= '9') {-
3281#ifndef QT_NO_REGEXP_BACKREF-
3282 val = prevCh - '0';-
3283 while (yyCh >= '0' && yyCh <= '9') {-
3284 val = (val * 10) + (yyCh - '0');-
3285 yyCh = getChar();-
3286 }-
3287 return Tok_BackRef | val;-
3288#else-
3289 error(RXERR_DISABLED);-
3290#endif-
3291 }-
3292 return Tok_Char | prevCh;-
3293}-
3294-
3295#ifndef QT_NO_REGEXP_INTERVAL-
3296int QRegExpEngine::getRep(int def)-
3297{-
3298 if (yyCh >= '0' && yyCh <= '9') {-
3299 int rep = 0;-
3300 do {-
3301 rep = 10 * rep + yyCh - '0';-
3302 if (rep >= InftyRep) {-
3303 error(RXERR_REPETITION);-
3304 rep = def;-
3305 }-
3306 yyCh = getChar();-
3307 } while (yyCh >= '0' && yyCh <= '9');-
3308 return rep;-
3309 } else {-
3310 return def;-
3311 }-
3312}-
3313#endif-
3314-
3315#ifndef QT_NO_REGEXP_LOOKAHEAD-
3316void QRegExpEngine::skipChars(int n)-
3317{-
3318 if (n > 0) {-
3319 yyPos += n - 1;-
3320 yyCh = getChar();-
3321 }-
3322}-
3323#endif-
3324-
3325void QRegExpEngine::error(const char *msg)-
3326{-
3327 if (yyError.isEmpty())-
3328 yyError = QLatin1String(msg);-
3329}-
3330-
3331void QRegExpEngine::startTokenizer(const QChar *rx, int len)-
3332{-
3333 yyIn = rx;-
3334 yyPos0 = 0;-
3335 yyPos = 0;-
3336 yyLen = len;-
3337 yyCh = getChar();-
3338 yyCharClass.reset(new QRegExpCharClass);-
3339 yyMinRep = 0;-
3340 yyMaxRep = 0;-
3341 yyError = QString();-
3342}-
3343-
3344int QRegExpEngine::getToken()-
3345{-
3346#ifndef QT_NO_REGEXP_CCLASS-
3347 ushort pendingCh = 0;-
3348 bool charPending;-
3349 bool rangePending;-
3350 int tok;-
3351#endif-
3352 int prevCh = yyCh;-
3353-
3354 yyPos0 = yyPos - 1;-
3355#ifndef QT_NO_REGEXP_CCLASS-
3356 yyCharClass->clear();-
3357#endif-
3358 yyMinRep = 0;-
3359 yyMaxRep = 0;-
3360 yyCh = getChar();-
3361-
3362 switch (prevCh) {-
3363 case EOS:-
3364 yyPos0 = yyPos;-
3365 return Tok_Eos;-
3366 case '$':-
3367 return Tok_Dollar;-
3368 case '(':-
3369 if (yyCh == '?') {-
3370 prevCh = getChar();-
3371 yyCh = getChar();-
3372 switch (prevCh) {-
3373#ifndef QT_NO_REGEXP_LOOKAHEAD-
3374 case '!':-
3375 return Tok_NegLookahead;-
3376 case '=':-
3377 return Tok_PosLookahead;-
3378#endif-
3379 case ':':-
3380 return Tok_MagicLeftParen;-
3381 case '<':-
3382 error(RXERR_LOOKBEHIND);-
3383 return Tok_MagicLeftParen;-
3384 default:-
3385 error(RXERR_LOOKAHEAD);-
3386 return Tok_MagicLeftParen;-
3387 }-
3388 } else {-
3389 return Tok_LeftParen;-
3390 }-
3391 case ')':-
3392 return Tok_RightParen;-
3393 case '*':-
3394 yyMinRep = 0;-
3395 yyMaxRep = InftyRep;-
3396 return Tok_Quantifier;-
3397 case '+':-
3398 yyMinRep = 1;-
3399 yyMaxRep = InftyRep;-
3400 return Tok_Quantifier;-
3401 case '.':-
3402#ifndef QT_NO_REGEXP_CCLASS-
3403 yyCharClass->setNegative(true);-
3404#endif-
3405 return Tok_CharClass;-
3406 case '?':-
3407 yyMinRep = 0;-
3408 yyMaxRep = 1;-
3409 return Tok_Quantifier;-
3410 case '[':-
3411#ifndef QT_NO_REGEXP_CCLASS-
3412 if (yyCh == '^') {-
3413 yyCharClass->setNegative(true);-
3414 yyCh = getChar();-
3415 }-
3416 charPending = false;-
3417 rangePending = false;-
3418 do {-
3419 if (yyCh == '-' && charPending && !rangePending) {-
3420 rangePending = true;-
3421 yyCh = getChar();-
3422 } else {-
3423 if (charPending && !rangePending) {-
3424 yyCharClass->addSingleton(pendingCh);-
3425 charPending = false;-
3426 }-
3427 if (yyCh == '\\') {-
3428 yyCh = getChar();-
3429 tok = getEscape();-
3430 if (tok == Tok_Word)-
3431 tok = '\b';-
3432 } else {-
3433 tok = Tok_Char | yyCh;-
3434 yyCh = getChar();-
3435 }-
3436 if (tok == Tok_CharClass) {-
3437 if (rangePending) {-
3438 yyCharClass->addSingleton('-');-
3439 yyCharClass->addSingleton(pendingCh);-
3440 charPending = false;-
3441 rangePending = false;-
3442 }-
3443 } else if ((tok & Tok_Char) != 0) {-
3444 if (rangePending) {-
3445 yyCharClass->addRange(pendingCh, tok ^ Tok_Char);-
3446 charPending = false;-
3447 rangePending = false;-
3448 } else {-
3449 pendingCh = tok ^ Tok_Char;-
3450 charPending = true;-
3451 }-
3452 } else {-
3453 error(RXERR_CHARCLASS);-
3454 }-
3455 }-
3456 } while (yyCh != ']' && yyCh != EOS);-
3457 if (rangePending)-
3458 yyCharClass->addSingleton('-');-
3459 if (charPending)-
3460 yyCharClass->addSingleton(pendingCh);-
3461 if (yyCh == EOS)-
3462 error(RXERR_END);-
3463 else-
3464 yyCh = getChar();-
3465 return Tok_CharClass;-
3466#else-
3467 error(RXERR_END);-
3468 return Tok_Char | '[';-
3469#endif-
3470 case '\\':-
3471 return getEscape();-
3472 case ']':-
3473 error(RXERR_LEFTDELIM);-
3474 return Tok_Char | ']';-
3475 case '^':-
3476 return Tok_Caret;-
3477 case '{':-
3478#ifndef QT_NO_REGEXP_INTERVAL-
3479 yyMinRep = getRep(0);-
3480 yyMaxRep = yyMinRep;-
3481 if (yyCh == ',') {-
3482 yyCh = getChar();-
3483 yyMaxRep = getRep(InftyRep);-
3484 }-
3485 if (yyMaxRep < yyMinRep)-
3486 error(RXERR_INTERVAL);-
3487 if (yyCh != '}')-
3488 error(RXERR_REPETITION);-
3489 yyCh = getChar();-
3490 return Tok_Quantifier;-
3491#else-
3492 error(RXERR_DISABLED);-
3493 return Tok_Char | '{';-
3494#endif-
3495 case '|':-
3496 return Tok_Bar;-
3497 case '}':-
3498 error(RXERR_LEFTDELIM);-
3499 return Tok_Char | '}';-
3500 default:-
3501 return Tok_Char | prevCh;-
3502 }-
3503}-
3504-
3505int QRegExpEngine::parse(const QChar *pattern, int len)-
3506{-
3507 valid = true;-
3508 startTokenizer(pattern, len);-
3509 yyTok = getToken();-
3510#ifndef QT_NO_REGEXP_CAPTURE-
3511 yyMayCapture = true;-
3512#else-
3513 yyMayCapture = false;-
3514#endif-
3515-
3516#ifndef QT_NO_REGEXP_CAPTURE-
3517 int atom = startAtom(false);-
3518#endif-
3519 QRegExpCharClass anything;-
3520 Box box(this); // create InitialState-
3521 box.set(anything);-
3522 Box rightBox(this); // create FinalState-
3523 rightBox.set(anything);-
3524-
3525 Box middleBox(this);-
3526 parseExpression(&middleBox);-
3527#ifndef QT_NO_REGEXP_CAPTURE-
3528 finishAtom(atom, false);-
3529#endif-
3530#ifndef QT_NO_REGEXP_OPTIM-
3531 middleBox.setupHeuristics();-
3532#endif-
3533 box.cat(middleBox);-
3534 box.cat(rightBox);-
3535 yyCharClass.reset(0);-
3536-
3537#ifndef QT_NO_REGEXP_CAPTURE-
3538 for (int i = 0; i < nf; ++i) {-
3539 switch (f[i].capture) {-
3540 case QRegExpAtom::NoCapture:-
3541 break;-
3542 case QRegExpAtom::OfficialCapture:-
3543 f[i].capture = ncap;-
3544 captureForOfficialCapture.append(ncap);-
3545 ++ncap;-
3546 ++officialncap;-
3547 break;-
3548 case QRegExpAtom::UnofficialCapture:-
3549 f[i].capture = greedyQuantifiers ? ncap++ : QRegExpAtom::NoCapture;-
3550 }-
3551 }-
3552-
3553#ifndef QT_NO_REGEXP_BACKREF-
3554#ifndef QT_NO_REGEXP_OPTIM-
3555 if (officialncap == 0 && nbrefs == 0) {-
3556 ncap = nf = 0;-
3557 f.clear();-
3558 }-
3559#endif-
3560 // handle the case where there's a \5 with no corresponding capture-
3561 // (captureForOfficialCapture.size() != officialncap)-
3562 for (int i = 0; i < nbrefs - officialncap; ++i) {-
3563 captureForOfficialCapture.append(ncap);-
3564 ++ncap;-
3565 }-
3566#endif-
3567#endif-
3568-
3569 if (!yyError.isEmpty())-
3570 return -1;-
3571-
3572#ifndef QT_NO_REGEXP_OPTIM-
3573 const QRegExpAutomatonState &sinit = s.at(InitialState);-
3574 caretAnchored = !sinit.anchors.isEmpty();-
3575 if (caretAnchored) {-
3576 const QMap<int, int> &anchors = sinit.anchors;-
3577 QMap<int, int>::const_iterator a;-
3578 for (a = anchors.constBegin(); a != anchors.constEnd(); ++a) {-
3579 if (-
3580#ifndef QT_NO_REGEXP_ANCHOR_ALT-
3581 (*a & Anchor_Alternation) != 0 ||-
3582#endif-
3583 (*a & Anchor_Caret) == 0)-
3584 {-
3585 caretAnchored = false;-
3586 break;-
3587 }-
3588 }-
3589 }-
3590#endif-
3591-
3592 // cleanup anchors-
3593 int numStates = s.count();-
3594 for (int i = 0; i < numStates; ++i) {-
3595 QRegExpAutomatonState &state = s[i];-
3596 if (!state.anchors.isEmpty()) {-
3597 QMap<int, int>::iterator a = state.anchors.begin();-
3598 while (a != state.anchors.end()) {-
3599 if (a.value() == 0)-
3600 a = state.anchors.erase(a);-
3601 else-
3602 ++a;-
3603 }-
3604 }-
3605 }-
3606-
3607 return yyPos0;-
3608}-
3609-
3610void QRegExpEngine::parseAtom(Box *box)-
3611{-
3612#ifndef QT_NO_REGEXP_LOOKAHEAD-
3613 QRegExpEngine *eng = 0;-
3614 bool neg;-
3615 int len;-
3616#endif-
3617-
3618 if ((yyTok & Tok_Char) != 0) {-
3619 box->set(QChar(yyTok ^ Tok_Char));-
3620 } else {-
3621#ifndef QT_NO_REGEXP_OPTIM-
3622 trivial = false;-
3623#endif-
3624 switch (yyTok) {-
3625 case Tok_Dollar:-
3626 box->catAnchor(Anchor_Dollar);-
3627 break;-
3628 case Tok_Caret:-
3629 box->catAnchor(Anchor_Caret);-
3630 break;-
3631#ifndef QT_NO_REGEXP_LOOKAHEAD-
3632 case Tok_PosLookahead:-
3633 case Tok_NegLookahead:-
3634 neg = (yyTok == Tok_NegLookahead);-
3635 eng = new QRegExpEngine(cs, greedyQuantifiers);-
3636 len = eng->parse(yyIn + yyPos - 1, yyLen - yyPos + 1);-
3637 if (len >= 0)-
3638 skipChars(len);-
3639 else-
3640 error(RXERR_LOOKAHEAD);-
3641 box->catAnchor(addLookahead(eng, neg));-
3642 yyTok = getToken();-
3643 if (yyTok != Tok_RightParen)-
3644 error(RXERR_LOOKAHEAD);-
3645 break;-
3646#endif-
3647#ifndef QT_NO_REGEXP_ESCAPE-
3648 case Tok_Word:-
3649 box->catAnchor(Anchor_Word);-
3650 break;-
3651 case Tok_NonWord:-
3652 box->catAnchor(Anchor_NonWord);-
3653 break;-
3654#endif-
3655 case Tok_LeftParen:-
3656 case Tok_MagicLeftParen:-
3657 yyTok = getToken();-
3658 parseExpression(box);-
3659 if (yyTok != Tok_RightParen)-
3660 error(RXERR_END);-
3661 break;-
3662 case Tok_CharClass:-
3663 box->set(*yyCharClass);-
3664 break;-
3665 case Tok_Quantifier:-
3666 error(RXERR_REPETITION);-
3667 break;-
3668 default:-
3669#ifndef QT_NO_REGEXP_BACKREF-
3670 if ((yyTok & Tok_BackRef) != 0)-
3671 box->set(yyTok ^ Tok_BackRef);-
3672 else-
3673#endif-
3674 error(RXERR_DISABLED);-
3675 }-
3676 }-
3677 yyTok = getToken();-
3678}-
3679-
3680void QRegExpEngine::parseFactor(Box *box)-
3681{-
3682#ifndef QT_NO_REGEXP_CAPTURE-
3683 int outerAtom = greedyQuantifiers ? startAtom(false) : -1;-
3684 int innerAtom = startAtom(yyMayCapture && yyTok == Tok_LeftParen);-
3685 bool magicLeftParen = (yyTok == Tok_MagicLeftParen);-
3686#else-
3687 const int innerAtom = -1;-
3688#endif-
3689-
3690#ifndef QT_NO_REGEXP_INTERVAL-
3691#define YYREDO() \-
3692 yyIn = in, yyPos0 = pos0, yyPos = pos, yyLen = len, yyCh = ch, \-
3693 *yyCharClass = charClass, yyMinRep = 0, yyMaxRep = 0, yyTok = tok-
3694-
3695 const QChar *in = yyIn;-
3696 int pos0 = yyPos0;-
3697 int pos = yyPos;-
3698 int len = yyLen;-
3699 int ch = yyCh;-
3700 QRegExpCharClass charClass;-
3701 if (yyTok == Tok_CharClass)-
3702 charClass = *yyCharClass;-
3703 int tok = yyTok;-
3704 bool mayCapture = yyMayCapture;-
3705#endif-
3706-
3707 parseAtom(box);-
3708#ifndef QT_NO_REGEXP_CAPTURE-
3709 finishAtom(innerAtom, magicLeftParen);-
3710#endif-
3711-
3712 bool hasQuantifier = (yyTok == Tok_Quantifier);-
3713 if (hasQuantifier) {-
3714#ifndef QT_NO_REGEXP_OPTIM-
3715 trivial = false;-
3716#endif-
3717 if (yyMaxRep == InftyRep) {-
3718 box->plus(innerAtom);-
3719#ifndef QT_NO_REGEXP_INTERVAL-
3720 } else if (yyMaxRep == 0) {-
3721 box->clear();-
3722#endif-
3723 }-
3724 if (yyMinRep == 0)-
3725 box->opt();-
3726-
3727#ifndef QT_NO_REGEXP_INTERVAL-
3728 yyMayCapture = false;-
3729 int alpha = (yyMinRep == 0) ? 0 : yyMinRep - 1;-
3730 int beta = (yyMaxRep == InftyRep) ? 0 : yyMaxRep - (alpha + 1);-
3731-
3732 Box rightBox(this);-
3733 int i;-
3734-
3735 for (i = 0; i < beta; i++) {-
3736 YYREDO();-
3737 Box leftBox(this);-
3738 parseAtom(&leftBox);-
3739 leftBox.cat(rightBox);-
3740 leftBox.opt();-
3741 rightBox = leftBox;-
3742 }-
3743 for (i = 0; i < alpha; i++) {-
3744 YYREDO();-
3745 Box leftBox(this);-
3746 parseAtom(&leftBox);-
3747 leftBox.cat(rightBox);-
3748 rightBox = leftBox;-
3749 }-
3750 rightBox.cat(*box);-
3751 *box = rightBox;-
3752#endif-
3753 yyTok = getToken();-
3754#ifndef QT_NO_REGEXP_INTERVAL-
3755 yyMayCapture = mayCapture;-
3756#endif-
3757 }-
3758#undef YYREDO-
3759#ifndef QT_NO_REGEXP_CAPTURE-
3760 if (greedyQuantifiers)-
3761 finishAtom(outerAtom, hasQuantifier);-
3762#endif-
3763}-
3764-
3765void QRegExpEngine::parseTerm(Box *box)-
3766{-
3767#ifndef QT_NO_REGEXP_OPTIM-
3768 if (yyTok != Tok_Eos && yyTok != Tok_RightParen && yyTok != Tok_Bar)-
3769 parseFactor(box);-
3770#endif-
3771 while (yyTok != Tok_Eos && yyTok != Tok_RightParen && yyTok != Tok_Bar) {-
3772 Box rightBox(this);-
3773 parseFactor(&rightBox);-
3774 box->cat(rightBox);-
3775 }-
3776}-
3777-
3778void QRegExpEngine::parseExpression(Box *box)-
3779{-
3780 parseTerm(box);-
3781 while (yyTok == Tok_Bar) {-
3782#ifndef QT_NO_REGEXP_OPTIM-
3783 trivial = false;-
3784#endif-
3785 Box rightBox(this);-
3786 yyTok = getToken();-
3787 parseTerm(&rightBox);-
3788 box->orx(rightBox);-
3789 }-
3790}-
3791-
3792/*-
3793 The struct QRegExpPrivate contains the private data of a regular-
3794 expression other than the automaton. It makes it possible for many-
3795 QRegExp objects to use the same QRegExpEngine object with different-
3796 QRegExpPrivate objects.-
3797*/-
3798struct QRegExpPrivate-
3799{-
3800 QRegExpEngine *eng;-
3801 QRegExpEngineKey engineKey;-
3802 bool minimal;-
3803#ifndef QT_NO_REGEXP_CAPTURE-
3804 QString t; // last string passed to QRegExp::indexIn() or lastIndexIn()-
3805 QStringList capturedCache; // what QRegExp::capturedTexts() returned last-
3806#endif-
3807 QRegExpMatchState matchState;-
3808-
3809 inline QRegExpPrivate()-
3810 : eng(0), engineKey(QString(), QRegExp::RegExp, Qt::CaseSensitive), minimal(false) { }-
3811 inline QRegExpPrivate(const QRegExpEngineKey &key)-
3812 : eng(0), engineKey(key), minimal(false) {}-
3813};-
3814-
3815#if !defined(QT_NO_REGEXP_OPTIM)-
3816typedef QCache<QRegExpEngineKey, QRegExpEngine> EngineCache;-
3817Q_GLOBAL_STATIC(EngineCache, globalEngineCache)-
3818static QBasicMutex globalEngineCacheMutex;-
3819#endif // QT_NO_REGEXP_OPTIM-
3820-
3821static void derefEngine(QRegExpEngine *eng, const QRegExpEngineKey &key)-
3822{-
3823 if (!eng->ref.deref()) {-
3824#if !defined(QT_NO_REGEXP_OPTIM)-
3825 if (globalEngineCache()) {-
3826 QMutexLocker locker(&globalEngineCacheMutex);-
3827 QT_TRY {-
3828 globalEngineCache()->insert(key, eng, 4 + key.pattern.length() / 4);-
3829 } QT_CATCH(const std::bad_alloc &) {-
3830 // in case of an exception (e.g. oom), just delete the engine-
3831 delete eng;-
3832 }-
3833 } else {-
3834 delete eng;-
3835 }-
3836#else-
3837 Q_UNUSED(key);-
3838 delete eng;-
3839#endif-
3840 }-
3841}-
3842-
3843static void prepareEngine_helper(QRegExpPrivate *priv)-
3844{-
3845 bool initMatchState = !priv->eng;-
3846#if !defined(QT_NO_REGEXP_OPTIM)-
3847 if (!priv->eng && globalEngineCache()) {-
3848 QMutexLocker locker(&globalEngineCacheMutex);-
3849 priv->eng = globalEngineCache()->take(priv->engineKey);-
3850 if (priv->eng != 0)-
3851 priv->eng->ref.ref();-
3852 }-
3853#endif // QT_NO_REGEXP_OPTIM-
3854-
3855 if (!priv->eng)-
3856 priv->eng = new QRegExpEngine(priv->engineKey);-
3857-
3858 if (initMatchState)-
3859 priv->matchState.prepareForMatch(priv->eng);-
3860}-
3861-
3862inline static void prepareEngine(QRegExpPrivate *priv)-
3863{-
3864 if (priv->eng)-
3865 return;-
3866 prepareEngine_helper(priv);-
3867}-
3868-
3869static void prepareEngineForMatch(QRegExpPrivate *priv, const QString &str)-
3870{-
3871 prepareEngine(priv);-
3872 priv->matchState.prepareForMatch(priv->eng);-
3873#ifndef QT_NO_REGEXP_CAPTURE-
3874 priv->t = str;-
3875 priv->capturedCache.clear();-
3876#else-
3877 Q_UNUSED(str);-
3878#endif-
3879}-
3880-
3881static void invalidateEngine(QRegExpPrivate *priv)-
3882{-
3883 if (priv->eng != 0) {-
3884 derefEngine(priv->eng, priv->engineKey);-
3885 priv->eng = 0;-
3886 priv->matchState.drain();-
3887 }-
3888}-
3889-
3890/*!-
3891 \enum QRegExp::CaretMode-
3892-
3893 The CaretMode enum defines the different meanings of the caret-
3894 (\b{^}) in a regular expression. The possible values are:-
3895-
3896 \value CaretAtZero-
3897 The caret corresponds to index 0 in the searched string.-
3898-
3899 \value CaretAtOffset-
3900 The caret corresponds to the start offset of the search.-
3901-
3902 \value CaretWontMatch-
3903 The caret never matches.-
3904*/-
3905-
3906/*!-
3907 \enum QRegExp::PatternSyntax-
3908-
3909 The syntax used to interpret the meaning of the pattern.-
3910-
3911 \value RegExp A rich Perl-like pattern matching syntax. This is-
3912 the default.-
3913-
3914 \value RegExp2 Like RegExp, but with \l{greedy quantifiers}.-
3915 (Introduced in Qt 4.2.)-
3916-
3917 \value Wildcard This provides a simple pattern matching syntax-
3918 similar to that used by shells (command interpreters) for "file-
3919 globbing". See \l{QRegExp wildcard matching}.-
3920-
3921 \value WildcardUnix This is similar to Wildcard but with the-
3922 behavior of a Unix shell. The wildcard characters can be escaped-
3923 with the character "\\".-
3924-
3925 \value FixedString The pattern is a fixed string. This is-
3926 equivalent to using the RegExp pattern on a string in-
3927 which all metacharacters are escaped using escape().-
3928-
3929 \value W3CXmlSchema11 The pattern is a regular expression as-
3930 defined by the W3C XML Schema 1.1 specification.-
3931-
3932 \sa setPatternSyntax()-
3933*/-
3934-
3935/*!-
3936 Constructs an empty regexp.-
3937-
3938 \sa isValid(), errorString()-
3939*/-
3940QRegExp::QRegExp()-
3941{-
3942 priv = new QRegExpPrivate;-
3943 prepareEngine(priv);-
3944}-
3945-
3946/*!-
3947 Constructs a regular expression object for the given \a pattern-
3948 string. The pattern must be given using wildcard notation if \a-
3949 syntax is \l Wildcard; the default is \l RegExp. The pattern is-
3950 case sensitive, unless \a cs is Qt::CaseInsensitive. Matching is-
3951 greedy (maximal), but can be changed by calling-
3952 setMinimal().-
3953-
3954 \sa setPattern(), setCaseSensitivity(), setPatternSyntax()-
3955*/-
3956QRegExp::QRegExp(const QString &pattern, Qt::CaseSensitivity cs, PatternSyntax syntax)-
3957{-
3958 priv = new QRegExpPrivate(QRegExpEngineKey(pattern, syntax, cs));-
3959 prepareEngine(priv);-
3960}-
3961-
3962/*!-
3963 Constructs a regular expression as a copy of \a rx.-
3964-
3965 \sa operator=()-
3966*/-
3967QRegExp::QRegExp(const QRegExp &rx)-
3968{-
3969 priv = new QRegExpPrivate;-
3970 operator=(rx);-
3971}-
3972-
3973/*!-
3974 Destroys the regular expression and cleans up its internal data.-
3975*/-
3976QRegExp::~QRegExp()-
3977{-
3978 invalidateEngine(priv);-
3979 delete priv;-
3980}-
3981-
3982/*!-
3983 Copies the regular expression \a rx and returns a reference to the-
3984 copy. The case sensitivity, wildcard, and minimal matching options-
3985 are also copied.-
3986*/-
3987QRegExp &QRegExp::operator=(const QRegExp &rx)-
3988{-
3989 prepareEngine(rx.priv); // to allow sharing-
3990 QRegExpEngine *otherEng = rx.priv->eng;-
3991 if (otherEng)-
3992 otherEng->ref.ref();-
3993 invalidateEngine(priv);-
3994 priv->eng = otherEng;-
3995 priv->engineKey = rx.priv->engineKey;-
3996 priv->minimal = rx.priv->minimal;-
3997#ifndef QT_NO_REGEXP_CAPTURE-
3998 priv->t = rx.priv->t;-
3999 priv->capturedCache = rx.priv->capturedCache;-
4000#endif-
4001 if (priv->eng)-
4002 priv->matchState.prepareForMatch(priv->eng);-
4003 priv->matchState.captured = rx.priv->matchState.captured;-
4004 return *this;-
4005}-
4006-
4007/*!-
4008 \fn QRegExp &QRegExp::operator=(QRegExp &&other)-
4009-
4010 Move-assigns \a other to this QRegExp instance.-
4011-
4012 \since 5.2-
4013*/-
4014-
4015/*!-
4016 \fn void QRegExp::swap(QRegExp &other)-
4017 \since 4.8-
4018-
4019 Swaps regular expression \a other with this regular-
4020 expression. This operation is very fast and never fails.-
4021*/-
4022-
4023/*!-
4024 Returns \c true if this regular expression is equal to \a rx;-
4025 otherwise returns \c false.-
4026-
4027 Two QRegExp objects are equal if they have the same pattern-
4028 strings and the same settings for case sensitivity, wildcard and-
4029 minimal matching.-
4030*/-
4031bool QRegExp::operator==(const QRegExp &rx) const-
4032{-
4033 return priv->engineKey == rx.priv->engineKey && priv->minimal == rx.priv->minimal;-
4034}-
4035-
4036/*!-
4037 \since 5.6-
4038 \relates QRegExp-
4039-
4040 Returns the hash value for \a key, using-
4041 \a seed to seed the calculation.-
4042*/-
4043uint qHash(const QRegExp &key, uint seed) Q_DECL_NOTHROW-
4044{-
4045 QtPrivate::QHashCombine hash;-
4046 seed = hash(seed, key.priv->engineKey);-
4047 seed = hash(seed, key.priv->minimal);-
4048 return seed;
executed 2048 times by 1 test: return seed;
Executed by:
  • tst_QRegExp
2048
4049}-
4050-
4051/*!-
4052 \fn bool QRegExp::operator!=(const QRegExp &rx) const-
4053-
4054 Returns \c true if this regular expression is not equal to \a rx;-
4055 otherwise returns \c false.-
4056-
4057 \sa operator==()-
4058*/-
4059-
4060/*!-
4061 Returns \c true if the pattern string is empty; otherwise returns-
4062 false.-
4063-
4064 If you call exactMatch() with an empty pattern on an empty string-
4065 it will return true; otherwise it returns \c false since it operates-
4066 over the whole string. If you call indexIn() with an empty pattern-
4067 on \e any string it will return the start offset (0 by default)-
4068 because the empty pattern matches the 'emptiness' at the start of-
4069 the string. In this case the length of the match returned by-
4070 matchedLength() will be 0.-
4071-
4072 See QString::isEmpty().-
4073*/-
4074-
4075bool QRegExp::isEmpty() const-
4076{-
4077 return priv->engineKey.pattern.isEmpty();-
4078}-
4079-
4080/*!-
4081 Returns \c true if the regular expression is valid; otherwise returns-
4082 false. An invalid regular expression never matches.-
4083-
4084 The pattern \b{[a-z} is an example of an invalid pattern, since-
4085 it lacks a closing square bracket.-
4086-
4087 Note that the validity of a regexp may also depend on the setting-
4088 of the wildcard flag, for example \b{*.html} is a valid-
4089 wildcard regexp but an invalid full regexp.-
4090-
4091 \sa errorString()-
4092*/-
4093bool QRegExp::isValid() const-
4094{-
4095 if (priv->engineKey.pattern.isEmpty()) {-
4096 return true;-
4097 } else {-
4098 prepareEngine(priv);-
4099 return priv->eng->isValid();-
4100 }-
4101}-
4102-
4103/*!-
4104 Returns the pattern string of the regular expression. The pattern-
4105 has either regular expression syntax or wildcard syntax, depending-
4106 on patternSyntax().-
4107-
4108 \sa patternSyntax(), caseSensitivity()-
4109*/-
4110QString QRegExp::pattern() const-
4111{-
4112 return priv->engineKey.pattern;-
4113}-
4114-
4115/*!-
4116 Sets the pattern string to \a pattern. The case sensitivity,-
4117 wildcard, and minimal matching options are not changed.-
4118-
4119 \sa setPatternSyntax(), setCaseSensitivity()-
4120*/-
4121void QRegExp::setPattern(const QString &pattern)-
4122{-
4123 if (priv->engineKey.pattern != pattern) {-
4124 invalidateEngine(priv);-
4125 priv->engineKey.pattern = pattern;-
4126 }-
4127}-
4128-
4129/*!-
4130 Returns Qt::CaseSensitive if the regexp is matched case-
4131 sensitively; otherwise returns Qt::CaseInsensitive.-
4132-
4133 \sa patternSyntax(), pattern(), isMinimal()-
4134*/-
4135Qt::CaseSensitivity QRegExp::caseSensitivity() const-
4136{-
4137 return priv->engineKey.cs;-
4138}-
4139-
4140/*!-
4141 Sets case sensitive matching to \a cs.-
4142-
4143 If \a cs is Qt::CaseSensitive, \b{\\.txt$} matches-
4144 \c{readme.txt} but not \c{README.TXT}.-
4145-
4146 \sa setPatternSyntax(), setPattern(), setMinimal()-
4147*/-
4148void QRegExp::setCaseSensitivity(Qt::CaseSensitivity cs)-
4149{-
4150 if ((bool)cs != (bool)priv->engineKey.cs) {-
4151 invalidateEngine(priv);-
4152 priv->engineKey.cs = cs;-
4153 }-
4154}-
4155-
4156/*!-
4157 Returns the syntax used by the regular expression. The default is-
4158 QRegExp::RegExp.-
4159-
4160 \sa pattern(), caseSensitivity()-
4161*/-
4162QRegExp::PatternSyntax QRegExp::patternSyntax() const-
4163{-
4164 return priv->engineKey.patternSyntax;-
4165}-
4166-
4167/*!-
4168 Sets the syntax mode for the regular expression. The default is-
4169 QRegExp::RegExp.-
4170-
4171 Setting \a syntax to QRegExp::Wildcard enables simple shell-like-
4172 \l{QRegExp wildcard matching}. For example, \b{r*.txt} matches the-
4173 string \c{readme.txt} in wildcard mode, but does not match-
4174 \c{readme}.-
4175-
4176 Setting \a syntax to QRegExp::FixedString means that the pattern-
4177 is interpreted as a plain string. Special characters (e.g.,-
4178 backslash) don't need to be escaped then.-
4179-
4180 \sa setPattern(), setCaseSensitivity(), escape()-
4181*/-
4182void QRegExp::setPatternSyntax(PatternSyntax syntax)-
4183{-
4184 if (syntax != priv->engineKey.patternSyntax) {-
4185 invalidateEngine(priv);-
4186 priv->engineKey.patternSyntax = syntax;-
4187 }-
4188}-
4189-
4190/*!-
4191 Returns \c true if minimal (non-greedy) matching is enabled;-
4192 otherwise returns \c false.-
4193-
4194 \sa caseSensitivity(), setMinimal()-
4195*/-
4196bool QRegExp::isMinimal() const-
4197{-
4198 return priv->minimal;-
4199}-
4200-
4201/*!-
4202 Enables or disables minimal matching. If \a minimal is false,-
4203 matching is greedy (maximal) which is the default.-
4204-
4205 For example, suppose we have the input string "We must be-
4206 <b>bold</b>, very <b>bold</b>!" and the pattern-
4207 \b{<b>.*</b>}. With the default greedy (maximal) matching,-
4208 the match is "We must be \underline{<b>bold</b>, very-
4209 <b>bold</b>}!". But with minimal (non-greedy) matching, the-
4210 first match is: "We must be \underline{<b>bold</b>}, very-
4211 <b>bold</b>!" and the second match is "We must be <b>bold</b>,-
4212 very \underline{<b>bold</b>}!". In practice we might use the pattern-
4213 \b{<b>[^<]*\</b>} instead, although this will still fail for-
4214 nested tags.-
4215-
4216 \sa setCaseSensitivity()-
4217*/-
4218void QRegExp::setMinimal(bool minimal)-
4219{-
4220 priv->minimal = minimal;-
4221}-
4222-
4223// ### Qt 5: make non-const-
4224/*!-
4225 Returns \c true if \a str is matched exactly by this regular-
4226 expression; otherwise returns \c false. You can determine how much of-
4227 the string was matched by calling matchedLength().-
4228-
4229 For a given regexp string R, exactMatch("R") is the equivalent of-
4230 indexIn("^R$") since exactMatch() effectively encloses the regexp-
4231 in the start of string and end of string anchors, except that it-
4232 sets matchedLength() differently.-
4233-
4234 For example, if the regular expression is \b{blue}, then-
4235 exactMatch() returns \c true only for input \c blue. For inputs \c-
4236 bluebell, \c blutak and \c lightblue, exactMatch() returns \c false-
4237 and matchedLength() will return 4, 3 and 0 respectively.-
4238-
4239 Although const, this function sets matchedLength(),-
4240 capturedTexts(), and pos().-
4241-
4242 \sa indexIn(), lastIndexIn()-
4243*/-
4244bool QRegExp::exactMatch(const QString &str) const-
4245{-
4246 prepareEngineForMatch(priv, str);-
4247 priv->matchState.match(str.unicode(), str.length(), 0, priv->minimal, true, 0);-
4248 if (priv->matchState.captured[1] == str.length()) {-
4249 return true;-
4250 } else {-
4251 priv->matchState.captured[0] = 0;-
4252 priv->matchState.captured[1] = priv->matchState.oneTestMatchedLen;-
4253 return false;-
4254 }-
4255}-
4256-
4257// ### Qt 5: make non-const-
4258/*!-
4259 Attempts to find a match in \a str from position \a offset (0 by-
4260 default). If \a offset is -1, the search starts at the last-
4261 character; if -2, at the next to last character; etc.-
4262-
4263 Returns the position of the first match, or -1 if there was no-
4264 match.-
4265-
4266 The \a caretMode parameter can be used to instruct whether \b{^}-
4267 should match at index 0 or at \a offset.-
4268-
4269 You might prefer to use QString::indexOf(), QString::contains(),-
4270 or even QStringList::filter(). To replace matches use-
4271 QString::replace().-
4272-
4273 Example:-
4274 \snippet code/src_corelib_tools_qregexp.cpp 13-
4275-
4276 Although const, this function sets matchedLength(),-
4277 capturedTexts() and pos().-
4278-
4279 If the QRegExp is a wildcard expression (see setPatternSyntax())-
4280 and want to test a string against the whole wildcard expression,-
4281 use exactMatch() instead of this function.-
4282-
4283 \sa lastIndexIn(), exactMatch()-
4284*/-
4285-
4286int QRegExp::indexIn(const QString &str, int offset, CaretMode caretMode) const-
4287{-
4288 prepareEngineForMatch(priv, str);-
4289 if (offset < 0)-
4290 offset += str.length();-
4291 priv->matchState.match(str.unicode(), str.length(), offset,-
4292 priv->minimal, false, caretIndex(offset, caretMode));-
4293 return priv->matchState.captured[0];-
4294}-
4295-
4296// ### Qt 5: make non-const-
4297/*!-
4298 Attempts to find a match backwards in \a str from position \a-
4299 offset. If \a offset is -1 (the default), the search starts at the-
4300 last character; if -2, at the next to last character; etc.-
4301-
4302 Returns the position of the first match, or -1 if there was no-
4303 match.-
4304-
4305 The \a caretMode parameter can be used to instruct whether \b{^}-
4306 should match at index 0 or at \a offset.-
4307-
4308 Although const, this function sets matchedLength(),-
4309 capturedTexts() and pos().-
4310-
4311 \warning Searching backwards is much slower than searching-
4312 forwards.-
4313-
4314 \sa indexIn(), exactMatch()-
4315*/-
4316-
4317int QRegExp::lastIndexIn(const QString &str, int offset, CaretMode caretMode) const-
4318{-
4319 prepareEngineForMatch(priv, str);-
4320 if (offset < 0)-
4321 offset += str.length();-
4322 if (offset < 0 || offset > str.length()) {-
4323 memset(priv->matchState.captured, -1, priv->matchState.capturedSize*sizeof(int));-
4324 return -1;-
4325 }-
4326-
4327 while (offset >= 0) {-
4328 priv->matchState.match(str.unicode(), str.length(), offset,-
4329 priv->minimal, true, caretIndex(offset, caretMode));-
4330 if (priv->matchState.captured[0] == offset)-
4331 return offset;-
4332 --offset;-
4333 }-
4334 return -1;-
4335}-
4336-
4337/*!-
4338 Returns the length of the last matched string, or -1 if there was-
4339 no match.-
4340-
4341 \sa exactMatch(), indexIn(), lastIndexIn()-
4342*/-
4343int QRegExp::matchedLength() const-
4344{-
4345 return priv->matchState.captured[1];-
4346}-
4347-
4348#ifndef QT_NO_REGEXP_CAPTURE-
4349-
4350/*!-
4351 \since 4.6-
4352 Returns the number of captures contained in the regular expression.-
4353 */-
4354int QRegExp::captureCount() const-
4355{-
4356 prepareEngine(priv);-
4357 return priv->eng->captureCount();-
4358}-
4359-
4360/*!-
4361 Returns a list of the captured text strings.-
4362-
4363 The first string in the list is the entire matched string. Each-
4364 subsequent list element contains a string that matched a-
4365 (capturing) subexpression of the regexp.-
4366-
4367 For example:-
4368 \snippet code/src_corelib_tools_qregexp.cpp 14-
4369-
4370 The above example also captures elements that may be present but-
4371 which we have no interest in. This problem can be solved by using-
4372 non-capturing parentheses:-
4373-
4374 \snippet code/src_corelib_tools_qregexp.cpp 15-
4375-
4376 Note that if you want to iterate over the list, you should iterate-
4377 over a copy, e.g.-
4378 \snippet code/src_corelib_tools_qregexp.cpp 16-
4379-
4380 Some regexps can match an indeterminate number of times. For-
4381 example if the input string is "Offsets: 12 14 99 231 7" and the-
4382 regexp, \c{rx}, is \b{(\\d+)+}, we would hope to get a list of-
4383 all the numbers matched. However, after calling-
4384 \c{rx.indexIn(str)}, capturedTexts() will return the list ("12",-
4385 "12"), i.e. the entire match was "12" and the first subexpression-
4386 matched was "12". The correct approach is to use cap() in a-
4387 \l{QRegExp#cap_in_a_loop}{loop}.-
4388-
4389 The order of elements in the string list is as follows. The first-
4390 element is the entire matching string. Each subsequent element-
4391 corresponds to the next capturing open left parentheses. Thus-
4392 capturedTexts()[1] is the text of the first capturing parentheses,-
4393 capturedTexts()[2] is the text of the second and so on-
4394 (corresponding to $1, $2, etc., in some other regexp languages).-
4395-
4396 \sa cap(), pos()-
4397*/-
4398QStringList QRegExp::capturedTexts() const-
4399{-
4400 if (priv->capturedCache.isEmpty()) {-
4401 prepareEngine(priv);-
4402 const int *captured = priv->matchState.captured;-
4403 int n = priv->matchState.capturedSize;-
4404-
4405 for (int i = 0; i < n; i += 2) {-
4406 QString m;-
4407 if (captured[i + 1] == 0)-
4408 m = QLatin1String(""); // ### Qt 5: don't distinguish between null and empty-
4409 else if (captured[i] >= 0)-
4410 m = priv->t.mid(captured[i], captured[i + 1]);-
4411 priv->capturedCache.append(m);-
4412 }-
4413 priv->t.clear();-
4414 }-
4415 return priv->capturedCache;-
4416}-
4417-
4418/*!-
4419 \internal-
4420*/-
4421QStringList QRegExp::capturedTexts()-
4422{-
4423 return const_cast<const QRegExp *>(this)->capturedTexts();-
4424}-
4425-
4426/*!-
4427 Returns the text captured by the \a nth subexpression. The entire-
4428 match has index 0 and the parenthesized subexpressions have-
4429 indexes starting from 1 (excluding non-capturing parentheses).-
4430-
4431 \snippet code/src_corelib_tools_qregexp.cpp 17-
4432-
4433 The order of elements matched by cap() is as follows. The first-
4434 element, cap(0), is the entire matching string. Each subsequent-
4435 element corresponds to the next capturing open left parentheses.-
4436 Thus cap(1) is the text of the first capturing parentheses, cap(2)-
4437 is the text of the second, and so on.-
4438-
4439 \sa capturedTexts(), pos()-
4440*/-
4441QString QRegExp::cap(int nth) const-
4442{-
4443 return capturedTexts().value(nth);-
4444}-
4445-
4446/*!-
4447 \internal-
4448*/-
4449QString QRegExp::cap(int nth)-
4450{-
4451 return const_cast<const QRegExp *>(this)->cap(nth);-
4452}-
4453-
4454/*!-
4455 Returns the position of the \a nth captured text in the searched-
4456 string. If \a nth is 0 (the default), pos() returns the position-
4457 of the whole match.-
4458-
4459 Example:-
4460 \snippet code/src_corelib_tools_qregexp.cpp 18-
4461-
4462 For zero-length matches, pos() always returns -1. (For example, if-
4463 cap(4) would return an empty string, pos(4) returns -1.) This is-
4464 a feature of the implementation.-
4465-
4466 \sa cap(), capturedTexts()-
4467*/-
4468int QRegExp::pos(int nth) const-
4469{-
4470 if (nth < 0 || nth >= priv->matchState.capturedSize / 2)-
4471 return -1;-
4472 else-
4473 return priv->matchState.captured[2 * nth];-
4474}-
4475-
4476/*!-
4477 \internal-
4478*/-
4479int QRegExp::pos(int nth)-
4480{-
4481 return const_cast<const QRegExp *>(this)->pos(nth);-
4482}-
4483-
4484/*!-
4485 Returns a text string that explains why a regexp pattern is-
4486 invalid the case being; otherwise returns "no error occurred".-
4487-
4488 \sa isValid()-
4489*/-
4490QString QRegExp::errorString() const-
4491{-
4492 if (isValid()) {-
4493 return QString::fromLatin1(RXERR_OK);-
4494 } else {-
4495 return priv->eng->errorString();-
4496 }-
4497}-
4498-
4499/*!-
4500 \internal-
4501*/-
4502QString QRegExp::errorString()-
4503{-
4504 return const_cast<const QRegExp *>(this)->errorString();-
4505}-
4506#endif-
4507-
4508/*!-
4509 Returns the string \a str with every regexp special character-
4510 escaped with a backslash. The special characters are $, (,), *, +,-
4511 ., ?, [, \,], ^, {, | and }.-
4512-
4513 Example:-
4514-
4515 \snippet code/src_corelib_tools_qregexp.cpp 19-
4516-
4517 This function is useful to construct regexp patterns dynamically:-
4518-
4519 \snippet code/src_corelib_tools_qregexp.cpp 20-
4520-
4521 \sa setPatternSyntax()-
4522*/-
4523QString QRegExp::escape(const QString &str)-
4524{-
4525 QString quoted;-
4526 const int count = str.count();-
4527 quoted.reserve(count * 2);-
4528 const QLatin1Char backslash('\\');-
4529 for (int i = 0; i < count; i++) {-
4530 switch (str.at(i).toLatin1()) {-
4531 case '$':-
4532 case '(':-
4533 case ')':-
4534 case '*':-
4535 case '+':-
4536 case '.':-
4537 case '?':-
4538 case '[':-
4539 case '\\':-
4540 case ']':-
4541 case '^':-
4542 case '{':-
4543 case '|':-
4544 case '}':-
4545 quoted.append(backslash);-
4546 }-
4547 quoted.append(str.at(i));-
4548 }-
4549 return quoted;-
4550}-
4551-
4552-
4553#ifndef QT_NO_DATASTREAM-
4554/*!-
4555 \relates QRegExp-
4556-
4557 Writes the regular expression \a regExp to stream \a out.-
4558-
4559 \sa {Serializing Qt Data Types}-
4560*/-
4561QDataStream &operator<<(QDataStream &out, const QRegExp &regExp)-
4562{-
4563 return out << regExp.pattern() << (quint8)regExp.caseSensitivity()-
4564 << (quint8)regExp.patternSyntax()-
4565 << (quint8)!!regExp.isMinimal();-
4566}-
4567-
4568/*!-
4569 \relates QRegExp-
4570-
4571 Reads a regular expression from stream \a in into \a regExp.-
4572-
4573 \sa {Serializing Qt Data Types}-
4574*/-
4575QDataStream &operator>>(QDataStream &in, QRegExp &regExp)-
4576{-
4577 QString pattern;-
4578 quint8 cs;-
4579 quint8 patternSyntax;-
4580 quint8 isMinimal;-
4581-
4582 in >> pattern >> cs >> patternSyntax >> isMinimal;-
4583-
4584 QRegExp newRegExp(pattern, Qt::CaseSensitivity(cs),-
4585 QRegExp::PatternSyntax(patternSyntax));-
4586-
4587 newRegExp.setMinimal(isMinimal);-
4588 regExp = newRegExp;-
4589 return in;-
4590}-
4591#endif // QT_NO_DATASTREAM-
4592-
4593#ifndef QT_NO_DEBUG_STREAM-
4594QDebug operator<<(QDebug dbg, const QRegExp &r)-
4595{-
4596 QDebugStateSaver saver(dbg);-
4597 dbg.nospace() << "QRegExp(patternSyntax=" << r.patternSyntax()-
4598 << ", pattern='"<< r.pattern() << "')";-
4599 return dbg;-
4600}-
4601#endif-
4602-
4603QT_END_NAMESPACE-
Source codeSwitch to Preprocessed file

Generated by Squish Coco Non-Commercial 4.3.0-BETA-master-30-08-2018-4cb69e9