Absolute File Name: | /home/qt/qt5_coco/qt5/qtbase/src/corelib/tools/qregexp.cpp |
Source code | Switch to Preprocessed file |
Line | Source | Count |
---|---|---|
1 | /**************************************************************************** | - |
2 | ** | - |
3 | ** Copyright (C) 2016 The Qt Company Ltd. | - |
4 | ** Contact: https://www.qt.io/licensing/ | - |
5 | ** | - |
6 | ** This file is part of the QtCore module of the Qt Toolkit. | - |
7 | ** | - |
8 | ** $QT_BEGIN_LICENSE:LGPL$ | - |
9 | ** Commercial License Usage | - |
10 | ** Licensees holding valid commercial Qt licenses may use this file in | - |
11 | ** accordance with the commercial license agreement provided with the | - |
12 | ** Software or, alternatively, in accordance with the terms contained in | - |
13 | ** a written agreement between you and The Qt Company. For licensing terms | - |
14 | ** and conditions see https://www.qt.io/terms-conditions. For further | - |
15 | ** information use the contact form at https://www.qt.io/contact-us. | - |
16 | ** | - |
17 | ** GNU Lesser General Public License Usage | - |
18 | ** Alternatively, this file may be used under the terms of the GNU Lesser | - |
19 | ** General Public License version 3 as published by the Free Software | - |
20 | ** Foundation and appearing in the file LICENSE.LGPL3 included in the | - |
21 | ** packaging of this file. Please review the following information to | - |
22 | ** ensure the GNU Lesser General Public License version 3 requirements | - |
23 | ** will be met: https://www.gnu.org/licenses/lgpl-3.0.html. | - |
24 | ** | - |
25 | ** GNU General Public License Usage | - |
26 | ** Alternatively, this file may be used under the terms of the GNU | - |
27 | ** General Public License version 2.0 or (at your option) the GNU General | - |
28 | ** Public license version 3 or any later version approved by the KDE Free | - |
29 | ** Qt Foundation. The licenses are as published by the Free Software | - |
30 | ** Foundation and appearing in the file LICENSE.GPL2 and LICENSE.GPL3 | - |
31 | ** included in the packaging of this file. Please review the following | - |
32 | ** information to ensure the GNU General Public License requirements will | - |
33 | ** be met: https://www.gnu.org/licenses/gpl-2.0.html and | - |
34 | ** https://www.gnu.org/licenses/gpl-3.0.html. | - |
35 | ** | - |
36 | ** $QT_END_LICENSE$ | - |
37 | ** | - |
38 | ****************************************************************************/ | - |
39 | - | |
40 | #include "qregexp.h" | - |
41 | - | |
42 | #include "qalgorithms.h" | - |
43 | #include "qbitarray.h" | - |
44 | #include "qcache.h" | - |
45 | #include "qdatastream.h" | - |
46 | #include "qdebug.h" | - |
47 | #include "qhashfunctions.h" | - |
48 | #include "qlist.h" | - |
49 | #include "qmap.h" | - |
50 | #include "qmutex.h" | - |
51 | #include "qstring.h" | - |
52 | #include "qstringlist.h" | - |
53 | #include "qstringmatcher.h" | - |
54 | #include "qvector.h" | - |
55 | - | |
56 | #include <limits.h> | - |
57 | #include <algorithm> | - |
58 | - | |
59 | QT_BEGIN_NAMESPACE | - |
60 | - | |
61 | int qFindString(const QChar *haystack, int haystackLen, int from, | - |
62 | const QChar *needle, int needleLen, Qt::CaseSensitivity cs); | - |
63 | - | |
64 | // error strings for the regexp parser | - |
65 | #define RXERR_OK QT_TRANSLATE_NOOP("QRegExp", "no error occurred") | - |
66 | #define RXERR_DISABLED QT_TRANSLATE_NOOP("QRegExp", "disabled feature used") | - |
67 | #define RXERR_CHARCLASS QT_TRANSLATE_NOOP("QRegExp", "bad char class syntax") | - |
68 | #define RXERR_LOOKAHEAD QT_TRANSLATE_NOOP("QRegExp", "bad lookahead syntax") | - |
69 | #define RXERR_LOOKBEHIND QT_TRANSLATE_NOOP("QRegExp", "lookbehinds not supported, see QTBUG-2371") | - |
70 | #define RXERR_REPETITION QT_TRANSLATE_NOOP("QRegExp", "bad repetition syntax") | - |
71 | #define RXERR_OCTAL QT_TRANSLATE_NOOP("QRegExp", "invalid octal value") | - |
72 | #define RXERR_LEFTDELIM QT_TRANSLATE_NOOP("QRegExp", "missing left delim") | - |
73 | #define RXERR_END QT_TRANSLATE_NOOP("QRegExp", "unexpected end") | - |
74 | #define RXERR_LIMIT QT_TRANSLATE_NOOP("QRegExp", "met internal limit") | - |
75 | #define RXERR_INTERVAL QT_TRANSLATE_NOOP("QRegExp", "invalid interval") | - |
76 | #define RXERR_CATEGORY QT_TRANSLATE_NOOP("QRegExp", "invalid category") | - |
77 | - | |
78 | /*! | - |
79 | \class QRegExp | - |
80 | \inmodule QtCore | - |
81 | \reentrant | - |
82 | \brief The QRegExp class provides pattern matching using regular expressions. | - |
83 | - | |
84 | \ingroup tools | - |
85 | \ingroup shared | - |
86 | - | |
87 | \keyword regular expression | - |
88 | - | |
89 | A regular expression, or "regexp", is a pattern for matching | - |
90 | substrings in a text. This is useful in many contexts, e.g., | - |
91 | - | |
92 | \table | - |
93 | \row \li Validation | - |
94 | \li A regexp can test whether a substring meets some criteria, | - |
95 | e.g. is an integer or contains no whitespace. | - |
96 | \row \li Searching | - |
97 | \li A regexp provides more powerful pattern matching than | - |
98 | simple substring matching, e.g., match one of the words | - |
99 | \e{mail}, \e{letter} or \e{correspondence}, but none of the | - |
100 | words \e{email}, \e{mailman}, \e{mailer}, \e{letterbox}, etc. | - |
101 | \row \li Search and Replace | - |
102 | \li A regexp can replace all occurrences of a substring with a | - |
103 | different substring, e.g., replace all occurrences of \e{&} | - |
104 | with \e{\&} except where the \e{&} is already followed by | - |
105 | an \e{amp;}. | - |
106 | \row \li String Splitting | - |
107 | \li A regexp can be used to identify where a string should be | - |
108 | split apart, e.g. splitting tab-delimited strings. | - |
109 | \endtable | - |
110 | - | |
111 | A brief introduction to regexps is presented, a description of | - |
112 | Qt's regexp language, some examples, and the function | - |
113 | documentation itself. QRegExp is modeled on Perl's regexp | - |
114 | language. It fully supports Unicode. QRegExp can also be used in a | - |
115 | simpler, \e{wildcard mode} that is similar to the functionality | - |
116 | found in command shells. The syntax rules used by QRegExp can be | - |
117 | changed with setPatternSyntax(). In particular, the pattern syntax | - |
118 | can be set to QRegExp::FixedString, which means the pattern to be | - |
119 | matched is interpreted as a plain string, i.e., special characters | - |
120 | (e.g., backslash) are not escaped. | - |
121 | - | |
122 | A good text on regexps is \e {Mastering Regular Expressions} | - |
123 | (Third Edition) by Jeffrey E. F. Friedl, ISBN 0-596-52812-4. | - |
124 | - | |
125 | \note In Qt 5, the new QRegularExpression class provides a Perl | - |
126 | compatible implementation of regular expressions and is recommended | - |
127 | in place of QRegExp. | - |
128 | - | |
129 | \tableofcontents | - |
130 | - | |
131 | \section1 Introduction | - |
132 | - | |
133 | Regexps are built up from expressions, quantifiers, and | - |
134 | assertions. The simplest expression is a character, e.g. \b{x} | - |
135 | or \b{5}. An expression can also be a set of characters | - |
136 | enclosed in square brackets. \b{[ABCD]} will match an \b{A} | - |
137 | or a \b{B} or a \b{C} or a \b{D}. We can write this same | - |
138 | expression as \b{[A-D]}, and an expression to match any | - |
139 | capital letter in the English alphabet is written as | - |
140 | \b{[A-Z]}. | - |
141 | - | |
142 | A quantifier specifies the number of occurrences of an expression | - |
143 | that must be matched. \b{x{1,1}} means match one and only one | - |
144 | \b{x}. \b{x{1,5}} means match a sequence of \b{x} | - |
145 | characters that contains at least one \b{x} but no more than | - |
146 | five. | - |
147 | - | |
148 | Note that in general regexps cannot be used to check for balanced | - |
149 | brackets or tags. For example, a regexp can be written to match an | - |
150 | opening html \c{<b>} and its closing \c{</b>}, if the \c{<b>} tags | - |
151 | are not nested, but if the \c{<b>} tags are nested, that same | - |
152 | regexp will match an opening \c{<b>} tag with the wrong closing | - |
153 | \c{</b>}. For the fragment \c{<b>bold <b>bolder</b></b>}, the | - |
154 | first \c{<b>} would be matched with the first \c{</b>}, which is | - |
155 | not correct. However, it is possible to write a regexp that will | - |
156 | match nested brackets or tags correctly, but only if the number of | - |
157 | nesting levels is fixed and known. If the number of nesting levels | - |
158 | is not fixed and known, it is impossible to write a regexp that | - |
159 | will not fail. | - |
160 | - | |
161 | Suppose we want a regexp to match integers in the range 0 to 99. | - |
162 | At least one digit is required, so we start with the expression | - |
163 | \b{[0-9]{1,1}}, which matches a single digit exactly once. This | - |
164 | regexp matches integers in the range 0 to 9. To match integers up | - |
165 | to 99, increase the maximum number of occurrences to 2, so the | - |
166 | regexp becomes \b{[0-9]{1,2}}. This regexp satisfies the | - |
167 | original requirement to match integers from 0 to 99, but it will | - |
168 | also match integers that occur in the middle of strings. If we | - |
169 | want the matched integer to be the whole string, we must use the | - |
170 | anchor assertions, \b{^} (caret) and \b{$} (dollar). When | - |
171 | \b{^} is the first character in a regexp, it means the regexp | - |
172 | must match from the beginning of the string. When \b{$} is the | - |
173 | last character of the regexp, it means the regexp must match to | - |
174 | the end of the string. The regexp becomes \b{^[0-9]{1,2}$}. | - |
175 | Note that assertions, e.g. \b{^} and \b{$}, do not match | - |
176 | characters but locations in the string. | - |
177 | - | |
178 | If you have seen regexps described elsewhere, they may have looked | - |
179 | different from the ones shown here. This is because some sets of | - |
180 | characters and some quantifiers are so common that they have been | - |
181 | given special symbols to represent them. \b{[0-9]} can be | - |
182 | replaced with the symbol \b{\\d}. The quantifier to match | - |
183 | exactly one occurrence, \b{{1,1}}, can be replaced with the | - |
184 | expression itself, i.e. \b{x{1,1}} is the same as \b{x}. So | - |
185 | our 0 to 99 matcher could be written as \b{^\\d{1,2}$}. It can | - |
186 | also be written \b{^\\d\\d{0,1}$}, i.e. \e{From the start of | - |
187 | the string, match a digit, followed immediately by 0 or 1 digits}. | - |
188 | In practice, it would be written as \b{^\\d\\d?$}. The \b{?} | - |
189 | is shorthand for the quantifier \b{{0,1}}, i.e. 0 or 1 | - |
190 | occurrences. \b{?} makes an expression optional. The regexp | - |
191 | \b{^\\d\\d?$} means \e{From the beginning of the string, match | - |
192 | one digit, followed immediately by 0 or 1 more digit, followed | - |
193 | immediately by end of string}. | - |
194 | - | |
195 | To write a regexp that matches one of the words 'mail' \e or | - |
196 | 'letter' \e or 'correspondence' but does not match words that | - |
197 | contain these words, e.g., 'email', 'mailman', 'mailer', and | - |
198 | 'letterbox', start with a regexp that matches 'mail'. Expressed | - |
199 | fully, the regexp is \b{m{1,1}a{1,1}i{1,1}l{1,1}}, but because | - |
200 | a character expression is automatically quantified by | - |
201 | \b{{1,1}}, we can simplify the regexp to \b{mail}, i.e., an | - |
202 | 'm' followed by an 'a' followed by an 'i' followed by an 'l'. Now | - |
203 | we can use the vertical bar \b{|}, which means \b{or}, to | - |
204 | include the other two words, so our regexp for matching any of the | - |
205 | three words becomes \b{mail|letter|correspondence}. Match | - |
206 | 'mail' \b{or} 'letter' \b{or} 'correspondence'. While this | - |
207 | regexp will match one of the three words we want to match, it will | - |
208 | also match words we don't want to match, e.g., 'email'. To | - |
209 | prevent the regexp from matching unwanted words, we must tell it | - |
210 | to begin and end the match at word boundaries. First we enclose | - |
211 | our regexp in parentheses, \b{(mail|letter|correspondence)}. | - |
212 | Parentheses group expressions together, and they identify a part | - |
213 | of the regexp that we wish to \l{capturing text}{capture}. | - |
214 | Enclosing the expression in parentheses allows us to use it as a | - |
215 | component in more complex regexps. It also allows us to examine | - |
216 | which of the three words was actually matched. To force the match | - |
217 | to begin and end on word boundaries, we enclose the regexp in | - |
218 | \b{\\b} \e{word boundary} assertions: | - |
219 | \b{\\b(mail|letter|correspondence)\\b}. Now the regexp means: | - |
220 | \e{Match a word boundary, followed by the regexp in parentheses, | - |
221 | followed by a word boundary}. The \b{\\b} assertion matches a | - |
222 | \e position in the regexp, not a \e character. A word boundary is | - |
223 | any non-word character, e.g., a space, newline, or the beginning | - |
224 | or ending of a string. | - |
225 | - | |
226 | If we want to replace ampersand characters with the HTML entity | - |
227 | \b{\&}, the regexp to match is simply \b{\&}. But this | - |
228 | regexp will also match ampersands that have already been converted | - |
229 | to HTML entities. We want to replace only ampersands that are not | - |
230 | already followed by \b{amp;}. For this, we need the negative | - |
231 | lookahead assertion, \b{(?!}__\b{)}. The regexp can then be | - |
232 | written as \b{\&(?!amp;)}, i.e. \e{Match an ampersand that is} | - |
233 | \b{not} \e{followed by} \b{amp;}. | - |
234 | - | |
235 | If we want to count all the occurrences of 'Eric' and 'Eirik' in a | - |
236 | string, two valid solutions are \b{\\b(Eric|Eirik)\\b} and | - |
237 | \b{\\bEi?ri[ck]\\b}. The word boundary assertion '\\b' is | - |
238 | required to avoid matching words that contain either name, | - |
239 | e.g. 'Ericsson'. Note that the second regexp matches more | - |
240 | spellings than we want: 'Eric', 'Erik', 'Eiric' and 'Eirik'. | - |
241 | - | |
242 | Some of the examples discussed above are implemented in the | - |
243 | \l{#code-examples}{code examples} section. | - |
244 | - | |
245 | \target characters-and-abbreviations-for-sets-of-characters | - |
246 | \section1 Characters and Abbreviations for Sets of Characters | - |
247 | - | |
248 | \table | - |
249 | \header \li Element \li Meaning | - |
250 | \row \li \b{c} | - |
251 | \li A character represents itself unless it has a special | - |
252 | regexp meaning. e.g. \b{c} matches the character \e c. | - |
253 | \row \li \b{\\c} | - |
254 | \li A character that follows a backslash matches the character | - |
255 | itself, except as specified below. e.g., To match a literal | - |
256 | caret at the beginning of a string, write \b{\\^}. | - |
257 | \row \li \b{\\a} | - |
258 | \li Matches the ASCII bell (BEL, 0x07). | - |
259 | \row \li \b{\\f} | - |
260 | \li Matches the ASCII form feed (FF, 0x0C). | - |
261 | \row \li \b{\\n} | - |
262 | \li Matches the ASCII line feed (LF, 0x0A, Unix newline). | - |
263 | \row \li \b{\\r} | - |
264 | \li Matches the ASCII carriage return (CR, 0x0D). | - |
265 | \row \li \b{\\t} | - |
266 | \li Matches the ASCII horizontal tab (HT, 0x09). | - |
267 | \row \li \b{\\v} | - |
268 | \li Matches the ASCII vertical tab (VT, 0x0B). | - |
269 | \row \li \b{\\x\e{hhhh}} | - |
270 | \li Matches the Unicode character corresponding to the | - |
271 | hexadecimal number \e{hhhh} (between 0x0000 and 0xFFFF). | - |
272 | \row \li \b{\\0\e{ooo}} (i.e., \\zero \e{ooo}) | - |
273 | \li matches the ASCII/Latin1 character for the octal number | - |
274 | \e{ooo} (between 0 and 0377). | - |
275 | \row \li \b{. (dot)} | - |
276 | \li Matches any character (including newline). | - |
277 | \row \li \b{\\d} | - |
278 | \li Matches a digit (QChar::isDigit()). | - |
279 | \row \li \b{\\D} | - |
280 | \li Matches a non-digit. | - |
281 | \row \li \b{\\s} | - |
282 | \li Matches a whitespace character (QChar::isSpace()). | - |
283 | \row \li \b{\\S} | - |
284 | \li Matches a non-whitespace character. | - |
285 | \row \li \b{\\w} | - |
286 | \li Matches a word character (QChar::isLetterOrNumber(), QChar::isMark(), or '_'). | - |
287 | \row \li \b{\\W} | - |
288 | \li Matches a non-word character. | - |
289 | \row \li \b{\\\e{n}} | - |
290 | \li The \e{n}-th backreference, e.g. \\1, \\2, etc. | - |
291 | \endtable | - |
292 | - | |
293 | \b{Note:} The C++ compiler transforms backslashes in strings. | - |
294 | To include a \b{\\} in a regexp, enter it twice, i.e. \c{\\}. | - |
295 | To match the backslash character itself, enter it four times, i.e. | - |
296 | \c{\\\\}. | - |
297 | - | |
298 | \target sets-of-characters | - |
299 | \section1 Sets of Characters | - |
300 | - | |
301 | Square brackets mean match any character contained in the square | - |
302 | brackets. The character set abbreviations described above can | - |
303 | appear in a character set in square brackets. Except for the | - |
304 | character set abbreviations and the following two exceptions, | - |
305 | characters do not have special meanings in square brackets. | - |
306 | - | |
307 | \table | - |
308 | \row \li \b{^} | - |
309 | - | |
310 | \li The caret negates the character set if it occurs as the | - |
311 | first character (i.e. immediately after the opening square | - |
312 | bracket). \b{[abc]} matches 'a' or 'b' or 'c', but | - |
313 | \b{[^abc]} matches anything \e but 'a' or 'b' or 'c'. | - |
314 | - | |
315 | \row \li \b{-} | - |
316 | - | |
317 | \li The dash indicates a range of characters. \b{[W-Z]} | - |
318 | matches 'W' or 'X' or 'Y' or 'Z'. | - |
319 | - | |
320 | \endtable | - |
321 | - | |
322 | Using the predefined character set abbreviations is more portable | - |
323 | than using character ranges across platforms and languages. For | - |
324 | example, \b{[0-9]} matches a digit in Western alphabets but | - |
325 | \b{\\d} matches a digit in \e any alphabet. | - |
326 | - | |
327 | Note: In other regexp documentation, sets of characters are often | - |
328 | called "character classes". | - |
329 | - | |
330 | \target quantifiers | - |
331 | \section1 Quantifiers | - |
332 | - | |
333 | By default, an expression is automatically quantified by | - |
334 | \b{{1,1}}, i.e. it should occur exactly once. In the following | - |
335 | list, \b{\e {E}} stands for expression. An expression is a | - |
336 | character, or an abbreviation for a set of characters, or a set of | - |
337 | characters in square brackets, or an expression in parentheses. | - |
338 | - | |
339 | \table | - |
340 | \row \li \b{\e {E}?} | - |
341 | - | |
342 | \li Matches zero or one occurrences of \e E. This quantifier | - |
343 | means \e{The previous expression is optional}, because it | - |
344 | will match whether or not the expression is found. \b{\e | - |
345 | {E}?} is the same as \b{\e {E}{0,1}}. e.g., \b{dents?} | - |
346 | matches 'dent' or 'dents'. | - |
347 | - | |
348 | \row \li \b{\e {E}+} | - |
349 | - | |
350 | \li Matches one or more occurrences of \e E. \b{\e {E}+} is | - |
351 | the same as \b{\e {E}{1,}}. e.g., \b{0+} matches '0', | - |
352 | '00', '000', etc. | - |
353 | - | |
354 | \row \li \b{\e {E}*} | - |
355 | - | |
356 | \li Matches zero or more occurrences of \e E. It is the same | - |
357 | as \b{\e {E}{0,}}. The \b{*} quantifier is often used | - |
358 | in error where \b{+} should be used. For example, if | - |
359 | \b{\\s*$} is used in an expression to match strings that | - |
360 | end in whitespace, it will match every string because | - |
361 | \b{\\s*$} means \e{Match zero or more whitespaces followed | - |
362 | by end of string}. The correct regexp to match strings that | - |
363 | have at least one trailing whitespace character is | - |
364 | \b{\\s+$}. | - |
365 | - | |
366 | \row \li \b{\e {E}{n}} | - |
367 | - | |
368 | \li Matches exactly \e n occurrences of \e E. \b{\e {E}{n}} | - |
369 | is the same as repeating \e E \e n times. For example, | - |
370 | \b{x{5}} is the same as \b{xxxxx}. It is also the same | - |
371 | as \b{\e {E}{n,n}}, e.g. \b{x{5,5}}. | - |
372 | - | |
373 | \row \li \b{\e {E}{n,}} | - |
374 | \li Matches at least \e n occurrences of \e E. | - |
375 | - | |
376 | \row \li \b{\e {E}{,m}} | - |
377 | \li Matches at most \e m occurrences of \e E. \b{\e {E}{,m}} | - |
378 | is the same as \b{\e {E}{0,m}}. | - |
379 | - | |
380 | \row \li \b{\e {E}{n,m}} | - |
381 | \li Matches at least \e n and at most \e m occurrences of \e E. | - |
382 | \endtable | - |
383 | - | |
384 | To apply a quantifier to more than just the preceding character, | - |
385 | use parentheses to group characters together in an expression. For | - |
386 | example, \b{tag+} matches a 't' followed by an 'a' followed by | - |
387 | at least one 'g', whereas \b{(tag)+} matches at least one | - |
388 | occurrence of 'tag'. | - |
389 | - | |
390 | Note: Quantifiers are normally "greedy". They always match as much | - |
391 | text as they can. For example, \b{0+} matches the first zero it | - |
392 | finds and all the consecutive zeros after the first zero. Applied | - |
393 | to '20005', it matches '2\underline{000}5'. Quantifiers can be made | - |
394 | non-greedy, see setMinimal(). | - |
395 | - | |
396 | \target capturing parentheses | - |
397 | \target backreferences | - |
398 | \section1 Capturing Text | - |
399 | - | |
400 | Parentheses allow us to group elements together so that we can | - |
401 | quantify and capture them. For example if we have the expression | - |
402 | \b{mail|letter|correspondence} that matches a string we know | - |
403 | that \e one of the words matched but not which one. Using | - |
404 | parentheses allows us to "capture" whatever is matched within | - |
405 | their bounds, so if we used \b{(mail|letter|correspondence)} | - |
406 | and matched this regexp against the string "I sent you some email" | - |
407 | we can use the cap() or capturedTexts() functions to extract the | - |
408 | matched characters, in this case 'mail'. | - |
409 | - | |
410 | We can use captured text within the regexp itself. To refer to the | - |
411 | captured text we use \e backreferences which are indexed from 1, | - |
412 | the same as for cap(). For example we could search for duplicate | - |
413 | words in a string using \b{\\b(\\w+)\\W+\\1\\b} which means match a | - |
414 | word boundary followed by one or more word characters followed by | - |
415 | one or more non-word characters followed by the same text as the | - |
416 | first parenthesized expression followed by a word boundary. | - |
417 | - | |
418 | If we want to use parentheses purely for grouping and not for | - |
419 | capturing we can use the non-capturing syntax, e.g. | - |
420 | \b{(?:green|blue)}. Non-capturing parentheses begin '(?:' and | - |
421 | end ')'. In this example we match either 'green' or 'blue' but we | - |
422 | do not capture the match so we only know whether or not we matched | - |
423 | but not which color we actually found. Using non-capturing | - |
424 | parentheses is more efficient than using capturing parentheses | - |
425 | since the regexp engine has to do less book-keeping. | - |
426 | - | |
427 | Both capturing and non-capturing parentheses may be nested. | - |
428 | - | |
429 | \target greedy quantifiers | - |
430 | - | |
431 | For historical reasons, quantifiers (e.g. \b{*}) that apply to | - |
432 | capturing parentheses are more "greedy" than other quantifiers. | - |
433 | For example, \b{a*(a*)} will match "aaa" with cap(1) == "aaa". | - |
434 | This behavior is different from what other regexp engines do | - |
435 | (notably, Perl). To obtain a more intuitive capturing behavior, | - |
436 | specify QRegExp::RegExp2 to the QRegExp constructor or call | - |
437 | setPatternSyntax(QRegExp::RegExp2). | - |
438 | - | |
439 | \target cap_in_a_loop | - |
440 | - | |
441 | When the number of matches cannot be determined in advance, a | - |
442 | common idiom is to use cap() in a loop. For example: | - |
443 | - | |
444 | \snippet code/src_corelib_tools_qregexp.cpp 0 | - |
445 | - | |
446 | \target assertions | - |
447 | \section1 Assertions | - |
448 | - | |
449 | Assertions make some statement about the text at the point where | - |
450 | they occur in the regexp but they do not match any characters. In | - |
451 | the following list \b{\e {E}} stands for any expression. | - |
452 | - | |
453 | \table | - |
454 | \row \li \b{^} | - |
455 | \li The caret signifies the beginning of the string. If you | - |
456 | wish to match a literal \c{^} you must escape it by | - |
457 | writing \c{\\^}. For example, \b{^#include} will only | - |
458 | match strings which \e begin with the characters '#include'. | - |
459 | (When the caret is the first character of a character set it | - |
460 | has a special meaning, see \l{#sets-of-characters}{Sets of Characters}.) | - |
461 | - | |
462 | \row \li \b{$} | - |
463 | \li The dollar signifies the end of the string. For example | - |
464 | \b{\\d\\s*$} will match strings which end with a digit | - |
465 | optionally followed by whitespace. If you wish to match a | - |
466 | literal \c{$} you must escape it by writing | - |
467 | \c{\\$}. | - |
468 | - | |
469 | \row \li \b{\\b} | - |
470 | \li A word boundary. For example the regexp | - |
471 | \b{\\bOK\\b} means match immediately after a word | - |
472 | boundary (e.g. start of string or whitespace) the letter 'O' | - |
473 | then the letter 'K' immediately before another word boundary | - |
474 | (e.g. end of string or whitespace). But note that the | - |
475 | assertion does not actually match any whitespace so if we | - |
476 | write \b{(\\bOK\\b)} and we have a match it will only | - |
477 | contain 'OK' even if the string is "It's \underline{OK} now". | - |
478 | - | |
479 | \row \li \b{\\B} | - |
480 | \li A non-word boundary. This assertion is true wherever | - |
481 | \b{\\b} is false. For example if we searched for | - |
482 | \b{\\Bon\\B} in "Left on" the match would fail (space | - |
483 | and end of string aren't non-word boundaries), but it would | - |
484 | match in "t\underline{on}ne". | - |
485 | - | |
486 | \row \li \b{(?=\e E)} | - |
487 | \li Positive lookahead. This assertion is true if the | - |
488 | expression matches at this point in the regexp. For example, | - |
489 | \b{const(?=\\s+char)} matches 'const' whenever it is | - |
490 | followed by 'char', as in 'static \underline{const} char *'. | - |
491 | (Compare with \b{const\\s+char}, which matches 'static | - |
492 | \underline{const char} *'.) | - |
493 | - | |
494 | \row \li \b{(?!\e E)} | - |
495 | \li Negative lookahead. This assertion is true if the | - |
496 | expression does not match at this point in the regexp. For | - |
497 | example, \b{const(?!\\s+char)} matches 'const' \e except | - |
498 | when it is followed by 'char'. | - |
499 | \endtable | - |
500 | - | |
501 | \target QRegExp wildcard matching | - |
502 | \section1 Wildcard Matching | - |
503 | - | |
504 | Most command shells such as \e bash or \e cmd.exe support "file | - |
505 | globbing", the ability to identify a group of files by using | - |
506 | wildcards. The setPatternSyntax() function is used to switch | - |
507 | between regexp and wildcard mode. Wildcard matching is much | - |
508 | simpler than full regexps and has only four features: | - |
509 | - | |
510 | \table | - |
511 | \row \li \b{c} | - |
512 | \li Any character represents itself apart from those mentioned | - |
513 | below. Thus \b{c} matches the character \e c. | - |
514 | \row \li \b{?} | - |
515 | \li Matches any single character. It is the same as | - |
516 | \b{.} in full regexps. | - |
517 | \row \li \b{*} | - |
518 | \li Matches zero or more of any characters. It is the | - |
519 | same as \b{.*} in full regexps. | - |
520 | \row \li \b{[...]} | - |
521 | \li Sets of characters can be represented in square brackets, | - |
522 | similar to full regexps. Within the character class, like | - |
523 | outside, backslash has no special meaning. | - |
524 | \endtable | - |
525 | - | |
526 | In the mode Wildcard, the wildcard characters cannot be | - |
527 | escaped. In the mode WildcardUnix, the character '\\' escapes the | - |
528 | wildcard. | - |
529 | - | |
530 | For example if we are in wildcard mode and have strings which | - |
531 | contain filenames we could identify HTML files with \b{*.html}. | - |
532 | This will match zero or more characters followed by a dot followed | - |
533 | by 'h', 't', 'm' and 'l'. | - |
534 | - | |
535 | To test a string against a wildcard expression, use exactMatch(). | - |
536 | For example: | - |
537 | - | |
538 | \snippet code/src_corelib_tools_qregexp.cpp 1 | - |
539 | - | |
540 | \target perl-users | - |
541 | \section1 Notes for Perl Users | - |
542 | - | |
543 | Most of the character class abbreviations supported by Perl are | - |
544 | supported by QRegExp, see \l{#characters-and-abbreviations-for-sets-of-characters} | - |
545 | {characters and abbreviations for sets of characters}. | - |
546 | - | |
547 | In QRegExp, apart from within character classes, \c{^} always | - |
548 | signifies the start of the string, so carets must always be | - |
549 | escaped unless used for that purpose. In Perl the meaning of caret | - |
550 | varies automagically depending on where it occurs so escaping it | - |
551 | is rarely necessary. The same applies to \c{$} which in | - |
552 | QRegExp always signifies the end of the string. | - |
553 | - | |
554 | QRegExp's quantifiers are the same as Perl's greedy quantifiers | - |
555 | (but see the \l{greedy quantifiers}{note above}). Non-greedy | - |
556 | matching cannot be applied to individual quantifiers, but can be | - |
557 | applied to all the quantifiers in the pattern. For example, to | - |
558 | match the Perl regexp \b{ro+?m} requires: | - |
559 | - | |
560 | \snippet code/src_corelib_tools_qregexp.cpp 2 | - |
561 | - | |
562 | The equivalent of Perl's \c{/i} option is | - |
563 | setCaseSensitivity(Qt::CaseInsensitive). | - |
564 | - | |
565 | Perl's \c{/g} option can be emulated using a \l{#cap_in_a_loop}{loop}. | - |
566 | - | |
567 | In QRegExp \b{.} matches any character, therefore all QRegExp | - |
568 | regexps have the equivalent of Perl's \c{/s} option. QRegExp | - |
569 | does not have an equivalent to Perl's \c{/m} option, but this | - |
570 | can be emulated in various ways for example by splitting the input | - |
571 | into lines or by looping with a regexp that searches for newlines. | - |
572 | - | |
573 | Because QRegExp is string oriented, there are no \\A, \\Z, or \\z | - |
574 | assertions. The \\G assertion is not supported but can be emulated | - |
575 | in a loop. | - |
576 | - | |
577 | Perl's $& is cap(0) or capturedTexts()[0]. There are no QRegExp | - |
578 | equivalents for $`, $' or $+. Perl's capturing variables, $1, $2, | - |
579 | ... correspond to cap(1) or capturedTexts()[1], cap(2) or | - |
580 | capturedTexts()[2], etc. | - |
581 | - | |
582 | To substitute a pattern use QString::replace(). | - |
583 | - | |
584 | Perl's extended \c{/x} syntax is not supported, nor are | - |
585 | directives, e.g. (?i), or regexp comments, e.g. (?#comment). On | - |
586 | the other hand, C++'s rules for literal strings can be used to | - |
587 | achieve the same: | - |
588 | - | |
589 | \snippet code/src_corelib_tools_qregexp.cpp 3 | - |
590 | - | |
591 | Both zero-width positive and zero-width negative lookahead | - |
592 | assertions (?=pattern) and (?!pattern) are supported with the same | - |
593 | syntax as Perl. Perl's lookbehind assertions, "independent" | - |
594 | subexpressions and conditional expressions are not supported. | - |
595 | - | |
596 | Non-capturing parentheses are also supported, with the same | - |
597 | (?:pattern) syntax. | - |
598 | - | |
599 | See QString::split() and QStringList::join() for equivalents | - |
600 | to Perl's split and join functions. | - |
601 | - | |
602 | Note: because C++ transforms \\'s they must be written \e twice in | - |
603 | code, e.g. \b{\\b} must be written \b{\\\\b}. | - |
604 | - | |
605 | \target code-examples | - |
606 | \section1 Code Examples | - |
607 | - | |
608 | \snippet code/src_corelib_tools_qregexp.cpp 4 | - |
609 | - | |
610 | The third string matches '\underline{6}'. This is a simple validation | - |
611 | regexp for integers in the range 0 to 99. | - |
612 | - | |
613 | \snippet code/src_corelib_tools_qregexp.cpp 5 | - |
614 | - | |
615 | The second string matches '\underline{This_is-OK}'. We've used the | - |
616 | character set abbreviation '\\S' (non-whitespace) and the anchors | - |
617 | to match strings which contain no whitespace. | - |
618 | - | |
619 | In the following example we match strings containing 'mail' or | - |
620 | 'letter' or 'correspondence' but only match whole words i.e. not | - |
621 | 'email' | - |
622 | - | |
623 | \snippet code/src_corelib_tools_qregexp.cpp 6 | - |
624 | - | |
625 | The second string matches "Please write the \underline{letter}". The | - |
626 | word 'letter' is also captured (because of the parentheses). We | - |
627 | can see what text we've captured like this: | - |
628 | - | |
629 | \snippet code/src_corelib_tools_qregexp.cpp 7 | - |
630 | - | |
631 | This will capture the text from the first set of capturing | - |
632 | parentheses (counting capturing left parentheses from left to | - |
633 | right). The parentheses are counted from 1 since cap(0) is the | - |
634 | whole matched regexp (equivalent to '&' in most regexp engines). | - |
635 | - | |
636 | \snippet code/src_corelib_tools_qregexp.cpp 8 | - |
637 | - | |
638 | Here we've passed the QRegExp to QString's replace() function to | - |
639 | replace the matched text with new text. | - |
640 | - | |
641 | \snippet code/src_corelib_tools_qregexp.cpp 9 | - |
642 | - | |
643 | We've used the indexIn() function to repeatedly match the regexp in | - |
644 | the string. Note that instead of moving forward by one character | - |
645 | at a time \c pos++ we could have written \c {pos += | - |
646 | rx.matchedLength()} to skip over the already matched string. The | - |
647 | count will equal 3, matching 'One \underline{Eric} another | - |
648 | \underline{Eirik}, and an Ericsson. How many Eiriks, \underline{Eric}?'; it | - |
649 | doesn't match 'Ericsson' or 'Eiriks' because they are not bounded | - |
650 | by non-word boundaries. | - |
651 | - | |
652 | One common use of regexps is to split lines of delimited data into | - |
653 | their component fields. | - |
654 | - | |
655 | \snippet code/src_corelib_tools_qregexp.cpp 10 | - |
656 | - | |
657 | In this example our input lines have the format company name, web | - |
658 | address and country. Unfortunately the regexp is rather long and | - |
659 | not very versatile -- the code will break if we add any more | - |
660 | fields. A simpler and better solution is to look for the | - |
661 | separator, '\\t' in this case, and take the surrounding text. The | - |
662 | QString::split() function can take a separator string or regexp | - |
663 | as an argument and split a string accordingly. | - |
664 | - | |
665 | \snippet code/src_corelib_tools_qregexp.cpp 11 | - |
666 | - | |
667 | Here field[0] is the company, field[1] the web address and so on. | - |
668 | - | |
669 | To imitate the matching of a shell we can use wildcard mode. | - |
670 | - | |
671 | \snippet code/src_corelib_tools_qregexp.cpp 12 | - |
672 | - | |
673 | Wildcard matching can be convenient because of its simplicity, but | - |
674 | any wildcard regexp can be defined using full regexps, e.g. | - |
675 | \b{.*\\.html$}. Notice that we can't match both \c .html and \c | - |
676 | .htm files with a wildcard unless we use \b{*.htm*} which will | - |
677 | also match 'test.html.bak'. A full regexp gives us the precision | - |
678 | we need, \b{.*\\.html?$}. | - |
679 | - | |
680 | QRegExp can match case insensitively using setCaseSensitivity(), | - |
681 | and can use non-greedy matching, see setMinimal(). By | - |
682 | default QRegExp uses full regexps but this can be changed with | - |
683 | setPatternSyntax(). Searching can be done forward with indexIn() or backward | - |
684 | with lastIndexIn(). Captured text can be accessed using | - |
685 | capturedTexts() which returns a string list of all captured | - |
686 | strings, or using cap() which returns the captured string for the | - |
687 | given index. The pos() function takes a match index and returns | - |
688 | the position in the string where the match was made (or -1 if | - |
689 | there was no match). | - |
690 | - | |
691 | \sa QString, QStringList, QRegExpValidator, QSortFilterProxyModel, | - |
692 | {tools/regexp}{Regular Expression Example} | - |
693 | */ | - |
694 | - | |
695 | #if defined(Q_OS_VXWORKS) && defined(EOS) | - |
696 | # undef EOS | - |
697 | #endif | - |
698 | - | |
699 | const int NumBadChars = 64; | - |
700 | #define BadChar(ch) ((ch).unicode() % NumBadChars) | - |
701 | - | |
702 | const int NoOccurrence = INT_MAX; | - |
703 | const int EmptyCapture = INT_MAX; | - |
704 | const int InftyLen = INT_MAX; | - |
705 | const int InftyRep = 1025; | - |
706 | const int EOS = -1; | - |
707 | - | |
708 | static bool isWord(QChar ch) | - |
709 | { | - |
710 | return ch.isLetterOrNumber() || ch.isMark() || ch == QLatin1Char('_'); | - |
711 | } | - |
712 | - | |
713 | /* | - |
714 | Merges two vectors of ints and puts the result into the first | - |
715 | one. | - |
716 | */ | - |
717 | static void mergeInto(QVector<int> *a, const QVector<int> &b) | - |
718 | { | - |
719 | int asize = a->size(); | - |
720 | int bsize = b.size(); | - |
721 | if (asize == 0) { | - |
722 | *a = b; | - |
723 | #ifndef QT_NO_REGEXP_OPTIM | - |
724 | } else if (bsize == 1 && a->at(asize - 1) < b.at(0)) { | - |
725 | a->resize(asize + 1); | - |
726 | (*a)[asize] = b.at(0); | - |
727 | #endif | - |
728 | } else if (bsize >= 1) { | - |
729 | int csize = asize + bsize; | - |
730 | QVector<int> c(csize); | - |
731 | int i = 0, j = 0, k = 0; | - |
732 | while (i < asize) { | - |
733 | if (j < bsize) { | - |
734 | if (a->at(i) == b.at(j)) { | - |
735 | ++i; | - |
736 | --csize; | - |
737 | } else if (a->at(i) < b.at(j)) { | - |
738 | c[k++] = a->at(i++); | - |
739 | } else { | - |
740 | c[k++] = b.at(j++); | - |
741 | } | - |
742 | } else { | - |
743 | memcpy(c.data() + k, a->constData() + i, (asize - i) * sizeof(int)); | - |
744 | break; | - |
745 | } | - |
746 | } | - |
747 | c.resize(csize); | - |
748 | if (j < bsize) | - |
749 | memcpy(c.data() + k, b.constData() + j, (bsize - j) * sizeof(int)); | - |
750 | *a = c; | - |
751 | } | - |
752 | } | - |
753 | - | |
754 | #ifndef QT_NO_REGEXP_WILDCARD | - |
755 | /* | - |
756 | Translates a wildcard pattern to an equivalent regular expression | - |
757 | pattern (e.g., *.cpp to .*\.cpp). | - |
758 | - | |
759 | If enableEscaping is true, it is possible to escape the wildcard | - |
760 | characters with \ | - |
761 | */ | - |
762 | static QString wc2rx(const QString &wc_str, const bool enableEscaping) | - |
763 | { | - |
764 | const int wclen = wc_str.length(); | - |
765 | QString rx; | - |
766 | int i = 0; | - |
767 | bool isEscaping = false; // the previous character is '\' | - |
768 | const QChar *wc = wc_str.unicode(); | - |
769 | - | |
770 | while (i < wclen) { | - |
771 | const QChar c = wc[i++]; | - |
772 | switch (c.unicode()) { | - |
773 | case '\\': | - |
774 | if (enableEscaping) { | - |
775 | if (isEscaping) { | - |
776 | rx += QLatin1String("\\\\"); | - |
777 | } // we insert the \\ later if necessary | - |
778 | if (i == wclen) { // the end | - |
779 | rx += QLatin1String("\\\\"); | - |
780 | } | - |
781 | } else { | - |
782 | rx += QLatin1String("\\\\"); | - |
783 | } | - |
784 | isEscaping = true; | - |
785 | break; | - |
786 | case '*': | - |
787 | if (isEscaping) { | - |
788 | rx += QLatin1String("\\*"); | - |
789 | isEscaping = false; | - |
790 | } else { | - |
791 | rx += QLatin1String(".*"); | - |
792 | } | - |
793 | break; | - |
794 | case '?': | - |
795 | if (isEscaping) { | - |
796 | rx += QLatin1String("\\?"); | - |
797 | isEscaping = false; | - |
798 | } else { | - |
799 | rx += QLatin1Char('.'); | - |
800 | } | - |
801 | - | |
802 | break; | - |
803 | case '$': | - |
804 | case '(': | - |
805 | case ')': | - |
806 | case '+': | - |
807 | case '.': | - |
808 | case '^': | - |
809 | case '{': | - |
810 | case '|': | - |
811 | case '}': | - |
812 | if (isEscaping) { | - |
813 | isEscaping = false; | - |
814 | rx += QLatin1String("\\\\"); | - |
815 | } | - |
816 | rx += QLatin1Char('\\'); | - |
817 | rx += c; | - |
818 | break; | - |
819 | case '[': | - |
820 | if (isEscaping) { | - |
821 | isEscaping = false; | - |
822 | rx += QLatin1String("\\["); | - |
823 | } else { | - |
824 | rx += c; | - |
825 | if (wc[i] == QLatin1Char('^')) | - |
826 | rx += wc[i++]; | - |
827 | if (i < wclen) { | - |
828 | if (rx[i] == QLatin1Char(']')) | - |
829 | rx += wc[i++]; | - |
830 | while (i < wclen && wc[i] != QLatin1Char(']')) { | - |
831 | if (wc[i] == QLatin1Char('\\')) | - |
832 | rx += QLatin1Char('\\'); | - |
833 | rx += wc[i++]; | - |
834 | } | - |
835 | } | - |
836 | } | - |
837 | break; | - |
838 | - | |
839 | case ']': | - |
840 | if(isEscaping){ | - |
841 | isEscaping = false; | - |
842 | rx += QLatin1String("\\"); | - |
843 | } | - |
844 | rx += c; | - |
845 | break; | - |
846 | - | |
847 | default: | - |
848 | if(isEscaping){ | - |
849 | isEscaping = false; | - |
850 | rx += QLatin1String("\\\\"); | - |
851 | } | - |
852 | rx += c; | - |
853 | } | - |
854 | } | - |
855 | return rx; | - |
856 | } | - |
857 | #endif | - |
858 | - | |
859 | static int caretIndex(int offset, QRegExp::CaretMode caretMode) | - |
860 | { | - |
861 | if (caretMode == QRegExp::CaretAtZero) { | - |
862 | return 0; | - |
863 | } else if (caretMode == QRegExp::CaretAtOffset) { | - |
864 | return offset; | - |
865 | } else { // QRegExp::CaretWontMatch | - |
866 | return -1; | - |
867 | } | - |
868 | } | - |
869 | - | |
870 | /* | - |
871 | The QRegExpEngineKey struct uniquely identifies an engine. | - |
872 | */ | - |
873 | struct QRegExpEngineKey | - |
874 | { | - |
875 | QString pattern; | - |
876 | QRegExp::PatternSyntax patternSyntax; | - |
877 | Qt::CaseSensitivity cs; | - |
878 | - | |
879 | inline QRegExpEngineKey(const QString &pattern, QRegExp::PatternSyntax patternSyntax, | - |
880 | Qt::CaseSensitivity cs) | - |
881 | : pattern(pattern), patternSyntax(patternSyntax), cs(cs) {} | - |
882 | - | |
883 | inline void clear() { | - |
884 | pattern.clear(); | - |
885 | patternSyntax = QRegExp::RegExp; | - |
886 | cs = Qt::CaseSensitive; | - |
887 | } | - |
888 | }; | - |
889 | - | |
890 | static bool operator==(const QRegExpEngineKey &key1, const QRegExpEngineKey &key2) | - |
891 | { | - |
892 | return key1.pattern == key2.pattern && key1.patternSyntax == key2.patternSyntax | - |
893 | && key1.cs == key2.cs; | - |
894 | } | - |
895 | - | |
896 | static uint qHash(const QRegExpEngineKey &key, uint seed = 0) Q_DECL_NOTHROW | - |
897 | { | - |
898 | QtPrivate::QHashCombine hash; | - |
899 | seed = hash(seed, key.pattern); | - |
900 | seed = hash(seed, key.patternSyntax); | - |
901 | seed = hash(seed, key.cs); | - |
902 | return seed; executed 630297 times by 167 tests: return seed; Executed by:
| 630297 |
903 | } | - |
904 | - | |
905 | class QRegExpEngine; | - |
906 | - | |
907 | //Q_DECLARE_TYPEINFO(QVector<int>, Q_MOVABLE_TYPE); | - |
908 | - | |
909 | /* | - |
910 | This is the engine state during matching. | - |
911 | */ | - |
912 | struct QRegExpMatchState | - |
913 | { | - |
914 | const QChar *in; // a pointer to the input string data | - |
915 | int pos; // the current position in the string | - |
916 | int caretPos; | - |
917 | int len; // the length of the input string | - |
918 | bool minimal; // minimal matching? | - |
919 | int *bigArray; // big array holding the data for the next pointers | - |
920 | int *inNextStack; // is state is nextStack? | - |
921 | int *curStack; // stack of current states | - |
922 | int *nextStack; // stack of next states | - |
923 | int *curCapBegin; // start of current states' captures | - |
924 | int *nextCapBegin; // start of next states' captures | - |
925 | int *curCapEnd; // end of current states' captures | - |
926 | int *nextCapEnd; // end of next states' captures | - |
927 | int *tempCapBegin; // start of temporary captures | - |
928 | int *tempCapEnd; // end of temporary captures | - |
929 | int *capBegin; // start of captures for a next state | - |
930 | int *capEnd; // end of captures for a next state | - |
931 | int *slideTab; // bump-along slide table for bad-character heuristic | - |
932 | int *captured; // what match() returned last | - |
933 | int slideTabSize; // size of slide table | - |
934 | int capturedSize; | - |
935 | #ifndef QT_NO_REGEXP_BACKREF | - |
936 | QList<QVector<int> > sleeping; // list of back-reference sleepers | - |
937 | #endif | - |
938 | int matchLen; // length of match | - |
939 | int oneTestMatchedLen; // length of partial match | - |
940 | - | |
941 | const QRegExpEngine *eng; | - |
942 | - | |
943 | inline QRegExpMatchState() : bigArray(0), captured(0) {} | - |
944 | inline ~QRegExpMatchState() { free(bigArray); } | - |
945 | - | |
946 | void drain() { free(bigArray); bigArray = 0; captured = 0; } // to save memory | - |
947 | void prepareForMatch(QRegExpEngine *eng); | - |
948 | void match(const QChar *str, int len, int pos, bool minimal, | - |
949 | bool oneTest, int caretIndex); | - |
950 | bool matchHere(); | - |
951 | bool testAnchor(int i, int a, const int *capBegin); | - |
952 | }; | - |
953 | - | |
954 | /* | - |
955 | The struct QRegExpAutomatonState represents one state in a modified NFA. The | - |
956 | input characters matched are stored in the state instead of on | - |
957 | the transitions, something possible for an automaton | - |
958 | constructed from a regular expression. | - |
959 | */ | - |
960 | struct QRegExpAutomatonState | - |
961 | { | - |
962 | #ifndef QT_NO_REGEXP_CAPTURE | - |
963 | int atom; // which atom does this state belong to? | - |
964 | #endif | - |
965 | int match; // what does it match? (see CharClassBit and BackRefBit) | - |
966 | QVector<int> outs; // out-transitions | - |
967 | QMap<int, int> reenter; // atoms reentered when transiting out | - |
968 | QMap<int, int> anchors; // anchors met when transiting out | - |
969 | - | |
970 | inline QRegExpAutomatonState() { } | - |
971 | #ifndef QT_NO_REGEXP_CAPTURE | - |
972 | inline QRegExpAutomatonState(int a, int m) | - |
973 | : atom(a), match(m) { } | - |
974 | #else | - |
975 | inline QRegExpAutomatonState(int m) | - |
976 | : match(m) { } | - |
977 | #endif | - |
978 | }; | - |
979 | - | |
980 | Q_DECLARE_TYPEINFO(QRegExpAutomatonState, Q_MOVABLE_TYPE); | - |
981 | - | |
982 | /* | - |
983 | The struct QRegExpCharClassRange represents a range of characters (e.g., | - |
984 | [0-9] denotes range 48 to 57). | - |
985 | */ | - |
986 | struct QRegExpCharClassRange | - |
987 | { | - |
988 | ushort from; // 48 | - |
989 | ushort len; // 10 | - |
990 | }; | - |
991 | - | |
992 | Q_DECLARE_TYPEINFO(QRegExpCharClassRange, Q_PRIMITIVE_TYPE); | - |
993 | - | |
994 | #ifndef QT_NO_REGEXP_CAPTURE | - |
995 | /* | - |
996 | The struct QRegExpAtom represents one node in the hierarchy of regular | - |
997 | expression atoms. | - |
998 | */ | - |
999 | struct QRegExpAtom | - |
1000 | { | - |
1001 | enum { NoCapture = -1, OfficialCapture = -2, UnofficialCapture = -3 }; | - |
1002 | - | |
1003 | int parent; // index of parent in array of atoms | - |
1004 | int capture; // index of capture, from 1 to ncap - 1 | - |
1005 | }; | - |
1006 | - | |
1007 | Q_DECLARE_TYPEINFO(QRegExpAtom, Q_PRIMITIVE_TYPE); | - |
1008 | #endif | - |
1009 | - | |
1010 | struct QRegExpLookahead; | - |
1011 | - | |
1012 | #ifndef QT_NO_REGEXP_ANCHOR_ALT | - |
1013 | /* | - |
1014 | The struct QRegExpAnchorAlternation represents a pair of anchors with | - |
1015 | OR semantics. | - |
1016 | */ | - |
1017 | struct QRegExpAnchorAlternation | - |
1018 | { | - |
1019 | int a; // this anchor... | - |
1020 | int b; // ...or this one | - |
1021 | }; | - |
1022 | - | |
1023 | Q_DECLARE_TYPEINFO(QRegExpAnchorAlternation, Q_PRIMITIVE_TYPE); | - |
1024 | #endif | - |
1025 | - | |
1026 | #ifndef QT_NO_REGEXP_CCLASS | - |
1027 | - | |
1028 | #define FLAG(x) (1 << (x)) | - |
1029 | /* | - |
1030 | The class QRegExpCharClass represents a set of characters, such as can | - |
1031 | be found in regular expressions (e.g., [a-z] denotes the set | - |
1032 | {a, b, ..., z}). | - |
1033 | */ | - |
1034 | class QRegExpCharClass | - |
1035 | { | - |
1036 | public: | - |
1037 | QRegExpCharClass(); | - |
1038 | - | |
1039 | void clear(); | - |
1040 | bool negative() const { return n; } | - |
1041 | void setNegative(bool negative); | - |
1042 | void addCategories(uint cats); | - |
1043 | void addRange(ushort from, ushort to); | - |
1044 | void addSingleton(ushort ch) { addRange(ch, ch); } | - |
1045 | - | |
1046 | bool in(QChar ch) const; | - |
1047 | #ifndef QT_NO_REGEXP_OPTIM | - |
1048 | const QVector<int> &firstOccurrence() const { return occ1; } | - |
1049 | #endif | - |
1050 | - | |
1051 | #if defined(QT_DEBUG) | - |
1052 | void dump() const; | - |
1053 | #endif | - |
1054 | - | |
1055 | private: | - |
1056 | QVector<QRegExpCharClassRange> r; // character ranges | - |
1057 | #ifndef QT_NO_REGEXP_OPTIM | - |
1058 | QVector<int> occ1; // first-occurrence array | - |
1059 | #endif | - |
1060 | uint c; // character classes | - |
1061 | bool n; // negative? | - |
1062 | }; | - |
1063 | #else | - |
1064 | struct QRegExpCharClass | - |
1065 | { | - |
1066 | int dummy; | - |
1067 | - | |
1068 | #ifndef QT_NO_REGEXP_OPTIM | - |
1069 | QRegExpCharClass() { occ1.fill(0, NumBadChars); } | - |
1070 | - | |
1071 | const QVector<int> &firstOccurrence() const { return occ1; } | - |
1072 | QVector<int> occ1; | - |
1073 | #endif | - |
1074 | }; | - |
1075 | #endif | - |
1076 | - | |
1077 | Q_DECLARE_TYPEINFO(QRegExpCharClass, Q_MOVABLE_TYPE); | - |
1078 | - | |
1079 | /* | - |
1080 | The QRegExpEngine class encapsulates a modified nondeterministic | - |
1081 | finite automaton (NFA). | - |
1082 | */ | - |
1083 | class QRegExpEngine | - |
1084 | { | - |
1085 | public: | - |
1086 | QRegExpEngine(Qt::CaseSensitivity cs, bool greedyQuantifiers) | - |
1087 | : cs(cs), greedyQuantifiers(greedyQuantifiers) { setup(); } | - |
1088 | - | |
1089 | QRegExpEngine(const QRegExpEngineKey &key); | - |
1090 | ~QRegExpEngine(); | - |
1091 | - | |
1092 | bool isValid() const { return valid; } | - |
1093 | const QString &errorString() const { return yyError; } | - |
1094 | int captureCount() const { return officialncap; } | - |
1095 | - | |
1096 | int createState(QChar ch); | - |
1097 | int createState(const QRegExpCharClass &cc); | - |
1098 | #ifndef QT_NO_REGEXP_BACKREF | - |
1099 | int createState(int bref); | - |
1100 | #endif | - |
1101 | - | |
1102 | void addCatTransitions(const QVector<int> &from, const QVector<int> &to); | - |
1103 | #ifndef QT_NO_REGEXP_CAPTURE | - |
1104 | void addPlusTransitions(const QVector<int> &from, const QVector<int> &to, int atom); | - |
1105 | #endif | - |
1106 | - | |
1107 | #ifndef QT_NO_REGEXP_ANCHOR_ALT | - |
1108 | int anchorAlternation(int a, int b); | - |
1109 | int anchorConcatenation(int a, int b); | - |
1110 | #else | - |
1111 | int anchorAlternation(int a, int b) { return a & b; } | - |
1112 | int anchorConcatenation(int a, int b) { return a | b; } | - |
1113 | #endif | - |
1114 | void addAnchors(int from, int to, int a); | - |
1115 | - | |
1116 | #ifndef QT_NO_REGEXP_OPTIM | - |
1117 | void heuristicallyChooseHeuristic(); | - |
1118 | #endif | - |
1119 | - | |
1120 | #if defined(QT_DEBUG) | - |
1121 | void dump() const; | - |
1122 | #endif | - |
1123 | - | |
1124 | QAtomicInt ref; | - |
1125 | - | |
1126 | private: | - |
1127 | enum { CharClassBit = 0x10000, BackRefBit = 0x20000 }; | - |
1128 | enum { InitialState = 0, FinalState = 1 }; | - |
1129 | - | |
1130 | void setup(); | - |
1131 | int setupState(int match); | - |
1132 | - | |
1133 | /* | - |
1134 | Let's hope that 13 lookaheads and 14 back-references are | - |
1135 | enough. | - |
1136 | */ | - |
1137 | enum { MaxLookaheads = 13, MaxBackRefs = 14 }; | - |
1138 | enum { Anchor_Dollar = 0x00000001, Anchor_Caret = 0x00000002, Anchor_Word = 0x00000004, | - |
1139 | Anchor_NonWord = 0x00000008, Anchor_FirstLookahead = 0x00000010, | - |
1140 | Anchor_BackRef1Empty = Anchor_FirstLookahead << MaxLookaheads, | - |
1141 | Anchor_BackRef0Empty = Anchor_BackRef1Empty >> 1, | - |
1142 | Anchor_Alternation = unsigned(Anchor_BackRef1Empty) << MaxBackRefs, | - |
1143 | - | |
1144 | Anchor_LookaheadMask = (Anchor_FirstLookahead - 1) ^ | - |
1145 | ((Anchor_FirstLookahead << MaxLookaheads) - 1) }; | - |
1146 | #ifndef QT_NO_REGEXP_CAPTURE | - |
1147 | int startAtom(bool officialCapture); | - |
1148 | void finishAtom(int atom, bool needCapture); | - |
1149 | #endif | - |
1150 | - | |
1151 | #ifndef QT_NO_REGEXP_LOOKAHEAD | - |
1152 | int addLookahead(QRegExpEngine *eng, bool negative); | - |
1153 | #endif | - |
1154 | - | |
1155 | #ifndef QT_NO_REGEXP_OPTIM | - |
1156 | bool goodStringMatch(QRegExpMatchState &matchState) const; | - |
1157 | bool badCharMatch(QRegExpMatchState &matchState) const; | - |
1158 | #else | - |
1159 | bool bruteMatch(QRegExpMatchState &matchState) const; | - |
1160 | #endif | - |
1161 | - | |
1162 | QVector<QRegExpAutomatonState> s; // array of states | - |
1163 | #ifndef QT_NO_REGEXP_CAPTURE | - |
1164 | QVector<QRegExpAtom> f; // atom hierarchy | - |
1165 | int nf; // number of atoms | - |
1166 | int cf; // current atom | - |
1167 | QVector<int> captureForOfficialCapture; | - |
1168 | #endif | - |
1169 | int officialncap; // number of captures, seen from the outside | - |
1170 | int ncap; // number of captures, seen from the inside | - |
1171 | #ifndef QT_NO_REGEXP_CCLASS | - |
1172 | QVector<QRegExpCharClass> cl; // array of character classes | - |
1173 | #endif | - |
1174 | #ifndef QT_NO_REGEXP_LOOKAHEAD | - |
1175 | QVector<QRegExpLookahead *> ahead; // array of lookaheads | - |
1176 | #endif | - |
1177 | #ifndef QT_NO_REGEXP_ANCHOR_ALT | - |
1178 | QVector<QRegExpAnchorAlternation> aa; // array of (a, b) pairs of anchors | - |
1179 | #endif | - |
1180 | #ifndef QT_NO_REGEXP_OPTIM | - |
1181 | bool caretAnchored; // does the regexp start with ^? | - |
1182 | bool trivial; // is the good-string all that needs to match? | - |
1183 | #endif | - |
1184 | bool valid; // is the regular expression valid? | - |
1185 | Qt::CaseSensitivity cs; // case sensitive? | - |
1186 | bool greedyQuantifiers; // RegExp2? | - |
1187 | bool xmlSchemaExtensions; | - |
1188 | #ifndef QT_NO_REGEXP_BACKREF | - |
1189 | int nbrefs; // number of back-references | - |
1190 | #endif | - |
1191 | - | |
1192 | #ifndef QT_NO_REGEXP_OPTIM | - |
1193 | bool useGoodStringHeuristic; // use goodStringMatch? otherwise badCharMatch | - |
1194 | - | |
1195 | int goodEarlyStart; // the index where goodStr can first occur in a match | - |
1196 | int goodLateStart; // the index where goodStr can last occur in a match | - |
1197 | QString goodStr; // the string that any match has to contain | - |
1198 | - | |
1199 | int minl; // the minimum length of a match | - |
1200 | QVector<int> occ1; // first-occurrence array | - |
1201 | #endif | - |
1202 | - | |
1203 | /* | - |
1204 | The class Box is an abstraction for a regular expression | - |
1205 | fragment. It can also be seen as one node in the syntax tree of | - |
1206 | a regular expression with synthetized attributes. | - |
1207 | - | |
1208 | Its interface is ugly for performance reasons. | - |
1209 | */ | - |
1210 | class Box | - |
1211 | { | - |
1212 | public: | - |
1213 | Box(QRegExpEngine *engine); | - |
1214 | Box(const Box &b) { operator=(b); } | - |
1215 | - | |
1216 | Box &operator=(const Box &b); | - |
1217 | - | |
1218 | void clear() { operator=(Box(eng)); } | - |
1219 | void set(QChar ch); | - |
1220 | void set(const QRegExpCharClass &cc); | - |
1221 | #ifndef QT_NO_REGEXP_BACKREF | - |
1222 | void set(int bref); | - |
1223 | #endif | - |
1224 | - | |
1225 | void cat(const Box &b); | - |
1226 | void orx(const Box &b); | - |
1227 | void plus(int atom); | - |
1228 | void opt(); | - |
1229 | void catAnchor(int a); | - |
1230 | #ifndef QT_NO_REGEXP_OPTIM | - |
1231 | void setupHeuristics(); | - |
1232 | #endif | - |
1233 | - | |
1234 | #if defined(QT_DEBUG) | - |
1235 | void dump() const; | - |
1236 | #endif | - |
1237 | - | |
1238 | private: | - |
1239 | void addAnchorsToEngine(const Box &to) const; | - |
1240 | - | |
1241 | QRegExpEngine *eng; // the automaton under construction | - |
1242 | QVector<int> ls; // the left states (firstpos) | - |
1243 | QVector<int> rs; // the right states (lastpos) | - |
1244 | QMap<int, int> lanchors; // the left anchors | - |
1245 | QMap<int, int> ranchors; // the right anchors | - |
1246 | int skipanchors; // the anchors to match if the box is skipped | - |
1247 | - | |
1248 | #ifndef QT_NO_REGEXP_OPTIM | - |
1249 | int earlyStart; // the index where str can first occur | - |
1250 | int lateStart; // the index where str can last occur | - |
1251 | QString str; // a string that has to occur in any match | - |
1252 | QString leftStr; // a string occurring at the left of this box | - |
1253 | QString rightStr; // a string occurring at the right of this box | - |
1254 | int maxl; // the maximum length of this box (possibly InftyLen) | - |
1255 | #endif | - |
1256 | - | |
1257 | int minl; // the minimum length of this box | - |
1258 | #ifndef QT_NO_REGEXP_OPTIM | - |
1259 | QVector<int> occ1; // first-occurrence array | - |
1260 | #endif | - |
1261 | }; | - |
1262 | - | |
1263 | friend class Box; | - |
1264 | - | |
1265 | /* | - |
1266 | This is the lexical analyzer for regular expressions. | - |
1267 | */ | - |
1268 | enum { Tok_Eos, Tok_Dollar, Tok_LeftParen, Tok_MagicLeftParen, Tok_PosLookahead, | - |
1269 | Tok_NegLookahead, Tok_RightParen, Tok_CharClass, Tok_Caret, Tok_Quantifier, Tok_Bar, | - |
1270 | Tok_Word, Tok_NonWord, Tok_Char = 0x10000, Tok_BackRef = 0x20000 }; | - |
1271 | int getChar(); | - |
1272 | int getEscape(); | - |
1273 | #ifndef QT_NO_REGEXP_INTERVAL | - |
1274 | int getRep(int def); | - |
1275 | #endif | - |
1276 | #ifndef QT_NO_REGEXP_LOOKAHEAD | - |
1277 | void skipChars(int n); | - |
1278 | #endif | - |
1279 | void error(const char *msg); | - |
1280 | void startTokenizer(const QChar *rx, int len); | - |
1281 | int getToken(); | - |
1282 | - | |
1283 | const QChar *yyIn; // a pointer to the input regular expression pattern | - |
1284 | int yyPos0; // the position of yyTok in the input pattern | - |
1285 | int yyPos; // the position of the next character to read | - |
1286 | int yyLen; // the length of yyIn | - |
1287 | int yyCh; // the last character read | - |
1288 | QScopedPointer<QRegExpCharClass> yyCharClass; // attribute for Tok_CharClass tokens | - |
1289 | int yyMinRep; // attribute for Tok_Quantifier | - |
1290 | int yyMaxRep; // ditto | - |
1291 | QString yyError; // syntax error or overflow during parsing? | - |
1292 | - | |
1293 | /* | - |
1294 | This is the syntactic analyzer for regular expressions. | - |
1295 | */ | - |
1296 | int parse(const QChar *rx, int len); | - |
1297 | void parseAtom(Box *box); | - |
1298 | void parseFactor(Box *box); | - |
1299 | void parseTerm(Box *box); | - |
1300 | void parseExpression(Box *box); | - |
1301 | - | |
1302 | int yyTok; // the last token read | - |
1303 | bool yyMayCapture; // set this to false to disable capturing | - |
1304 | - | |
1305 | friend struct QRegExpMatchState; | - |
1306 | }; | - |
1307 | - | |
1308 | #ifndef QT_NO_REGEXP_LOOKAHEAD | - |
1309 | /* | - |
1310 | The struct QRegExpLookahead represents a lookahead a la Perl (e.g., | - |
1311 | (?=foo) and (?!bar)). | - |
1312 | */ | - |
1313 | struct QRegExpLookahead | - |
1314 | { | - |
1315 | QRegExpEngine *eng; // NFA representing the embedded regular expression | - |
1316 | bool neg; // negative lookahead? | - |
1317 | - | |
1318 | inline QRegExpLookahead(QRegExpEngine *eng0, bool neg0) | - |
1319 | : eng(eng0), neg(neg0) { } | - |
1320 | inline ~QRegExpLookahead() { delete eng; } | - |
1321 | }; | - |
1322 | #endif | - |
1323 | - | |
1324 | /*! | - |
1325 | \internal | - |
1326 | convert the pattern string to the RegExp syntax. | - |
1327 | - | |
1328 | This is also used by QScriptEngine::newRegExp to convert to a pattern that JavaScriptCore can understan | - |
1329 | */ | - |
1330 | Q_CORE_EXPORT QString qt_regexp_toCanonical(const QString &pattern, QRegExp::PatternSyntax patternSyntax) | - |
1331 | { | - |
1332 | switch (patternSyntax) { | - |
1333 | #ifndef QT_NO_REGEXP_WILDCARD | - |
1334 | case QRegExp::Wildcard: | - |
1335 | return wc2rx(pattern, false); | - |
1336 | case QRegExp::WildcardUnix: | - |
1337 | return wc2rx(pattern, true); | - |
1338 | #endif | - |
1339 | case QRegExp::FixedString: | - |
1340 | return QRegExp::escape(pattern); | - |
1341 | case QRegExp::W3CXmlSchema11: | - |
1342 | default: | - |
1343 | return pattern; | - |
1344 | } | - |
1345 | } | - |
1346 | - | |
1347 | QRegExpEngine::QRegExpEngine(const QRegExpEngineKey &key) | - |
1348 | : cs(key.cs), greedyQuantifiers(key.patternSyntax == QRegExp::RegExp2), | - |
1349 | xmlSchemaExtensions(key.patternSyntax == QRegExp::W3CXmlSchema11) | - |
1350 | { | - |
1351 | setup(); | - |
1352 | - | |
1353 | QString rx = qt_regexp_toCanonical(key.pattern, key.patternSyntax); | - |
1354 | - | |
1355 | valid = (parse(rx.unicode(), rx.length()) == rx.length()); | - |
1356 | if (!valid) { | - |
1357 | #ifndef QT_NO_REGEXP_OPTIM | - |
1358 | trivial = false; | - |
1359 | #endif | - |
1360 | error(RXERR_LEFTDELIM); | - |
1361 | } | - |
1362 | } | - |
1363 | - | |
1364 | QRegExpEngine::~QRegExpEngine() | - |
1365 | { | - |
1366 | #ifndef QT_NO_REGEXP_LOOKAHEAD | - |
1367 | qDeleteAll(ahead); | - |
1368 | #endif | - |
1369 | } | - |
1370 | - | |
1371 | void QRegExpMatchState::prepareForMatch(QRegExpEngine *eng) | - |
1372 | { | - |
1373 | /* | - |
1374 | We use one QVector<int> for all the big data used a lot in | - |
1375 | matchHere() and friends. | - |
1376 | */ | - |
1377 | int ns = eng->s.size(); // number of states | - |
1378 | int ncap = eng->ncap; | - |
1379 | #ifndef QT_NO_REGEXP_OPTIM | - |
1380 | int newSlideTabSize = qMax(eng->minl + 1, 16); | - |
1381 | #else | - |
1382 | int newSlideTabSize = 0; | - |
1383 | #endif | - |
1384 | int numCaptures = eng->captureCount(); | - |
1385 | int newCapturedSize = 2 + 2 * numCaptures; | - |
1386 | bigArray = q_check_ptr((int *)realloc(bigArray, ((3 + 4 * ncap) * ns + 4 * ncap + newSlideTabSize + newCapturedSize)*sizeof(int))); | - |
1387 | - | |
1388 | // set all internal variables only _after_ bigArray is realloc'ed | - |
1389 | // to prevent a broken regexp in oom case | - |
1390 | - | |
1391 | slideTabSize = newSlideTabSize; | - |
1392 | capturedSize = newCapturedSize; | - |
1393 | inNextStack = bigArray; | - |
1394 | memset(inNextStack, -1, ns * sizeof(int)); | - |
1395 | curStack = inNextStack + ns; | - |
1396 | nextStack = inNextStack + 2 * ns; | - |
1397 | - | |
1398 | curCapBegin = inNextStack + 3 * ns; | - |
1399 | nextCapBegin = curCapBegin + ncap * ns; | - |
1400 | curCapEnd = curCapBegin + 2 * ncap * ns; | - |
1401 | nextCapEnd = curCapBegin + 3 * ncap * ns; | - |
1402 | - | |
1403 | tempCapBegin = curCapBegin + 4 * ncap * ns; | - |
1404 | tempCapEnd = tempCapBegin + ncap; | - |
1405 | capBegin = tempCapBegin + 2 * ncap; | - |
1406 | capEnd = tempCapBegin + 3 * ncap; | - |
1407 | - | |
1408 | slideTab = tempCapBegin + 4 * ncap; | - |
1409 | captured = slideTab + slideTabSize; | - |
1410 | memset(captured, -1, capturedSize*sizeof(int)); | - |
1411 | this->eng = eng; | - |
1412 | } | - |
1413 | - | |
1414 | /* | - |
1415 | Tries to match in str and returns an array of (begin, length) pairs | - |
1416 | for captured text. If there is no match, all pairs are (-1, -1). | - |
1417 | */ | - |
1418 | void QRegExpMatchState::match(const QChar *str0, int len0, int pos0, | - |
1419 | bool minimal0, bool oneTest, int caretIndex) | - |
1420 | { | - |
1421 | bool matched = false; | - |
1422 | QChar char_null; | - |
1423 | - | |
1424 | #ifndef QT_NO_REGEXP_OPTIM | - |
1425 | if (eng->trivial && !oneTest) { | - |
1426 | pos = qFindString(str0, len0, pos0, eng->goodStr.unicode(), eng->goodStr.length(), eng->cs); | - |
1427 | matchLen = eng->goodStr.length(); | - |
1428 | matched = (pos != -1); | - |
1429 | } else | - |
1430 | #endif | - |
1431 | { | - |
1432 | in = str0; | - |
1433 | if (in == 0) | - |
1434 | in = &char_null; | - |
1435 | pos = pos0; | - |
1436 | caretPos = caretIndex; | - |
1437 | len = len0; | - |
1438 | minimal = minimal0; | - |
1439 | matchLen = 0; | - |
1440 | oneTestMatchedLen = 0; | - |
1441 | - | |
1442 | if (eng->valid && pos >= 0 && pos <= len) { | - |
1443 | #ifndef QT_NO_REGEXP_OPTIM | - |
1444 | if (oneTest) { | - |
1445 | matched = matchHere(); | - |
1446 | } else { | - |
1447 | if (pos <= len - eng->minl) { | - |
1448 | if (eng->caretAnchored) { | - |
1449 | matched = matchHere(); | - |
1450 | } else if (eng->useGoodStringHeuristic) { | - |
1451 | matched = eng->goodStringMatch(*this); | - |
1452 | } else { | - |
1453 | matched = eng->badCharMatch(*this); | - |
1454 | } | - |
1455 | } | - |
1456 | } | - |
1457 | #else | - |
1458 | matched = oneTest ? matchHere() : eng->bruteMatch(*this); | - |
1459 | #endif | - |
1460 | } | - |
1461 | } | - |
1462 | - | |
1463 | if (matched) { | - |
1464 | int *c = captured; | - |
1465 | *c++ = pos; | - |
1466 | *c++ = matchLen; | - |
1467 | - | |
1468 | int numCaptures = (capturedSize - 2) >> 1; | - |
1469 | #ifndef QT_NO_REGEXP_CAPTURE | - |
1470 | for (int i = 0; i < numCaptures; ++i) { | - |
1471 | int j = eng->captureForOfficialCapture.at(i); | - |
1472 | if (capBegin[j] != EmptyCapture) { | - |
1473 | int len = capEnd[j] - capBegin[j]; | - |
1474 | *c++ = (len > 0) ? pos + capBegin[j] : 0; | - |
1475 | *c++ = len; | - |
1476 | } else { | - |
1477 | *c++ = -1; | - |
1478 | *c++ = -1; | - |
1479 | } | - |
1480 | } | - |
1481 | #endif | - |
1482 | } else { | - |
1483 | // we rely on 2's complement here | - |
1484 | memset(captured, -1, capturedSize * sizeof(int)); | - |
1485 | } | - |
1486 | } | - |
1487 | - | |
1488 | /* | - |
1489 | The three following functions add one state to the automaton and | - |
1490 | return the number of the state. | - |
1491 | */ | - |
1492 | - | |
1493 | int QRegExpEngine::createState(QChar ch) | - |
1494 | { | - |
1495 | return setupState(ch.unicode()); | - |
1496 | } | - |
1497 | - | |
1498 | int QRegExpEngine::createState(const QRegExpCharClass &cc) | - |
1499 | { | - |
1500 | #ifndef QT_NO_REGEXP_CCLASS | - |
1501 | int n = cl.size(); | - |
1502 | cl += QRegExpCharClass(cc); | - |
1503 | return setupState(CharClassBit | n); | - |
1504 | #else | - |
1505 | Q_UNUSED(cc); | - |
1506 | return setupState(CharClassBit); | - |
1507 | #endif | - |
1508 | } | - |
1509 | - | |
1510 | #ifndef QT_NO_REGEXP_BACKREF | - |
1511 | int QRegExpEngine::createState(int bref) | - |
1512 | { | - |
1513 | if (bref > nbrefs) { | - |
1514 | nbrefs = bref; | - |
1515 | if (nbrefs > MaxBackRefs) { | - |
1516 | error(RXERR_LIMIT); | - |
1517 | return 0; | - |
1518 | } | - |
1519 | } | - |
1520 | return setupState(BackRefBit | bref); | - |
1521 | } | - |
1522 | #endif | - |
1523 | - | |
1524 | /* | - |
1525 | The two following functions add a transition between all pairs of | - |
1526 | states (i, j) where i is found in from, and j is found in to. | - |
1527 | - | |
1528 | Cat-transitions are distinguished from plus-transitions for | - |
1529 | capturing. | - |
1530 | */ | - |
1531 | - | |
1532 | void QRegExpEngine::addCatTransitions(const QVector<int> &from, const QVector<int> &to) | - |
1533 | { | - |
1534 | for (int i = 0; i < from.size(); i++) | - |
1535 | mergeInto(&s[from.at(i)].outs, to); | - |
1536 | } | - |
1537 | - | |
1538 | #ifndef QT_NO_REGEXP_CAPTURE | - |
1539 | void QRegExpEngine::addPlusTransitions(const QVector<int> &from, const QVector<int> &to, int atom) | - |
1540 | { | - |
1541 | for (int i = 0; i < from.size(); i++) { | - |
1542 | QRegExpAutomatonState &st = s[from.at(i)]; | - |
1543 | const QVector<int> oldOuts = st.outs; | - |
1544 | mergeInto(&st.outs, to); | - |
1545 | if (f.at(atom).capture != QRegExpAtom::NoCapture) { | - |
1546 | for (int j = 0; j < to.size(); j++) { | - |
1547 | // ### st.reenter.contains(to.at(j)) check looks suspicious | - |
1548 | if (!st.reenter.contains(to.at(j)) && | - |
1549 | !std::binary_search(oldOuts.constBegin(), oldOuts.constEnd(), to.at(j))) | - |
1550 | st.reenter.insert(to.at(j), atom); | - |
1551 | } | - |
1552 | } | - |
1553 | } | - |
1554 | } | - |
1555 | #endif | - |
1556 | - | |
1557 | #ifndef QT_NO_REGEXP_ANCHOR_ALT | - |
1558 | /* | - |
1559 | Returns an anchor that means a OR b. | - |
1560 | */ | - |
1561 | int QRegExpEngine::anchorAlternation(int a, int b) | - |
1562 | { | - |
1563 | if (((a & b) == a || (a & b) == b) && ((a | b) & Anchor_Alternation) == 0) | - |
1564 | return a & b; | - |
1565 | - | |
1566 | int n = aa.size(); | - |
1567 | #ifndef QT_NO_REGEXP_OPTIM | - |
1568 | if (n > 0 && aa.at(n - 1).a == a && aa.at(n - 1).b == b) | - |
1569 | return Anchor_Alternation | (n - 1); | - |
1570 | #endif | - |
1571 | - | |
1572 | QRegExpAnchorAlternation element = {a, b}; | - |
1573 | aa.append(element); | - |
1574 | return Anchor_Alternation | n; | - |
1575 | } | - |
1576 | - | |
1577 | /* | - |
1578 | Returns an anchor that means a AND b. | - |
1579 | */ | - |
1580 | int QRegExpEngine::anchorConcatenation(int a, int b) | - |
1581 | { | - |
1582 | if (((a | b) & Anchor_Alternation) == 0) | - |
1583 | return a | b; | - |
1584 | if ((b & Anchor_Alternation) != 0) | - |
1585 | qSwap(a, b); | - |
1586 | - | |
1587 | int aprime = anchorConcatenation(aa.at(a ^ Anchor_Alternation).a, b); | - |
1588 | int bprime = anchorConcatenation(aa.at(a ^ Anchor_Alternation).b, b); | - |
1589 | return anchorAlternation(aprime, bprime); | - |
1590 | } | - |
1591 | #endif | - |
1592 | - | |
1593 | /* | - |
1594 | Adds anchor a on a transition caracterised by its from state and | - |
1595 | its to state. | - |
1596 | */ | - |
1597 | void QRegExpEngine::addAnchors(int from, int to, int a) | - |
1598 | { | - |
1599 | QRegExpAutomatonState &st = s[from]; | - |
1600 | if (st.anchors.contains(to)) | - |
1601 | a = anchorAlternation(st.anchors.value(to), a); | - |
1602 | st.anchors.insert(to, a); | - |
1603 | } | - |
1604 | - | |
1605 | #ifndef QT_NO_REGEXP_OPTIM | - |
1606 | /* | - |
1607 | This function chooses between the good-string and the bad-character | - |
1608 | heuristics. It computes two scores and chooses the heuristic with | - |
1609 | the highest score. | - |
1610 | - | |
1611 | Here are some common-sense constraints on the scores that should be | - |
1612 | respected if the formulas are ever modified: (1) If goodStr is | - |
1613 | empty, the good-string heuristic scores 0. (2) If the regular | - |
1614 | expression is trivial, the good-string heuristic should be used. | - |
1615 | (3) If the search is case insensitive, the good-string heuristic | - |
1616 | should be used, unless it scores 0. (Case insensitivity turns all | - |
1617 | entries of occ1 to 0.) (4) If (goodLateStart - goodEarlyStart) is | - |
1618 | big, the good-string heuristic should score less. | - |
1619 | */ | - |
1620 | void QRegExpEngine::heuristicallyChooseHeuristic() | - |
1621 | { | - |
1622 | if (minl == 0) { | - |
1623 | useGoodStringHeuristic = false; | - |
1624 | } else if (trivial) { | - |
1625 | useGoodStringHeuristic = true; | - |
1626 | } else { | - |
1627 | /* | - |
1628 | Magic formula: The good string has to constitute a good | - |
1629 | proportion of the minimum-length string, and appear at a | - |
1630 | more-or-less known index. | - |
1631 | */ | - |
1632 | int goodStringScore = (64 * goodStr.length() / minl) - | - |
1633 | (goodLateStart - goodEarlyStart); | - |
1634 | /* | - |
1635 | Less magic formula: We pick some characters at random, and | - |
1636 | check whether they are good or bad. | - |
1637 | */ | - |
1638 | int badCharScore = 0; | - |
1639 | int step = qMax(1, NumBadChars / 32); | - |
1640 | for (int i = 1; i < NumBadChars; i += step) { | - |
1641 | if (occ1.at(i) == NoOccurrence) | - |
1642 | badCharScore += minl; | - |
1643 | else | - |
1644 | badCharScore += occ1.at(i); | - |
1645 | } | - |
1646 | badCharScore /= minl; | - |
1647 | useGoodStringHeuristic = (goodStringScore > badCharScore); | - |
1648 | } | - |
1649 | } | - |
1650 | #endif | - |
1651 | - | |
1652 | #if defined(QT_DEBUG) | - |
1653 | void QRegExpEngine::dump() const | - |
1654 | { | - |
1655 | int i, j; | - |
1656 | qDebug("Case %ssensitive engine", cs ? "" : "in"); | - |
1657 | qDebug(" States"); | - |
1658 | for (i = 0; i < s.size(); i++) { | - |
1659 | qDebug(" %d%s", i, i == InitialState ? " (initial)" : i == FinalState ? " (final)" : ""); | - |
1660 | #ifndef QT_NO_REGEXP_CAPTURE | - |
1661 | if (nf > 0) | - |
1662 | qDebug(" in atom %d", s[i].atom); | - |
1663 | #endif | - |
1664 | int m = s[i].match; | - |
1665 | if ((m & CharClassBit) != 0) { | - |
1666 | qDebug(" match character class %d", m ^ CharClassBit); | - |
1667 | #ifndef QT_NO_REGEXP_CCLASS | - |
1668 | cl[m ^ CharClassBit].dump(); | - |
1669 | #else | - |
1670 | qDebug(" negative character class"); | - |
1671 | #endif | - |
1672 | } else if ((m & BackRefBit) != 0) { | - |
1673 | qDebug(" match back-reference %d", m ^ BackRefBit); | - |
1674 | } else if (m >= 0x20 && m <= 0x7e) { | - |
1675 | qDebug(" match 0x%.4x (%c)", m, m); | - |
1676 | } else { | - |
1677 | qDebug(" match 0x%.4x", m); | - |
1678 | } | - |
1679 | for (j = 0; j < s[i].outs.size(); j++) { | - |
1680 | int next = s[i].outs[j]; | - |
1681 | qDebug(" -> %d", next); | - |
1682 | if (s[i].reenter.contains(next)) | - |
1683 | qDebug(" [reenter %d]", s[i].reenter[next]); | - |
1684 | if (s[i].anchors.value(next) != 0) | - |
1685 | qDebug(" [anchors 0x%.8x]", s[i].anchors[next]); | - |
1686 | } | - |
1687 | } | - |
1688 | #ifndef QT_NO_REGEXP_CAPTURE | - |
1689 | if (nf > 0) { | - |
1690 | qDebug(" Atom Parent Capture"); | - |
1691 | for (i = 0; i < nf; i++) { | - |
1692 | if (f[i].capture == QRegExpAtom::NoCapture) { | - |
1693 | qDebug(" %6d %6d nil", i, f[i].parent); | - |
1694 | } else { | - |
1695 | int cap = f[i].capture; | - |
1696 | bool official = captureForOfficialCapture.contains(cap); | - |
1697 | qDebug(" %6d %6d %6d %s", i, f[i].parent, f[i].capture, | - |
1698 | official ? "official" : ""); | - |
1699 | } | - |
1700 | } | - |
1701 | } | - |
1702 | #endif | - |
1703 | #ifndef QT_NO_REGEXP_ANCHOR_ALT | - |
1704 | for (i = 0; i < aa.size(); i++) | - |
1705 | qDebug(" Anchor alternation 0x%.8x: 0x%.8x 0x%.9x", i, aa[i].a, aa[i].b); | - |
1706 | #endif | - |
1707 | } | - |
1708 | #endif | - |
1709 | - | |
1710 | void QRegExpEngine::setup() | - |
1711 | { | - |
1712 | ref.store(1); | - |
1713 | #ifndef QT_NO_REGEXP_CAPTURE | - |
1714 | f.resize(32); | - |
1715 | nf = 0; | - |
1716 | cf = -1; | - |
1717 | #endif | - |
1718 | officialncap = 0; | - |
1719 | ncap = 0; | - |
1720 | #ifndef QT_NO_REGEXP_OPTIM | - |
1721 | caretAnchored = true; | - |
1722 | trivial = true; | - |
1723 | #endif | - |
1724 | valid = false; | - |
1725 | #ifndef QT_NO_REGEXP_BACKREF | - |
1726 | nbrefs = 0; | - |
1727 | #endif | - |
1728 | #ifndef QT_NO_REGEXP_OPTIM | - |
1729 | useGoodStringHeuristic = true; | - |
1730 | minl = 0; | - |
1731 | occ1.fill(0, NumBadChars); | - |
1732 | #endif | - |
1733 | } | - |
1734 | - | |
1735 | int QRegExpEngine::setupState(int match) | - |
1736 | { | - |
1737 | #ifndef QT_NO_REGEXP_CAPTURE | - |
1738 | s += QRegExpAutomatonState(cf, match); | - |
1739 | #else | - |
1740 | s += QRegExpAutomatonState(match); | - |
1741 | #endif | - |
1742 | return s.size() - 1; | - |
1743 | } | - |
1744 | - | |
1745 | #ifndef QT_NO_REGEXP_CAPTURE | - |
1746 | /* | - |
1747 | Functions startAtom() and finishAtom() should be called to delimit | - |
1748 | atoms. When a state is created, it is assigned to the current atom. | - |
1749 | The information is later used for capturing. | - |
1750 | */ | - |
1751 | int QRegExpEngine::startAtom(bool officialCapture) | - |
1752 | { | - |
1753 | if ((nf & (nf + 1)) == 0 && nf + 1 >= f.size()) | - |
1754 | f.resize((nf + 1) << 1); | - |
1755 | f[nf].parent = cf; | - |
1756 | cf = nf++; | - |
1757 | f[cf].capture = officialCapture ? QRegExpAtom::OfficialCapture : QRegExpAtom::NoCapture; | - |
1758 | return cf; | - |
1759 | } | - |
1760 | - | |
1761 | void QRegExpEngine::finishAtom(int atom, bool needCapture) | - |
1762 | { | - |
1763 | if (greedyQuantifiers && needCapture && f[atom].capture == QRegExpAtom::NoCapture) | - |
1764 | f[atom].capture = QRegExpAtom::UnofficialCapture; | - |
1765 | cf = f.at(atom).parent; | - |
1766 | } | - |
1767 | #endif | - |
1768 | - | |
1769 | #ifndef QT_NO_REGEXP_LOOKAHEAD | - |
1770 | /* | - |
1771 | Creates a lookahead anchor. | - |
1772 | */ | - |
1773 | int QRegExpEngine::addLookahead(QRegExpEngine *eng, bool negative) | - |
1774 | { | - |
1775 | int n = ahead.size(); | - |
1776 | if (n == MaxLookaheads) { | - |
1777 | error(RXERR_LIMIT); | - |
1778 | return 0; | - |
1779 | } | - |
1780 | ahead += new QRegExpLookahead(eng, negative); | - |
1781 | return Anchor_FirstLookahead << n; | - |
1782 | } | - |
1783 | #endif | - |
1784 | - | |
1785 | #ifndef QT_NO_REGEXP_CAPTURE | - |
1786 | /* | - |
1787 | We want the longest leftmost captures. | - |
1788 | */ | - |
1789 | static bool isBetterCapture(int ncap, const int *begin1, const int *end1, const int *begin2, | - |
1790 | const int *end2) | - |
1791 | { | - |
1792 | for (int i = 0; i < ncap; i++) { | - |
1793 | int delta = begin2[i] - begin1[i]; // it has to start early... | - |
1794 | if (delta == 0) | - |
1795 | delta = end1[i] - end2[i]; // ...and end late | - |
1796 | - | |
1797 | if (delta != 0) | - |
1798 | return delta > 0; | - |
1799 | } | - |
1800 | return false; | - |
1801 | } | - |
1802 | #endif | - |
1803 | - | |
1804 | /* | - |
1805 | Returns \c true if anchor a matches at position pos + i in the input | - |
1806 | string, otherwise false. | - |
1807 | */ | - |
1808 | bool QRegExpMatchState::testAnchor(int i, int a, const int *capBegin) | - |
1809 | { | - |
1810 | int j; | - |
1811 | - | |
1812 | #ifndef QT_NO_REGEXP_ANCHOR_ALT | - |
1813 | if ((a & QRegExpEngine::Anchor_Alternation) != 0) | - |
1814 | return testAnchor(i, eng->aa.at(a ^ QRegExpEngine::Anchor_Alternation).a, capBegin) | - |
1815 | || testAnchor(i, eng->aa.at(a ^ QRegExpEngine::Anchor_Alternation).b, capBegin); | - |
1816 | #endif | - |
1817 | - | |
1818 | if ((a & QRegExpEngine::Anchor_Caret) != 0) { | - |
1819 | if (pos + i != caretPos) | - |
1820 | return false; | - |
1821 | } | - |
1822 | if ((a & QRegExpEngine::Anchor_Dollar) != 0) { | - |
1823 | if (pos + i != len) | - |
1824 | return false; | - |
1825 | } | - |
1826 | #ifndef QT_NO_REGEXP_ESCAPE | - |
1827 | if ((a & (QRegExpEngine::Anchor_Word | QRegExpEngine::Anchor_NonWord)) != 0) { | - |
1828 | bool before = false; | - |
1829 | bool after = false; | - |
1830 | if (pos + i != 0) | - |
1831 | before = isWord(in[pos + i - 1]); | - |
1832 | if (pos + i != len) | - |
1833 | after = isWord(in[pos + i]); | - |
1834 | if ((a & QRegExpEngine::Anchor_Word) != 0 && (before == after)) | - |
1835 | return false; | - |
1836 | if ((a & QRegExpEngine::Anchor_NonWord) != 0 && (before != after)) | - |
1837 | return false; | - |
1838 | } | - |
1839 | #endif | - |
1840 | #ifndef QT_NO_REGEXP_LOOKAHEAD | - |
1841 | if ((a & QRegExpEngine::Anchor_LookaheadMask) != 0) { | - |
1842 | const QVector<QRegExpLookahead *> &ahead = eng->ahead; | - |
1843 | for (j = 0; j < ahead.size(); j++) { | - |
1844 | if ((a & (QRegExpEngine::Anchor_FirstLookahead << j)) != 0) { | - |
1845 | QRegExpMatchState matchState; | - |
1846 | matchState.prepareForMatch(ahead[j]->eng); | - |
1847 | matchState.match(in + pos + i, len - pos - i, 0, | - |
1848 | true, true, caretPos - pos - i); | - |
1849 | if ((matchState.captured[0] == 0) == ahead[j]->neg) | - |
1850 | return false; | - |
1851 | } | - |
1852 | } | - |
1853 | } | - |
1854 | #endif | - |
1855 | #ifndef QT_NO_REGEXP_CAPTURE | - |
1856 | #ifndef QT_NO_REGEXP_BACKREF | - |
1857 | for (j = 0; j < eng->nbrefs; j++) { | - |
1858 | if ((a & (QRegExpEngine::Anchor_BackRef1Empty << j)) != 0) { | - |
1859 | int i = eng->captureForOfficialCapture.at(j); | - |
1860 | if (capBegin[i] != EmptyCapture) | - |
1861 | return false; | - |
1862 | } | - |
1863 | } | - |
1864 | #endif | - |
1865 | #endif | - |
1866 | return true; | - |
1867 | } | - |
1868 | - | |
1869 | #ifndef QT_NO_REGEXP_OPTIM | - |
1870 | /* | - |
1871 | The three following functions are what Jeffrey Friedl would call | - |
1872 | transmissions (or bump-alongs). Using one or the other should make | - |
1873 | no difference except in performance. | - |
1874 | */ | - |
1875 | - | |
1876 | bool QRegExpEngine::goodStringMatch(QRegExpMatchState &matchState) const | - |
1877 | { | - |
1878 | int k = matchState.pos + goodEarlyStart; | - |
1879 | QStringMatcher matcher(goodStr.unicode(), goodStr.length(), cs); | - |
1880 | while ((k = matcher.indexIn(matchState.in, matchState.len, k)) != -1) { | - |
1881 | int from = k - goodLateStart; | - |
1882 | int to = k - goodEarlyStart; | - |
1883 | if (from > matchState.pos) | - |
1884 | matchState.pos = from; | - |
1885 | - | |
1886 | while (matchState.pos <= to) { | - |
1887 | if (matchState.matchHere()) | - |
1888 | return true; | - |
1889 | ++matchState.pos; | - |
1890 | } | - |
1891 | ++k; | - |
1892 | } | - |
1893 | return false; | - |
1894 | } | - |
1895 | - | |
1896 | bool QRegExpEngine::badCharMatch(QRegExpMatchState &matchState) const | - |
1897 | { | - |
1898 | int slideHead = 0; | - |
1899 | int slideNext = 0; | - |
1900 | int i; | - |
1901 | int lastPos = matchState.len - minl; | - |
1902 | memset(matchState.slideTab, 0, matchState.slideTabSize * sizeof(int)); | - |
1903 | - | |
1904 | /* | - |
1905 | Set up the slide table, used for the bad-character heuristic, | - |
1906 | using the table of first occurrence of each character. | - |
1907 | */ | - |
1908 | for (i = 0; i < minl; i++) { | - |
1909 | int sk = occ1[BadChar(matchState.in[matchState.pos + i])]; | - |
1910 | if (sk == NoOccurrence) | - |
1911 | sk = i + 1; | - |
1912 | if (sk > 0) { | - |
1913 | int k = i + 1 - sk; | - |
1914 | if (k < 0) { | - |
1915 | sk = i + 1; | - |
1916 | k = 0; | - |
1917 | } | - |
1918 | if (sk > matchState.slideTab[k]) | - |
1919 | matchState.slideTab[k] = sk; | - |
1920 | } | - |
1921 | } | - |
1922 | - | |
1923 | if (matchState.pos > lastPos) | - |
1924 | return false; | - |
1925 | - | |
1926 | for (;;) { | - |
1927 | if (++slideNext >= matchState.slideTabSize) | - |
1928 | slideNext = 0; | - |
1929 | if (matchState.slideTab[slideHead] > 0) { | - |
1930 | if (matchState.slideTab[slideHead] - 1 > matchState.slideTab[slideNext]) | - |
1931 | matchState.slideTab[slideNext] = matchState.slideTab[slideHead] - 1; | - |
1932 | matchState.slideTab[slideHead] = 0; | - |
1933 | } else { | - |
1934 | if (matchState.matchHere()) | - |
1935 | return true; | - |
1936 | } | - |
1937 | - | |
1938 | if (matchState.pos == lastPos) | - |
1939 | break; | - |
1940 | - | |
1941 | /* | - |
1942 | Update the slide table. This code has much in common with | - |
1943 | the initialization code. | - |
1944 | */ | - |
1945 | int sk = occ1[BadChar(matchState.in[matchState.pos + minl])]; | - |
1946 | if (sk == NoOccurrence) { | - |
1947 | matchState.slideTab[slideNext] = minl; | - |
1948 | } else if (sk > 0) { | - |
1949 | int k = slideNext + minl - sk; | - |
1950 | if (k >= matchState.slideTabSize) | - |
1951 | k -= matchState.slideTabSize; | - |
1952 | if (sk > matchState.slideTab[k]) | - |
1953 | matchState.slideTab[k] = sk; | - |
1954 | } | - |
1955 | slideHead = slideNext; | - |
1956 | ++matchState.pos; | - |
1957 | } | - |
1958 | return false; | - |
1959 | } | - |
1960 | #else | - |
1961 | bool QRegExpEngine::bruteMatch(QRegExpMatchState &matchState) const | - |
1962 | { | - |
1963 | while (matchState.pos <= matchState.len) { | - |
1964 | if (matchState.matchHere()) | - |
1965 | return true; | - |
1966 | ++matchState.pos; | - |
1967 | } | - |
1968 | return false; | - |
1969 | } | - |
1970 | #endif | - |
1971 | - | |
1972 | /* | - |
1973 | Here's the core of the engine. It tries to do a match here and now. | - |
1974 | */ | - |
1975 | bool QRegExpMatchState::matchHere() | - |
1976 | { | - |
1977 | int ncur = 1, nnext = 0; | - |
1978 | int i = 0, j, k, m; | - |
1979 | bool stop = false; | - |
1980 | - | |
1981 | matchLen = -1; | - |
1982 | oneTestMatchedLen = -1; | - |
1983 | curStack[0] = QRegExpEngine::InitialState; | - |
1984 | - | |
1985 | int ncap = eng->ncap; | - |
1986 | #ifndef QT_NO_REGEXP_CAPTURE | - |
1987 | if (ncap > 0) { | - |
1988 | for (j = 0; j < ncap; j++) { | - |
1989 | curCapBegin[j] = EmptyCapture; | - |
1990 | curCapEnd[j] = EmptyCapture; | - |
1991 | } | - |
1992 | } | - |
1993 | #endif | - |
1994 | - | |
1995 | #ifndef QT_NO_REGEXP_BACKREF | - |
1996 | while ((ncur > 0 || !sleeping.isEmpty()) && i <= len - pos && !stop) | - |
1997 | #else | - |
1998 | while (ncur > 0 && i <= len - pos && !stop) | - |
1999 | #endif | - |
2000 | { | - |
2001 | int ch = (i < len - pos) ? in[pos + i].unicode() : 0; | - |
2002 | for (j = 0; j < ncur; j++) { | - |
2003 | int cur = curStack[j]; | - |
2004 | const QRegExpAutomatonState &scur = eng->s.at(cur); | - |
2005 | const QVector<int> &outs = scur.outs; | - |
2006 | for (k = 0; k < outs.size(); k++) { | - |
2007 | int next = outs.at(k); | - |
2008 | const QRegExpAutomatonState &snext = eng->s.at(next); | - |
2009 | bool inside = true; | - |
2010 | #if !defined(QT_NO_REGEXP_BACKREF) && !defined(QT_NO_REGEXP_CAPTURE) | - |
2011 | int needSomeSleep = 0; | - |
2012 | #endif | - |
2013 | - | |
2014 | /* | - |
2015 | First, check if the anchors are anchored properly. | - |
2016 | */ | - |
2017 | int a = scur.anchors.value(next); | - |
2018 | if (a != 0 && !testAnchor(i, a, curCapBegin + j * ncap)) | - |
2019 | inside = false; | - |
2020 | - | |
2021 | /* | - |
2022 | If indeed they are, check if the input character is | - |
2023 | correct for this transition. | - |
2024 | */ | - |
2025 | if (inside) { | - |
2026 | m = snext.match; | - |
2027 | if ((m & (QRegExpEngine::CharClassBit | QRegExpEngine::BackRefBit)) == 0) { | - |
2028 | if (eng->cs) | - |
2029 | inside = (m == ch); | - |
2030 | else | - |
2031 | inside = (QChar(m).toLower() == QChar(ch).toLower()); | - |
2032 | } else if (next == QRegExpEngine::FinalState) { | - |
2033 | matchLen = i; | - |
2034 | stop = minimal; | - |
2035 | inside = true; | - |
2036 | } else if ((m & QRegExpEngine::CharClassBit) != 0) { | - |
2037 | #ifndef QT_NO_REGEXP_CCLASS | - |
2038 | const QRegExpCharClass &cc = eng->cl.at(m ^ QRegExpEngine::CharClassBit); | - |
2039 | if (eng->cs) | - |
2040 | inside = cc.in(ch); | - |
2041 | else if (cc.negative()) | - |
2042 | inside = cc.in(QChar(ch).toLower()) && | - |
2043 | cc.in(QChar(ch).toUpper()); | - |
2044 | else | - |
2045 | inside = cc.in(QChar(ch).toLower()) || | - |
2046 | cc.in(QChar(ch).toUpper()); | - |
2047 | #endif | - |
2048 | #if !defined(QT_NO_REGEXP_BACKREF) && !defined(QT_NO_REGEXP_CAPTURE) | - |
2049 | } else { /* ((m & QRegExpEngine::BackRefBit) != 0) */ | - |
2050 | int bref = m ^ QRegExpEngine::BackRefBit; | - |
2051 | int ell = j * ncap + eng->captureForOfficialCapture.at(bref - 1); | - |
2052 | - | |
2053 | inside = bref <= ncap && curCapBegin[ell] != EmptyCapture; | - |
2054 | if (inside) { | - |
2055 | if (eng->cs) | - |
2056 | inside = (in[pos + curCapBegin[ell]] == QChar(ch)); | - |
2057 | else | - |
2058 | inside = (in[pos + curCapBegin[ell]].toLower() | - |
2059 | == QChar(ch).toLower()); | - |
2060 | } | - |
2061 | - | |
2062 | if (inside) { | - |
2063 | int delta; | - |
2064 | if (curCapEnd[ell] == EmptyCapture) | - |
2065 | delta = i - curCapBegin[ell]; | - |
2066 | else | - |
2067 | delta = curCapEnd[ell] - curCapBegin[ell]; | - |
2068 | - | |
2069 | inside = (delta <= len - (pos + i)); | - |
2070 | if (inside && delta > 1) { | - |
2071 | int n = 1; | - |
2072 | if (eng->cs) { | - |
2073 | while (n < delta) { | - |
2074 | if (in[pos + curCapBegin[ell] + n] | - |
2075 | != in[pos + i + n]) | - |
2076 | break; | - |
2077 | ++n; | - |
2078 | } | - |
2079 | } else { | - |
2080 | while (n < delta) { | - |
2081 | QChar a = in[pos + curCapBegin[ell] + n]; | - |
2082 | QChar b = in[pos + i + n]; | - |
2083 | if (a.toLower() != b.toLower()) | - |
2084 | break; | - |
2085 | ++n; | - |
2086 | } | - |
2087 | } | - |
2088 | inside = (n == delta); | - |
2089 | if (inside) | - |
2090 | needSomeSleep = delta - 1; | - |
2091 | } | - |
2092 | } | - |
2093 | #endif | - |
2094 | } | - |
2095 | } | - |
2096 | - | |
2097 | /* | - |
2098 | We must now update our data structures. | - |
2099 | */ | - |
2100 | if (inside) { | - |
2101 | #ifndef QT_NO_REGEXP_CAPTURE | - |
2102 | int *capBegin, *capEnd; | - |
2103 | #endif | - |
2104 | /* | - |
2105 | If the next state was not encountered yet, all | - |
2106 | is fine. | - |
2107 | */ | - |
2108 | if ((m = inNextStack[next]) == -1) { | - |
2109 | m = nnext++; | - |
2110 | nextStack[m] = next; | - |
2111 | inNextStack[next] = m; | - |
2112 | #ifndef QT_NO_REGEXP_CAPTURE | - |
2113 | capBegin = nextCapBegin + m * ncap; | - |
2114 | capEnd = nextCapEnd + m * ncap; | - |
2115 | - | |
2116 | /* | - |
2117 | Otherwise, we'll first maintain captures in | - |
2118 | temporary arrays, and decide at the end whether | - |
2119 | it's best to keep the previous capture zones or | - |
2120 | the new ones. | - |
2121 | */ | - |
2122 | } else { | - |
2123 | capBegin = tempCapBegin; | - |
2124 | capEnd = tempCapEnd; | - |
2125 | #endif | - |
2126 | } | - |
2127 | - | |
2128 | #ifndef QT_NO_REGEXP_CAPTURE | - |
2129 | /* | - |
2130 | Updating the capture zones is much of a task. | - |
2131 | */ | - |
2132 | if (ncap > 0) { | - |
2133 | memcpy(capBegin, curCapBegin + j * ncap, ncap * sizeof(int)); | - |
2134 | memcpy(capEnd, curCapEnd + j * ncap, ncap * sizeof(int)); | - |
2135 | int c = scur.atom, n = snext.atom; | - |
2136 | int p = -1, q = -1; | - |
2137 | int cap; | - |
2138 | - | |
2139 | /* | - |
2140 | Lemma 1. For any x in the range [0..nf), we | - |
2141 | have f[x].parent < x. | - |
2142 | - | |
2143 | Proof. By looking at startAtom(), it is | - |
2144 | clear that cf < nf holds all the time, and | - |
2145 | thus that f[nf].parent < nf. | - |
2146 | */ | - |
2147 | - | |
2148 | /* | - |
2149 | If we are reentering an atom, we empty all | - |
2150 | capture zones inside it. | - |
2151 | */ | - |
2152 | if ((q = scur.reenter.value(next)) != 0) { | - |
2153 | QBitArray b(eng->nf, false); | - |
2154 | b.setBit(q, true); | - |
2155 | for (int ell = q + 1; ell < eng->nf; ell++) { | - |
2156 | if (b.testBit(eng->f.at(ell).parent)) { | - |
2157 | b.setBit(ell, true); | - |
2158 | cap = eng->f.at(ell).capture; | - |
2159 | if (cap >= 0) { | - |
2160 | capBegin[cap] = EmptyCapture; | - |
2161 | capEnd[cap] = EmptyCapture; | - |
2162 | } | - |
2163 | } | - |
2164 | } | - |
2165 | p = eng->f.at(q).parent; | - |
2166 | - | |
2167 | /* | - |
2168 | Otherwise, close the capture zones we are | - |
2169 | leaving. We are leaving f[c].capture, | - |
2170 | f[f[c].parent].capture, | - |
2171 | f[f[f[c].parent].parent].capture, ..., | - |
2172 | until f[x].capture, with x such that | - |
2173 | f[x].parent is the youngest common ancestor | - |
2174 | for c and n. | - |
2175 | - | |
2176 | We go up along c's and n's ancestry until | - |
2177 | we find x. | - |
2178 | */ | - |
2179 | } else { | - |
2180 | p = c; | - |
2181 | q = n; | - |
2182 | while (p != q) { | - |
2183 | if (p > q) { | - |
2184 | cap = eng->f.at(p).capture; | - |
2185 | if (cap >= 0) { | - |
2186 | if (capBegin[cap] == i) { | - |
2187 | capBegin[cap] = EmptyCapture; | - |
2188 | capEnd[cap] = EmptyCapture; | - |
2189 | } else { | - |
2190 | capEnd[cap] = i; | - |
2191 | } | - |
2192 | } | - |
2193 | p = eng->f.at(p).parent; | - |
2194 | } else { | - |
2195 | q = eng->f.at(q).parent; | - |
2196 | } | - |
2197 | } | - |
2198 | } | - |
2199 | - | |
2200 | /* | - |
2201 | In any case, we now open the capture zones | - |
2202 | we are entering. We work upwards from n | - |
2203 | until we reach p (the parent of the atom we | - |
2204 | reenter or the youngest common ancestor). | - |
2205 | */ | - |
2206 | while (n > p) { | - |
2207 | cap = eng->f.at(n).capture; | - |
2208 | if (cap >= 0) { | - |
2209 | capBegin[cap] = i; | - |
2210 | capEnd[cap] = EmptyCapture; | - |
2211 | } | - |
2212 | n = eng->f.at(n).parent; | - |
2213 | } | - |
2214 | /* | - |
2215 | If the next state was already in | - |
2216 | nextStack, we must choose carefully which | - |
2217 | capture zones we want to keep. | - |
2218 | */ | - |
2219 | if (capBegin == tempCapBegin && | - |
2220 | isBetterCapture(ncap, capBegin, capEnd, nextCapBegin + m * ncap, | - |
2221 | nextCapEnd + m * ncap)) { | - |
2222 | memcpy(nextCapBegin + m * ncap, capBegin, ncap * sizeof(int)); | - |
2223 | memcpy(nextCapEnd + m * ncap, capEnd, ncap * sizeof(int)); | - |
2224 | } | - |
2225 | } | - |
2226 | #ifndef QT_NO_REGEXP_BACKREF | - |
2227 | /* | - |
2228 | We are done with updating the capture zones. | - |
2229 | It's now time to put the next state to sleep, | - |
2230 | if it needs to, and to remove it from | - |
2231 | nextStack. | - |
2232 | */ | - |
2233 | if (needSomeSleep > 0) { | - |
2234 | QVector<int> zzZ(2 + 2 * ncap); | - |
2235 | zzZ[0] = i + needSomeSleep; | - |
2236 | zzZ[1] = next; | - |
2237 | if (ncap > 0) { | - |
2238 | memcpy(zzZ.data() + 2, capBegin, ncap * sizeof(int)); | - |
2239 | memcpy(zzZ.data() + 2 + ncap, capEnd, ncap * sizeof(int)); | - |
2240 | } | - |
2241 | inNextStack[nextStack[--nnext]] = -1; | - |
2242 | sleeping.append(zzZ); | - |
2243 | } | - |
2244 | #endif | - |
2245 | #endif | - |
2246 | } | - |
2247 | } | - |
2248 | } | - |
2249 | #ifndef QT_NO_REGEXP_CAPTURE | - |
2250 | /* | - |
2251 | If we reached the final state, hurray! Copy the captured | - |
2252 | zone. | - |
2253 | */ | - |
2254 | if (ncap > 0 && (m = inNextStack[QRegExpEngine::FinalState]) != -1) { | - |
2255 | memcpy(capBegin, nextCapBegin + m * ncap, ncap * sizeof(int)); | - |
2256 | memcpy(capEnd, nextCapEnd + m * ncap, ncap * sizeof(int)); | - |
2257 | } | - |
2258 | #ifndef QT_NO_REGEXP_BACKREF | - |
2259 | /* | - |
2260 | It's time to wake up the sleepers. | - |
2261 | */ | - |
2262 | j = 0; | - |
2263 | while (j < sleeping.count()) { | - |
2264 | if (sleeping.at(j)[0] == i) { | - |
2265 | const QVector<int> &zzZ = sleeping.at(j); | - |
2266 | int next = zzZ[1]; | - |
2267 | const int *capBegin = zzZ.data() + 2; | - |
2268 | const int *capEnd = zzZ.data() + 2 + ncap; | - |
2269 | bool copyOver = true; | - |
2270 | - | |
2271 | if ((m = inNextStack[next]) == -1) { | - |
2272 | m = nnext++; | - |
2273 | nextStack[m] = next; | - |
2274 | inNextStack[next] = m; | - |
2275 | } else { | - |
2276 | copyOver = isBetterCapture(ncap, nextCapBegin + m * ncap, nextCapEnd + m * ncap, | - |
2277 | capBegin, capEnd); | - |
2278 | } | - |
2279 | if (copyOver) { | - |
2280 | memcpy(nextCapBegin + m * ncap, capBegin, ncap * sizeof(int)); | - |
2281 | memcpy(nextCapEnd + m * ncap, capEnd, ncap * sizeof(int)); | - |
2282 | } | - |
2283 | - | |
2284 | sleeping.removeAt(j); | - |
2285 | } else { | - |
2286 | ++j; | - |
2287 | } | - |
2288 | } | - |
2289 | #endif | - |
2290 | #endif | - |
2291 | for (j = 0; j < nnext; j++) | - |
2292 | inNextStack[nextStack[j]] = -1; | - |
2293 | - | |
2294 | // avoid needless iteration that confuses oneTestMatchedLen | - |
2295 | if (nnext == 1 && nextStack[0] == QRegExpEngine::FinalState | - |
2296 | #ifndef QT_NO_REGEXP_BACKREF | - |
2297 | && sleeping.isEmpty() | - |
2298 | #endif | - |
2299 | ) | - |
2300 | stop = true; | - |
2301 | - | |
2302 | qSwap(curStack, nextStack); | - |
2303 | #ifndef QT_NO_REGEXP_CAPTURE | - |
2304 | qSwap(curCapBegin, nextCapBegin); | - |
2305 | qSwap(curCapEnd, nextCapEnd); | - |
2306 | #endif | - |
2307 | ncur = nnext; | - |
2308 | nnext = 0; | - |
2309 | ++i; | - |
2310 | } | - |
2311 | - | |
2312 | #ifndef QT_NO_REGEXP_BACKREF | - |
2313 | /* | - |
2314 | If minimal matching is enabled, we might have some sleepers | - |
2315 | left. | - |
2316 | */ | - |
2317 | if (!sleeping.isEmpty()) | - |
2318 | sleeping.clear(); | - |
2319 | #endif | - |
2320 | - | |
2321 | oneTestMatchedLen = i - 1; | - |
2322 | return (matchLen >= 0); | - |
2323 | } | - |
2324 | - | |
2325 | #ifndef QT_NO_REGEXP_CCLASS | - |
2326 | - | |
2327 | QRegExpCharClass::QRegExpCharClass() | - |
2328 | : c(0), n(false) | - |
2329 | { | - |
2330 | #ifndef QT_NO_REGEXP_OPTIM | - |
2331 | occ1.fill(NoOccurrence, NumBadChars); | - |
2332 | #endif | - |
2333 | } | - |
2334 | - | |
2335 | void QRegExpCharClass::clear() | - |
2336 | { | - |
2337 | c = 0; | - |
2338 | r.resize(0);clear(); | - |
2339 | n = false; | - |
2340 | } executed 34392 times by 116 tests: end of block Executed by:
| 34392 |
2341 | - | |
2342 | void QRegExpCharClass::setNegative(bool negative) | - |
2343 | { | - |
2344 | n = negative; | - |
2345 | #ifndef QT_NO_REGEXP_OPTIM | - |
2346 | occ1.fill(0, NumBadChars); | - |
2347 | #endif | - |
2348 | } | - |
2349 | - | |
2350 | void QRegExpCharClass::addCategories(uint cats) | - |
2351 | { | - |
2352 | static const int all_cats = FLAG(QChar::Mark_NonSpacing) | | - |
2353 | FLAG(QChar::Mark_SpacingCombining) | | - |
2354 | FLAG(QChar::Mark_Enclosing) | | - |
2355 | FLAG(QChar::Number_DecimalDigit) | | - |
2356 | FLAG(QChar::Number_Letter) | | - |
2357 | FLAG(QChar::Number_Other) | | - |
2358 | FLAG(QChar::Separator_Space) | | - |
2359 | FLAG(QChar::Separator_Line) | | - |
2360 | FLAG(QChar::Separator_Paragraph) | | - |
2361 | FLAG(QChar::Other_Control) | | - |
2362 | FLAG(QChar::Other_Format) | | - |
2363 | FLAG(QChar::Other_Surrogate) | | - |
2364 | FLAG(QChar::Other_PrivateUse) | | - |
2365 | FLAG(QChar::Other_NotAssigned) | | - |
2366 | FLAG(QChar::Letter_Uppercase) | | - |
2367 | FLAG(QChar::Letter_Lowercase) | | - |
2368 | FLAG(QChar::Letter_Titlecase) | | - |
2369 | FLAG(QChar::Letter_Modifier) | | - |
2370 | FLAG(QChar::Letter_Other) | | - |
2371 | FLAG(QChar::Punctuation_Connector) | | - |
2372 | FLAG(QChar::Punctuation_Dash) | | - |
2373 | FLAG(QChar::Punctuation_Open) | | - |
2374 | FLAG(QChar::Punctuation_Close) | | - |
2375 | FLAG(QChar::Punctuation_InitialQuote) | | - |
2376 | FLAG(QChar::Punctuation_FinalQuote) | | - |
2377 | FLAG(QChar::Punctuation_Other) | | - |
2378 | FLAG(QChar::Symbol_Math) | | - |
2379 | FLAG(QChar::Symbol_Currency) | | - |
2380 | FLAG(QChar::Symbol_Modifier) | | - |
2381 | FLAG(QChar::Symbol_Other); | - |
2382 | c |= (all_cats & cats); | - |
2383 | #ifndef QT_NO_REGEXP_OPTIM | - |
2384 | occ1.fill(0, NumBadChars); | - |
2385 | #endif | - |
2386 | } | - |
2387 | - | |
2388 | void QRegExpCharClass::addRange(ushort from, ushort to) | - |
2389 | { | - |
2390 | if (from > to) | - |
2391 | qSwap(from, to); | - |
2392 | int m = r.size(); | - |
2393 | r.resize(m + 1); | - |
2394 | r[m].from = from; | - |
2395 | r[m].len = to - from + 1; | - |
2396 | - | |
2397 | #ifndef QT_NO_REGEXP_OPTIM | - |
2398 | int i; | - |
2399 | - | |
2400 | if (to - from < NumBadChars) { | - |
2401 | if (from % NumBadChars <= to % NumBadChars) { | - |
2402 | for (i = from % NumBadChars; i <= to % NumBadChars; i++) | - |
2403 | occ1[i] = 0; | - |
2404 | } else { | - |
2405 | for (i = 0; i <= to % NumBadChars; i++) | - |
2406 | occ1[i] = 0; | - |
2407 | for (i = from % NumBadChars; i < NumBadChars; i++) | - |
2408 | occ1[i] = 0; | - |
2409 | } | - |
2410 | } else { | - |
2411 | occ1.fill(0, NumBadChars); | - |
2412 | } | - |
2413 | #endif | - |
2414 | } | - |
2415 | - | |
2416 | bool QRegExpCharClass::in(QChar ch) const | - |
2417 | { | - |
2418 | #ifndef QT_NO_REGEXP_OPTIM | - |
2419 | if (occ1.at(BadChar(ch)) == NoOccurrence) | - |
2420 | return n; | - |
2421 | #endif | - |
2422 | - | |
2423 | if (c != 0 && (c & FLAG(ch.category())) != 0) | - |
2424 | return !n; | - |
2425 | - | |
2426 | const int uc = ch.unicode(); | - |
2427 | int size = r.size(); | - |
2428 | - | |
2429 | for (int i = 0; i < size; ++i) { | - |
2430 | const QRegExpCharClassRange &range = r.at(i); | - |
2431 | if (uint(uc - range.from) < uint(r.at(i).len)) | - |
2432 | return !n; | - |
2433 | } | - |
2434 | return n; | - |
2435 | } | - |
2436 | - | |
2437 | #if defined(QT_DEBUG) | - |
2438 | void QRegExpCharClass::dump() const | - |
2439 | { | - |
2440 | int i; | - |
2441 | qDebug(" %stive character class", n ? "nega" : "posi"); | - |
2442 | #ifndef QT_NO_REGEXP_CCLASS | - |
2443 | if (c != 0) | - |
2444 | qDebug(" categories 0x%.8x", c); | - |
2445 | #endif | - |
2446 | for (i = 0; i < r.size(); i++) | - |
2447 | qDebug(" 0x%.4x through 0x%.4x", r[i].from, r[i].from + r[i].len - 1); | - |
2448 | } | - |
2449 | #endif | - |
2450 | #endif | - |
2451 | - | |
2452 | QRegExpEngine::Box::Box(QRegExpEngine *engine) | - |
2453 | : eng(engine), skipanchors(0) | - |
2454 | #ifndef QT_NO_REGEXP_OPTIM | - |
2455 | , earlyStart(0), lateStart(0), maxl(0) | - |
2456 | #endif | - |
2457 | { | - |
2458 | #ifndef QT_NO_REGEXP_OPTIM | - |
2459 | occ1.fill(NoOccurrence, NumBadChars); | - |
2460 | #endif | - |
2461 | minl = 0; | - |
2462 | } | - |
2463 | - | |
2464 | QRegExpEngine::Box &QRegExpEngine::Box::operator=(const Box &b) | - |
2465 | { | - |
2466 | eng = b.eng; | - |
2467 | ls = b.ls; | - |
2468 | rs = b.rs; | - |
2469 | lanchors = b.lanchors; | - |
2470 | ranchors = b.ranchors; | - |
2471 | skipanchors = b.skipanchors; | - |
2472 | #ifndef QT_NO_REGEXP_OPTIM | - |
2473 | earlyStart = b.earlyStart; | - |
2474 | lateStart = b.lateStart; | - |
2475 | str = b.str; | - |
2476 | leftStr = b.leftStr; | - |
2477 | rightStr = b.rightStr; | - |
2478 | maxl = b.maxl; | - |
2479 | occ1 = b.occ1; | - |
2480 | #endif | - |
2481 | minl = b.minl; | - |
2482 | return *this; | - |
2483 | } | - |
2484 | - | |
2485 | void QRegExpEngine::Box::set(QChar ch) | - |
2486 | { | - |
2487 | ls.resize(1); | - |
2488 | ls[0] = eng->createState(ch); | - |
2489 | rs = ls; | - |
2490 | #ifndef QT_NO_REGEXP_OPTIM | - |
2491 | str = ch; | - |
2492 | leftStr = ch; | - |
2493 | rightStr = ch; | - |
2494 | maxl = 1; | - |
2495 | occ1[BadChar(ch)] = 0; | - |
2496 | #endif | - |
2497 | minl = 1; | - |
2498 | } | - |
2499 | - | |
2500 | void QRegExpEngine::Box::set(const QRegExpCharClass &cc) | - |
2501 | { | - |
2502 | ls.resize(1); | - |
2503 | ls[0] = eng->createState(cc); | - |
2504 | rs = ls; | - |
2505 | #ifndef QT_NO_REGEXP_OPTIM | - |
2506 | maxl = 1; | - |
2507 | occ1 = cc.firstOccurrence(); | - |
2508 | #endif | - |
2509 | minl = 1; | - |
2510 | } | - |
2511 | - | |
2512 | #ifndef QT_NO_REGEXP_BACKREF | - |
2513 | void QRegExpEngine::Box::set(int bref) | - |
2514 | { | - |
2515 | ls.resize(1); | - |
2516 | ls[0] = eng->createState(bref); | - |
2517 | rs = ls; | - |
2518 | if (bref >= 1 && bref <= MaxBackRefs) | - |
2519 | skipanchors = Anchor_BackRef0Empty << bref; | - |
2520 | #ifndef QT_NO_REGEXP_OPTIM | - |
2521 | maxl = InftyLen; | - |
2522 | #endif | - |
2523 | minl = 0; | - |
2524 | } | - |
2525 | #endif | - |
2526 | - | |
2527 | void QRegExpEngine::Box::cat(const Box &b) | - |
2528 | { | - |
2529 | eng->addCatTransitions(rs, b.ls); | - |
2530 | addAnchorsToEngine(b); | - |
2531 | if (minl == 0) { | - |
2532 | lanchors.unite(b.lanchors); | - |
2533 | if (skipanchors != 0) { | - |
2534 | for (int i = 0; i < b.ls.size(); i++) { | - |
2535 | int a = eng->anchorConcatenation(lanchors.value(b.ls.at(i), 0), skipanchors); | - |
2536 | lanchors.insert(b.ls.at(i), a); | - |
2537 | } | - |
2538 | } | - |
2539 | mergeInto(&ls, b.ls); | - |
2540 | } | - |
2541 | if (b.minl == 0) { | - |
2542 | ranchors.unite(b.ranchors); | - |
2543 | if (b.skipanchors != 0) { | - |
2544 | for (int i = 0; i < rs.size(); i++) { | - |
2545 | int a = eng->anchorConcatenation(ranchors.value(rs.at(i), 0), b.skipanchors); | - |
2546 | ranchors.insert(rs.at(i), a); | - |
2547 | } | - |
2548 | } | - |
2549 | mergeInto(&rs, b.rs); | - |
2550 | } else { | - |
2551 | ranchors = b.ranchors; | - |
2552 | rs = b.rs; | - |
2553 | } | - |
2554 | - | |
2555 | #ifndef QT_NO_REGEXP_OPTIM | - |
2556 | if (maxl != InftyLen) { | - |
2557 | if (rightStr.length() + b.leftStr.length() > | - |
2558 | qMax(str.length(), b.str.length())) { | - |
2559 | earlyStart = minl - rightStr.length(); | - |
2560 | lateStart = maxl - rightStr.length(); | - |
2561 | str = rightStr + b.leftStr; | - |
2562 | } else if (b.str.length() > str.length()) { | - |
2563 | earlyStart = minl + b.earlyStart; | - |
2564 | lateStart = maxl + b.lateStart; | - |
2565 | str = b.str; | - |
2566 | } | - |
2567 | } | - |
2568 | - | |
2569 | if (leftStr.length() == maxl) | - |
2570 | leftStr += b.leftStr; | - |
2571 | - | |
2572 | if (b.rightStr.length() == b.maxl) { | - |
2573 | rightStr += b.rightStr; | - |
2574 | } else { | - |
2575 | rightStr = b.rightStr; | - |
2576 | } | - |
2577 | - | |
2578 | if (maxl == InftyLen || b.maxl == InftyLen) { | - |
2579 | maxl = InftyLen; | - |
2580 | } else { | - |
2581 | maxl += b.maxl; | - |
2582 | } | - |
2583 | - | |
2584 | for (int i = 0; i < NumBadChars; i++) { | - |
2585 | if (b.occ1.at(i) != NoOccurrence && minl + b.occ1.at(i) < occ1.at(i)) | - |
2586 | occ1[i] = minl + b.occ1.at(i); | - |
2587 | } | - |
2588 | #endif | - |
2589 | - | |
2590 | minl += b.minl; | - |
2591 | if (minl == 0) | - |
2592 | skipanchors = eng->anchorConcatenation(skipanchors, b.skipanchors); | - |
2593 | else | - |
2594 | skipanchors = 0; | - |
2595 | } | - |
2596 | - | |
2597 | void QRegExpEngine::Box::orx(const Box &b) | - |
2598 | { | - |
2599 | mergeInto(&ls, b.ls); | - |
2600 | lanchors.unite(b.lanchors); | - |
2601 | mergeInto(&rs, b.rs); | - |
2602 | ranchors.unite(b.ranchors); | - |
2603 | - | |
2604 | if (b.minl == 0) { | - |
2605 | if (minl == 0) | - |
2606 | skipanchors = eng->anchorAlternation(skipanchors, b.skipanchors); | - |
2607 | else | - |
2608 | skipanchors = b.skipanchors; | - |
2609 | } | - |
2610 | - | |
2611 | #ifndef QT_NO_REGEXP_OPTIM | - |
2612 | for (int i = 0; i < NumBadChars; i++) { | - |
2613 | if (occ1.at(i) > b.occ1.at(i)) | - |
2614 | occ1[i] = b.occ1.at(i); | - |
2615 | } | - |
2616 | earlyStart = 0; | - |
2617 | lateStart = 0; | - |
2618 | str = QString(); | - |
2619 | leftStr = QString(); | - |
2620 | rightStr = QString(); | - |
2621 | if (b.maxl > maxl) | - |
2622 | maxl = b.maxl; | - |
2623 | #endif | - |
2624 | if (b.minl < minl) | - |
2625 | minl = b.minl; | - |
2626 | } | - |
2627 | - | |
2628 | void QRegExpEngine::Box::plus(int atom) | - |
2629 | { | - |
2630 | #ifndef QT_NO_REGEXP_CAPTURE | - |
2631 | eng->addPlusTransitions(rs, ls, atom); | - |
2632 | #else | - |
2633 | Q_UNUSED(atom); | - |
2634 | eng->addCatTransitions(rs, ls); | - |
2635 | #endif | - |
2636 | addAnchorsToEngine(*this); | - |
2637 | #ifndef QT_NO_REGEXP_OPTIM | - |
2638 | maxl = InftyLen; | - |
2639 | #endif | - |
2640 | } | - |
2641 | - | |
2642 | void QRegExpEngine::Box::opt() | - |
2643 | { | - |
2644 | #ifndef QT_NO_REGEXP_OPTIM | - |
2645 | earlyStart = 0; | - |
2646 | lateStart = 0; | - |
2647 | str = QString(); | - |
2648 | leftStr = QString(); | - |
2649 | rightStr = QString(); | - |
2650 | #endif | - |
2651 | skipanchors = 0; | - |
2652 | minl = 0; | - |
2653 | } | - |
2654 | - | |
2655 | void QRegExpEngine::Box::catAnchor(int a) | - |
2656 | { | - |
2657 | if (a != 0) { | - |
2658 | for (int i = 0; i < rs.size(); i++) { | - |
2659 | a = eng->anchorConcatenation(ranchors.value(rs.at(i), 0), a); | - |
2660 | ranchors.insert(rs.at(i), a); | - |
2661 | } | - |
2662 | if (minl == 0) | - |
2663 | skipanchors = eng->anchorConcatenation(skipanchors, a); | - |
2664 | } | - |
2665 | } | - |
2666 | - | |
2667 | #ifndef QT_NO_REGEXP_OPTIM | - |
2668 | void QRegExpEngine::Box::setupHeuristics() | - |
2669 | { | - |
2670 | eng->goodEarlyStart = earlyStart; | - |
2671 | eng->goodLateStart = lateStart; | - |
2672 | eng->goodStr = eng->cs ? str : str.toLower(); | - |
2673 | - | |
2674 | eng->minl = minl; | - |
2675 | if (eng->cs) { | - |
2676 | /* | - |
2677 | A regular expression such as 112|1 has occ1['2'] = 2 and minl = | - |
2678 | 1 at this point. An entry of occ1 has to be at most minl or | - |
2679 | infinity for the rest of the algorithm to go well. | - |
2680 | - | |
2681 | We waited until here before normalizing these cases (instead of | - |
2682 | doing it in Box::orx()) because sometimes things improve by | - |
2683 | themselves. Consider for example (112|1)34. | - |
2684 | */ | - |
2685 | for (int i = 0; i < NumBadChars; i++) { | - |
2686 | if (occ1.at(i) != NoOccurrence && occ1.at(i) >= minl) | - |
2687 | occ1[i] = minl; | - |
2688 | } | - |
2689 | eng->occ1 = occ1; | - |
2690 | } else { | - |
2691 | eng->occ1.fill(0, NumBadChars); | - |
2692 | } | - |
2693 | - | |
2694 | eng->heuristicallyChooseHeuristic(); | - |
2695 | } | - |
2696 | #endif | - |
2697 | - | |
2698 | #if defined(QT_DEBUG) | - |
2699 | void QRegExpEngine::Box::dump() const | - |
2700 | { | - |
2701 | int i; | - |
2702 | qDebug("Box of at least %d character%s", minl, minl == 1 ? "" : "s"); | - |
2703 | qDebug(" Left states:"); | - |
2704 | for (i = 0; i < ls.size(); i++) { | - |
2705 | if (lanchors.value(ls[i], 0) == 0) | - |
2706 | qDebug(" %d", ls[i]); | - |
2707 | else | - |
2708 | qDebug(" %d [anchors 0x%.8x]", ls[i], lanchors[ls[i]]); | - |
2709 | } | - |
2710 | qDebug(" Right states:"); | - |
2711 | for (i = 0; i < rs.size(); i++) { | - |
2712 | if (ranchors.value(rs[i], 0) == 0) | - |
2713 | qDebug(" %d", rs[i]); | - |
2714 | else | - |
2715 | qDebug(" %d [anchors 0x%.8x]", rs[i], ranchors[rs[i]]); | - |
2716 | } | - |
2717 | qDebug(" Skip anchors: 0x%.8x", skipanchors); | - |
2718 | } | - |
2719 | #endif | - |
2720 | - | |
2721 | void QRegExpEngine::Box::addAnchorsToEngine(const Box &to) const | - |
2722 | { | - |
2723 | for (int i = 0; i < to.ls.size(); i++) { | - |
2724 | for (int j = 0; j < rs.size(); j++) { | - |
2725 | int a = eng->anchorConcatenation(ranchors.value(rs.at(j), 0), | - |
2726 | to.lanchors.value(to.ls.at(i), 0)); | - |
2727 | eng->addAnchors(rs[j], to.ls[i], a); | - |
2728 | } | - |
2729 | } | - |
2730 | } | - |
2731 | - | |
2732 | #ifndef QT_NO_REGEXP_CCLASS | - |
2733 | // fast lookup hash for xml schema extensions | - |
2734 | // sorted by name for b-search | - |
2735 | static const struct CategoriesRangeMapEntry { | - |
2736 | const char name[40]; | - |
2737 | uint first, second; | - |
2738 | } categoriesRangeMap[] = { | - |
2739 | { "AegeanNumbers", 0x10100, 0x1013F }, | - |
2740 | { "AlphabeticPresentationForms", 0xFB00, 0xFB4F }, | - |
2741 | { "AncientGreekMusicalNotation", 0x1D200, 0x1D24F }, | - |
2742 | { "AncientGreekNumbers", 0x10140, 0x1018F }, | - |
2743 | { "Arabic", 0x0600, 0x06FF }, | - |
2744 | { "ArabicPresentationForms-A", 0xFB50, 0xFDFF }, | - |
2745 | { "ArabicPresentationForms-B", 0xFE70, 0xFEFF }, | - |
2746 | { "ArabicSupplement", 0x0750, 0x077F }, | - |
2747 | { "Armenian", 0x0530, 0x058F }, | - |
2748 | { "Arrows", 0x2190, 0x21FF }, | - |
2749 | { "BasicLatin", 0x0000, 0x007F }, | - |
2750 | { "Bengali", 0x0980, 0x09FF }, | - |
2751 | { "BlockElements", 0x2580, 0x259F }, | - |
2752 | { "Bopomofo", 0x3100, 0x312F }, | - |
2753 | { "BopomofoExtended", 0x31A0, 0x31BF }, | - |
2754 | { "BoxDrawing", 0x2500, 0x257F }, | - |
2755 | { "BraillePatterns", 0x2800, 0x28FF }, | - |
2756 | { "Buginese", 0x1A00, 0x1A1F }, | - |
2757 | { "Buhid", 0x1740, 0x175F }, | - |
2758 | { "ByzantineMusicalSymbols", 0x1D000, 0x1D0FF }, | - |
2759 | { "CJKCompatibility", 0x3300, 0x33FF }, | - |
2760 | { "CJKCompatibilityForms", 0xFE30, 0xFE4F }, | - |
2761 | { "CJKCompatibilityIdeographs", 0xF900, 0xFAFF }, | - |
2762 | { "CJKCompatibilityIdeographsSupplement", 0x2F800, 0x2FA1F }, | - |
2763 | { "CJKRadicalsSupplement", 0x2E80, 0x2EFF }, | - |
2764 | { "CJKStrokes", 0x31C0, 0x31EF }, | - |
2765 | { "CJKSymbolsandPunctuation", 0x3000, 0x303F }, | - |
2766 | { "CJKUnifiedIdeographs", 0x4E00, 0x9FFF }, | - |
2767 | { "CJKUnifiedIdeographsExtensionA", 0x3400, 0x4DB5 }, | - |
2768 | { "CJKUnifiedIdeographsExtensionB", 0x20000, 0x2A6DF }, | - |
2769 | { "Cherokee", 0x13A0, 0x13FF }, | - |
2770 | { "CombiningDiacriticalMarks", 0x0300, 0x036F }, | - |
2771 | { "CombiningDiacriticalMarksSupplement", 0x1DC0, 0x1DFF }, | - |
2772 | { "CombiningHalfMarks", 0xFE20, 0xFE2F }, | - |
2773 | { "CombiningMarksforSymbols", 0x20D0, 0x20FF }, | - |
2774 | { "ControlPictures", 0x2400, 0x243F }, | - |
2775 | { "Coptic", 0x2C80, 0x2CFF }, | - |
2776 | { "CurrencySymbols", 0x20A0, 0x20CF }, | - |
2777 | { "CypriotSyllabary", 0x10800, 0x1083F }, | - |
2778 | { "Cyrillic", 0x0400, 0x04FF }, | - |
2779 | { "CyrillicSupplement", 0x0500, 0x052F }, | - |
2780 | { "Deseret", 0x10400, 0x1044F }, | - |
2781 | { "Devanagari", 0x0900, 0x097F }, | - |
2782 | { "Dingbats", 0x2700, 0x27BF }, | - |
2783 | { "EnclosedAlphanumerics", 0x2460, 0x24FF }, | - |
2784 | { "EnclosedCJKLettersandMonths", 0x3200, 0x32FF }, | - |
2785 | { "Ethiopic", 0x1200, 0x137F }, | - |
2786 | { "EthiopicExtended", 0x2D80, 0x2DDF }, | - |
2787 | { "EthiopicSupplement", 0x1380, 0x139F }, | - |
2788 | { "GeneralPunctuation", 0x2000, 0x206F }, | - |
2789 | { "GeometricShapes", 0x25A0, 0x25FF }, | - |
2790 | { "Georgian", 0x10A0, 0x10FF }, | - |
2791 | { "GeorgianSupplement", 0x2D00, 0x2D2F }, | - |
2792 | { "Glagolitic", 0x2C00, 0x2C5F }, | - |
2793 | { "Gothic", 0x10330, 0x1034F }, | - |
2794 | { "Greek", 0x0370, 0x03FF }, | - |
2795 | { "GreekExtended", 0x1F00, 0x1FFF }, | - |
2796 | { "Gujarati", 0x0A80, 0x0AFF }, | - |
2797 | { "Gurmukhi", 0x0A00, 0x0A7F }, | - |
2798 | { "HalfwidthandFullwidthForms", 0xFF00, 0xFFEF }, | - |
2799 | { "HangulCompatibilityJamo", 0x3130, 0x318F }, | - |
2800 | { "HangulJamo", 0x1100, 0x11FF }, | - |
2801 | { "HangulSyllables", 0xAC00, 0xD7A3 }, | - |
2802 | { "Hanunoo", 0x1720, 0x173F }, | - |
2803 | { "Hebrew", 0x0590, 0x05FF }, | - |
2804 | { "Hiragana", 0x3040, 0x309F }, | - |
2805 | { "IPAExtensions", 0x0250, 0x02AF }, | - |
2806 | { "IdeographicDescriptionCharacters", 0x2FF0, 0x2FFF }, | - |
2807 | { "Kanbun", 0x3190, 0x319F }, | - |
2808 | { "KangxiRadicals", 0x2F00, 0x2FDF }, | - |
2809 | { "Kannada", 0x0C80, 0x0CFF }, | - |
2810 | { "Katakana", 0x30A0, 0x30FF }, | - |
2811 | { "KatakanaPhoneticExtensions", 0x31F0, 0x31FF }, | - |
2812 | { "Kharoshthi", 0x10A00, 0x10A5F }, | - |
2813 | { "Khmer", 0x1780, 0x17FF }, | - |
2814 | { "KhmerSymbols", 0x19E0, 0x19FF }, | - |
2815 | { "Lao", 0x0E80, 0x0EFF }, | - |
2816 | { "Latin-1Supplement", 0x0080, 0x00FF }, | - |
2817 | { "LatinExtended-A", 0x0100, 0x017F }, | - |
2818 | { "LatinExtended-B", 0x0180, 0x024F }, | - |
2819 | { "LatinExtendedAdditional", 0x1E00, 0x1EFF }, | - |
2820 | { "LetterlikeSymbols", 0x2100, 0x214F }, | - |
2821 | { "Limbu", 0x1900, 0x194F }, | - |
2822 | { "LinearBIdeograms", 0x10080, 0x100FF }, | - |
2823 | { "LinearBSyllabary", 0x10000, 0x1007F }, | - |
2824 | { "Malayalam", 0x0D00, 0x0D7F }, | - |
2825 | { "MathematicalAlphanumericSymbols", 0x1D400, 0x1D7FF }, | - |
2826 | { "MathematicalOperators", 0x2200, 0x22FF }, | - |
2827 | { "MiscellaneousMathematicalSymbols-A", 0x27C0, 0x27EF }, | - |
2828 | { "MiscellaneousMathematicalSymbols-B", 0x2980, 0x29FF }, | - |
2829 | { "MiscellaneousSymbols", 0x2600, 0x26FF }, | - |
2830 | { "MiscellaneousSymbolsandArrows", 0x2B00, 0x2BFF }, | - |
2831 | { "MiscellaneousTechnical", 0x2300, 0x23FF }, | - |
2832 | { "ModifierToneLetters", 0xA700, 0xA71F }, | - |
2833 | { "Mongolian", 0x1800, 0x18AF }, | - |
2834 | { "MusicalSymbols", 0x1D100, 0x1D1FF }, | - |
2835 | { "Myanmar", 0x1000, 0x109F }, | - |
2836 | { "NewTaiLue", 0x1980, 0x19DF }, | - |
2837 | { "NumberForms", 0x2150, 0x218F }, | - |
2838 | { "Ogham", 0x1680, 0x169F }, | - |
2839 | { "OldItalic", 0x10300, 0x1032F }, | - |
2840 | { "OldPersian", 0x103A0, 0x103DF }, | - |
2841 | { "OpticalCharacterRecognition", 0x2440, 0x245F }, | - |
2842 | { "Oriya", 0x0B00, 0x0B7F }, | - |
2843 | { "Osmanya", 0x10480, 0x104AF }, | - |
2844 | { "PhoneticExtensions", 0x1D00, 0x1D7F }, | - |
2845 | { "PhoneticExtensionsSupplement", 0x1D80, 0x1DBF }, | - |
2846 | { "PrivateUse", 0xE000, 0xF8FF }, | - |
2847 | { "Runic", 0x16A0, 0x16FF }, | - |
2848 | { "Shavian", 0x10450, 0x1047F }, | - |
2849 | { "Sinhala", 0x0D80, 0x0DFF }, | - |
2850 | { "SmallFormVariants", 0xFE50, 0xFE6F }, | - |
2851 | { "SpacingModifierLetters", 0x02B0, 0x02FF }, | - |
2852 | { "Specials", 0xFFF0, 0xFFFF }, | - |
2853 | { "SuperscriptsandSubscripts", 0x2070, 0x209F }, | - |
2854 | { "SupplementalArrows-A", 0x27F0, 0x27FF }, | - |
2855 | { "SupplementalArrows-B", 0x2900, 0x297F }, | - |
2856 | { "SupplementalMathematicalOperators", 0x2A00, 0x2AFF }, | - |
2857 | { "SupplementalPunctuation", 0x2E00, 0x2E7F }, | - |
2858 | { "SupplementaryPrivateUseArea-A", 0xF0000, 0xFFFFF }, | - |
2859 | { "SupplementaryPrivateUseArea-B", 0x100000, 0x10FFFF }, | - |
2860 | { "SylotiNagri", 0xA800, 0xA82F }, | - |
2861 | { "Syriac", 0x0700, 0x074F }, | - |
2862 | { "Tagalog", 0x1700, 0x171F }, | - |
2863 | { "Tagbanwa", 0x1760, 0x177F }, | - |
2864 | { "Tags", 0xE0000, 0xE007F }, | - |
2865 | { "TaiLe", 0x1950, 0x197F }, | - |
2866 | { "TaiXuanJingSymbols", 0x1D300, 0x1D35F }, | - |
2867 | { "Tamil", 0x0B80, 0x0BFF }, | - |
2868 | { "Telugu", 0x0C00, 0x0C7F }, | - |
2869 | { "Thaana", 0x0780, 0x07BF }, | - |
2870 | { "Thai", 0x0E00, 0x0E7F }, | - |
2871 | { "Tibetan", 0x0F00, 0x0FFF }, | - |
2872 | { "Tifinagh", 0x2D30, 0x2D7F }, | - |
2873 | { "Ugaritic", 0x10380, 0x1039F }, | - |
2874 | { "UnifiedCanadianAboriginalSyllabics", 0x1400, 0x167F }, | - |
2875 | { "VariationSelectors", 0xFE00, 0xFE0F }, | - |
2876 | { "VariationSelectorsSupplement", 0xE0100, 0xE01EF }, | - |
2877 | { "VerticalForms", 0xFE10, 0xFE1F }, | - |
2878 | { "YiRadicals", 0xA490, 0xA4CF }, | - |
2879 | { "YiSyllables", 0xA000, 0xA48F }, | - |
2880 | { "YijingHexagramSymbols", 0x4DC0, 0x4DFF } | - |
2881 | }; | - |
2882 | - | |
2883 | inline bool operator<(const CategoriesRangeMapEntry &entry1, const CategoriesRangeMapEntry &entry2) | - |
2884 | { return qstrcmp(entry1.name, entry2.name) < 0; } | - |
2885 | inline bool operator<(const char *name, const CategoriesRangeMapEntry &entry) | - |
2886 | { return qstrcmp(name, entry.name) < 0; } | - |
2887 | inline bool operator<(const CategoriesRangeMapEntry &entry, const char *name) | - |
2888 | { return qstrcmp(entry.name, name) < 0; } | - |
2889 | #endif // QT_NO_REGEXP_CCLASS | - |
2890 | - | |
2891 | int QRegExpEngine::getChar() | - |
2892 | { | - |
2893 | return (yyPos == yyLen) ? EOS : yyIn[yyPos++].unicode(); | - |
2894 | } | - |
2895 | - | |
2896 | int QRegExpEngine::getEscape() | - |
2897 | { | - |
2898 | #ifndef QT_NO_REGEXP_ESCAPE | - |
2899 | const char tab[] = "afnrtv"; // no b, as \b means word boundary | - |
2900 | const char backTab[] = "\a\f\n\r\t\v"; | - |
2901 | ushort low; | - |
2902 | int i; | - |
2903 | #endif | - |
2904 | ushort val; | - |
2905 | int prevCh = yyCh; | - |
2906 | - | |
2907 | if (prevCh == EOS) { | - |
2908 | error(RXERR_END); | - |
2909 | return Tok_Char | '\\'; | - |
2910 | } | - |
2911 | yyCh = getChar(); | - |
2912 | #ifndef QT_NO_REGEXP_ESCAPE | - |
2913 | if ((prevCh & ~0xff) == 0) { | - |
2914 | const char *p = strchr(tab, prevCh); | - |
2915 | if (p != 0) | - |
2916 | return Tok_Char | backTab[p - tab]; | - |
2917 | } | - |
2918 | #endif | - |
2919 | - | |
2920 | switch (prevCh) { | - |
2921 | #ifndef QT_NO_REGEXP_ESCAPE | - |
2922 | case '0': | - |
2923 | val = 0; | - |
2924 | for (i = 0; i < 3; i++) { | - |
2925 | if (yyCh >= '0' && yyCh <= '7') | - |
2926 | val = (val << 3) | (yyCh - '0'); | - |
2927 | else | - |
2928 | break; | - |
2929 | yyCh = getChar(); | - |
2930 | } | - |
2931 | if ((val & ~0377) != 0) | - |
2932 | error(RXERR_OCTAL); | - |
2933 | return Tok_Char | val; | - |
2934 | #endif | - |
2935 | #ifndef QT_NO_REGEXP_ESCAPE | - |
2936 | case 'B': | - |
2937 | return Tok_NonWord; | - |
2938 | #endif | - |
2939 | #ifndef QT_NO_REGEXP_CCLASS | - |
2940 | case 'D': | - |
2941 | // see QChar::isDigit() | - |
2942 | yyCharClass->addCategories(uint(-1) ^ FLAG(QChar::Number_DecimalDigit)); | - |
2943 | return Tok_CharClass; | - |
2944 | case 'S': | - |
2945 | // see QChar::isSpace() | - |
2946 | yyCharClass->addCategories(uint(-1) ^ (FLAG(QChar::Separator_Space) | | - |
2947 | FLAG(QChar::Separator_Line) | | - |
2948 | FLAG(QChar::Separator_Paragraph) | | - |
2949 | FLAG(QChar::Other_Control))); | - |
2950 | yyCharClass->addRange(0x0000, 0x0008); | - |
2951 | yyCharClass->addRange(0x000e, 0x001f); | - |
2952 | yyCharClass->addRange(0x007f, 0x0084); | - |
2953 | yyCharClass->addRange(0x0086, 0x009f); | - |
2954 | return Tok_CharClass; | - |
2955 | case 'W': | - |
2956 | // see QChar::isLetterOrNumber() and QChar::isMark() | - |
2957 | yyCharClass->addCategories(uint(-1) ^ (FLAG(QChar::Mark_NonSpacing) | | - |
2958 | FLAG(QChar::Mark_SpacingCombining) | | - |
2959 | FLAG(QChar::Mark_Enclosing) | | - |
2960 | FLAG(QChar::Number_DecimalDigit) | | - |
2961 | FLAG(QChar::Number_Letter) | | - |
2962 | FLAG(QChar::Number_Other) | | - |
2963 | FLAG(QChar::Letter_Uppercase) | | - |
2964 | FLAG(QChar::Letter_Lowercase) | | - |
2965 | FLAG(QChar::Letter_Titlecase) | | - |
2966 | FLAG(QChar::Letter_Modifier) | | - |
2967 | FLAG(QChar::Letter_Other) | | - |
2968 | FLAG(QChar::Punctuation_Connector))); | - |
2969 | yyCharClass->addRange(0x203f, 0x2040); | - |
2970 | yyCharClass->addSingleton(0x2040); | - |
2971 | yyCharClass->addSingleton(0x2054); | - |
2972 | yyCharClass->addSingleton(0x30fb); | - |
2973 | yyCharClass->addRange(0xfe33, 0xfe34); | - |
2974 | yyCharClass->addRange(0xfe4d, 0xfe4f); | - |
2975 | yyCharClass->addSingleton(0xff3f); | - |
2976 | yyCharClass->addSingleton(0xff65); | - |
2977 | return Tok_CharClass; | - |
2978 | #endif | - |
2979 | #ifndef QT_NO_REGEXP_ESCAPE | - |
2980 | case 'b': | - |
2981 | return Tok_Word; | - |
2982 | #endif | - |
2983 | #ifndef QT_NO_REGEXP_CCLASS | - |
2984 | case 'd': | - |
2985 | // see QChar::isDigit() | - |
2986 | yyCharClass->addCategories(FLAG(QChar::Number_DecimalDigit)); | - |
2987 | return Tok_CharClass; | - |
2988 | case 's': | - |
2989 | // see QChar::isSpace() | - |
2990 | yyCharClass->addCategories(FLAG(QChar::Separator_Space) | | - |
2991 | FLAG(QChar::Separator_Line) | | - |
2992 | FLAG(QChar::Separator_Paragraph)); | - |
2993 | yyCharClass->addRange(0x0009, 0x000d); | - |
2994 | yyCharClass->addSingleton(0x0085); | - |
2995 | return Tok_CharClass; | - |
2996 | case 'w': | - |
2997 | // see QChar::isLetterOrNumber() and QChar::isMark() | - |
2998 | yyCharClass->addCategories(FLAG(QChar::Mark_NonSpacing) | | - |
2999 | FLAG(QChar::Mark_SpacingCombining) | | - |
3000 | FLAG(QChar::Mark_Enclosing) | | - |
3001 | FLAG(QChar::Number_DecimalDigit) | | - |
3002 | FLAG(QChar::Number_Letter) | | - |
3003 | FLAG(QChar::Number_Other) | | - |
3004 | FLAG(QChar::Letter_Uppercase) | | - |
3005 | FLAG(QChar::Letter_Lowercase) | | - |
3006 | FLAG(QChar::Letter_Titlecase) | | - |
3007 | FLAG(QChar::Letter_Modifier) | | - |
3008 | FLAG(QChar::Letter_Other)); | - |
3009 | yyCharClass->addSingleton(0x005f); // '_' | - |
3010 | return Tok_CharClass; | - |
3011 | case 'I': | - |
3012 | if (xmlSchemaExtensions) { | - |
3013 | yyCharClass->setNegative(!yyCharClass->negative()); | - |
3014 | // fall through | - |
3015 | } else { | - |
3016 | break; | - |
3017 | } | - |
3018 | case 'i': | - |
3019 | if (xmlSchemaExtensions) { | - |
3020 | yyCharClass->addCategories(FLAG(QChar::Mark_NonSpacing) | | - |
3021 | FLAG(QChar::Mark_SpacingCombining) | | - |
3022 | FLAG(QChar::Mark_Enclosing) | | - |
3023 | FLAG(QChar::Number_DecimalDigit) | | - |
3024 | FLAG(QChar::Number_Letter) | | - |
3025 | FLAG(QChar::Number_Other) | | - |
3026 | FLAG(QChar::Letter_Uppercase) | | - |
3027 | FLAG(QChar::Letter_Lowercase) | | - |
3028 | FLAG(QChar::Letter_Titlecase) | | - |
3029 | FLAG(QChar::Letter_Modifier) | | - |
3030 | FLAG(QChar::Letter_Other)); | - |
3031 | yyCharClass->addSingleton(0x003a); // ':' | - |
3032 | yyCharClass->addSingleton(0x005f); // '_' | - |
3033 | yyCharClass->addRange(0x0041, 0x005a); // [A-Z] | - |
3034 | yyCharClass->addRange(0x0061, 0x007a); // [a-z] | - |
3035 | yyCharClass->addRange(0xc0, 0xd6); | - |
3036 | yyCharClass->addRange(0xd8, 0xf6); | - |
3037 | yyCharClass->addRange(0xf8, 0x2ff); | - |
3038 | yyCharClass->addRange(0x370, 0x37d); | - |
3039 | yyCharClass->addRange(0x37f, 0x1fff); | - |
3040 | yyCharClass->addRange(0x200c, 0x200d); | - |
3041 | yyCharClass->addRange(0x2070, 0x218f); | - |
3042 | yyCharClass->addRange(0x2c00, 0x2fef); | - |
3043 | yyCharClass->addRange(0x3001, 0xd7ff); | - |
3044 | yyCharClass->addRange(0xf900, 0xfdcf); | - |
3045 | yyCharClass->addRange(0xfdf0, 0xfffd); | - |
3046 | yyCharClass->addRange((ushort)0x10000, (ushort)0xeffff); | - |
3047 | return Tok_CharClass; | - |
3048 | } else { | - |
3049 | break; | - |
3050 | } | - |
3051 | case 'C': | - |
3052 | if (xmlSchemaExtensions) { | - |
3053 | yyCharClass->setNegative(!yyCharClass->negative()); | - |
3054 | // fall through | - |
3055 | } else { | - |
3056 | break; | - |
3057 | } | - |
3058 | case 'c': | - |
3059 | if (xmlSchemaExtensions) { | - |
3060 | yyCharClass->addCategories(FLAG(QChar::Mark_NonSpacing) | | - |
3061 | FLAG(QChar::Mark_SpacingCombining) | | - |
3062 | FLAG(QChar::Mark_Enclosing) | | - |
3063 | FLAG(QChar::Number_DecimalDigit) | | - |
3064 | FLAG(QChar::Number_Letter) | | - |
3065 | FLAG(QChar::Number_Other) | | - |
3066 | FLAG(QChar::Letter_Uppercase) | | - |
3067 | FLAG(QChar::Letter_Lowercase) | | - |
3068 | FLAG(QChar::Letter_Titlecase) | | - |
3069 | FLAG(QChar::Letter_Modifier) | | - |
3070 | FLAG(QChar::Letter_Other)); | - |
3071 | yyCharClass->addSingleton(0x002d); // '-' | - |
3072 | yyCharClass->addSingleton(0x002e); // '.' | - |
3073 | yyCharClass->addSingleton(0x003a); // ':' | - |
3074 | yyCharClass->addSingleton(0x005f); // '_' | - |
3075 | yyCharClass->addSingleton(0xb7); | - |
3076 | yyCharClass->addRange(0x0030, 0x0039); // [0-9] | - |
3077 | yyCharClass->addRange(0x0041, 0x005a); // [A-Z] | - |
3078 | yyCharClass->addRange(0x0061, 0x007a); // [a-z] | - |
3079 | yyCharClass->addRange(0xc0, 0xd6); | - |
3080 | yyCharClass->addRange(0xd8, 0xf6); | - |
3081 | yyCharClass->addRange(0xf8, 0x2ff); | - |
3082 | yyCharClass->addRange(0x370, 0x37d); | - |
3083 | yyCharClass->addRange(0x37f, 0x1fff); | - |
3084 | yyCharClass->addRange(0x200c, 0x200d); | - |
3085 | yyCharClass->addRange(0x2070, 0x218f); | - |
3086 | yyCharClass->addRange(0x2c00, 0x2fef); | - |
3087 | yyCharClass->addRange(0x3001, 0xd7ff); | - |
3088 | yyCharClass->addRange(0xf900, 0xfdcf); | - |
3089 | yyCharClass->addRange(0xfdf0, 0xfffd); | - |
3090 | yyCharClass->addRange((ushort)0x10000, (ushort)0xeffff); | - |
3091 | yyCharClass->addRange(0x0300, 0x036f); | - |
3092 | yyCharClass->addRange(0x203f, 0x2040); | - |
3093 | return Tok_CharClass; | - |
3094 | } else { | - |
3095 | break; | - |
3096 | } | - |
3097 | case 'P': | - |
3098 | if (xmlSchemaExtensions) { | - |
3099 | yyCharClass->setNegative(!yyCharClass->negative()); | - |
3100 | // fall through | - |
3101 | } else { | - |
3102 | break; | - |
3103 | } | - |
3104 | case 'p': | - |
3105 | if (xmlSchemaExtensions) { | - |
3106 | if (yyCh != '{') { | - |
3107 | error(RXERR_CHARCLASS); | - |
3108 | return Tok_CharClass; | - |
3109 | } | - |
3110 | - | |
3111 | QByteArray category; | - |
3112 | yyCh = getChar(); | - |
3113 | while (yyCh != '}') { | - |
3114 | if (yyCh == EOS) { | - |
3115 | error(RXERR_END); | - |
3116 | return Tok_CharClass; | - |
3117 | } | - |
3118 | category.append(yyCh); | - |
3119 | yyCh = getChar(); | - |
3120 | } | - |
3121 | yyCh = getChar(); // skip closing '}' | - |
3122 | - | |
3123 | int catlen = category.length(); | - |
3124 | if (catlen == 1 || catlen == 2) { | - |
3125 | switch (category.at(0)) { | - |
3126 | case 'M': | - |
3127 | if (catlen == 1) { | - |
3128 | yyCharClass->addCategories(FLAG(QChar::Mark_NonSpacing) | | - |
3129 | FLAG(QChar::Mark_SpacingCombining) | | - |
3130 | FLAG(QChar::Mark_Enclosing)); | - |
3131 | } else { | - |
3132 | switch (category.at(1)) { | - |
3133 | case 'n': yyCharClass->addCategories(FLAG(QChar::Mark_NonSpacing)); break; // Mn | - |
3134 | case 'c': yyCharClass->addCategories(FLAG(QChar::Mark_SpacingCombining)); break; // Mc | - |
3135 | case 'e': yyCharClass->addCategories(FLAG(QChar::Mark_Enclosing)); break; // Me | - |
3136 | default: error(RXERR_CATEGORY); break; | - |
3137 | } | - |
3138 | } | - |
3139 | break; | - |
3140 | case 'N': | - |
3141 | if (catlen == 1) { | - |
3142 | yyCharClass->addCategories(FLAG(QChar::Number_DecimalDigit) | | - |
3143 | FLAG(QChar::Number_Letter) | | - |
3144 | FLAG(QChar::Number_Other)); | - |
3145 | } else { | - |
3146 | switch (category.at(1)) { | - |
3147 | case 'd': yyCharClass->addCategories(FLAG(QChar::Number_DecimalDigit)); break; // Nd | - |
3148 | case 'l': yyCharClass->addCategories(FLAG(QChar::Number_Letter)); break; // Hl | - |
3149 | case 'o': yyCharClass->addCategories(FLAG(QChar::Number_Other)); break; // No | - |
3150 | default: error(RXERR_CATEGORY); break; | - |
3151 | } | - |
3152 | } | - |
3153 | break; | - |
3154 | case 'Z': | - |
3155 | if (catlen == 1) { | - |
3156 | yyCharClass->addCategories(FLAG(QChar::Separator_Space) | | - |
3157 | FLAG(QChar::Separator_Line) | | - |
3158 | FLAG(QChar::Separator_Paragraph)); | - |
3159 | } else { | - |
3160 | switch (category.at(1)) { | - |
3161 | case 's': yyCharClass->addCategories(FLAG(QChar::Separator_Space)); break; // Zs | - |
3162 | case 'l': yyCharClass->addCategories(FLAG(QChar::Separator_Line)); break; // Zl | - |
3163 | case 'p': yyCharClass->addCategories(FLAG(QChar::Separator_Paragraph)); break; // Zp | - |
3164 | default: error(RXERR_CATEGORY); break; | - |
3165 | } | - |
3166 | } | - |
3167 | break; | - |
3168 | case 'C': | - |
3169 | if (catlen == 1) { | - |
3170 | yyCharClass->addCategories(FLAG(QChar::Other_Control) | | - |
3171 | FLAG(QChar::Other_Format) | | - |
3172 | FLAG(QChar::Other_Surrogate) | | - |
3173 | FLAG(QChar::Other_PrivateUse) | | - |
3174 | FLAG(QChar::Other_NotAssigned)); | - |
3175 | } else { | - |
3176 | switch (category.at(1)) { | - |
3177 | case 'c': yyCharClass->addCategories(FLAG(QChar::Other_Control)); break; // Cc | - |
3178 | case 'f': yyCharClass->addCategories(FLAG(QChar::Other_Format)); break; // Cf | - |
3179 | case 's': yyCharClass->addCategories(FLAG(QChar::Other_Surrogate)); break; // Cs | - |
3180 | case 'o': yyCharClass->addCategories(FLAG(QChar::Other_PrivateUse)); break; // Co | - |
3181 | case 'n': yyCharClass->addCategories(FLAG(QChar::Other_NotAssigned)); break; // Cn | - |
3182 | default: error(RXERR_CATEGORY); break; | - |
3183 | } | - |
3184 | } | - |
3185 | break; | - |
3186 | case 'L': | - |
3187 | if (catlen == 1) { | - |
3188 | yyCharClass->addCategories(FLAG(QChar::Letter_Uppercase) | | - |
3189 | FLAG(QChar::Letter_Lowercase) | | - |
3190 | FLAG(QChar::Letter_Titlecase) | | - |
3191 | FLAG(QChar::Letter_Modifier) | | - |
3192 | FLAG(QChar::Letter_Other)); | - |
3193 | } else { | - |
3194 | switch (category.at(1)) { | - |
3195 | case 'u': yyCharClass->addCategories(FLAG(QChar::Letter_Uppercase)); break; // Lu | - |
3196 | case 'l': yyCharClass->addCategories(FLAG(QChar::Letter_Lowercase)); break; // Ll | - |
3197 | case 't': yyCharClass->addCategories(FLAG(QChar::Letter_Titlecase)); break; // Lt | - |
3198 | case 'm': yyCharClass->addCategories(FLAG(QChar::Letter_Modifier)); break; // Lm | - |
3199 | case 'o': yyCharClass->addCategories(FLAG(QChar::Letter_Other)); break; // Lo | - |
3200 | default: error(RXERR_CATEGORY); break; | - |
3201 | } | - |
3202 | } | - |
3203 | break; | - |
3204 | case 'P': | - |
3205 | if (catlen == 1) { | - |
3206 | yyCharClass->addCategories(FLAG(QChar::Punctuation_Connector) | | - |
3207 | FLAG(QChar::Punctuation_Dash) | | - |
3208 | FLAG(QChar::Punctuation_Open) | | - |
3209 | FLAG(QChar::Punctuation_Close) | | - |
3210 | FLAG(QChar::Punctuation_InitialQuote) | | - |
3211 | FLAG(QChar::Punctuation_FinalQuote) | | - |
3212 | FLAG(QChar::Punctuation_Other)); | - |
3213 | } else { | - |
3214 | switch (category.at(1)) { | - |
3215 | case 'c': yyCharClass->addCategories(FLAG(QChar::Punctuation_Connector)); break; // Pc | - |
3216 | case 'd': yyCharClass->addCategories(FLAG(QChar::Punctuation_Dash)); break; // Pd | - |
3217 | case 's': yyCharClass->addCategories(FLAG(QChar::Punctuation_Open)); break; // Ps | - |
3218 | case 'e': yyCharClass->addCategories(FLAG(QChar::Punctuation_Close)); break; // Pe | - |
3219 | case 'i': yyCharClass->addCategories(FLAG(QChar::Punctuation_InitialQuote)); break; // Pi | - |
3220 | case 'f': yyCharClass->addCategories(FLAG(QChar::Punctuation_FinalQuote)); break; // Pf | - |
3221 | case 'o': yyCharClass->addCategories(FLAG(QChar::Punctuation_Other)); break; // Po | - |
3222 | default: error(RXERR_CATEGORY); break; | - |
3223 | } | - |
3224 | } | - |
3225 | break; | - |
3226 | case 'S': | - |
3227 | if (catlen == 1) { | - |
3228 | yyCharClass->addCategories(FLAG(QChar::Symbol_Math) | | - |
3229 | FLAG(QChar::Symbol_Currency) | | - |
3230 | FLAG(QChar::Symbol_Modifier) | | - |
3231 | FLAG(QChar::Symbol_Other)); | - |
3232 | } else { | - |
3233 | switch (category.at(1)) { | - |
3234 | case 'm': yyCharClass->addCategories(FLAG(QChar::Symbol_Math)); break; // Sm | - |
3235 | case 'c': yyCharClass->addCategories(FLAG(QChar::Symbol_Currency)); break; // Sc | - |
3236 | case 'k': yyCharClass->addCategories(FLAG(QChar::Symbol_Modifier)); break; // Sk | - |
3237 | case 'o': yyCharClass->addCategories(FLAG(QChar::Symbol_Other)); break; // So | - |
3238 | default: error(RXERR_CATEGORY); break; | - |
3239 | } | - |
3240 | } | - |
3241 | break; | - |
3242 | default: | - |
3243 | error(RXERR_CATEGORY); | - |
3244 | break; | - |
3245 | } | - |
3246 | } else if (catlen > 2 && category.at(0) == 'I' && category.at(1) == 's') { | - |
3247 | static const int N = sizeof(categoriesRangeMap) / sizeof(categoriesRangeMap[0]); | - |
3248 | const char * const categoryFamily = category.constData() + 2; | - |
3249 | const CategoriesRangeMapEntry *r = std::lower_bound(categoriesRangeMap, categoriesRangeMap + N, categoryFamily); | - |
3250 | if (r != categoriesRangeMap + N && qstrcmp(r->name, categoryFamily) == 0) | - |
3251 | yyCharClass->addRange(r->first, r->second); | - |
3252 | else | - |
3253 | error(RXERR_CATEGORY); | - |
3254 | } else { | - |
3255 | error(RXERR_CATEGORY); | - |
3256 | } | - |
3257 | return Tok_CharClass; | - |
3258 | } else { | - |
3259 | break; | - |
3260 | } | - |
3261 | #endif | - |
3262 | #ifndef QT_NO_REGEXP_ESCAPE | - |
3263 | case 'x': | - |
3264 | val = 0; | - |
3265 | for (i = 0; i < 4; i++) { | - |
3266 | low = QChar(yyCh).toLower().unicode(); | - |
3267 | if (low >= '0' && low <= '9') | - |
3268 | val = (val << 4) | (low - '0'); | - |
3269 | else if (low >= 'a' && low <= 'f') | - |
3270 | val = (val << 4) | (low - 'a' + 10); | - |
3271 | else | - |
3272 | break; | - |
3273 | yyCh = getChar(); | - |
3274 | } | - |
3275 | return Tok_Char | val; | - |
3276 | #endif | - |
3277 | default: | - |
3278 | break; | - |
3279 | } | - |
3280 | if (prevCh >= '1' && prevCh <= '9') { | - |
3281 | #ifndef QT_NO_REGEXP_BACKREF | - |
3282 | val = prevCh - '0'; | - |
3283 | while (yyCh >= '0' && yyCh <= '9') { | - |
3284 | val = (val * 10) + (yyCh - '0'); | - |
3285 | yyCh = getChar(); | - |
3286 | } | - |
3287 | return Tok_BackRef | val; | - |
3288 | #else | - |
3289 | error(RXERR_DISABLED); | - |
3290 | #endif | - |
3291 | } | - |
3292 | return Tok_Char | prevCh; | - |
3293 | } | - |
3294 | - | |
3295 | #ifndef QT_NO_REGEXP_INTERVAL | - |
3296 | int QRegExpEngine::getRep(int def) | - |
3297 | { | - |
3298 | if (yyCh >= '0' && yyCh <= '9') { | - |
3299 | int rep = 0; | - |
3300 | do { | - |
3301 | rep = 10 * rep + yyCh - '0'; | - |
3302 | if (rep >= InftyRep) { | - |
3303 | error(RXERR_REPETITION); | - |
3304 | rep = def; | - |
3305 | } | - |
3306 | yyCh = getChar(); | - |
3307 | } while (yyCh >= '0' && yyCh <= '9'); | - |
3308 | return rep; | - |
3309 | } else { | - |
3310 | return def; | - |
3311 | } | - |
3312 | } | - |
3313 | #endif | - |
3314 | - | |
3315 | #ifndef QT_NO_REGEXP_LOOKAHEAD | - |
3316 | void QRegExpEngine::skipChars(int n) | - |
3317 | { | - |
3318 | if (n > 0) { | - |
3319 | yyPos += n - 1; | - |
3320 | yyCh = getChar(); | - |
3321 | } | - |
3322 | } | - |
3323 | #endif | - |
3324 | - | |
3325 | void QRegExpEngine::error(const char *msg) | - |
3326 | { | - |
3327 | if (yyError.isEmpty()) | - |
3328 | yyError = QLatin1String(msg); | - |
3329 | } | - |
3330 | - | |
3331 | void QRegExpEngine::startTokenizer(const QChar *rx, int len) | - |
3332 | { | - |
3333 | yyIn = rx; | - |
3334 | yyPos0 = 0; | - |
3335 | yyPos = 0; | - |
3336 | yyLen = len; | - |
3337 | yyCh = getChar(); | - |
3338 | yyCharClass.reset(new QRegExpCharClass); | - |
3339 | yyMinRep = 0; | - |
3340 | yyMaxRep = 0; | - |
3341 | yyError = QString(); | - |
3342 | } | - |
3343 | - | |
3344 | int QRegExpEngine::getToken() | - |
3345 | { | - |
3346 | #ifndef QT_NO_REGEXP_CCLASS | - |
3347 | ushort pendingCh = 0; | - |
3348 | bool charPending; | - |
3349 | bool rangePending; | - |
3350 | int tok; | - |
3351 | #endif | - |
3352 | int prevCh = yyCh; | - |
3353 | - | |
3354 | yyPos0 = yyPos - 1; | - |
3355 | #ifndef QT_NO_REGEXP_CCLASS | - |
3356 | yyCharClass->clear(); | - |
3357 | #endif | - |
3358 | yyMinRep = 0; | - |
3359 | yyMaxRep = 0; | - |
3360 | yyCh = getChar(); | - |
3361 | - | |
3362 | switch (prevCh) { | - |
3363 | case EOS: | - |
3364 | yyPos0 = yyPos; | - |
3365 | return Tok_Eos; | - |
3366 | case '$': | - |
3367 | return Tok_Dollar; | - |
3368 | case '(': | - |
3369 | if (yyCh == '?') { | - |
3370 | prevCh = getChar(); | - |
3371 | yyCh = getChar(); | - |
3372 | switch (prevCh) { | - |
3373 | #ifndef QT_NO_REGEXP_LOOKAHEAD | - |
3374 | case '!': | - |
3375 | return Tok_NegLookahead; | - |
3376 | case '=': | - |
3377 | return Tok_PosLookahead; | - |
3378 | #endif | - |
3379 | case ':': | - |
3380 | return Tok_MagicLeftParen; | - |
3381 | case '<': | - |
3382 | error(RXERR_LOOKBEHIND); | - |
3383 | return Tok_MagicLeftParen; | - |
3384 | default: | - |
3385 | error(RXERR_LOOKAHEAD); | - |
3386 | return Tok_MagicLeftParen; | - |
3387 | } | - |
3388 | } else { | - |
3389 | return Tok_LeftParen; | - |
3390 | } | - |
3391 | case ')': | - |
3392 | return Tok_RightParen; | - |
3393 | case '*': | - |
3394 | yyMinRep = 0; | - |
3395 | yyMaxRep = InftyRep; | - |
3396 | return Tok_Quantifier; | - |
3397 | case '+': | - |
3398 | yyMinRep = 1; | - |
3399 | yyMaxRep = InftyRep; | - |
3400 | return Tok_Quantifier; | - |
3401 | case '.': | - |
3402 | #ifndef QT_NO_REGEXP_CCLASS | - |
3403 | yyCharClass->setNegative(true); | - |
3404 | #endif | - |
3405 | return Tok_CharClass; | - |
3406 | case '?': | - |
3407 | yyMinRep = 0; | - |
3408 | yyMaxRep = 1; | - |
3409 | return Tok_Quantifier; | - |
3410 | case '[': | - |
3411 | #ifndef QT_NO_REGEXP_CCLASS | - |
3412 | if (yyCh == '^') { | - |
3413 | yyCharClass->setNegative(true); | - |
3414 | yyCh = getChar(); | - |
3415 | } | - |
3416 | charPending = false; | - |
3417 | rangePending = false; | - |
3418 | do { | - |
3419 | if (yyCh == '-' && charPending && !rangePending) { | - |
3420 | rangePending = true; | - |
3421 | yyCh = getChar(); | - |
3422 | } else { | - |
3423 | if (charPending && !rangePending) { | - |
3424 | yyCharClass->addSingleton(pendingCh); | - |
3425 | charPending = false; | - |
3426 | } | - |
3427 | if (yyCh == '\\') { | - |
3428 | yyCh = getChar(); | - |
3429 | tok = getEscape(); | - |
3430 | if (tok == Tok_Word) | - |
3431 | tok = '\b'; | - |
3432 | } else { | - |
3433 | tok = Tok_Char | yyCh; | - |
3434 | yyCh = getChar(); | - |
3435 | } | - |
3436 | if (tok == Tok_CharClass) { | - |
3437 | if (rangePending) { | - |
3438 | yyCharClass->addSingleton('-'); | - |
3439 | yyCharClass->addSingleton(pendingCh); | - |
3440 | charPending = false; | - |
3441 | rangePending = false; | - |
3442 | } | - |
3443 | } else if ((tok & Tok_Char) != 0) { | - |
3444 | if (rangePending) { | - |
3445 | yyCharClass->addRange(pendingCh, tok ^ Tok_Char); | - |
3446 | charPending = false; | - |
3447 | rangePending = false; | - |
3448 | } else { | - |
3449 | pendingCh = tok ^ Tok_Char; | - |
3450 | charPending = true; | - |
3451 | } | - |
3452 | } else { | - |
3453 | error(RXERR_CHARCLASS); | - |
3454 | } | - |
3455 | } | - |
3456 | } while (yyCh != ']' && yyCh != EOS); | - |
3457 | if (rangePending) | - |
3458 | yyCharClass->addSingleton('-'); | - |
3459 | if (charPending) | - |
3460 | yyCharClass->addSingleton(pendingCh); | - |
3461 | if (yyCh == EOS) | - |
3462 | error(RXERR_END); | - |
3463 | else | - |
3464 | yyCh = getChar(); | - |
3465 | return Tok_CharClass; | - |
3466 | #else | - |
3467 | error(RXERR_END); | - |
3468 | return Tok_Char | '['; | - |
3469 | #endif | - |
3470 | case '\\': | - |
3471 | return getEscape(); | - |
3472 | case ']': | - |
3473 | error(RXERR_LEFTDELIM); | - |
3474 | return Tok_Char | ']'; | - |
3475 | case '^': | - |
3476 | return Tok_Caret; | - |
3477 | case '{': | - |
3478 | #ifndef QT_NO_REGEXP_INTERVAL | - |
3479 | yyMinRep = getRep(0); | - |
3480 | yyMaxRep = yyMinRep; | - |
3481 | if (yyCh == ',') { | - |
3482 | yyCh = getChar(); | - |
3483 | yyMaxRep = getRep(InftyRep); | - |
3484 | } | - |
3485 | if (yyMaxRep < yyMinRep) | - |
3486 | error(RXERR_INTERVAL); | - |
3487 | if (yyCh != '}') | - |
3488 | error(RXERR_REPETITION); | - |
3489 | yyCh = getChar(); | - |
3490 | return Tok_Quantifier; | - |
3491 | #else | - |
3492 | error(RXERR_DISABLED); | - |
3493 | return Tok_Char | '{'; | - |
3494 | #endif | - |
3495 | case '|': | - |
3496 | return Tok_Bar; | - |
3497 | case '}': | - |
3498 | error(RXERR_LEFTDELIM); | - |
3499 | return Tok_Char | '}'; | - |
3500 | default: | - |
3501 | return Tok_Char | prevCh; | - |
3502 | } | - |
3503 | } | - |
3504 | - | |
3505 | int QRegExpEngine::parse(const QChar *pattern, int len) | - |
3506 | { | - |
3507 | valid = true; | - |
3508 | startTokenizer(pattern, len); | - |
3509 | yyTok = getToken(); | - |
3510 | #ifndef QT_NO_REGEXP_CAPTURE | - |
3511 | yyMayCapture = true; | - |
3512 | #else | - |
3513 | yyMayCapture = false; | - |
3514 | #endif | - |
3515 | - | |
3516 | #ifndef QT_NO_REGEXP_CAPTURE | - |
3517 | int atom = startAtom(false); | - |
3518 | #endif | - |
3519 | QRegExpCharClass anything; | - |
3520 | Box box(this); // create InitialState | - |
3521 | box.set(anything); | - |
3522 | Box rightBox(this); // create FinalState | - |
3523 | rightBox.set(anything); | - |
3524 | - | |
3525 | Box middleBox(this); | - |
3526 | parseExpression(&middleBox); | - |
3527 | #ifndef QT_NO_REGEXP_CAPTURE | - |
3528 | finishAtom(atom, false); | - |
3529 | #endif | - |
3530 | #ifndef QT_NO_REGEXP_OPTIM | - |
3531 | middleBox.setupHeuristics(); | - |
3532 | #endif | - |
3533 | box.cat(middleBox); | - |
3534 | box.cat(rightBox); | - |
3535 | yyCharClass.reset(0); | - |
3536 | - | |
3537 | #ifndef QT_NO_REGEXP_CAPTURE | - |
3538 | for (int i = 0; i < nf; ++i) { | - |
3539 | switch (f[i].capture) { | - |
3540 | case QRegExpAtom::NoCapture: | - |
3541 | break; | - |
3542 | case QRegExpAtom::OfficialCapture: | - |
3543 | f[i].capture = ncap; | - |
3544 | captureForOfficialCapture.append(ncap); | - |
3545 | ++ncap; | - |
3546 | ++officialncap; | - |
3547 | break; | - |
3548 | case QRegExpAtom::UnofficialCapture: | - |
3549 | f[i].capture = greedyQuantifiers ? ncap++ : QRegExpAtom::NoCapture; | - |
3550 | } | - |
3551 | } | - |
3552 | - | |
3553 | #ifndef QT_NO_REGEXP_BACKREF | - |
3554 | #ifndef QT_NO_REGEXP_OPTIM | - |
3555 | if (officialncap == 0 && nbrefs == 0) { | - |
3556 | ncap = nf = 0; | - |
3557 | f.clear(); | - |
3558 | } | - |
3559 | #endif | - |
3560 | // handle the case where there's a \5 with no corresponding capture | - |
3561 | // (captureForOfficialCapture.size() != officialncap) | - |
3562 | for (int i = 0; i < nbrefs - officialncap; ++i) { | - |
3563 | captureForOfficialCapture.append(ncap); | - |
3564 | ++ncap; | - |
3565 | } | - |
3566 | #endif | - |
3567 | #endif | - |
3568 | - | |
3569 | if (!yyError.isEmpty()) | - |
3570 | return -1; | - |
3571 | - | |
3572 | #ifndef QT_NO_REGEXP_OPTIM | - |
3573 | const QRegExpAutomatonState &sinit = s.at(InitialState); | - |
3574 | caretAnchored = !sinit.anchors.isEmpty(); | - |
3575 | if (caretAnchored) { | - |
3576 | const QMap<int, int> &anchors = sinit.anchors; | - |
3577 | QMap<int, int>::const_iterator a; | - |
3578 | for (a = anchors.constBegin(); a != anchors.constEnd(); ++a) { | - |
3579 | if ( | - |
3580 | #ifndef QT_NO_REGEXP_ANCHOR_ALT | - |
3581 | (*a & Anchor_Alternation) != 0 || | - |
3582 | #endif | - |
3583 | (*a & Anchor_Caret) == 0) | - |
3584 | { | - |
3585 | caretAnchored = false; | - |
3586 | break; | - |
3587 | } | - |
3588 | } | - |
3589 | } | - |
3590 | #endif | - |
3591 | - | |
3592 | // cleanup anchors | - |
3593 | int numStates = s.count(); | - |
3594 | for (int i = 0; i < numStates; ++i) { | - |
3595 | QRegExpAutomatonState &state = s[i]; | - |
3596 | if (!state.anchors.isEmpty()) { | - |
3597 | QMap<int, int>::iterator a = state.anchors.begin(); | - |
3598 | while (a != state.anchors.end()) { | - |
3599 | if (a.value() == 0) | - |
3600 | a = state.anchors.erase(a); | - |
3601 | else | - |
3602 | ++a; | - |
3603 | } | - |
3604 | } | - |
3605 | } | - |
3606 | - | |
3607 | return yyPos0; | - |
3608 | } | - |
3609 | - | |
3610 | void QRegExpEngine::parseAtom(Box *box) | - |
3611 | { | - |
3612 | #ifndef QT_NO_REGEXP_LOOKAHEAD | - |
3613 | QRegExpEngine *eng = 0; | - |
3614 | bool neg; | - |
3615 | int len; | - |
3616 | #endif | - |
3617 | - | |
3618 | if ((yyTok & Tok_Char) != 0) { | - |
3619 | box->set(QChar(yyTok ^ Tok_Char)); | - |
3620 | } else { | - |
3621 | #ifndef QT_NO_REGEXP_OPTIM | - |
3622 | trivial = false; | - |
3623 | #endif | - |
3624 | switch (yyTok) { | - |
3625 | case Tok_Dollar: | - |
3626 | box->catAnchor(Anchor_Dollar); | - |
3627 | break; | - |
3628 | case Tok_Caret: | - |
3629 | box->catAnchor(Anchor_Caret); | - |
3630 | break; | - |
3631 | #ifndef QT_NO_REGEXP_LOOKAHEAD | - |
3632 | case Tok_PosLookahead: | - |
3633 | case Tok_NegLookahead: | - |
3634 | neg = (yyTok == Tok_NegLookahead); | - |
3635 | eng = new QRegExpEngine(cs, greedyQuantifiers); | - |
3636 | len = eng->parse(yyIn + yyPos - 1, yyLen - yyPos + 1); | - |
3637 | if (len >= 0) | - |
3638 | skipChars(len); | - |
3639 | else | - |
3640 | error(RXERR_LOOKAHEAD); | - |
3641 | box->catAnchor(addLookahead(eng, neg)); | - |
3642 | yyTok = getToken(); | - |
3643 | if (yyTok != Tok_RightParen) | - |
3644 | error(RXERR_LOOKAHEAD); | - |
3645 | break; | - |
3646 | #endif | - |
3647 | #ifndef QT_NO_REGEXP_ESCAPE | - |
3648 | case Tok_Word: | - |
3649 | box->catAnchor(Anchor_Word); | - |
3650 | break; | - |
3651 | case Tok_NonWord: | - |
3652 | box->catAnchor(Anchor_NonWord); | - |
3653 | break; | - |
3654 | #endif | - |
3655 | case Tok_LeftParen: | - |
3656 | case Tok_MagicLeftParen: | - |
3657 | yyTok = getToken(); | - |
3658 | parseExpression(box); | - |
3659 | if (yyTok != Tok_RightParen) | - |
3660 | error(RXERR_END); | - |
3661 | break; | - |
3662 | case Tok_CharClass: | - |
3663 | box->set(*yyCharClass); | - |
3664 | break; | - |
3665 | case Tok_Quantifier: | - |
3666 | error(RXERR_REPETITION); | - |
3667 | break; | - |
3668 | default: | - |
3669 | #ifndef QT_NO_REGEXP_BACKREF | - |
3670 | if ((yyTok & Tok_BackRef) != 0) | - |
3671 | box->set(yyTok ^ Tok_BackRef); | - |
3672 | else | - |
3673 | #endif | - |
3674 | error(RXERR_DISABLED); | - |
3675 | } | - |
3676 | } | - |
3677 | yyTok = getToken(); | - |
3678 | } | - |
3679 | - | |
3680 | void QRegExpEngine::parseFactor(Box *box) | - |
3681 | { | - |
3682 | #ifndef QT_NO_REGEXP_CAPTURE | - |
3683 | int outerAtom = greedyQuantifiers ? startAtom(false) : -1; | - |
3684 | int innerAtom = startAtom(yyMayCapture && yyTok == Tok_LeftParen); | - |
3685 | bool magicLeftParen = (yyTok == Tok_MagicLeftParen); | - |
3686 | #else | - |
3687 | const int innerAtom = -1; | - |
3688 | #endif | - |
3689 | - | |
3690 | #ifndef QT_NO_REGEXP_INTERVAL | - |
3691 | #define YYREDO() \ | - |
3692 | yyIn = in, yyPos0 = pos0, yyPos = pos, yyLen = len, yyCh = ch, \ | - |
3693 | *yyCharClass = charClass, yyMinRep = 0, yyMaxRep = 0, yyTok = tok | - |
3694 | - | |
3695 | const QChar *in = yyIn; | - |
3696 | int pos0 = yyPos0; | - |
3697 | int pos = yyPos; | - |
3698 | int len = yyLen; | - |
3699 | int ch = yyCh; | - |
3700 | QRegExpCharClass charClass; | - |
3701 | if (yyTok == Tok_CharClass) | - |
3702 | charClass = *yyCharClass; | - |
3703 | int tok = yyTok; | - |
3704 | bool mayCapture = yyMayCapture; | - |
3705 | #endif | - |
3706 | - | |
3707 | parseAtom(box); | - |
3708 | #ifndef QT_NO_REGEXP_CAPTURE | - |
3709 | finishAtom(innerAtom, magicLeftParen); | - |
3710 | #endif | - |
3711 | - | |
3712 | bool hasQuantifier = (yyTok == Tok_Quantifier); | - |
3713 | if (hasQuantifier) { | - |
3714 | #ifndef QT_NO_REGEXP_OPTIM | - |
3715 | trivial = false; | - |
3716 | #endif | - |
3717 | if (yyMaxRep == InftyRep) { | - |
3718 | box->plus(innerAtom); | - |
3719 | #ifndef QT_NO_REGEXP_INTERVAL | - |
3720 | } else if (yyMaxRep == 0) { | - |
3721 | box->clear(); | - |
3722 | #endif | - |
3723 | } | - |
3724 | if (yyMinRep == 0) | - |
3725 | box->opt(); | - |
3726 | - | |
3727 | #ifndef QT_NO_REGEXP_INTERVAL | - |
3728 | yyMayCapture = false; | - |
3729 | int alpha = (yyMinRep == 0) ? 0 : yyMinRep - 1; | - |
3730 | int beta = (yyMaxRep == InftyRep) ? 0 : yyMaxRep - (alpha + 1); | - |
3731 | - | |
3732 | Box rightBox(this); | - |
3733 | int i; | - |
3734 | - | |
3735 | for (i = 0; i < beta; i++) { | - |
3736 | YYREDO(); | - |
3737 | Box leftBox(this); | - |
3738 | parseAtom(&leftBox); | - |
3739 | leftBox.cat(rightBox); | - |
3740 | leftBox.opt(); | - |
3741 | rightBox = leftBox; | - |
3742 | } | - |
3743 | for (i = 0; i < alpha; i++) { | - |
3744 | YYREDO(); | - |
3745 | Box leftBox(this); | - |
3746 | parseAtom(&leftBox); | - |
3747 | leftBox.cat(rightBox); | - |
3748 | rightBox = leftBox; | - |
3749 | } | - |
3750 | rightBox.cat(*box); | - |
3751 | *box = rightBox; | - |
3752 | #endif | - |
3753 | yyTok = getToken(); | - |
3754 | #ifndef QT_NO_REGEXP_INTERVAL | - |
3755 | yyMayCapture = mayCapture; | - |
3756 | #endif | - |
3757 | } | - |
3758 | #undef YYREDO | - |
3759 | #ifndef QT_NO_REGEXP_CAPTURE | - |
3760 | if (greedyQuantifiers) | - |
3761 | finishAtom(outerAtom, hasQuantifier); | - |
3762 | #endif | - |
3763 | } | - |
3764 | - | |
3765 | void QRegExpEngine::parseTerm(Box *box) | - |
3766 | { | - |
3767 | #ifndef QT_NO_REGEXP_OPTIM | - |
3768 | if (yyTok != Tok_Eos && yyTok != Tok_RightParen && yyTok != Tok_Bar) | - |
3769 | parseFactor(box); | - |
3770 | #endif | - |
3771 | while (yyTok != Tok_Eos && yyTok != Tok_RightParen && yyTok != Tok_Bar) { | - |
3772 | Box rightBox(this); | - |
3773 | parseFactor(&rightBox); | - |
3774 | box->cat(rightBox); | - |
3775 | } | - |
3776 | } | - |
3777 | - | |
3778 | void QRegExpEngine::parseExpression(Box *box) | - |
3779 | { | - |
3780 | parseTerm(box); | - |
3781 | while (yyTok == Tok_Bar) { | - |
3782 | #ifndef QT_NO_REGEXP_OPTIM | - |
3783 | trivial = false; | - |
3784 | #endif | - |
3785 | Box rightBox(this); | - |
3786 | yyTok = getToken(); | - |
3787 | parseTerm(&rightBox); | - |
3788 | box->orx(rightBox); | - |
3789 | } | - |
3790 | } | - |
3791 | - | |
3792 | /* | - |
3793 | The struct QRegExpPrivate contains the private data of a regular | - |
3794 | expression other than the automaton. It makes it possible for many | - |
3795 | QRegExp objects to use the same QRegExpEngine object with different | - |
3796 | QRegExpPrivate objects. | - |
3797 | */ | - |
3798 | struct QRegExpPrivate | - |
3799 | { | - |
3800 | QRegExpEngine *eng; | - |
3801 | QRegExpEngineKey engineKey; | - |
3802 | bool minimal; | - |
3803 | #ifndef QT_NO_REGEXP_CAPTURE | - |
3804 | QString t; // last string passed to QRegExp::indexIn() or lastIndexIn() | - |
3805 | QStringList capturedCache; // what QRegExp::capturedTexts() returned last | - |
3806 | #endif | - |
3807 | QRegExpMatchState matchState; | - |
3808 | - | |
3809 | inline QRegExpPrivate() | - |
3810 | : eng(0), engineKey(QString(), QRegExp::RegExp, Qt::CaseSensitive), minimal(false) { } | - |
3811 | inline QRegExpPrivate(const QRegExpEngineKey &key) | - |
3812 | : eng(0), engineKey(key), minimal(false) {} | - |
3813 | }; | - |
3814 | - | |
3815 | #if !defined(QT_NO_REGEXP_OPTIM) | - |
3816 | typedef QCache<QRegExpEngineKey, QRegExpEngine> EngineCache; | - |
3817 | Q_GLOBAL_STATIC(EngineCache, globalEngineCache) | - |
3818 | static QBasicMutex globalEngineCacheMutex; | - |
3819 | #endif // QT_NO_REGEXP_OPTIM | - |
3820 | - | |
3821 | static void derefEngine(QRegExpEngine *eng, const QRegExpEngineKey &key) | - |
3822 | { | - |
3823 | if (!eng->ref.deref()) { | - |
3824 | #if !defined(QT_NO_REGEXP_OPTIM) | - |
3825 | if (globalEngineCache()) { | - |
3826 | QMutexLocker locker(&globalEngineCacheMutex); | - |
3827 | QT_TRY { | - |
3828 | globalEngineCache()->insert(key, eng, 4 + key.pattern.length() / 4); | - |
3829 | } QT_CATCH(const std::bad_alloc &) { | - |
3830 | // in case of an exception (e.g. oom), just delete the engine | - |
3831 | delete eng; | - |
3832 | } | - |
3833 | } else { | - |
3834 | delete eng; | - |
3835 | } | - |
3836 | #else | - |
3837 | Q_UNUSED(key); | - |
3838 | delete eng; | - |
3839 | #endif | - |
3840 | } | - |
3841 | } | - |
3842 | - | |
3843 | static void prepareEngine_helper(QRegExpPrivate *priv) | - |
3844 | { | - |
3845 | bool initMatchState = !priv->eng; | - |
3846 | #if !defined(QT_NO_REGEXP_OPTIM) | - |
3847 | if (!priv->eng && globalEngineCache()) { | - |
3848 | QMutexLocker locker(&globalEngineCacheMutex); | - |
3849 | priv->eng = globalEngineCache()->take(priv->engineKey); | - |
3850 | if (priv->eng != 0) | - |
3851 | priv->eng->ref.ref(); | - |
3852 | } | - |
3853 | #endif // QT_NO_REGEXP_OPTIM | - |
3854 | - | |
3855 | if (!priv->eng) | - |
3856 | priv->eng = new QRegExpEngine(priv->engineKey); | - |
3857 | - | |
3858 | if (initMatchState) | - |
3859 | priv->matchState.prepareForMatch(priv->eng); | - |
3860 | } | - |
3861 | - | |
3862 | inline static void prepareEngine(QRegExpPrivate *priv) | - |
3863 | { | - |
3864 | if (priv->eng) | - |
3865 | return; | - |
3866 | prepareEngine_helper(priv); | - |
3867 | } | - |
3868 | - | |
3869 | static void prepareEngineForMatch(QRegExpPrivate *priv, const QString &str) | - |
3870 | { | - |
3871 | prepareEngine(priv); | - |
3872 | priv->matchState.prepareForMatch(priv->eng); | - |
3873 | #ifndef QT_NO_REGEXP_CAPTURE | - |
3874 | priv->t = str; | - |
3875 | priv->capturedCache.clear(); | - |
3876 | #else | - |
3877 | Q_UNUSED(str); | - |
3878 | #endif | - |
3879 | } | - |
3880 | - | |
3881 | static void invalidateEngine(QRegExpPrivate *priv) | - |
3882 | { | - |
3883 | if (priv->eng != 0) { | - |
3884 | derefEngine(priv->eng, priv->engineKey); | - |
3885 | priv->eng = 0; | - |
3886 | priv->matchState.drain(); | - |
3887 | } | - |
3888 | } | - |
3889 | - | |
3890 | /*! | - |
3891 | \enum QRegExp::CaretMode | - |
3892 | - | |
3893 | The CaretMode enum defines the different meanings of the caret | - |
3894 | (\b{^}) in a regular expression. The possible values are: | - |
3895 | - | |
3896 | \value CaretAtZero | - |
3897 | The caret corresponds to index 0 in the searched string. | - |
3898 | - | |
3899 | \value CaretAtOffset | - |
3900 | The caret corresponds to the start offset of the search. | - |
3901 | - | |
3902 | \value CaretWontMatch | - |
3903 | The caret never matches. | - |
3904 | */ | - |
3905 | - | |
3906 | /*! | - |
3907 | \enum QRegExp::PatternSyntax | - |
3908 | - | |
3909 | The syntax used to interpret the meaning of the pattern. | - |
3910 | - | |
3911 | \value RegExp A rich Perl-like pattern matching syntax. This is | - |
3912 | the default. | - |
3913 | - | |
3914 | \value RegExp2 Like RegExp, but with \l{greedy quantifiers}. | - |
3915 | (Introduced in Qt 4.2.) | - |
3916 | - | |
3917 | \value Wildcard This provides a simple pattern matching syntax | - |
3918 | similar to that used by shells (command interpreters) for "file | - |
3919 | globbing". See \l{QRegExp wildcard matching}. | - |
3920 | - | |
3921 | \value WildcardUnix This is similar to Wildcard but with the | - |
3922 | behavior of a Unix shell. The wildcard characters can be escaped | - |
3923 | with the character "\\". | - |
3924 | - | |
3925 | \value FixedString The pattern is a fixed string. This is | - |
3926 | equivalent to using the RegExp pattern on a string in | - |
3927 | which all metacharacters are escaped using escape(). | - |
3928 | - | |
3929 | \value W3CXmlSchema11 The pattern is a regular expression as | - |
3930 | defined by the W3C XML Schema 1.1 specification. | - |
3931 | - | |
3932 | \sa setPatternSyntax() | - |
3933 | */ | - |
3934 | - | |
3935 | /*! | - |
3936 | Constructs an empty regexp. | - |
3937 | - | |
3938 | \sa isValid(), errorString() | - |
3939 | */ | - |
3940 | QRegExp::QRegExp() | - |
3941 | { | - |
3942 | priv = new QRegExpPrivate; | - |
3943 | prepareEngine(priv); | - |
3944 | } | - |
3945 | - | |
3946 | /*! | - |
3947 | Constructs a regular expression object for the given \a pattern | - |
3948 | string. The pattern must be given using wildcard notation if \a | - |
3949 | syntax is \l Wildcard; the default is \l RegExp. The pattern is | - |
3950 | case sensitive, unless \a cs is Qt::CaseInsensitive. Matching is | - |
3951 | greedy (maximal), but can be changed by calling | - |
3952 | setMinimal(). | - |
3953 | - | |
3954 | \sa setPattern(), setCaseSensitivity(), setPatternSyntax() | - |
3955 | */ | - |
3956 | QRegExp::QRegExp(const QString &pattern, Qt::CaseSensitivity cs, PatternSyntax syntax) | - |
3957 | { | - |
3958 | priv = new QRegExpPrivate(QRegExpEngineKey(pattern, syntax, cs)); | - |
3959 | prepareEngine(priv); | - |
3960 | } | - |
3961 | - | |
3962 | /*! | - |
3963 | Constructs a regular expression as a copy of \a rx. | - |
3964 | - | |
3965 | \sa operator=() | - |
3966 | */ | - |
3967 | QRegExp::QRegExp(const QRegExp &rx) | - |
3968 | { | - |
3969 | priv = new QRegExpPrivate; | - |
3970 | operator=(rx); | - |
3971 | } | - |
3972 | - | |
3973 | /*! | - |
3974 | Destroys the regular expression and cleans up its internal data. | - |
3975 | */ | - |
3976 | QRegExp::~QRegExp() | - |
3977 | { | - |
3978 | invalidateEngine(priv); | - |
3979 | delete priv; | - |
3980 | } | - |
3981 | - | |
3982 | /*! | - |
3983 | Copies the regular expression \a rx and returns a reference to the | - |
3984 | copy. The case sensitivity, wildcard, and minimal matching options | - |
3985 | are also copied. | - |
3986 | */ | - |
3987 | QRegExp &QRegExp::operator=(const QRegExp &rx) | - |
3988 | { | - |
3989 | prepareEngine(rx.priv); // to allow sharing | - |
3990 | QRegExpEngine *otherEng = rx.priv->eng; | - |
3991 | if (otherEng) | - |
3992 | otherEng->ref.ref(); | - |
3993 | invalidateEngine(priv); | - |
3994 | priv->eng = otherEng; | - |
3995 | priv->engineKey = rx.priv->engineKey; | - |
3996 | priv->minimal = rx.priv->minimal; | - |
3997 | #ifndef QT_NO_REGEXP_CAPTURE | - |
3998 | priv->t = rx.priv->t; | - |
3999 | priv->capturedCache = rx.priv->capturedCache; | - |
4000 | #endif | - |
4001 | if (priv->eng) | - |
4002 | priv->matchState.prepareForMatch(priv->eng); | - |
4003 | priv->matchState.captured = rx.priv->matchState.captured; | - |
4004 | return *this; | - |
4005 | } | - |
4006 | - | |
4007 | /*! | - |
4008 | \fn QRegExp &QRegExp::operator=(QRegExp &&other) | - |
4009 | - | |
4010 | Move-assigns \a other to this QRegExp instance. | - |
4011 | - | |
4012 | \since 5.2 | - |
4013 | */ | - |
4014 | - | |
4015 | /*! | - |
4016 | \fn void QRegExp::swap(QRegExp &other) | - |
4017 | \since 4.8 | - |
4018 | - | |
4019 | Swaps regular expression \a other with this regular | - |
4020 | expression. This operation is very fast and never fails. | - |
4021 | */ | - |
4022 | - | |
4023 | /*! | - |
4024 | Returns \c true if this regular expression is equal to \a rx; | - |
4025 | otherwise returns \c false. | - |
4026 | - | |
4027 | Two QRegExp objects are equal if they have the same pattern | - |
4028 | strings and the same settings for case sensitivity, wildcard and | - |
4029 | minimal matching. | - |
4030 | */ | - |
4031 | bool QRegExp::operator==(const QRegExp &rx) const | - |
4032 | { | - |
4033 | return priv->engineKey == rx.priv->engineKey && priv->minimal == rx.priv->minimal; | - |
4034 | } | - |
4035 | - | |
4036 | /*! | - |
4037 | \since 5.6 | - |
4038 | \relates QRegExp | - |
4039 | - | |
4040 | Returns the hash value for \a key, using | - |
4041 | \a seed to seed the calculation. | - |
4042 | */ | - |
4043 | uint qHash(const QRegExp &key, uint seed) Q_DECL_NOTHROW | - |
4044 | { | - |
4045 | QtPrivate::QHashCombine hash; | - |
4046 | seed = hash(seed, key.priv->engineKey); | - |
4047 | seed = hash(seed, key.priv->minimal); | - |
4048 | return seed; executed 2048 times by 1 test: return seed; Executed by:
| 2048 |
4049 | } | - |
4050 | - | |
4051 | /*! | - |
4052 | \fn bool QRegExp::operator!=(const QRegExp &rx) const | - |
4053 | - | |
4054 | Returns \c true if this regular expression is not equal to \a rx; | - |
4055 | otherwise returns \c false. | - |
4056 | - | |
4057 | \sa operator==() | - |
4058 | */ | - |
4059 | - | |
4060 | /*! | - |
4061 | Returns \c true if the pattern string is empty; otherwise returns | - |
4062 | false. | - |
4063 | - | |
4064 | If you call exactMatch() with an empty pattern on an empty string | - |
4065 | it will return true; otherwise it returns \c false since it operates | - |
4066 | over the whole string. If you call indexIn() with an empty pattern | - |
4067 | on \e any string it will return the start offset (0 by default) | - |
4068 | because the empty pattern matches the 'emptiness' at the start of | - |
4069 | the string. In this case the length of the match returned by | - |
4070 | matchedLength() will be 0. | - |
4071 | - | |
4072 | See QString::isEmpty(). | - |
4073 | */ | - |
4074 | - | |
4075 | bool QRegExp::isEmpty() const | - |
4076 | { | - |
4077 | return priv->engineKey.pattern.isEmpty(); | - |
4078 | } | - |
4079 | - | |
4080 | /*! | - |
4081 | Returns \c true if the regular expression is valid; otherwise returns | - |
4082 | false. An invalid regular expression never matches. | - |
4083 | - | |
4084 | The pattern \b{[a-z} is an example of an invalid pattern, since | - |
4085 | it lacks a closing square bracket. | - |
4086 | - | |
4087 | Note that the validity of a regexp may also depend on the setting | - |
4088 | of the wildcard flag, for example \b{*.html} is a valid | - |
4089 | wildcard regexp but an invalid full regexp. | - |
4090 | - | |
4091 | \sa errorString() | - |
4092 | */ | - |
4093 | bool QRegExp::isValid() const | - |
4094 | { | - |
4095 | if (priv->engineKey.pattern.isEmpty()) { | - |
4096 | return true; | - |
4097 | } else { | - |
4098 | prepareEngine(priv); | - |
4099 | return priv->eng->isValid(); | - |
4100 | } | - |
4101 | } | - |
4102 | - | |
4103 | /*! | - |
4104 | Returns the pattern string of the regular expression. The pattern | - |
4105 | has either regular expression syntax or wildcard syntax, depending | - |
4106 | on patternSyntax(). | - |
4107 | - | |
4108 | \sa patternSyntax(), caseSensitivity() | - |
4109 | */ | - |
4110 | QString QRegExp::pattern() const | - |
4111 | { | - |
4112 | return priv->engineKey.pattern; | - |
4113 | } | - |
4114 | - | |
4115 | /*! | - |
4116 | Sets the pattern string to \a pattern. The case sensitivity, | - |
4117 | wildcard, and minimal matching options are not changed. | - |
4118 | - | |
4119 | \sa setPatternSyntax(), setCaseSensitivity() | - |
4120 | */ | - |
4121 | void QRegExp::setPattern(const QString &pattern) | - |
4122 | { | - |
4123 | if (priv->engineKey.pattern != pattern) { | - |
4124 | invalidateEngine(priv); | - |
4125 | priv->engineKey.pattern = pattern; | - |
4126 | } | - |
4127 | } | - |
4128 | - | |
4129 | /*! | - |
4130 | Returns Qt::CaseSensitive if the regexp is matched case | - |
4131 | sensitively; otherwise returns Qt::CaseInsensitive. | - |
4132 | - | |
4133 | \sa patternSyntax(), pattern(), isMinimal() | - |
4134 | */ | - |
4135 | Qt::CaseSensitivity QRegExp::caseSensitivity() const | - |
4136 | { | - |
4137 | return priv->engineKey.cs; | - |
4138 | } | - |
4139 | - | |
4140 | /*! | - |
4141 | Sets case sensitive matching to \a cs. | - |
4142 | - | |
4143 | If \a cs is Qt::CaseSensitive, \b{\\.txt$} matches | - |
4144 | \c{readme.txt} but not \c{README.TXT}. | - |
4145 | - | |
4146 | \sa setPatternSyntax(), setPattern(), setMinimal() | - |
4147 | */ | - |
4148 | void QRegExp::setCaseSensitivity(Qt::CaseSensitivity cs) | - |
4149 | { | - |
4150 | if ((bool)cs != (bool)priv->engineKey.cs) { | - |
4151 | invalidateEngine(priv); | - |
4152 | priv->engineKey.cs = cs; | - |
4153 | } | - |
4154 | } | - |
4155 | - | |
4156 | /*! | - |
4157 | Returns the syntax used by the regular expression. The default is | - |
4158 | QRegExp::RegExp. | - |
4159 | - | |
4160 | \sa pattern(), caseSensitivity() | - |
4161 | */ | - |
4162 | QRegExp::PatternSyntax QRegExp::patternSyntax() const | - |
4163 | { | - |
4164 | return priv->engineKey.patternSyntax; | - |
4165 | } | - |
4166 | - | |
4167 | /*! | - |
4168 | Sets the syntax mode for the regular expression. The default is | - |
4169 | QRegExp::RegExp. | - |
4170 | - | |
4171 | Setting \a syntax to QRegExp::Wildcard enables simple shell-like | - |
4172 | \l{QRegExp wildcard matching}. For example, \b{r*.txt} matches the | - |
4173 | string \c{readme.txt} in wildcard mode, but does not match | - |
4174 | \c{readme}. | - |
4175 | - | |
4176 | Setting \a syntax to QRegExp::FixedString means that the pattern | - |
4177 | is interpreted as a plain string. Special characters (e.g., | - |
4178 | backslash) don't need to be escaped then. | - |
4179 | - | |
4180 | \sa setPattern(), setCaseSensitivity(), escape() | - |
4181 | */ | - |
4182 | void QRegExp::setPatternSyntax(PatternSyntax syntax) | - |
4183 | { | - |
4184 | if (syntax != priv->engineKey.patternSyntax) { | - |
4185 | invalidateEngine(priv); | - |
4186 | priv->engineKey.patternSyntax = syntax; | - |
4187 | } | - |
4188 | } | - |
4189 | - | |
4190 | /*! | - |
4191 | Returns \c true if minimal (non-greedy) matching is enabled; | - |
4192 | otherwise returns \c false. | - |
4193 | - | |
4194 | \sa caseSensitivity(), setMinimal() | - |
4195 | */ | - |
4196 | bool QRegExp::isMinimal() const | - |
4197 | { | - |
4198 | return priv->minimal; | - |
4199 | } | - |
4200 | - | |
4201 | /*! | - |
4202 | Enables or disables minimal matching. If \a minimal is false, | - |
4203 | matching is greedy (maximal) which is the default. | - |
4204 | - | |
4205 | For example, suppose we have the input string "We must be | - |
4206 | <b>bold</b>, very <b>bold</b>!" and the pattern | - |
4207 | \b{<b>.*</b>}. With the default greedy (maximal) matching, | - |
4208 | the match is "We must be \underline{<b>bold</b>, very | - |
4209 | <b>bold</b>}!". But with minimal (non-greedy) matching, the | - |
4210 | first match is: "We must be \underline{<b>bold</b>}, very | - |
4211 | <b>bold</b>!" and the second match is "We must be <b>bold</b>, | - |
4212 | very \underline{<b>bold</b>}!". In practice we might use the pattern | - |
4213 | \b{<b>[^<]*\</b>} instead, although this will still fail for | - |
4214 | nested tags. | - |
4215 | - | |
4216 | \sa setCaseSensitivity() | - |
4217 | */ | - |
4218 | void QRegExp::setMinimal(bool minimal) | - |
4219 | { | - |
4220 | priv->minimal = minimal; | - |
4221 | } | - |
4222 | - | |
4223 | // ### Qt 5: make non-const | - |
4224 | /*! | - |
4225 | Returns \c true if \a str is matched exactly by this regular | - |
4226 | expression; otherwise returns \c false. You can determine how much of | - |
4227 | the string was matched by calling matchedLength(). | - |
4228 | - | |
4229 | For a given regexp string R, exactMatch("R") is the equivalent of | - |
4230 | indexIn("^R$") since exactMatch() effectively encloses the regexp | - |
4231 | in the start of string and end of string anchors, except that it | - |
4232 | sets matchedLength() differently. | - |
4233 | - | |
4234 | For example, if the regular expression is \b{blue}, then | - |
4235 | exactMatch() returns \c true only for input \c blue. For inputs \c | - |
4236 | bluebell, \c blutak and \c lightblue, exactMatch() returns \c false | - |
4237 | and matchedLength() will return 4, 3 and 0 respectively. | - |
4238 | - | |
4239 | Although const, this function sets matchedLength(), | - |
4240 | capturedTexts(), and pos(). | - |
4241 | - | |
4242 | \sa indexIn(), lastIndexIn() | - |
4243 | */ | - |
4244 | bool QRegExp::exactMatch(const QString &str) const | - |
4245 | { | - |
4246 | prepareEngineForMatch(priv, str); | - |
4247 | priv->matchState.match(str.unicode(), str.length(), 0, priv->minimal, true, 0); | - |
4248 | if (priv->matchState.captured[1] == str.length()) { | - |
4249 | return true; | - |
4250 | } else { | - |
4251 | priv->matchState.captured[0] = 0; | - |
4252 | priv->matchState.captured[1] = priv->matchState.oneTestMatchedLen; | - |
4253 | return false; | - |
4254 | } | - |
4255 | } | - |
4256 | - | |
4257 | // ### Qt 5: make non-const | - |
4258 | /*! | - |
4259 | Attempts to find a match in \a str from position \a offset (0 by | - |
4260 | default). If \a offset is -1, the search starts at the last | - |
4261 | character; if -2, at the next to last character; etc. | - |
4262 | - | |
4263 | Returns the position of the first match, or -1 if there was no | - |
4264 | match. | - |
4265 | - | |
4266 | The \a caretMode parameter can be used to instruct whether \b{^} | - |
4267 | should match at index 0 or at \a offset. | - |
4268 | - | |
4269 | You might prefer to use QString::indexOf(), QString::contains(), | - |
4270 | or even QStringList::filter(). To replace matches use | - |
4271 | QString::replace(). | - |
4272 | - | |
4273 | Example: | - |
4274 | \snippet code/src_corelib_tools_qregexp.cpp 13 | - |
4275 | - | |
4276 | Although const, this function sets matchedLength(), | - |
4277 | capturedTexts() and pos(). | - |
4278 | - | |
4279 | If the QRegExp is a wildcard expression (see setPatternSyntax()) | - |
4280 | and want to test a string against the whole wildcard expression, | - |
4281 | use exactMatch() instead of this function. | - |
4282 | - | |
4283 | \sa lastIndexIn(), exactMatch() | - |
4284 | */ | - |
4285 | - | |
4286 | int QRegExp::indexIn(const QString &str, int offset, CaretMode caretMode) const | - |
4287 | { | - |
4288 | prepareEngineForMatch(priv, str); | - |
4289 | if (offset < 0) | - |
4290 | offset += str.length(); | - |
4291 | priv->matchState.match(str.unicode(), str.length(), offset, | - |
4292 | priv->minimal, false, caretIndex(offset, caretMode)); | - |
4293 | return priv->matchState.captured[0]; | - |
4294 | } | - |
4295 | - | |
4296 | // ### Qt 5: make non-const | - |
4297 | /*! | - |
4298 | Attempts to find a match backwards in \a str from position \a | - |
4299 | offset. If \a offset is -1 (the default), the search starts at the | - |
4300 | last character; if -2, at the next to last character; etc. | - |
4301 | - | |
4302 | Returns the position of the first match, or -1 if there was no | - |
4303 | match. | - |
4304 | - | |
4305 | The \a caretMode parameter can be used to instruct whether \b{^} | - |
4306 | should match at index 0 or at \a offset. | - |
4307 | - | |
4308 | Although const, this function sets matchedLength(), | - |
4309 | capturedTexts() and pos(). | - |
4310 | - | |
4311 | \warning Searching backwards is much slower than searching | - |
4312 | forwards. | - |
4313 | - | |
4314 | \sa indexIn(), exactMatch() | - |
4315 | */ | - |
4316 | - | |
4317 | int QRegExp::lastIndexIn(const QString &str, int offset, CaretMode caretMode) const | - |
4318 | { | - |
4319 | prepareEngineForMatch(priv, str); | - |
4320 | if (offset < 0) | - |
4321 | offset += str.length(); | - |
4322 | if (offset < 0 || offset > str.length()) { | - |
4323 | memset(priv->matchState.captured, -1, priv->matchState.capturedSize*sizeof(int)); | - |
4324 | return -1; | - |
4325 | } | - |
4326 | - | |
4327 | while (offset >= 0) { | - |
4328 | priv->matchState.match(str.unicode(), str.length(), offset, | - |
4329 | priv->minimal, true, caretIndex(offset, caretMode)); | - |
4330 | if (priv->matchState.captured[0] == offset) | - |
4331 | return offset; | - |
4332 | --offset; | - |
4333 | } | - |
4334 | return -1; | - |
4335 | } | - |
4336 | - | |
4337 | /*! | - |
4338 | Returns the length of the last matched string, or -1 if there was | - |
4339 | no match. | - |
4340 | - | |
4341 | \sa exactMatch(), indexIn(), lastIndexIn() | - |
4342 | */ | - |
4343 | int QRegExp::matchedLength() const | - |
4344 | { | - |
4345 | return priv->matchState.captured[1]; | - |
4346 | } | - |
4347 | - | |
4348 | #ifndef QT_NO_REGEXP_CAPTURE | - |
4349 | - | |
4350 | /*! | - |
4351 | \since 4.6 | - |
4352 | Returns the number of captures contained in the regular expression. | - |
4353 | */ | - |
4354 | int QRegExp::captureCount() const | - |
4355 | { | - |
4356 | prepareEngine(priv); | - |
4357 | return priv->eng->captureCount(); | - |
4358 | } | - |
4359 | - | |
4360 | /*! | - |
4361 | Returns a list of the captured text strings. | - |
4362 | - | |
4363 | The first string in the list is the entire matched string. Each | - |
4364 | subsequent list element contains a string that matched a | - |
4365 | (capturing) subexpression of the regexp. | - |
4366 | - | |
4367 | For example: | - |
4368 | \snippet code/src_corelib_tools_qregexp.cpp 14 | - |
4369 | - | |
4370 | The above example also captures elements that may be present but | - |
4371 | which we have no interest in. This problem can be solved by using | - |
4372 | non-capturing parentheses: | - |
4373 | - | |
4374 | \snippet code/src_corelib_tools_qregexp.cpp 15 | - |
4375 | - | |
4376 | Note that if you want to iterate over the list, you should iterate | - |
4377 | over a copy, e.g. | - |
4378 | \snippet code/src_corelib_tools_qregexp.cpp 16 | - |
4379 | - | |
4380 | Some regexps can match an indeterminate number of times. For | - |
4381 | example if the input string is "Offsets: 12 14 99 231 7" and the | - |
4382 | regexp, \c{rx}, is \b{(\\d+)+}, we would hope to get a list of | - |
4383 | all the numbers matched. However, after calling | - |
4384 | \c{rx.indexIn(str)}, capturedTexts() will return the list ("12", | - |
4385 | "12"), i.e. the entire match was "12" and the first subexpression | - |
4386 | matched was "12". The correct approach is to use cap() in a | - |
4387 | \l{QRegExp#cap_in_a_loop}{loop}. | - |
4388 | - | |
4389 | The order of elements in the string list is as follows. The first | - |
4390 | element is the entire matching string. Each subsequent element | - |
4391 | corresponds to the next capturing open left parentheses. Thus | - |
4392 | capturedTexts()[1] is the text of the first capturing parentheses, | - |
4393 | capturedTexts()[2] is the text of the second and so on | - |
4394 | (corresponding to $1, $2, etc., in some other regexp languages). | - |
4395 | - | |
4396 | \sa cap(), pos() | - |
4397 | */ | - |
4398 | QStringList QRegExp::capturedTexts() const | - |
4399 | { | - |
4400 | if (priv->capturedCache.isEmpty()) { | - |
4401 | prepareEngine(priv); | - |
4402 | const int *captured = priv->matchState.captured; | - |
4403 | int n = priv->matchState.capturedSize; | - |
4404 | - | |
4405 | for (int i = 0; i < n; i += 2) { | - |
4406 | QString m; | - |
4407 | if (captured[i + 1] == 0) | - |
4408 | m = QLatin1String(""); // ### Qt 5: don't distinguish between null and empty | - |
4409 | else if (captured[i] >= 0) | - |
4410 | m = priv->t.mid(captured[i], captured[i + 1]); | - |
4411 | priv->capturedCache.append(m); | - |
4412 | } | - |
4413 | priv->t.clear(); | - |
4414 | } | - |
4415 | return priv->capturedCache; | - |
4416 | } | - |
4417 | - | |
4418 | /*! | - |
4419 | \internal | - |
4420 | */ | - |
4421 | QStringList QRegExp::capturedTexts() | - |
4422 | { | - |
4423 | return const_cast<const QRegExp *>(this)->capturedTexts(); | - |
4424 | } | - |
4425 | - | |
4426 | /*! | - |
4427 | Returns the text captured by the \a nth subexpression. The entire | - |
4428 | match has index 0 and the parenthesized subexpressions have | - |
4429 | indexes starting from 1 (excluding non-capturing parentheses). | - |
4430 | - | |
4431 | \snippet code/src_corelib_tools_qregexp.cpp 17 | - |
4432 | - | |
4433 | The order of elements matched by cap() is as follows. The first | - |
4434 | element, cap(0), is the entire matching string. Each subsequent | - |
4435 | element corresponds to the next capturing open left parentheses. | - |
4436 | Thus cap(1) is the text of the first capturing parentheses, cap(2) | - |
4437 | is the text of the second, and so on. | - |
4438 | - | |
4439 | \sa capturedTexts(), pos() | - |
4440 | */ | - |
4441 | QString QRegExp::cap(int nth) const | - |
4442 | { | - |
4443 | return capturedTexts().value(nth); | - |
4444 | } | - |
4445 | - | |
4446 | /*! | - |
4447 | \internal | - |
4448 | */ | - |
4449 | QString QRegExp::cap(int nth) | - |
4450 | { | - |
4451 | return const_cast<const QRegExp *>(this)->cap(nth); | - |
4452 | } | - |
4453 | - | |
4454 | /*! | - |
4455 | Returns the position of the \a nth captured text in the searched | - |
4456 | string. If \a nth is 0 (the default), pos() returns the position | - |
4457 | of the whole match. | - |
4458 | - | |
4459 | Example: | - |
4460 | \snippet code/src_corelib_tools_qregexp.cpp 18 | - |
4461 | - | |
4462 | For zero-length matches, pos() always returns -1. (For example, if | - |
4463 | cap(4) would return an empty string, pos(4) returns -1.) This is | - |
4464 | a feature of the implementation. | - |
4465 | - | |
4466 | \sa cap(), capturedTexts() | - |
4467 | */ | - |
4468 | int QRegExp::pos(int nth) const | - |
4469 | { | - |
4470 | if (nth < 0 || nth >= priv->matchState.capturedSize / 2) | - |
4471 | return -1; | - |
4472 | else | - |
4473 | return priv->matchState.captured[2 * nth]; | - |
4474 | } | - |
4475 | - | |
4476 | /*! | - |
4477 | \internal | - |
4478 | */ | - |
4479 | int QRegExp::pos(int nth) | - |
4480 | { | - |
4481 | return const_cast<const QRegExp *>(this)->pos(nth); | - |
4482 | } | - |
4483 | - | |
4484 | /*! | - |
4485 | Returns a text string that explains why a regexp pattern is | - |
4486 | invalid the case being; otherwise returns "no error occurred". | - |
4487 | - | |
4488 | \sa isValid() | - |
4489 | */ | - |
4490 | QString QRegExp::errorString() const | - |
4491 | { | - |
4492 | if (isValid()) { | - |
4493 | return QString::fromLatin1(RXERR_OK); | - |
4494 | } else { | - |
4495 | return priv->eng->errorString(); | - |
4496 | } | - |
4497 | } | - |
4498 | - | |
4499 | /*! | - |
4500 | \internal | - |
4501 | */ | - |
4502 | QString QRegExp::errorString() | - |
4503 | { | - |
4504 | return const_cast<const QRegExp *>(this)->errorString(); | - |
4505 | } | - |
4506 | #endif | - |
4507 | - | |
4508 | /*! | - |
4509 | Returns the string \a str with every regexp special character | - |
4510 | escaped with a backslash. The special characters are $, (,), *, +, | - |
4511 | ., ?, [, \,], ^, {, | and }. | - |
4512 | - | |
4513 | Example: | - |
4514 | - | |
4515 | \snippet code/src_corelib_tools_qregexp.cpp 19 | - |
4516 | - | |
4517 | This function is useful to construct regexp patterns dynamically: | - |
4518 | - | |
4519 | \snippet code/src_corelib_tools_qregexp.cpp 20 | - |
4520 | - | |
4521 | \sa setPatternSyntax() | - |
4522 | */ | - |
4523 | QString QRegExp::escape(const QString &str) | - |
4524 | { | - |
4525 | QString quoted; | - |
4526 | const int count = str.count(); | - |
4527 | quoted.reserve(count * 2); | - |
4528 | const QLatin1Char backslash('\\'); | - |
4529 | for (int i = 0; i < count; i++) { | - |
4530 | switch (str.at(i).toLatin1()) { | - |
4531 | case '$': | - |
4532 | case '(': | - |
4533 | case ')': | - |
4534 | case '*': | - |
4535 | case '+': | - |
4536 | case '.': | - |
4537 | case '?': | - |
4538 | case '[': | - |
4539 | case '\\': | - |
4540 | case ']': | - |
4541 | case '^': | - |
4542 | case '{': | - |
4543 | case '|': | - |
4544 | case '}': | - |
4545 | quoted.append(backslash); | - |
4546 | } | - |
4547 | quoted.append(str.at(i)); | - |
4548 | } | - |
4549 | return quoted; | - |
4550 | } | - |
4551 | - | |
4552 | - | |
4553 | #ifndef QT_NO_DATASTREAM | - |
4554 | /*! | - |
4555 | \relates QRegExp | - |
4556 | - | |
4557 | Writes the regular expression \a regExp to stream \a out. | - |
4558 | - | |
4559 | \sa {Serializing Qt Data Types} | - |
4560 | */ | - |
4561 | QDataStream &operator<<(QDataStream &out, const QRegExp ®Exp) | - |
4562 | { | - |
4563 | return out << regExp.pattern() << (quint8)regExp.caseSensitivity() | - |
4564 | << (quint8)regExp.patternSyntax() | - |
4565 | << (quint8)!!regExp.isMinimal(); | - |
4566 | } | - |
4567 | - | |
4568 | /*! | - |
4569 | \relates QRegExp | - |
4570 | - | |
4571 | Reads a regular expression from stream \a in into \a regExp. | - |
4572 | - | |
4573 | \sa {Serializing Qt Data Types} | - |
4574 | */ | - |
4575 | QDataStream &operator>>(QDataStream &in, QRegExp ®Exp) | - |
4576 | { | - |
4577 | QString pattern; | - |
4578 | quint8 cs; | - |
4579 | quint8 patternSyntax; | - |
4580 | quint8 isMinimal; | - |
4581 | - | |
4582 | in >> pattern >> cs >> patternSyntax >> isMinimal; | - |
4583 | - | |
4584 | QRegExp newRegExp(pattern, Qt::CaseSensitivity(cs), | - |
4585 | QRegExp::PatternSyntax(patternSyntax)); | - |
4586 | - | |
4587 | newRegExp.setMinimal(isMinimal); | - |
4588 | regExp = newRegExp; | - |
4589 | return in; | - |
4590 | } | - |
4591 | #endif // QT_NO_DATASTREAM | - |
4592 | - | |
4593 | #ifndef QT_NO_DEBUG_STREAM | - |
4594 | QDebug operator<<(QDebug dbg, const QRegExp &r) | - |
4595 | { | - |
4596 | QDebugStateSaver saver(dbg); | - |
4597 | dbg.nospace() << "QRegExp(patternSyntax=" << r.patternSyntax() | - |
4598 | << ", pattern='"<< r.pattern() << "')"; | - |
4599 | return dbg; | - |
4600 | } | - |
4601 | #endif | - |
4602 | - | |
4603 | QT_END_NAMESPACE | - |
Source code | Switch to Preprocessed file |