qregexp.cpp

Jenkins

qregexp.cpp

Absolute File Name:

/home/qt/qt5_coco/qt5/qtbase/src/corelib/tools/qregexp.cpp

Line	Source	Count
1	/****************************************************************************	-
2	**	-
3	** Copyright (C) 2016 The Qt Company Ltd.	-
4	** Contact: https://www.qt.io/licensing/	-
5	**	-
6	** This file is part of the QtCore module of the Qt Toolkit.	-
7	**	-
8	** $QT_BEGIN_LICENSE:LGPL$	-
9	** Commercial License Usage	-
10	** Licensees holding valid commercial Qt licenses may use this file in	-
11	** accordance with the commercial license agreement provided with the	-
12	** Software or, alternatively, in accordance with the terms contained in	-
13	** a written agreement between you and The Qt Company. For licensing terms	-
14	** and conditions see https://www.qt.io/terms-conditions. For further	-
15	** information use the contact form at https://www.qt.io/contact-us.	-
16	**	-
17	** GNU Lesser General Public License Usage	-
18	** Alternatively, this file may be used under the terms of the GNU Lesser	-
19	** General Public License version 3 as published by the Free Software	-
20	** Foundation and appearing in the file LICENSE.LGPL3 included in the	-
21	** packaging of this file. Please review the following information to	-
22	** ensure the GNU Lesser General Public License version 3 requirements	-
23	** will be met: https://www.gnu.org/licenses/lgpl-3.0.html.	-
24	**	-
25	** GNU General Public License Usage	-
26	** Alternatively, this file may be used under the terms of the GNU	-
27	** General Public License version 2.0 or (at your option) the GNU General	-
28	** Public license version 3 or any later version approved by the KDE Free	-
29	** Qt Foundation. The licenses are as published by the Free Software	-
30	** Foundation and appearing in the file LICENSE.GPL2 and LICENSE.GPL3	-
31	** included in the packaging of this file. Please review the following	-
32	** information to ensure the GNU General Public License requirements will	-
33	** be met: https://www.gnu.org/licenses/gpl-2.0.html and	-
34	** https://www.gnu.org/licenses/gpl-3.0.html.	-
35	**	-
36	** $QT_END_LICENSE$	-
37	**	-
38	****************************************************************************/	-
39		-
40	#include "qregexp.h"	-
41		-
42	#include "qalgorithms.h"	-
43	#include "qbitarray.h"	-
44	#include "qcache.h"	-
45	#include "qdatastream.h"	-
46	#include "qdebug.h"	-
47	#include "qhashfunctions.h"	-
48	#include "qlist.h"	-
49	#include "qmap.h"	-
50	#include "qmutex.h"	-
51	#include "qstring.h"	-
52	#include "qstringlist.h"	-
53	#include "qstringmatcher.h"	-
54	#include "qvector.h"	-
55		-
56	#include <limits.h>	-
57	#include <algorithm>	-
58		-
59	QT_BEGIN_NAMESPACE	-
60		-
61	int qFindString(const QChar *haystack, int haystackLen, int from,	-
62	const QChar *needle, int needleLen, Qt::CaseSensitivity cs);	-
63		-
64	// error strings for the regexp parser	-
65	#define RXERR_OK QT_TRANSLATE_NOOP("QRegExp", "no error occurred")	-
66	#define RXERR_DISABLED QT_TRANSLATE_NOOP("QRegExp", "disabled feature used")	-
67	#define RXERR_CHARCLASS QT_TRANSLATE_NOOP("QRegExp", "bad char class syntax")	-
68	#define RXERR_LOOKAHEAD QT_TRANSLATE_NOOP("QRegExp", "bad lookahead syntax")	-
69	#define RXERR_LOOKBEHIND QT_TRANSLATE_NOOP("QRegExp", "lookbehinds not supported, see QTBUG-2371")	-
70	#define RXERR_REPETITION QT_TRANSLATE_NOOP("QRegExp", "bad repetition syntax")	-
71	#define RXERR_OCTAL QT_TRANSLATE_NOOP("QRegExp", "invalid octal value")	-
72	#define RXERR_LEFTDELIM QT_TRANSLATE_NOOP("QRegExp", "missing left delim")	-
73	#define RXERR_END QT_TRANSLATE_NOOP("QRegExp", "unexpected end")	-
74	#define RXERR_LIMIT QT_TRANSLATE_NOOP("QRegExp", "met internal limit")	-
75	#define RXERR_INTERVAL QT_TRANSLATE_NOOP("QRegExp", "invalid interval")	-
76	#define RXERR_CATEGORY QT_TRANSLATE_NOOP("QRegExp", "invalid category")	-
77		-
78	/*!	-
79	\class QRegExp	-
80	\inmodule QtCore	-
81	\reentrant	-
82	\brief The QRegExp class provides pattern matching using regular expressions.	-
83		-
84	\ingroup tools	-
85	\ingroup shared	-
86		-
87	\keyword regular expression	-
88		-
89	A regular expression, or "regexp", is a pattern for matching	-
90	substrings in a text. This is useful in many contexts, e.g.,	-
91		-
92	\table	-
93	\row \li Validation	-
94	\li A regexp can test whether a substring meets some criteria,	-
95	e.g. is an integer or contains no whitespace.	-
96	\row \li Searching	-
97	\li A regexp provides more powerful pattern matching than	-
98	simple substring matching, e.g., match one of the words	-
99	\e{mail}, \e{letter} or \e{correspondence}, but none of the	-
100	words \e{email}, \e{mailman}, \e{mailer}, \e{letterbox}, etc.	-
101	\row \li Search and Replace	-
102	\li A regexp can replace all occurrences of a substring with a	-
103	different substring, e.g., replace all occurrences of \e{&}	-
104	with \e{\&} except where the \e{&} is already followed by	-
105	an \e{amp;}.	-
106	\row \li String Splitting	-
107	\li A regexp can be used to identify where a string should be	-
108	split apart, e.g. splitting tab-delimited strings.	-
109	\endtable	-
110		-
111	A brief introduction to regexps is presented, a description of	-
112	Qt's regexp language, some examples, and the function	-
113	documentation itself. QRegExp is modeled on Perl's regexp	-
114	language. It fully supports Unicode. QRegExp can also be used in a	-
115	simpler, \e{wildcard mode} that is similar to the functionality	-
116	found in command shells. The syntax rules used by QRegExp can be	-
117	changed with setPatternSyntax(). In particular, the pattern syntax	-
118	can be set to QRegExp::FixedString, which means the pattern to be	-
119	matched is interpreted as a plain string, i.e., special characters	-
120	(e.g., backslash) are not escaped.	-
121		-
122	A good text on regexps is \e {Mastering Regular Expressions}	-
123	(Third Edition) by Jeffrey E. F. Friedl, ISBN 0-596-52812-4.	-
124		-
125	\note In Qt 5, the new QRegularExpression class provides a Perl	-
126	compatible implementation of regular expressions and is recommended	-
127	in place of QRegExp.	-
128		-
129	\tableofcontents	-
130		-
131	\section1 Introduction	-
132		-
133	Regexps are built up from expressions, quantifiers, and	-
134	assertions. The simplest expression is a character, e.g. \b{x}	-
135	or \b{5}. An expression can also be a set of characters	-
136	enclosed in square brackets. \b{[ABCD]} will match an \b{A}	-
137	or a \b{B} or a \b{C} or a \b{D}. We can write this same	-
138	expression as \b{[A-D]}, and an expression to match any	-
139	capital letter in the English alphabet is written as	-
140	\b{[A-Z]}.	-
141		-
142	A quantifier specifies the number of occurrences of an expression	-
143	that must be matched. \b{x{1,1}} means match one and only one	-
144	\b{x}. \b{x{1,5}} means match a sequence of \b{x}	-
145	characters that contains at least one \b{x} but no more than	-
146	five.	-
147		-
148	Note that in general regexps cannot be used to check for balanced	-
149	brackets or tags. For example, a regexp can be written to match an	-
150	opening html \c{<b>} and its closing \c{</b>}, if the \c{<b>} tags	-
151	are not nested, but if the \c{<b>} tags are nested, that same	-
152	regexp will match an opening \c{<b>} tag with the wrong closing	-
153	\c{</b>}. For the fragment \c{<b>bold <b>bolder</b></b>}, the	-
154	first \c{<b>} would be matched with the first \c{</b>}, which is	-
155	not correct. However, it is possible to write a regexp that will	-
156	match nested brackets or tags correctly, but only if the number of	-
157	nesting levels is fixed and known. If the number of nesting levels	-
158	is not fixed and known, it is impossible to write a regexp that	-
159	will not fail.	-
160		-
161	Suppose we want a regexp to match integers in the range 0 to 99.	-
162	At least one digit is required, so we start with the expression	-
163	\b{[0-9]{1,1}}, which matches a single digit exactly once. This	-
164	regexp matches integers in the range 0 to 9. To match integers up	-
165	to 99, increase the maximum number of occurrences to 2, so the	-
166	regexp becomes \b{[0-9]{1,2}}. This regexp satisfies the	-
167	original requirement to match integers from 0 to 99, but it will	-
168	also match integers that occur in the middle of strings. If we	-
169	want the matched integer to be the whole string, we must use the	-
170	anchor assertions, \b{^} (caret) and \b{$} (dollar). When	-
171	\b{^} is the first character in a regexp, it means the regexp	-
172	must match from the beginning of the string. When \b{$} is the	-
173	last character of the regexp, it means the regexp must match to	-
174	the end of the string. The regexp becomes \b{^[0-9]{1,2}$}.	-
175	Note that assertions, e.g. \b{^} and \b{$}, do not match	-
176	characters but locations in the string.	-
177		-
178	If you have seen regexps described elsewhere, they may have looked	-
179	different from the ones shown here. This is because some sets of	-
180	characters and some quantifiers are so common that they have been	-
181	given special symbols to represent them. \b{[0-9]} can be	-
182	replaced with the symbol \b{\\d}. The quantifier to match	-
183	exactly one occurrence, \b{{1,1}}, can be replaced with the	-
184	expression itself, i.e. \b{x{1,1}} is the same as \b{x}. So	-
185	our 0 to 99 matcher could be written as \b{^\\d{1,2}$}. It can	-
186	also be written \b{^\\d\\d{0,1}$}, i.e. \e{From the start of	-
187	the string, match a digit, followed immediately by 0 or 1 digits}.	-
188	In practice, it would be written as \b{^\\d\\d?$}. The \b{?}	-
189	is shorthand for the quantifier \b{{0,1}}, i.e. 0 or 1	-
190	occurrences. \b{?} makes an expression optional. The regexp	-
191	\b{^\\d\\d?$} means \e{From the beginning of the string, match	-
192	one digit, followed immediately by 0 or 1 more digit, followed	-
193	immediately by end of string}.	-
194		-
195	To write a regexp that matches one of the words 'mail' \e or	-
196	'letter' \e or 'correspondence' but does not match words that	-
197	contain these words, e.g., 'email', 'mailman', 'mailer', and	-
198	'letterbox', start with a regexp that matches 'mail'. Expressed	-
199	fully, the regexp is \b{m{1,1}a{1,1}i{1,1}l{1,1}}, but because	-
200	a character expression is automatically quantified by	-
201	\b{{1,1}}, we can simplify the regexp to \b{mail}, i.e., an	-
202	'm' followed by an 'a' followed by an 'i' followed by an 'l'. Now	-
203	we can use the vertical bar \b{\|}, which means \b{or}, to	-
204	include the other two words, so our regexp for matching any of the	-
205	three words becomes \b{mail\|letter\|correspondence}. Match	-
206	'mail' \b{or} 'letter' \b{or} 'correspondence'. While this	-
207	regexp will match one of the three words we want to match, it will	-
208	also match words we don't want to match, e.g., 'email'. To	-
209	prevent the regexp from matching unwanted words, we must tell it	-
210	to begin and end the match at word boundaries. First we enclose	-
211	our regexp in parentheses, \b{(mail\|letter\|correspondence)}.	-
212	Parentheses group expressions together, and they identify a part	-
213	of the regexp that we wish to \l{capturing text}{capture}.	-
214	Enclosing the expression in parentheses allows us to use it as a	-
215	component in more complex regexps. It also allows us to examine	-
216	which of the three words was actually matched. To force the match	-
217	to begin and end on word boundaries, we enclose the regexp in	-
218	\b{\\b} \e{word boundary} assertions:	-
219	\b{\\b(mail\|letter\|correspondence)\\b}. Now the regexp means:	-
220	\e{Match a word boundary, followed by the regexp in parentheses,	-
221	followed by a word boundary}. The \b{\\b} assertion matches a	-
222	\e position in the regexp, not a \e character. A word boundary is	-
223	any non-word character, e.g., a space, newline, or the beginning	-
224	or ending of a string.	-
225		-
226	If we want to replace ampersand characters with the HTML entity	-
227	\b{\&}, the regexp to match is simply \b{\&}. But this	-
228	regexp will also match ampersands that have already been converted	-
229	to HTML entities. We want to replace only ampersands that are not	-
230	already followed by \b{amp;}. For this, we need the negative	-
231	lookahead assertion, \b{(?!}__\b{)}. The regexp can then be	-
232	written as \b{\&(?!amp;)}, i.e. \e{Match an ampersand that is}	-
233	\b{not} \e{followed by} \b{amp;}.	-
234		-
235	If we want to count all the occurrences of 'Eric' and 'Eirik' in a	-
236	string, two valid solutions are \b{\\b(Eric\|Eirik)\\b} and	-
237	\b{\\bEi?ri[ck]\\b}. The word boundary assertion '\\b' is	-
238	required to avoid matching words that contain either name,	-
239	e.g. 'Ericsson'. Note that the second regexp matches more	-
240	spellings than we want: 'Eric', 'Erik', 'Eiric' and 'Eirik'.	-
241		-
242	Some of the examples discussed above are implemented in the	-
243	\l{#code-examples}{code examples} section.	-
244		-
245	\target characters-and-abbreviations-for-sets-of-characters	-
246	\section1 Characters and Abbreviations for Sets of Characters	-
247		-
248	\table	-
249	\header \li Element \li Meaning	-
250	\row \li \b{c}	-
251	\li A character represents itself unless it has a special	-
252	regexp meaning. e.g. \b{c} matches the character \e c.	-
253	\row \li \b{\\c}	-
254	\li A character that follows a backslash matches the character	-
255	itself, except as specified below. e.g., To match a literal	-
256	caret at the beginning of a string, write \b{\\^}.	-
257	\row \li \b{\\a}	-
258	\li Matches the ASCII bell (BEL, 0x07).	-
259	\row \li \b{\\f}	-
260	\li Matches the ASCII form feed (FF, 0x0C).	-
261	\row \li \b{\\n}	-
262	\li Matches the ASCII line feed (LF, 0x0A, Unix newline).	-
263	\row \li \b{\\r}	-
264	\li Matches the ASCII carriage return (CR, 0x0D).	-
265	\row \li \b{\\t}	-
266	\li Matches the ASCII horizontal tab (HT, 0x09).	-
267	\row \li \b{\\v}	-
268	\li Matches the ASCII vertical tab (VT, 0x0B).	-
269	\row \li \b{\\x\e{hhhh}}	-
270	\li Matches the Unicode character corresponding to the	-
271	hexadecimal number \e{hhhh} (between 0x0000 and 0xFFFF).	-
272	\row \li \b{\\0\e{ooo}} (i.e., \\zero \e{ooo})	-
273	\li matches the ASCII/Latin1 character for the octal number	-
274	\e{ooo} (between 0 and 0377).	-
275	\row \li \b{. (dot)}	-
276	\li Matches any character (including newline).	-
277	\row \li \b{\\d}	-
278	\li Matches a digit (QChar::isDigit()).	-
279	\row \li \b{\\D}	-
280	\li Matches a non-digit.	-
281	\row \li \b{\\s}	-
282	\li Matches a whitespace character (QChar::isSpace()).	-
283	\row \li \b{\\S}	-
284	\li Matches a non-whitespace character.	-
285	\row \li \b{\\w}	-
286	\li Matches a word character (QChar::isLetterOrNumber(), QChar::isMark(), or '_').	-
287	\row \li \b{\\W}	-
288	\li Matches a non-word character.	-
289	\row \li \b{\\\e{n}}	-
290	\li The \e{n}-th backreference, e.g. \\1, \\2, etc.	-
291	\endtable	-
292		-
293	\b{Note:} The C++ compiler transforms backslashes in strings.	-
294	To include a \b{\\} in a regexp, enter it twice, i.e. \c{\\}.	-
295	To match the backslash character itself, enter it four times, i.e.	-
296	\c{\\\\}.	-
297		-
298	\target sets-of-characters	-
299	\section1 Sets of Characters	-
300		-
301	Square brackets mean match any character contained in the square	-
302	brackets. The character set abbreviations described above can	-
303	appear in a character set in square brackets. Except for the	-
304	character set abbreviations and the following two exceptions,	-
305	characters do not have special meanings in square brackets.	-
306		-
307	\table	-
308	\row \li \b{^}	-
309		-
310	\li The caret negates the character set if it occurs as the	-
311	first character (i.e. immediately after the opening square	-
312	bracket). \b{[abc]} matches 'a' or 'b' or 'c', but	-
313	\b{[^abc]} matches anything \e but 'a' or 'b' or 'c'.	-
314		-
315	\row \li \b{-}	-
316		-
317	\li The dash indicates a range of characters. \b{[W-Z]}	-
318	matches 'W' or 'X' or 'Y' or 'Z'.	-
319		-
320	\endtable	-
321		-
322	Using the predefined character set abbreviations is more portable	-
323	than using character ranges across platforms and languages. For	-
324	example, \b{[0-9]} matches a digit in Western alphabets but	-
325	\b{\\d} matches a digit in \e any alphabet.	-
326		-
327	Note: In other regexp documentation, sets of characters are often	-
328	called "character classes".	-
329		-
330	\target quantifiers	-
331	\section1 Quantifiers	-
332		-
333	By default, an expression is automatically quantified by	-
334	\b{{1,1}}, i.e. it should occur exactly once. In the following	-
335	list, \b{\e {E}} stands for expression. An expression is a	-
336	character, or an abbreviation for a set of characters, or a set of	-
337	characters in square brackets, or an expression in parentheses.	-
338		-
339	\table	-
340	\row \li \b{\e {E}?}	-
341		-
342	\li Matches zero or one occurrences of \e E. This quantifier	-
343	means \e{The previous expression is optional}, because it	-
344	will match whether or not the expression is found. \b{\e	-
345	{E}?} is the same as \b{\e {E}{0,1}}. e.g., \b{dents?}	-
346	matches 'dent' or 'dents'.	-
347		-
348	\row \li \b{\e {E}+}	-
349		-
350	\li Matches one or more occurrences of \e E. \b{\e {E}+} is	-
351	the same as \b{\e {E}{1,}}. e.g., \b{0+} matches '0',	-
352	'00', '000', etc.	-
353		-
354	\row \li \b{\e {E}*}	-
355		-
356	\li Matches zero or more occurrences of \e E. It is the same	-
357	as \b{\e {E}{0,}}. The \b{*} quantifier is often used	-
358	in error where \b{+} should be used. For example, if	-
359	\b{\\s*$} is used in an expression to match strings that	-
360	end in whitespace, it will match every string because	-
361	\b{\\s*$} means \e{Match zero or more whitespaces followed	-
362	by end of string}. The correct regexp to match strings that	-
363	have at least one trailing whitespace character is	-
364	\b{\\s+$}.	-
365		-
366	\row \li \b{\e {E}{n}}	-
367		-
368	\li Matches exactly \e n occurrences of \e E. \b{\e {E}{n}}	-
369	is the same as repeating \e E \e n times. For example,	-
370	\b{x{5}} is the same as \b{xxxxx}. It is also the same	-
371	as \b{\e {E}{n,n}}, e.g. \b{x{5,5}}.	-
372		-
373	\row \li \b{\e {E}{n,}}	-
374	\li Matches at least \e n occurrences of \e E.	-
375		-
376	\row \li \b{\e {E}{,m}}	-
377	\li Matches at most \e m occurrences of \e E. \b{\e {E}{,m}}	-
378	is the same as \b{\e {E}{0,m}}.	-
379		-
380	\row \li \b{\e {E}{n,m}}	-
381	\li Matches at least \e n and at most \e m occurrences of \e E.	-
382	\endtable	-
383		-
384	To apply a quantifier to more than just the preceding character,	-
385	use parentheses to group characters together in an expression. For	-
386	example, \b{tag+} matches a 't' followed by an 'a' followed by	-
387	at least one 'g', whereas \b{(tag)+} matches at least one	-
388	occurrence of 'tag'.	-
389		-
390	Note: Quantifiers are normally "greedy". They always match as much	-
391	text as they can. For example, \b{0+} matches the first zero it	-
392	finds and all the consecutive zeros after the first zero. Applied	-
393	to '20005', it matches '2\underline{000}5'. Quantifiers can be made	-
394	non-greedy, see setMinimal().	-
395		-
396	\target capturing parentheses	-
397	\target backreferences	-
398	\section1 Capturing Text	-
399		-
400	Parentheses allow us to group elements together so that we can	-
401	quantify and capture them. For example if we have the expression	-
402	\b{mail\|letter\|correspondence} that matches a string we know	-
403	that \e one of the words matched but not which one. Using	-
404	parentheses allows us to "capture" whatever is matched within	-
405	their bounds, so if we used \b{(mail\|letter\|correspondence)}	-
406	and matched this regexp against the string "I sent you some email"	-
407	we can use the cap() or capturedTexts() functions to extract the	-
408	matched characters, in this case 'mail'.	-
409		-
410	We can use captured text within the regexp itself. To refer to the	-
411	captured text we use \e backreferences which are indexed from 1,	-
412	the same as for cap(). For example we could search for duplicate	-
413	words in a string using \b{\\b(\\w+)\\W+\\1\\b} which means match a	-
414	word boundary followed by one or more word characters followed by	-
415	one or more non-word characters followed by the same text as the	-
416	first parenthesized expression followed by a word boundary.	-
417		-
418	If we want to use parentheses purely for grouping and not for	-
419	capturing we can use the non-capturing syntax, e.g.	-
420	\b{(?:green\|blue)}. Non-capturing parentheses begin '(?:' and	-
421	end ')'. In this example we match either 'green' or 'blue' but we	-
422	do not capture the match so we only know whether or not we matched	-
423	but not which color we actually found. Using non-capturing	-
424	parentheses is more efficient than using capturing parentheses	-
425	since the regexp engine has to do less book-keeping.	-
426		-
427	Both capturing and non-capturing parentheses may be nested.	-
428		-
429	\target greedy quantifiers	-
430		-
431	For historical reasons, quantifiers (e.g. \b{*}) that apply to	-
432	capturing parentheses are more "greedy" than other quantifiers.	-
433	For example, \b{a(a)} will match "aaa" with cap(1) == "aaa".	-
434	This behavior is different from what other regexp engines do	-
435	(notably, Perl). To obtain a more intuitive capturing behavior,	-
436	specify QRegExp::RegExp2 to the QRegExp constructor or call	-
437	setPatternSyntax(QRegExp::RegExp2).	-
438		-
439	\target cap_in_a_loop	-
440		-
441	When the number of matches cannot be determined in advance, a	-
442	common idiom is to use cap() in a loop. For example:	-
443		-
444	\snippet code/src_corelib_tools_qregexp.cpp 0	-
445		-
446	\target assertions	-
447	\section1 Assertions	-
448		-
449	Assertions make some statement about the text at the point where	-
450	they occur in the regexp but they do not match any characters. In	-
451	the following list \b{\e {E}} stands for any expression.	-
452		-
453	\table	-
454	\row \li \b{^}	-
455	\li The caret signifies the beginning of the string. If you	-
456	wish to match a literal \c{^} you must escape it by	-
457	writing \c{\\^}. For example, \b{^#include} will only	-
458	match strings which \e begin with the characters '#include'.	-
459	(When the caret is the first character of a character set it	-
460	has a special meaning, see \l{#sets-of-characters}{Sets of Characters}.)	-
461		-
462	\row \li \b{$}	-
463	\li The dollar signifies the end of the string. For example	-
464	\b{\\d\\s*$} will match strings which end with a digit	-
465	optionally followed by whitespace. If you wish to match a	-
466	literal \c{$} you must escape it by writing	-
467	\c{\\$}.	-
468		-
469	\row \li \b{\\b}	-
470	\li A word boundary. For example the regexp	-
471	\b{\\bOK\\b} means match immediately after a word	-
472	boundary (e.g. start of string or whitespace) the letter 'O'	-
473	then the letter 'K' immediately before another word boundary	-
474	(e.g. end of string or whitespace). But note that the	-
475	assertion does not actually match any whitespace so if we	-
476	write \b{(\\bOK\\b)} and we have a match it will only	-
477	contain 'OK' even if the string is "It's \underline{OK} now".	-
478		-
479	\row \li \b{\\B}	-
480	\li A non-word boundary. This assertion is true wherever	-
481	\b{\\b} is false. For example if we searched for	-
482	\b{\\Bon\\B} in "Left on" the match would fail (space	-
483	and end of string aren't non-word boundaries), but it would	-
484	match in "t\underline{on}ne".	-
485		-
486	\row \li \b{(?=\e E)}	-
487	\li Positive lookahead. This assertion is true if the	-
488	expression matches at this point in the regexp. For example,	-
489	\b{const(?=\\s+char)} matches 'const' whenever it is	-
490	followed by 'char', as in 'static \underline{const} char *'.	-
491	(Compare with \b{const\\s+char}, which matches 'static	-
492	\underline{const char} *'.)	-
493		-
494	\row \li \b{(?!\e E)}	-
495	\li Negative lookahead. This assertion is true if the	-
496	expression does not match at this point in the regexp. For	-
497	example, \b{const(?!\\s+char)} matches 'const' \e except	-
498	when it is followed by 'char'.	-
499	\endtable	-
500		-

Line

Source

Count

/****************************************************************************

** Contact: https://www.qt.io/licensing/

** This file is part of the QtCore module of the Qt Toolkit.

** $QT_BEGIN_LICENSE:LGPL$

** Commercial License Usage

** Licensees holding valid commercial Qt licenses may use this file in

** accordance with the commercial license agreement provided with the

** Software or, alternatively, in accordance with the terms contained in

** a written agreement between you and The Qt Company. For licensing terms

** and conditions see https://www.qt.io/terms-conditions. For further

** information use the contact form at https://www.qt.io/contact-us.

** GNU Lesser General Public License Usage

** Alternatively, this file may be used under the terms of the GNU Lesser

** General Public License version 3 as published by the Free Software

** Foundation and appearing in the file LICENSE.LGPL3 included in the

** packaging of this file. Please review the following information to

** ensure the GNU Lesser General Public License version 3 requirements

** will be met: https://www.gnu.org/licenses/lgpl-3.0.html.

** GNU General Public License Usage

** Alternatively, this file may be used under the terms of the GNU

** General Public License version 2.0 or (at your option) the GNU General

** Public license version 3 or any later version approved by the KDE Free

** Qt Foundation. The licenses are as published by the Free Software

** Foundation and appearing in the file LICENSE.GPL2 and LICENSE.GPL3

** included in the packaging of this file. Please review the following

** information to ensure the GNU General Public License requirements will

** be met: https://www.gnu.org/licenses/gpl-2.0.html and

** https://www.gnu.org/licenses/gpl-3.0.html.

** $QT_END_LICENSE$

****************************************************************************/

#include "qregexp.h"

#include "qalgorithms.h"

#include "qbitarray.h"

#include "qcache.h"

#include "qdatastream.h"

#include "qdebug.h"

#include "qhashfunctions.h"

#include "qlist.h"

#include "qmap.h"

#include "qmutex.h"

#include "qstring.h"

#include "qstringlist.h"

#include "qstringmatcher.h"

#include "qvector.h"

#include <limits.h>

#include <algorithm>

QT_BEGIN_NAMESPACE

int qFindString(const QChar *haystack, int haystackLen, int from,

const QChar *needle, int needleLen, Qt::CaseSensitivity cs);

// error strings for the regexp parser

#define RXERR_OK QT_TRANSLATE_NOOP("QRegExp", "no error occurred")

#define RXERR_DISABLED QT_TRANSLATE_NOOP("QRegExp", "disabled feature used")

#define RXERR_CHARCLASS QT_TRANSLATE_NOOP("QRegExp", "bad char class syntax")

#define RXERR_LOOKAHEAD QT_TRANSLATE_NOOP("QRegExp", "bad lookahead syntax")

#define RXERR_LOOKBEHIND QT_TRANSLATE_NOOP("QRegExp", "lookbehinds not supported, see QTBUG-2371")

#define RXERR_REPETITION QT_TRANSLATE_NOOP("QRegExp", "bad repetition syntax")

#define RXERR_OCTAL QT_TRANSLATE_NOOP("QRegExp", "invalid octal value")

#define RXERR_LEFTDELIM QT_TRANSLATE_NOOP("QRegExp", "missing left delim")

#define RXERR_END QT_TRANSLATE_NOOP("QRegExp", "unexpected end")

#define RXERR_LIMIT QT_TRANSLATE_NOOP("QRegExp", "met internal limit")

#define RXERR_INTERVAL QT_TRANSLATE_NOOP("QRegExp", "invalid interval")

#define RXERR_CATEGORY QT_TRANSLATE_NOOP("QRegExp", "invalid category")

/*!

\class QRegExp

\inmodule QtCore

\reentrant

\brief The QRegExp class provides pattern matching using regular expressions.

\ingroup tools

\ingroup shared

\keyword regular expression

A regular expression, or "regexp", is a pattern for matching

substrings in a text. This is useful in many contexts, e.g.,

\table

\row \li Validation

\li A regexp can test whether a substring meets some criteria,

e.g. is an integer or contains no whitespace.

\row \li Searching

\li A regexp provides more powerful pattern matching than

simple substring matching, e.g., match one of the words

\e{mail}, \e{letter} or \e{correspondence}, but none of the

100

words \e{email}, \e{mailman}, \e{mailer}, \e{letterbox}, etc.

101

\row \li Search and Replace

102

\li A regexp can replace all occurrences of a substring with a

103

different substring, e.g., replace all occurrences of \e{&}

104

with \e{\&} except where the \e{&} is already followed by

105

an \e{amp;}.

106

\row \li String Splitting

107

\li A regexp can be used to identify where a string should be

108

split apart, e.g. splitting tab-delimited strings.

109

\endtable

110

111

A brief introduction to regexps is presented, a description of

112

Qt's regexp language, some examples, and the function

113

documentation itself. QRegExp is modeled on Perl's regexp

114

language. It fully supports Unicode. QRegExp can also be used in a

115

simpler, \e{wildcard mode} that is similar to the functionality

116

found in command shells. The syntax rules used by QRegExp can be

117

changed with setPatternSyntax(). In particular, the pattern syntax

118

can be set to QRegExp::FixedString, which means the pattern to be

119

matched is interpreted as a plain string, i.e., special characters

120

(e.g., backslash) are not escaped.

121

122

A good text on regexps is \e {Mastering Regular Expressions}

123

(Third Edition) by Jeffrey E. F. Friedl, ISBN 0-596-52812-4.

124

125

\note In Qt 5, the new QRegularExpression class provides a Perl

126

compatible implementation of regular expressions and is recommended

in place of QRegExp.

\tableofcontents

\section1 Introduction

132

133

Regexps are built up from expressions, quantifiers, and

134

assertions. The simplest expression is a character, e.g. \b{x}

135

or \b{5}. An expression can also be a set of characters

136

enclosed in square brackets. \b{[ABCD]} will match an \b{A}

137

or a \b{B} or a \b{C} or a \b{D}. We can write this same

138

expression as \b{[A-D]}, and an expression to match any

139

capital letter in the English alphabet is written as

140

\b{[A-Z]}.

141

142

A quantifier specifies the number of occurrences of an expression

143

that must be matched. \b{x{1,1}} means match one and only one

144

\b{x}. \b{x{1,5}} means match a sequence of \b{x}

145

characters that contains at least one \b{x} but no more than

146

five.

147

148

Note that in general regexps cannot be used to check for balanced

149

brackets or tags. For example, a regexp can be written to match an

150

opening html \c{} and its closing \c{}, if the \c{} tags

151

are not nested, but if the \c{} tags are nested, that same

152

regexp will match an opening \c{} tag with the wrong closing

153

\c{}. For the fragment \c{bold bolder}, the

154

first \c{} would be matched with the first \c{}, which is

155

not correct. However, it is possible to write a regexp that will

156

match nested brackets or tags correctly, but only if the number of

157

nesting levels is fixed and known. If the number of nesting levels

158

is not fixed and known, it is impossible to write a regexp that

159

will not fail.

160

161

Suppose we want a regexp to match integers in the range 0 to 99.

162

At least one digit is required, so we start with the expression

163

\b{[0-9]{1,1}}, which matches a single digit exactly once. This

164

regexp matches integers in the range 0 to 9. To match integers up

165

to 99, increase the maximum number of occurrences to 2, so the

166

regexp becomes \b{[0-9]{1,2}}. This regexp satisfies the

167

original requirement to match integers from 0 to 99, but it will

168

also match integers that occur in the middle of strings. If we

169

want the matched integer to be the whole string, we must use the

170

anchor assertions, \b{^} (caret) and \b{$} (dollar). When

171

\b{^} is the first character in a regexp, it means the regexp

172

must match from the beginning of the string. When \b{$} is the

173

last character of the regexp, it means the regexp must match to

174

the end of the string. The regexp becomes \b{^[0-9]{1,2}$}.

175

Note that assertions, e.g. \b{^} and \b{$}, do not match

176

characters but locations in the string.

177

178

If you have seen regexps described elsewhere, they may have looked

179

different from the ones shown here. This is because some sets of

180

characters and some quantifiers are so common that they have been

181

given special symbols to represent them. \b{[0-9]} can be

182

replaced with the symbol \b{\\d}. The quantifier to match

183

exactly one occurrence, \b{{1,1}}, can be replaced with the

184

expression itself, i.e. \b{x{1,1}} is the same as \b{x}. So

185

our 0 to 99 matcher could be written as \b{^\\d{1,2}$}. It can

186

also be written \b{^\\d\\d{0,1}$}, i.e. \e{From the start of

187

the string, match a digit, followed immediately by 0 or 1 digits}.

188

In practice, it would be written as \b{^\\d\\d?$}. The \b{?}

189

is shorthand for the quantifier \b{{0,1}}, i.e. 0 or 1

190

occurrences. \b{?} makes an expression optional. The regexp

191

\b{^\\d\\d?$} means \e{From the beginning of the string, match

192

one digit, followed immediately by 0 or 1 more digit, followed

193

immediately by end of string}.

194

195

To write a regexp that matches one of the words 'mail' \e or

196

'letter' \e or 'correspondence' but does not match words that

197

contain these words, e.g., 'email', 'mailman', 'mailer', and

198

'letterbox', start with a regexp that matches 'mail'. Expressed

199

fully, the regexp is \b{m{1,1}a{1,1}i{1,1}l{1,1}}, but because

200

a character expression is automatically quantified by

201

\b{{1,1}}, we can simplify the regexp to \b{mail}, i.e., an

202

'm' followed by an 'a' followed by an 'i' followed by an 'l'. Now

203

we can use the vertical bar \b{|}, which means \b{or}, to

204

include the other two words, so our regexp for matching any of the

205

three words becomes \b{mail|letter|correspondence}. Match

206

'mail' \b{or} 'letter' \b{or} 'correspondence'. While this

207

regexp will match one of the three words we want to match, it will

208

also match words we don't want to match, e.g., 'email'. To

209

prevent the regexp from matching unwanted words, we must tell it

210

to begin and end the match at word boundaries. First we enclose

211

our regexp in parentheses, \b{(mail|letter|correspondence)}.

212

Parentheses group expressions together, and they identify a part

213

of the regexp that we wish to \l{capturing text}{capture}.

214

Enclosing the expression in parentheses allows us to use it as a

215

component in more complex regexps. It also allows us to examine

216

which of the three words was actually matched. To force the match

217

to begin and end on word boundaries, we enclose the regexp in

218

\b{\\b} \e{word boundary} assertions:

219

\b{\\b(mail|letter|correspondence)\\b}. Now the regexp means:

220

\e{Match a word boundary, followed by the regexp in parentheses,

221

followed by a word boundary}. The \b{\\b} assertion matches a

222

\e position in the regexp, not a \e character. A word boundary is

223

any non-word character, e.g., a space, newline, or the beginning

224

or ending of a string.

225

226

If we want to replace ampersand characters with the HTML entity

227

\b{\&}, the regexp to match is simply \b{\&}. But this

228

regexp will also match ampersands that have already been converted

229

to HTML entities. We want to replace only ampersands that are not

230

already followed by \b{amp;}. For this, we need the negative

231

lookahead assertion, \b{(?!}__\b{)}. The regexp can then be

232

written as \b{\&(?!amp;)}, i.e. \e{Match an ampersand that is}

233

\b{not} \e{followed by} \b{amp;}.

234

235

If we want to count all the occurrences of 'Eric' and 'Eirik' in a

236

string, two valid solutions are \b{\\b(Eric|Eirik)\\b} and

237

\b{\\bEi?ri[ck]\\b}. The word boundary assertion '\\b' is

238

required to avoid matching words that contain either name,

239

e.g. 'Ericsson'. Note that the second regexp matches more

240

spellings than we want: 'Eric', 'Erik', 'Eiric' and 'Eirik'.

241

242

Some of the examples discussed above are implemented in the

243

\l{#code-examples}{code examples} section.

244

245

\target characters-and-abbreviations-for-sets-of-characters

246

\section1 Characters and Abbreviations for Sets of Characters

247

248

\table

249

\header \li Element \li Meaning

250

\row \li \b{c}

251

\li A character represents itself unless it has a special

252

regexp meaning. e.g. \b{c} matches the character \e c.

253

\row \li \b{\\c}

254

\li A character that follows a backslash matches the character

255

itself, except as specified below. e.g., To match a literal

256

caret at the beginning of a string, write \b{\\^}.

257

\row \li \b{\\a}

258

\li Matches the ASCII bell (BEL, 0x07).

259

\row \li \b{\\f}

260

\li Matches the ASCII form feed (FF, 0x0C).

261

\row \li \b{\\n}

262

\li Matches the ASCII line feed (LF, 0x0A, Unix newline).

263

\row \li \b{\\r}

264

\li Matches the ASCII carriage return (CR, 0x0D).

265

\row \li \b{\\t}

266

\li Matches the ASCII horizontal tab (HT, 0x09).

267

\row \li \b{\\v}

268

\li Matches the ASCII vertical tab (VT, 0x0B).

269

\row \li \b{\\x\e{hhhh}}

270

\li Matches the Unicode character corresponding to the

271

hexadecimal number \e{hhhh} (between 0x0000 and 0xFFFF).

272

\row \li \b{\\0\e{ooo}} (i.e., \\zero \e{ooo})

273

\li matches the ASCII/Latin1 character for the octal number

274

\e{ooo} (between 0 and 0377).

275

\row \li \b{. (dot)}

276

\li Matches any character (including newline).

277

\row \li \b{\\d}

278

\li Matches a digit (QChar::isDigit()).

279

\row \li \b{\\D}

280

\li Matches a non-digit.

281

\row \li \b{\\s}

282

\li Matches a whitespace character (QChar::isSpace()).

283

\row \li \b{\\S}

284

\li Matches a non-whitespace character.

285

\row \li \b{\\w}

286

\li Matches a word character (QChar::isLetterOrNumber(), QChar::isMark(), or '_').

287

\row \li \b{\\W}

288

\li Matches a non-word character.

289

\row \li \b{\\\e{n}}

290

\li The \e{n}-th backreference, e.g. \\1, \\2, etc.

291

\endtable

292

293

\b{Note:} The C++ compiler transforms backslashes in strings.

294

To include a \b{\\} in a regexp, enter it twice, i.e. \c{\\}.

295

To match the backslash character itself, enter it four times, i.e.

296

\c{\\\\}.

297

298

\target sets-of-characters

299

\section1 Sets of Characters

300

301

Square brackets mean match any character contained in the square

302

brackets. The character set abbreviations described above can

303

appear in a character set in square brackets. Except for the

304

character set abbreviations and the following two exceptions,

305

characters do not have special meanings in square brackets.

\table

\row \li \b{^}

\li The caret negates the character set if it occurs as the

311

first character (i.e. immediately after the opening square

312

bracket). \b{[abc]} matches 'a' or 'b' or 'c', but

313

\b{[^abc]} matches anything \e but 'a' or 'b' or 'c'.

\row \li \b{-}

\li The dash indicates a range of characters. \b{[W-Z]}

318

matches 'W' or 'X' or 'Y' or 'Z'.

\endtable

Using the predefined character set abbreviations is more portable

323

than using character ranges across platforms and languages. For

324

example, \b{[0-9]} matches a digit in Western alphabets but

325

\b{\\d} matches a digit in \e any alphabet.

326

327

Note: In other regexp documentation, sets of characters are often

328

called "character classes".

329

330

\target quantifiers

331

\section1 Quantifiers

332

333

By default, an expression is automatically quantified by

334

\b{{1,1}}, i.e. it should occur exactly once. In the following

335

list, \b{\e {E}} stands for expression. An expression is a

336

character, or an abbreviation for a set of characters, or a set of

337

characters in square brackets, or an expression in parentheses.

\table

\row \li \b{\e {E}?}

\li Matches zero or one occurrences of \e E. This quantifier

343

means \e{The previous expression is optional}, because it

344

will match whether or not the expression is found. \b{\e

345

{E}?} is the same as \b{\e {E}{0,1}}. e.g., \b{dents?}

346

matches 'dent' or 'dents'.

\row \li \b{\e {E}+}

\li Matches one or more occurrences of \e E. \b{\e {E}+} is

351

the same as \b{\e {E}{1,}}. e.g., \b{0+} matches '0',

'00', '000', etc.

\row \li \b{\e {E}*}

\li Matches zero or more occurrences of \e E. It is the same

357

as \b{\e {E}{0,}}. The \b{*} quantifier is often used

358

in error where \b{+} should be used. For example, if

359

\b{\\s*$} is used in an expression to match strings that

360

end in whitespace, it will match every string because

361

\b{\\s*$} means \e{Match zero or more whitespaces followed

362

by end of string}. The correct regexp to match strings that

363

have at least one trailing whitespace character is

364

\b{\\s+$}.

365

366

\row \li \b{\e {E}{n}}

367

368

\li Matches exactly \e n occurrences of \e E. \b{\e {E}{n}}

369

is the same as repeating \e E \e n times. For example,

370

\b{x{5}} is the same as \b{xxxxx}. It is also the same

371

as \b{\e {E}{n,n}}, e.g. \b{x{5,5}}.

372

373

\row \li \b{\e {E}{n,}}

374

\li Matches at least \e n occurrences of \e E.

375

376

\row \li \b{\e {E}{,m}}

377

\li Matches at most \e m occurrences of \e E. \b{\e {E}{,m}}

378

is the same as \b{\e {E}{0,m}}.

379

380

\row \li \b{\e {E}{n,m}}

381

\li Matches at least \e n and at most \e m occurrences of \e E.

382

\endtable

383

384

To apply a quantifier to more than just the preceding character,

385

use parentheses to group characters together in an expression. For

386

example, \b{tag+} matches a 't' followed by an 'a' followed by

387

at least one 'g', whereas \b{(tag)+} matches at least one

388

occurrence of 'tag'.

389

390

Note: Quantifiers are normally "greedy". They always match as much

391

text as they can. For example, \b{0+} matches the first zero it

392

finds and all the consecutive zeros after the first zero. Applied

393

to '20005', it matches '2\underline{000}5'. Quantifiers can be made

394

non-greedy, see setMinimal().

395

396

\target capturing parentheses

397

\target backreferences

398

\section1 Capturing Text

399

400

Parentheses allow us to group elements together so that we can

401

quantify and capture them. For example if we have the expression

402

\b{mail|letter|correspondence} that matches a string we know

403

that \e one of the words matched but not which one. Using

404

parentheses allows us to "capture" whatever is matched within

405

their bounds, so if we used \b{(mail|letter|correspondence)}

406

and matched this regexp against the string "I sent you some email"

407

we can use the cap() or capturedTexts() functions to extract the

408

matched characters, in this case 'mail'.

409

410

We can use captured text within the regexp itself. To refer to the

411

captured text we use \e backreferences which are indexed from 1,

412

the same as for cap(). For example we could search for duplicate

413

words in a string using \b{\\b(\\w+)\\W+\\1\\b} which means match a

414

word boundary followed by one or more word characters followed by

415

one or more non-word characters followed by the same text as the

416

first parenthesized expression followed by a word boundary.

417

418

If we want to use parentheses purely for grouping and not for

419

capturing we can use the non-capturing syntax, e.g.

420

\b{(?:green|blue)}. Non-capturing parentheses begin '(?:' and

421

end ')'. In this example we match either 'green' or 'blue' but we

422

do not capture the match so we only know whether or not we matched

423

but not which color we actually found. Using non-capturing

424

parentheses is more efficient than using capturing parentheses

425

since the regexp engine has to do less book-keeping.

426

427

Both capturing and non-capturing parentheses may be nested.

428

429

\target greedy quantifiers

430

431

For historical reasons, quantifiers (e.g. \b{*}) that apply to

432

capturing parentheses are more "greedy" than other quantifiers.

433

For example, \b{a*(a*)} will match "aaa" with cap(1) == "aaa".

434

This behavior is different from what other regexp engines do

435

(notably, Perl). To obtain a more intuitive capturing behavior,

436

specify QRegExp::RegExp2 to the QRegExp constructor or call

437

setPatternSyntax(QRegExp::RegExp2).

438

439

\target cap_in_a_loop

440

441

When the number of matches cannot be determined in advance, a

442

common idiom is to use cap() in a loop. For example:

443

444

\snippet code/src_corelib_tools_qregexp.cpp 0

\target assertions

\section1 Assertions

Assertions make some statement about the text at the point where

450

they occur in the regexp but they do not match any characters. In

451

the following list \b{\e {E}} stands for any expression.

\table

\row \li \b{^}

\li The caret signifies the beginning of the string. If you

456

wish to match a literal \c{^} you must escape it by

457

writing \c{\\^}. For example, \b{^#include} will only

458

match strings which \e begin with the characters '#include'.

459

(When the caret is the first character of a character set it

460

has a special meaning, see \l{#sets-of-characters}{Sets of Characters}.)

461

462

\row \li \b{$}

463

\li The dollar signifies the end of the string. For example

464

\b{\\d\\s*$} will match strings which end with a digit

465

optionally followed by whitespace. If you wish to match a

466

literal \c{$} you must escape it by writing

\c{\\$}.

\row \li \b{\\b}

\li A word boundary. For example the regexp

471

\b{\\bOK\\b} means match immediately after a word

472

boundary (e.g. start of string or whitespace) the letter 'O'

473

then the letter 'K' immediately before another word boundary

474

(e.g. end of string or whitespace). But note that the

475

assertion does not actually match any whitespace so if we

476

write \b{(\\bOK\\b)} and we have a match it will only

477

contain 'OK' even if the string is "It's \underline{OK} now".

478

479

\row \li \b{\\B}

480

\li A non-word boundary. This assertion is true wherever

481

\b{\\b} is false. For example if we searched for

482

\b{\\Bon\\B} in "Left on" the match would fail (space

483

and end of string aren't non-word boundaries), but it would

484

match in "t\underline{on}ne".

485

486

\row \li \b{(?=\e E)}

487

\li Positive lookahead. This assertion is true if the

488

expression matches at this point in the regexp. For example,

489

\b{const(?=\\s+char)} matches 'const' whenever it is

490

followed by 'char', as in 'static \underline{const} char *'.

491

(Compare with \b{const\\s+char}, which matches 'static

492

\underline{const char} *'.)

493

494

\row \li \b{(?!\e E)}

495

\li Negative lookahead. This assertion is true if the

496

expression does not match at this point in the regexp. For

497

example, \b{const(?!\\s+char)} matches 'const' \e except

498

when it is followed by 'char'.

499

\endtable

500

501

\target QRegExp wildcard matching

502

\section1 Wildcard Matching

503

504

Most command shells such as \e bash or \e cmd.exe support "file

505

globbing", the ability to identify a group of files by using

506

wildcards. The setPatternSyntax() function is used to switch

507

between regexp and wildcard mode. Wildcard matching is much

508

simpler than full regexps and has only four features:

\table

\row \li \b{c}

\li Any character represents itself apart from those mentioned

513

below. Thus \b{c} matches the character \e c.

514

\row \li \b{?}

515

\li Matches any single character. It is the same as

516

\b{.} in full regexps.

517

\row \li \b{*}

518

\li Matches zero or more of any characters. It is the

519

same as \b{.*} in full regexps.

520

\row \li \b{[...]}

521

\li Sets of characters can be represented in square brackets,

522

similar to full regexps. Within the character class, like

523

outside, backslash has no special meaning.

524

\endtable

525

526

In the mode Wildcard, the wildcard characters cannot be

527

escaped. In the mode WildcardUnix, the character '\\' escapes the

528

wildcard.

529

530

For example if we are in wildcard mode and have strings which

531

contain filenames we could identify HTML files with \b{*.html}.

532

This will match zero or more characters followed by a dot followed

533

by 'h', 't', 'm' and 'l'.

534

535

To test a string against a wildcard expression, use exactMatch().

536

For example:

537

538

\snippet code/src_corelib_tools_qregexp.cpp 1

539

540

\target perl-users

541

\section1 Notes for Perl Users

542

543

Most of the character class abbreviations supported by Perl are

544

supported by QRegExp, see \l{#characters-and-abbreviations-for-sets-of-characters}

545

{characters and abbreviations for sets of characters}.

546

547

In QRegExp, apart from within character classes, \c{^} always

548

signifies the start of the string, so carets must always be

549

escaped unless used for that purpose. In Perl the meaning of caret

550

varies automagically depending on where it occurs so escaping it

551

is rarely necessary. The same applies to \c{$} which in

552

QRegExp always signifies the end of the string.

553

554

QRegExp's quantifiers are the same as Perl's greedy quantifiers

555

(but see the \l{greedy quantifiers}{note above}). Non-greedy

556

matching cannot be applied to individual quantifiers, but can be

557

applied to all the quantifiers in the pattern. For example, to

558

match the Perl regexp \b{ro+?m} requires:

559

560

\snippet code/src_corelib_tools_qregexp.cpp 2

561

562

The equivalent of Perl's \c{/i} option is

563

setCaseSensitivity(Qt::CaseInsensitive).

564

565

Perl's \c{/g} option can be emulated using a \l{#cap_in_a_loop}{loop}.

566

567

In QRegExp \b{.} matches any character, therefore all QRegExp

568

regexps have the equivalent of Perl's \c{/s} option. QRegExp

569

does not have an equivalent to Perl's \c{/m} option, but this

570

can be emulated in various ways for example by splitting the input

571

into lines or by looping with a regexp that searches for newlines.

572

573

Because QRegExp is string oriented, there are no \\A, \\Z, or \\z

574

assertions. The \\G assertion is not supported but can be emulated

575

in a loop.

576

577

Perl's $& is cap(0) or capturedTexts()[0]. There are no QRegExp

578

equivalents for $`, $' or $+. Perl's capturing variables, $1, $2,

579

... correspond to cap(1) or capturedTexts()[1], cap(2) or

580

capturedTexts()[2], etc.

581

582

To substitute a pattern use QString::replace().

583

584

Perl's extended \c{/x} syntax is not supported, nor are

585

directives, e.g. (?i), or regexp comments, e.g. (?#comment). On

586

the other hand, C++'s rules for literal strings can be used to

587

achieve the same:

588

589

\snippet code/src_corelib_tools_qregexp.cpp 3

590

591

Both zero-width positive and zero-width negative lookahead

592

assertions (?=pattern) and (?!pattern) are supported with the same

593

syntax as Perl. Perl's lookbehind assertions, "independent"

594

subexpressions and conditional expressions are not supported.

595

596

Non-capturing parentheses are also supported, with the same

597

(?:pattern) syntax.

598

599

See QString::split() and QStringList::join() for equivalents

600

to Perl's split and join functions.

601

602

Note: because C++ transforms \\'s they must be written \e twice in

603

code, e.g. \b{\\b} must be written \b{\\\\b}.

604

605

\target code-examples

606

\section1 Code Examples

607

608

\snippet code/src_corelib_tools_qregexp.cpp 4

609

610

The third string matches '\underline{6}'. This is a simple validation

611

regexp for integers in the range 0 to 99.

612

613

\snippet code/src_corelib_tools_qregexp.cpp 5

614

615

The second string matches '\underline{This_is-OK}'. We've used the

616

character set abbreviation '\\S' (non-whitespace) and the anchors

617

to match strings which contain no whitespace.

618

619

In the following example we match strings containing 'mail' or

620

'letter' or 'correspondence' but only match whole words i.e. not

621

'email'

622

623

\snippet code/src_corelib_tools_qregexp.cpp 6

624

625

The second string matches "Please write the \underline{letter}". The

626

word 'letter' is also captured (because of the parentheses). We

627

can see what text we've captured like this:

628

629

\snippet code/src_corelib_tools_qregexp.cpp 7

630

631

This will capture the text from the first set of capturing

632

parentheses (counting capturing left parentheses from left to

633

right). The parentheses are counted from 1 since cap(0) is the

634

whole matched regexp (equivalent to '&' in most regexp engines).

635

636

\snippet code/src_corelib_tools_qregexp.cpp 8

637

638

Here we've passed the QRegExp to QString's replace() function to

639

replace the matched text with new text.

640

641

\snippet code/src_corelib_tools_qregexp.cpp 9

642

643

We've used the indexIn() function to repeatedly match the regexp in

644

the string. Note that instead of moving forward by one character

645

at a time \c pos++ we could have written \c {pos +=

646

rx.matchedLength()} to skip over the already matched string. The

647

count will equal 3, matching 'One \underline{Eric} another

648

\underline{Eirik}, and an Ericsson. How many Eiriks, \underline{Eric}?'; it

649

doesn't match 'Ericsson' or 'Eiriks' because they are not bounded

650

by non-word boundaries.

651

652

One common use of regexps is to split lines of delimited data into

653

their component fields.

654

655

\snippet code/src_corelib_tools_qregexp.cpp 10

656

657

In this example our input lines have the format company name, web

658

address and country. Unfortunately the regexp is rather long and

659

not very versatile -- the code will break if we add any more

660

fields. A simpler and better solution is to look for the

661

separator, '\\t' in this case, and take the surrounding text. The

662

QString::split() function can take a separator string or regexp

663

as an argument and split a string accordingly.

664

665

\snippet code/src_corelib_tools_qregexp.cpp 11

666

667

Here field[0] is the company, field[1] the web address and so on.

668

669

To imitate the matching of a shell we can use wildcard mode.

670

671

\snippet code/src_corelib_tools_qregexp.cpp 12

672

673

Wildcard matching can be convenient because of its simplicity, but

674

any wildcard regexp can be defined using full regexps, e.g.

675

\b{.*\\.html$}. Notice that we can't match both \c .html and \c

676

.htm files with a wildcard unless we use \b{*.htm*} which will

677

also match 'test.html.bak'. A full regexp gives us the precision

678

we need, \b{.*\\.html?$}.

679

680

QRegExp can match case insensitively using setCaseSensitivity(),

681

and can use non-greedy matching, see setMinimal(). By

682

default QRegExp uses full regexps but this can be changed with

683

setPatternSyntax(). Searching can be done forward with indexIn() or backward

684

with lastIndexIn(). Captured text can be accessed using

685

capturedTexts() which returns a string list of all captured

686

strings, or using cap() which returns the captured string for the

687

given index. The pos() function takes a match index and returns

688

the position in the string where the match was made (or -1 if

689

there was no match).

690

691

\sa QString, QStringList, QRegExpValidator, QSortFilterProxyModel,

692

{tools/regexp}{Regular Expression Example}

693

694

695

#if defined(Q_OS_VXWORKS) && defined(EOS)

# undef EOS

#endif

const int NumBadChars = 64;

700

#define BadChar(ch) ((ch).unicode() % NumBadChars)

701

702

const int NoOccurrence = INT_MAX;

703

const int EmptyCapture = INT_MAX;

704

const int InftyLen = INT_MAX;

705

const int InftyRep = 1025;

706

const int EOS = -1;

707

708

static bool isWord(QChar ch)

709

{

710

return ch.isLetterOrNumber() || ch.isMark() || ch == QLatin1Char('_');

}

Merges two vectors of ints and puts the result into the first

715

one.

716

717

static void mergeInto(QVector<int> *a, const QVector<int> &b)

718

{

719

int asize = a->size();

720

int bsize = b.size();

721

if (asize == 0) {

722

*a = b;

723

#ifndef QT_NO_REGEXP_OPTIM

724

} else if (bsize == 1 && a->at(asize - 1) < b.at(0)) {

725

a->resize(asize + 1);

726

(*a)[asize] = b.at(0);

727

#endif

728

} else if (bsize >= 1) {

729

int csize = asize + bsize;

730

QVector<int> c(csize);

731

int i = 0, j = 0, k = 0;

732

while (i < asize) {

733

if (j < bsize) {

734

if (a->at(i) == b.at(j)) {

735

++i;

736

--csize;

737

} else if (a->at(i) < b.at(j)) {

c[k++] = a->at(i++);

} else {

c[k++] = b.at(j++);

}

} else {

memcpy(c.data() + k, a->constData() + i, (asize - i) * sizeof(int));

break;

}

}

c.resize(csize);

if (j < bsize)

memcpy(c.data() + k, b.constData() + j, (bsize - j) * sizeof(int));

*a = c;

}

}

#ifndef QT_NO_REGEXP_WILDCARD

755

756

Translates a wildcard pattern to an equivalent regular expression

757

pattern (e.g., *.cpp to .*\.cpp).

758

759

If enableEscaping is true, it is possible to escape the wildcard

760

characters with \

761

762

static QString wc2rx(const QString &wc_str, const bool enableEscaping)

763

{

764

const int wclen = wc_str.length();

765

QString rx;

766

int i = 0;

767

bool isEscaping = false; // the previous character is '\'

768

const QChar *wc = wc_str.unicode();

769

770

while (i < wclen) {

771

const QChar c = wc[i++];

772

switch (c.unicode()) {

773

case '\\':

774

if (enableEscaping) {

775

if (isEscaping) {

776

rx += QLatin1String("\\\\");

777

} // we insert the \\ later if necessary

778

if (i == wclen) { // the end

779

rx += QLatin1String("\\\\");

780

}

781

} else {

782

rx += QLatin1String("\\\\");

}

isEscaping = true;

break;

case '*':

if (isEscaping) {

rx += QLatin1String("\\*");

789

isEscaping = false;

790

} else {

791

rx += QLatin1String(".*");

}

break;

case '?':

if (isEscaping) {

rx += QLatin1String("\\?");

797

isEscaping = false;

798

} else {

799

rx += QLatin1Char('.');

}

break;

case '$':

case '(':

case ')':

case '+':

case '.':

case '^':

case '{':

case '|':

case '}':

if (isEscaping) {

isEscaping = false;

rx += QLatin1String("\\\\");

815

}

816

rx += QLatin1Char('\\');

rx += c;

break;

case '[':

if (isEscaping) {

isEscaping = false;

rx += QLatin1String("\\[");

823

} else {

824

rx += c;

825

if (wc[i] == QLatin1Char('^'))

826

rx += wc[i++];

827

if (i < wclen) {

828

if (rx[i] == QLatin1Char(']'))

829

rx += wc[i++];

830

while (i < wclen && wc[i] != QLatin1Char(']')) {

831

if (wc[i] == QLatin1Char('\\'))

832

rx += QLatin1Char('\\');

rx += wc[i++];

}

}

}

break;

case ']':

if(isEscaping){

isEscaping = false;

rx += QLatin1String("\\");

}

rx += c;

break;

default:

if(isEscaping){

isEscaping = false;

rx += QLatin1String("\\\\");

}

rx += c;

}

}

return rx;

}

#endif

static int caretIndex(int offset, QRegExp::CaretMode caretMode)

860

{

861

if (caretMode == QRegExp::CaretAtZero) {

862

return 0;

863

} else if (caretMode == QRegExp::CaretAtOffset) {

864

return offset;

865

} else { // QRegExp::CaretWontMatch

return -1;

}

}

The QRegExpEngineKey struct uniquely identifies an engine.

872

873

struct QRegExpEngineKey

874

{

875

QString pattern;

876

QRegExp::PatternSyntax patternSyntax;

877

Qt::CaseSensitivity cs;

878

879

inline QRegExpEngineKey(const QString &pattern, QRegExp::PatternSyntax patternSyntax,

880

Qt::CaseSensitivity cs)

881

: pattern(pattern), patternSyntax(patternSyntax), cs(cs) {}

882

883

inline void clear() {

884

pattern.clear();

885

patternSyntax = QRegExp::RegExp;

886

cs = Qt::CaseSensitive;

}

};

static bool operator==(const QRegExpEngineKey &key1, const QRegExpEngineKey &key2)

891

{

892

return key1.pattern == key2.pattern && key1.patternSyntax == key2.patternSyntax

893

&& key1.cs == key2.cs;

894

}

895

896

static uint qHash(const QRegExpEngineKey &key, uint seed = 0) Q_DECL_NOTHROW

897

{

898

QtPrivate::QHashCombine hash;

899

seed = hash(seed, key.pattern);

900

seed = hash(seed, key.patternSyntax);

901

seed = hash(seed, key.cs);

902

return seed;

executed 630297 times by 167 tests: return seed;

Executed by:

tst_Collections
tst_Lancelot
tst_ModelTest
tst_NetworkSelfTest
tst_QAbstractFileEngine
tst_QAbstractItemModel
tst_QAbstractItemView
tst_QAbstractNetworkCache
tst_QAbstractPrintDialog
tst_QAbstractScrollArea
tst_QAccessibility
tst_QApplication
tst_QBrush
tst_QButtonGroup
tst_QCalendarWidget
tst_QColorDialog
tst_QColumnView
tst_QComboBox
tst_QCommandLinkButton
tst_QCompleter
tst_QCssParser
tst_QDBusInterface
tst_QDataStream
tst_QDate
tst_QDateTime
...

630297

}

class QRegExpEngine;

//Q_DECLARE_TYPEINFO(QVector<int>, Q_MOVABLE_TYPE);

908

909

910

This is the engine state during matching.

911

912

struct QRegExpMatchState

913

{

914

const QChar *in; // a pointer to the input string data

915

int pos; // the current position in the string

916

int caretPos;

917

int len; // the length of the input string

918

bool minimal; // minimal matching?

919

int *bigArray; // big array holding the data for the next pointers

920

int *inNextStack; // is state is nextStack?

921

int *curStack; // stack of current states

922

int *nextStack; // stack of next states

923

int *curCapBegin; // start of current states' captures

924

int *nextCapBegin; // start of next states' captures

925

int *curCapEnd; // end of current states' captures

926

int *nextCapEnd; // end of next states' captures

927

int *tempCapBegin; // start of temporary captures

928

int *tempCapEnd; // end of temporary captures

929

int *capBegin; // start of captures for a next state

930

int *capEnd; // end of captures for a next state

931

int *slideTab; // bump-along slide table for bad-character heuristic

932

int *captured; // what match() returned last

933

int slideTabSize; // size of slide table

934

int capturedSize;

935

#ifndef QT_NO_REGEXP_BACKREF

936

QList<QVector<int> > sleeping; // list of back-reference sleepers

937

#endif

938

int matchLen; // length of match

939

int oneTestMatchedLen; // length of partial match

940

941

const QRegExpEngine *eng;

942

943

inline QRegExpMatchState() : bigArray(0), captured(0) {}

944

inline ~QRegExpMatchState() { free(bigArray); }

945

946

void drain() { free(bigArray); bigArray = 0; captured = 0; } // to save memory

947

void prepareForMatch(QRegExpEngine *eng);

948

void match(const QChar *str, int len, int pos, bool minimal,

949

bool oneTest, int caretIndex);

950

bool matchHere();

951

bool testAnchor(int i, int a, const int *capBegin);

};

The struct QRegExpAutomatonState represents one state in a modified NFA. The

956

input characters matched are stored in the state instead of on

957

the transitions, something possible for an automaton

958

constructed from a regular expression.

959

960

struct QRegExpAutomatonState

961

{

962

#ifndef QT_NO_REGEXP_CAPTURE

963

int atom; // which atom does this state belong to?

964

#endif

965

int match; // what does it match? (see CharClassBit and BackRefBit)

966

QVector<int> outs; // out-transitions

967

QMap<int, int> reenter; // atoms reentered when transiting out

968

QMap<int, int> anchors; // anchors met when transiting out

969

970

inline QRegExpAutomatonState() { }

971

#ifndef QT_NO_REGEXP_CAPTURE

972

inline QRegExpAutomatonState(int a, int m)

973

: atom(a), match(m) { }

974

#else

975

inline QRegExpAutomatonState(int m)

: match(m) { }

#endif

};

Q_DECLARE_TYPEINFO(QRegExpAutomatonState, Q_MOVABLE_TYPE);

981

982

983

The struct QRegExpCharClassRange represents a range of characters (e.g.,

984

[0-9] denotes range 48 to 57).

985

986

struct QRegExpCharClassRange

{

ushort from; // 48

ushort len; // 10

};

Q_DECLARE_TYPEINFO(QRegExpCharClassRange, Q_PRIMITIVE_TYPE);

993

994

#ifndef QT_NO_REGEXP_CAPTURE

995

996

The struct QRegExpAtom represents one node in the hierarchy of regular

expression atoms.

struct QRegExpAtom

{

enum { NoCapture = -1, OfficialCapture = -2, UnofficialCapture = -3 };

1002

1003

int parent; // index of parent in array of atoms

1004

int capture; // index of capture, from 1 to ncap - 1

1005

};

1006

1007

Q_DECLARE_TYPEINFO(QRegExpAtom, Q_PRIMITIVE_TYPE);

1008

#endif

1009

1010

struct QRegExpLookahead;

1011

1012

#ifndef QT_NO_REGEXP_ANCHOR_ALT

1013

1014

The struct QRegExpAnchorAlternation represents a pair of anchors with

1015

OR semantics.

1016

1017

struct QRegExpAnchorAlternation

1018

{

1019

int a; // this anchor...

1020

int b; // ...or this one

1021

};

1022

1023

Q_DECLARE_TYPEINFO(QRegExpAnchorAlternation, Q_PRIMITIVE_TYPE);

1024

#endif

1025

1026

#ifndef QT_NO_REGEXP_CCLASS

1027

1028

#define FLAG(x) (1 << (x))

1029

1030

The class QRegExpCharClass represents a set of characters, such as can

1031

be found in regular expressions (e.g., [a-z] denotes the set

1032

{a, b, ..., z}).

1033

1034

class QRegExpCharClass

{

public:

QRegExpCharClass();

void clear();

bool negative() const { return n; }

1041

void setNegative(bool negative);

1042

void addCategories(uint cats);

1043

void addRange(ushort from, ushort to);

1044

void addSingleton(ushort ch) { addRange(ch, ch); }

1045

1046

bool in(QChar ch) const;

1047

#ifndef QT_NO_REGEXP_OPTIM

1048

const QVector<int> &firstOccurrence() const { return occ1; }

1049

#endif

1050

1051

#if defined(QT_DEBUG)

void dump() const;

#endif

private:

QVector<QRegExpCharClassRange> r; // character ranges

1057

#ifndef QT_NO_REGEXP_OPTIM

1058

QVector<int> occ1; // first-occurrence array

1059

#endif

1060

uint c; // character classes

bool n; // negative?

};

#else

struct QRegExpCharClass

{

int dummy;

#ifndef QT_NO_REGEXP_OPTIM

1069

QRegExpCharClass() { occ1.fill(0, NumBadChars); }

1070

1071

const QVector<int> &firstOccurrence() const { return occ1; }

QVector<int> occ1;

#endif

};

#endif

Q_DECLARE_TYPEINFO(QRegExpCharClass, Q_MOVABLE_TYPE);

1078

1079

1080

The QRegExpEngine class encapsulates a modified nondeterministic

1081

finite automaton (NFA).

class QRegExpEngine

{

public:

QRegExpEngine(Qt::CaseSensitivity cs, bool greedyQuantifiers)

1087

: cs(cs), greedyQuantifiers(greedyQuantifiers) { setup(); }

1088

1089

QRegExpEngine(const QRegExpEngineKey &key);

1090

~QRegExpEngine();

1091

1092

bool isValid() const { return valid; }

1093

const QString &errorString() const { return yyError; }

1094

int captureCount() const { return officialncap; }

1095

1096

int createState(QChar ch);

1097

int createState(const QRegExpCharClass &cc);

1098

#ifndef QT_NO_REGEXP_BACKREF

1099

int createState(int bref);

1100

#endif

1101

1102

void addCatTransitions(const QVector<int> &from, const QVector<int> &to);

1103

#ifndef QT_NO_REGEXP_CAPTURE

1104

void addPlusTransitions(const QVector<int> &from, const QVector<int> &to, int atom);

1105

#endif

1106

1107

#ifndef QT_NO_REGEXP_ANCHOR_ALT

1108

int anchorAlternation(int a, int b);

1109

int anchorConcatenation(int a, int b);

1110

#else

1111

int anchorAlternation(int a, int b) { return a & b; }

1112

int anchorConcatenation(int a, int b) { return a | b; }

1113

#endif

1114

void addAnchors(int from, int to, int a);

1115

1116

#ifndef QT_NO_REGEXP_OPTIM

1117

void heuristicallyChooseHeuristic();

1118

#endif

1119

1120

#if defined(QT_DEBUG)

void dump() const;

#endif

QAtomicInt ref;

private:

enum { CharClassBit = 0x10000, BackRefBit = 0x20000 };

1128

enum { InitialState = 0, FinalState = 1 };

1129

1130

void setup();

1131

int setupState(int match);

1132

1133

1134

Let's hope that 13 lookaheads and 14 back-references are

1135

enough.

1136

1137

enum { MaxLookaheads = 13, MaxBackRefs = 14 };

1138

enum { Anchor_Dollar = 0x00000001, Anchor_Caret = 0x00000002, Anchor_Word = 0x00000004,

1139

Anchor_NonWord = 0x00000008, Anchor_FirstLookahead = 0x00000010,

1140

Anchor_BackRef1Empty = Anchor_FirstLookahead << MaxLookaheads,

1141

Anchor_BackRef0Empty = Anchor_BackRef1Empty >> 1,

1142

Anchor_Alternation = unsigned(Anchor_BackRef1Empty) << MaxBackRefs,

1143

1144

Anchor_LookaheadMask = (Anchor_FirstLookahead - 1) ^

1145

((Anchor_FirstLookahead << MaxLookaheads) - 1) };

1146

#ifndef QT_NO_REGEXP_CAPTURE

1147

int startAtom(bool officialCapture);

1148

void finishAtom(int atom, bool needCapture);

1149

#endif

1150

1151

#ifndef QT_NO_REGEXP_LOOKAHEAD

1152

int addLookahead(QRegExpEngine *eng, bool negative);

1153

#endif

1154

1155

#ifndef QT_NO_REGEXP_OPTIM

1156

bool goodStringMatch(QRegExpMatchState &matchState) const;

1157

bool badCharMatch(QRegExpMatchState &matchState) const;

1158

#else

1159

bool bruteMatch(QRegExpMatchState &matchState) const;

1160

#endif

1161

1162

QVector<QRegExpAutomatonState> s; // array of states

1163

#ifndef QT_NO_REGEXP_CAPTURE

1164

QVector<QRegExpAtom> f; // atom hierarchy

1165

int nf; // number of atoms

1166

int cf; // current atom

1167

QVector<int> captureForOfficialCapture;

1168

#endif

1169

int officialncap; // number of captures, seen from the outside

1170

int ncap; // number of captures, seen from the inside

1171

#ifndef QT_NO_REGEXP_CCLASS

1172

QVector<QRegExpCharClass> cl; // array of character classes

1173

#endif

1174

#ifndef QT_NO_REGEXP_LOOKAHEAD

1175

QVector<QRegExpLookahead *> ahead; // array of lookaheads

1176

#endif

1177

#ifndef QT_NO_REGEXP_ANCHOR_ALT

1178

QVector<QRegExpAnchorAlternation> aa; // array of (a, b) pairs of anchors

1179

#endif

1180

#ifndef QT_NO_REGEXP_OPTIM

1181

bool caretAnchored; // does the regexp start with ^?

1182

bool trivial; // is the good-string all that needs to match?

1183

#endif

1184

bool valid; // is the regular expression valid?

1185

Qt::CaseSensitivity cs; // case sensitive?

1186

bool greedyQuantifiers; // RegExp2?

1187

bool xmlSchemaExtensions;

1188

#ifndef QT_NO_REGEXP_BACKREF

1189

int nbrefs; // number of back-references

1190

#endif

1191

1192

#ifndef QT_NO_REGEXP_OPTIM

1193

bool useGoodStringHeuristic; // use goodStringMatch? otherwise badCharMatch

1194

1195

int goodEarlyStart; // the index where goodStr can first occur in a match

1196

int goodLateStart; // the index where goodStr can last occur in a match

1197

QString goodStr; // the string that any match has to contain

1198

1199

int minl; // the minimum length of a match

1200

QVector<int> occ1; // first-occurrence array

#endif

The class Box is an abstraction for a regular expression

1205

fragment. It can also be seen as one node in the syntax tree of

1206

a regular expression with synthetized attributes.

1207

1208

Its interface is ugly for performance reasons.

class Box

{

public:

Box(QRegExpEngine *engine);

1214

Box(const Box &b) { operator=(b); }

1215

1216

Box &operator=(const Box &b);

1217

1218

void clear() { operator=(Box(eng)); }

1219

void set(QChar ch);

1220

void set(const QRegExpCharClass &cc);

1221

#ifndef QT_NO_REGEXP_BACKREF

void set(int bref);

#endif

void cat(const Box &b);

1226

void orx(const Box &b);

1227

void plus(int atom);

1228

void opt();

1229

void catAnchor(int a);

1230

#ifndef QT_NO_REGEXP_OPTIM

1231

void setupHeuristics();

1232

#endif

1233

1234

#if defined(QT_DEBUG)

void dump() const;

#endif

private:

void addAnchorsToEngine(const Box &to) const;

1240

1241

QRegExpEngine *eng; // the automaton under construction

1242

QVector<int> ls; // the left states (firstpos)

1243

QVector<int> rs; // the right states (lastpos)

1244

QMap<int, int> lanchors; // the left anchors

1245

QMap<int, int> ranchors; // the right anchors

1246

int skipanchors; // the anchors to match if the box is skipped

1247

1248

#ifndef QT_NO_REGEXP_OPTIM

1249

int earlyStart; // the index where str can first occur

1250

int lateStart; // the index where str can last occur

1251

QString str; // a string that has to occur in any match

1252

QString leftStr; // a string occurring at the left of this box

1253

QString rightStr; // a string occurring at the right of this box

1254

int maxl; // the maximum length of this box (possibly InftyLen)

1255

#endif

1256

1257

int minl; // the minimum length of this box

1258

#ifndef QT_NO_REGEXP_OPTIM

1259

QVector<int> occ1; // first-occurrence array

#endif

};

friend class Box;

This is the lexical analyzer for regular expressions.

1267

1268

enum { Tok_Eos, Tok_Dollar, Tok_LeftParen, Tok_MagicLeftParen, Tok_PosLookahead,

1269

Tok_NegLookahead, Tok_RightParen, Tok_CharClass, Tok_Caret, Tok_Quantifier, Tok_Bar,

1270

Tok_Word, Tok_NonWord, Tok_Char = 0x10000, Tok_BackRef = 0x20000 };

1271

int getChar();

1272

int getEscape();

1273

#ifndef QT_NO_REGEXP_INTERVAL

1274

int getRep(int def);

1275

#endif

1276

#ifndef QT_NO_REGEXP_LOOKAHEAD

1277

void skipChars(int n);

1278

#endif

1279

void error(const char *msg);

1280

void startTokenizer(const QChar *rx, int len);

1281

int getToken();

1282

1283

const QChar *yyIn; // a pointer to the input regular expression pattern

1284

int yyPos0; // the position of yyTok in the input pattern

1285

int yyPos; // the position of the next character to read

1286

int yyLen; // the length of yyIn

1287

int yyCh; // the last character read

1288

QScopedPointer<QRegExpCharClass> yyCharClass; // attribute for Tok_CharClass tokens

1289

int yyMinRep; // attribute for Tok_Quantifier

1290

int yyMaxRep; // ditto

1291

QString yyError; // syntax error or overflow during parsing?

1292

1293

1294

This is the syntactic analyzer for regular expressions.

1295

1296

int parse(const QChar *rx, int len);

1297

void parseAtom(Box *box);

1298

void parseFactor(Box *box);

1299

void parseTerm(Box *box);

1300

void parseExpression(Box *box);

1301

1302

int yyTok; // the last token read

1303

bool yyMayCapture; // set this to false to disable capturing

1304

1305

friend struct QRegExpMatchState;

1306

};

1307

1308

#ifndef QT_NO_REGEXP_LOOKAHEAD

1309

1310

The struct QRegExpLookahead represents a lookahead a la Perl (e.g.,

1311

(?=foo) and (?!bar)).

1312

1313

struct QRegExpLookahead

1314

{

1315

QRegExpEngine *eng; // NFA representing the embedded regular expression

1316

bool neg; // negative lookahead?

1317

1318

inline QRegExpLookahead(QRegExpEngine *eng0, bool neg0)

1319

: eng(eng0), neg(neg0) { }

1320

inline ~QRegExpLookahead() { delete eng; }

};

#endif

/*!

\internal

convert the pattern string to the RegExp syntax.

1327

1328

This is also used by QScriptEngine::newRegExp to convert to a pattern that JavaScriptCore can understan

1329

1330

Q_CORE_EXPORT QString qt_regexp_toCanonical(const QString &pattern, QRegExp::PatternSyntax patternSyntax)

1331

{

1332

switch (patternSyntax) {

1333

#ifndef QT_NO_REGEXP_WILDCARD

1334

case QRegExp::Wildcard:

1335

return wc2rx(pattern, false);

1336

case QRegExp::WildcardUnix:

1337

return wc2rx(pattern, true);

1338

#endif

1339

case QRegExp::FixedString:

1340

return QRegExp::escape(pattern);

1341

case QRegExp::W3CXmlSchema11:

default:

return pattern;

}

}

QRegExpEngine::QRegExpEngine(const QRegExpEngineKey &key)

1348

: cs(key.cs), greedyQuantifiers(key.patternSyntax == QRegExp::RegExp2),

1349

xmlSchemaExtensions(key.patternSyntax == QRegExp::W3CXmlSchema11)

{

setup();

QString rx = qt_regexp_toCanonical(key.pattern, key.patternSyntax);

1354

1355

valid = (parse(rx.unicode(), rx.length()) == rx.length());

1356

if (!valid) {

1357

#ifndef QT_NO_REGEXP_OPTIM

1358

trivial = false;

1359

#endif

1360

error(RXERR_LEFTDELIM);

}

}

QRegExpEngine::~QRegExpEngine()

1365

{

1366

#ifndef QT_NO_REGEXP_LOOKAHEAD

qDeleteAll(ahead);

#endif

}

void QRegExpMatchState::prepareForMatch(QRegExpEngine *eng)

1372

{

1373

1374

We use one QVector<int> for all the big data used a lot in

1375

matchHere() and friends.

1376

1377

int ns = eng->s.size(); // number of states

1378

int ncap = eng->ncap;

1379

#ifndef QT_NO_REGEXP_OPTIM

1380

int newSlideTabSize = qMax(eng->minl + 1, 16);

1381

#else

1382

int newSlideTabSize = 0;

1383

#endif

1384

int numCaptures = eng->captureCount();

1385

int newCapturedSize = 2 + 2 * numCaptures;

1386

bigArray = q_check_ptr((int *)realloc(bigArray, ((3 + 4 * ncap) * ns + 4 * ncap + newSlideTabSize + newCapturedSize)*sizeof(int)));

1387

1388

// set all internal variables only _after_ bigArray is realloc'ed

1389

// to prevent a broken regexp in oom case

1390

1391

slideTabSize = newSlideTabSize;

1392

capturedSize = newCapturedSize;

1393

inNextStack = bigArray;

1394

memset(inNextStack, -1, ns * sizeof(int));

1395

curStack = inNextStack + ns;

1396

nextStack = inNextStack + 2 * ns;

1397

1398

curCapBegin = inNextStack + 3 * ns;

1399

nextCapBegin = curCapBegin + ncap * ns;

1400

curCapEnd = curCapBegin + 2 * ncap * ns;

1401

nextCapEnd = curCapBegin + 3 * ncap * ns;

1402

1403

tempCapBegin = curCapBegin + 4 * ncap * ns;

1404

tempCapEnd = tempCapBegin + ncap;

1405

capBegin = tempCapBegin + 2 * ncap;

1406

capEnd = tempCapBegin + 3 * ncap;

1407

1408

slideTab = tempCapBegin + 4 * ncap;

1409

captured = slideTab + slideTabSize;

1410

memset(captured, -1, capturedSize*sizeof(int));

this->eng = eng;

}

Tries to match in str and returns an array of (begin, length) pairs

1416

for captured text. If there is no match, all pairs are (-1, -1).

1417

1418

void QRegExpMatchState::match(const QChar *str0, int len0, int pos0,

1419

bool minimal0, bool oneTest, int caretIndex)

1420

{

1421

bool matched = false;

1422

QChar char_null;

1423

1424

#ifndef QT_NO_REGEXP_OPTIM

1425

if (eng->trivial && !oneTest) {

1426

pos = qFindString(str0, len0, pos0, eng->goodStr.unicode(), eng->goodStr.length(), eng->cs);

1427

matchLen = eng->goodStr.length();

1428

matched = (pos != -1);

} else

#endif

{

in = str0;

if (in == 0)

in = &char_null;

pos = pos0;

caretPos = caretIndex;

len = len0;

minimal = minimal0;

matchLen = 0;

oneTestMatchedLen = 0;

1441

1442

if (eng->valid && pos >= 0 && pos <= len) {

1443

#ifndef QT_NO_REGEXP_OPTIM

1444

if (oneTest) {

1445

matched = matchHere();

1446

} else {

1447

if (pos <= len - eng->minl) {

1448

if (eng->caretAnchored) {

1449

matched = matchHere();

1450

} else if (eng->useGoodStringHeuristic) {

1451

matched = eng->goodStringMatch(*this);

1452

} else {

1453

matched = eng->badCharMatch(*this);

}

}

}

#else

matched = oneTest ? matchHere() : eng->bruteMatch(*this);

#endif

}

}

if (matched) {

int *c = captured;

*c++ = pos;

*c++ = matchLen;

int numCaptures = (capturedSize - 2) >> 1;

1469

#ifndef QT_NO_REGEXP_CAPTURE

1470

for (int i = 0; i < numCaptures; ++i) {

1471

int j = eng->captureForOfficialCapture.at(i);

1472

if (capBegin[j] != EmptyCapture) {

1473

int len = capEnd[j] - capBegin[j];

1474

*c++ = (len > 0) ? pos + capBegin[j] : 0;

*c++ = len;

} else {

*c++ = -1;

*c++ = -1;

}

}

#endif

} else {

// we rely on 2's complement here

1484

memset(captured, -1, capturedSize * sizeof(int));

}

}

The three following functions add one state to the automaton and

1490

return the number of the state.

1491

1492

1493

int QRegExpEngine::createState(QChar ch)

1494

{

1495

return setupState(ch.unicode());

1496

}

1497

1498

int QRegExpEngine::createState(const QRegExpCharClass &cc)

1499

{

1500

#ifndef QT_NO_REGEXP_CCLASS

1501

int n = cl.size();

1502

cl += QRegExpCharClass(cc);

1503

return setupState(CharClassBit | n);

1504

#else

1505

Q_UNUSED(cc);

1506

return setupState(CharClassBit);

#endif

}

#ifndef QT_NO_REGEXP_BACKREF

1511

int QRegExpEngine::createState(int bref)

{

if (bref > nbrefs) {

nbrefs = bref;

if (nbrefs > MaxBackRefs) {

error(RXERR_LIMIT);

return 0;

}

}

return setupState(BackRefBit | bref);

}

#endif

The two following functions add a transition between all pairs of

1526

states (i, j) where i is found in from, and j is found in to.

1527

1528

Cat-transitions are distinguished from plus-transitions for

capturing.

void QRegExpEngine::addCatTransitions(const QVector<int> &from, const QVector<int> &to)

1533

{

1534

for (int i = 0; i < from.size(); i++)

1535

mergeInto(&s[from.at(i)].outs, to);

1536

}

1537

1538

#ifndef QT_NO_REGEXP_CAPTURE

1539

void QRegExpEngine::addPlusTransitions(const QVector<int> &from, const QVector<int> &to, int atom)

1540

{

1541

for (int i = 0; i < from.size(); i++) {

1542

QRegExpAutomatonState &st = s[from.at(i)];

1543

const QVector<int> oldOuts = st.outs;

1544

mergeInto(&st.outs, to);

1545

if (f.at(atom).capture != QRegExpAtom::NoCapture) {

1546

for (int j = 0; j < to.size(); j++) {

1547

// ### st.reenter.contains(to.at(j)) check looks suspicious

1548

if (!st.reenter.contains(to.at(j)) &&

1549

!std::binary_search(oldOuts.constBegin(), oldOuts.constEnd(), to.at(j)))

1550

st.reenter.insert(to.at(j), atom);

}

}

}

}

#endif

#ifndef QT_NO_REGEXP_ANCHOR_ALT

1558

1559

Returns an anchor that means a OR b.

1560

1561

int QRegExpEngine::anchorAlternation(int a, int b)

1562

{

1563

if (((a & b) == a || (a & b) == b) && ((a | b) & Anchor_Alternation) == 0)

return a & b;

int n = aa.size();

#ifndef QT_NO_REGEXP_OPTIM

1568

if (n > 0 && aa.at(n - 1).a == a && aa.at(n - 1).b == b)

1569

return Anchor_Alternation | (n - 1);

1570

#endif

1571

1572

QRegExpAnchorAlternation element = {a, b};

1573

aa.append(element);

1574

return Anchor_Alternation | n;

}

Returns an anchor that means a AND b.

1579

1580

int QRegExpEngine::anchorConcatenation(int a, int b)

1581

{

1582

if (((a | b) & Anchor_Alternation) == 0)

1583

return a | b;

1584

if ((b & Anchor_Alternation) != 0)

1585

qSwap(a, b);

1586

1587

int aprime = anchorConcatenation(aa.at(a ^ Anchor_Alternation).a, b);

1588

int bprime = anchorConcatenation(aa.at(a ^ Anchor_Alternation).b, b);

1589

return anchorAlternation(aprime, bprime);

}

#endif

Adds anchor a on a transition caracterised by its from state and

1595

its to state.

1596

1597

void QRegExpEngine::addAnchors(int from, int to, int a)

1598

{

1599

QRegExpAutomatonState &st = s[from];

1600

if (st.anchors.contains(to))

1601

a = anchorAlternation(st.anchors.value(to), a);

1602

st.anchors.insert(to, a);

1603

}

1604

1605

#ifndef QT_NO_REGEXP_OPTIM

1606

1607

This function chooses between the good-string and the bad-character

1608

heuristics. It computes two scores and chooses the heuristic with

1609

the highest score.

1610

1611

Here are some common-sense constraints on the scores that should be

1612

respected if the formulas are ever modified: (1) If goodStr is

1613

empty, the good-string heuristic scores 0. (2) If the regular

1614

expression is trivial, the good-string heuristic should be used.

1615

(3) If the search is case insensitive, the good-string heuristic

1616

should be used, unless it scores 0. (Case insensitivity turns all

1617

entries of occ1 to 0.) (4) If (goodLateStart - goodEarlyStart) is

1618

big, the good-string heuristic should score less.

1619

1620

void QRegExpEngine::heuristicallyChooseHeuristic()

1621

{

1622

if (minl == 0) {

1623

useGoodStringHeuristic = false;

1624

} else if (trivial) {

1625

useGoodStringHeuristic = true;

1626

} else {

1627

1628

Magic formula: The good string has to constitute a good

1629

proportion of the minimum-length string, and appear at a

1630

more-or-less known index.

1631

1632

int goodStringScore = (64 * goodStr.length() / minl) -

1633

(goodLateStart - goodEarlyStart);

1634

1635

Less magic formula: We pick some characters at random, and

1636

check whether they are good or bad.

1637

1638

int badCharScore = 0;

1639

int step = qMax(1, NumBadChars / 32);

1640

for (int i = 1; i < NumBadChars; i += step) {

1641

if (occ1.at(i) == NoOccurrence)

1642

badCharScore += minl;

1643

else

1644

badCharScore += occ1.at(i);

1645

}

1646

badCharScore /= minl;

1647

useGoodStringHeuristic = (goodStringScore > badCharScore);

}

}

#endif

#if defined(QT_DEBUG)

1653

void QRegExpEngine::dump() const

1654

{

1655

int i, j;

1656

qDebug("Case %ssensitive engine", cs ? "" : "in");

1657

qDebug(" States");

1658

for (i = 0; i < s.size(); i++) {

1659

qDebug(" %d%s", i, i == InitialState ? " (initial)" : i == FinalState ? " (final)" : "");

1660

#ifndef QT_NO_REGEXP_CAPTURE

1661

if (nf > 0)

1662

qDebug(" in atom %d", s[i].atom);

1663

#endif

1664

int m = s[i].match;

1665

if ((m & CharClassBit) != 0) {

1666

qDebug(" match character class %d", m ^ CharClassBit);

1667

#ifndef QT_NO_REGEXP_CCLASS

1668

cl[m ^ CharClassBit].dump();

1669

#else

1670

qDebug(" negative character class");

1671

#endif

1672

} else if ((m & BackRefBit) != 0) {

1673

qDebug(" match back-reference %d", m ^ BackRefBit);

1674

} else if (m >= 0x20 && m <= 0x7e) {

1675

qDebug(" match 0x%.4x (%c)", m, m);

1676

} else {

1677

qDebug(" match 0x%.4x", m);

1678

}

1679

for (j = 0; j < s[i].outs.size(); j++) {

1680

int next = s[i].outs[j];

1681

qDebug(" -> %d", next);

1682

if (s[i].reenter.contains(next))

1683

qDebug(" [reenter %d]", s[i].reenter[next]);

1684

if (s[i].anchors.value(next) != 0)

1685

qDebug(" [anchors 0x%.8x]", s[i].anchors[next]);

1686

}

1687

}

1688

#ifndef QT_NO_REGEXP_CAPTURE

1689

if (nf > 0) {

1690

qDebug(" Atom Parent Capture");

1691

for (i = 0; i < nf; i++) {

1692

if (f[i].capture == QRegExpAtom::NoCapture) {

1693

qDebug(" %6d %6d nil", i, f[i].parent);

1694

} else {

1695

int cap = f[i].capture;

1696

bool official = captureForOfficialCapture.contains(cap);

1697

qDebug(" %6d %6d %6d %s", i, f[i].parent, f[i].capture,

1698

official ? "official" : "");

}

}

}

#endif

#ifndef QT_NO_REGEXP_ANCHOR_ALT

1704

for (i = 0; i < aa.size(); i++)

1705

qDebug(" Anchor alternation 0x%.8x: 0x%.8x 0x%.9x", i, aa[i].a, aa[i].b);

#endif

}

#endif

void QRegExpEngine::setup()

1711

{

1712

ref.store(1);

1713

#ifndef QT_NO_REGEXP_CAPTURE

f.resize(32);

nf = 0;

cf = -1;

#endif

officialncap = 0;

ncap = 0;

#ifndef QT_NO_REGEXP_OPTIM

1721

caretAnchored = true;

trivial = true;

#endif

valid = false;

#ifndef QT_NO_REGEXP_BACKREF

1726

nbrefs = 0;

1727

#endif

1728

#ifndef QT_NO_REGEXP_OPTIM

1729

useGoodStringHeuristic = true;

1730

minl = 0;

1731

occ1.fill(0, NumBadChars);

#endif

}

int QRegExpEngine::setupState(int match)

1736

{

1737

#ifndef QT_NO_REGEXP_CAPTURE

1738

s += QRegExpAutomatonState(cf, match);

1739

#else

1740

s += QRegExpAutomatonState(match);

#endif

return s.size() - 1;

}

#ifndef QT_NO_REGEXP_CAPTURE

1746

1747

Functions startAtom() and finishAtom() should be called to delimit

1748

atoms. When a state is created, it is assigned to the current atom.

1749

The information is later used for capturing.

1750

1751

int QRegExpEngine::startAtom(bool officialCapture)

1752

{

1753

if ((nf & (nf + 1)) == 0 && nf + 1 >= f.size())

1754

f.resize((nf + 1) << 1);

1755

f[nf].parent = cf;

1756

cf = nf++;

1757

f[cf].capture = officialCapture ? QRegExpAtom::OfficialCapture : QRegExpAtom::NoCapture;

return cf;

}

void QRegExpEngine::finishAtom(int atom, bool needCapture)

1762

{

1763

if (greedyQuantifiers && needCapture && f[atom].capture == QRegExpAtom::NoCapture)

1764

f[atom].capture = QRegExpAtom::UnofficialCapture;

1765

cf = f.at(atom).parent;

}

#endif

#ifndef QT_NO_REGEXP_LOOKAHEAD

1770

1771

Creates a lookahead anchor.

1772

1773

int QRegExpEngine::addLookahead(QRegExpEngine *eng, bool negative)

1774

{

1775

int n = ahead.size();

1776

if (n == MaxLookaheads) {

error(RXERR_LIMIT);

return 0;

}

ahead += new QRegExpLookahead(eng, negative);

1781

return Anchor_FirstLookahead << n;

}

#endif

#ifndef QT_NO_REGEXP_CAPTURE

1786

1787

We want the longest leftmost captures.

1788

1789

static bool isBetterCapture(int ncap, const int *begin1, const int *end1, const int *begin2,

1790

const int *end2)

1791

{

1792

for (int i = 0; i < ncap; i++) {

1793

int delta = begin2[i] - begin1[i]; // it has to start early...

1794

if (delta == 0)

1795

delta = end1[i] - end2[i]; // ...and end late

if (delta != 0)

return delta > 0;

}

return false;

}

#endif

Returns \c true if anchor a matches at position pos + i in the input

1806

string, otherwise false.

1807

1808

bool QRegExpMatchState::testAnchor(int i, int a, const int *capBegin)

{

int j;

#ifndef QT_NO_REGEXP_ANCHOR_ALT

1813

if ((a & QRegExpEngine::Anchor_Alternation) != 0)

1814

return testAnchor(i, eng->aa.at(a ^ QRegExpEngine::Anchor_Alternation).a, capBegin)

1815

|| testAnchor(i, eng->aa.at(a ^ QRegExpEngine::Anchor_Alternation).b, capBegin);

1816

#endif

1817

1818

if ((a & QRegExpEngine::Anchor_Caret) != 0) {

1819

if (pos + i != caretPos)

1820

return false;

1821

}

1822

if ((a & QRegExpEngine::Anchor_Dollar) != 0) {

if (pos + i != len)

return false;

}

#ifndef QT_NO_REGEXP_ESCAPE

1827

if ((a & (QRegExpEngine::Anchor_Word | QRegExpEngine::Anchor_NonWord)) != 0) {

bool before = false;

bool after = false;

if (pos + i != 0)

before = isWord(in[pos + i - 1]);

1832

if (pos + i != len)

1833

after = isWord(in[pos + i]);

1834

if ((a & QRegExpEngine::Anchor_Word) != 0 && (before == after))

1835

return false;

1836

if ((a & QRegExpEngine::Anchor_NonWord) != 0 && (before != after))

return false;

}

#endif

#ifndef QT_NO_REGEXP_LOOKAHEAD

1841

if ((a & QRegExpEngine::Anchor_LookaheadMask) != 0) {

1842

const QVector<QRegExpLookahead *> &ahead = eng->ahead;

1843

for (j = 0; j < ahead.size(); j++) {

1844

if ((a & (QRegExpEngine::Anchor_FirstLookahead << j)) != 0) {

1845

QRegExpMatchState matchState;

1846

matchState.prepareForMatch(ahead[j]->eng);

1847

matchState.match(in + pos + i, len - pos - i, 0,

1848

true, true, caretPos - pos - i);

1849

if ((matchState.captured[0] == 0) == ahead[j]->neg)

return false;

}

}

}

#endif

#ifndef QT_NO_REGEXP_CAPTURE

1856

#ifndef QT_NO_REGEXP_BACKREF

1857

for (j = 0; j < eng->nbrefs; j++) {

1858

if ((a & (QRegExpEngine::Anchor_BackRef1Empty << j)) != 0) {

1859

int i = eng->captureForOfficialCapture.at(j);

1860

if (capBegin[i] != EmptyCapture)

return false;

}

}

#endif

#endif

return true;

}

#ifndef QT_NO_REGEXP_OPTIM

1870

1871

The three following functions are what Jeffrey Friedl would call

1872

transmissions (or bump-alongs). Using one or the other should make

1873

no difference except in performance.

1874

1875

1876

bool QRegExpEngine::goodStringMatch(QRegExpMatchState &matchState) const

1877

{

1878

int k = matchState.pos + goodEarlyStart;

1879

QStringMatcher matcher(goodStr.unicode(), goodStr.length(), cs);

1880

while ((k = matcher.indexIn(matchState.in, matchState.len, k)) != -1) {

1881

int from = k - goodLateStart;

1882

int to = k - goodEarlyStart;

1883

if (from > matchState.pos)

1884

matchState.pos = from;

1885

1886

while (matchState.pos <= to) {

1887

if (matchState.matchHere())

return true;

++matchState.pos;

}

++k;

}

return false;

}

bool QRegExpEngine::badCharMatch(QRegExpMatchState &matchState) const

{

int slideHead = 0;

int slideNext = 0;

int i;

int lastPos = matchState.len - minl;

1902

memset(matchState.slideTab, 0, matchState.slideTabSize * sizeof(int));

1903

1904

1905

Set up the slide table, used for the bad-character heuristic,

1906

using the table of first occurrence of each character.

1907

1908

for (i = 0; i < minl; i++) {

1909

int sk = occ1[BadChar(matchState.in[matchState.pos + i])];

1910

if (sk == NoOccurrence)

sk = i + 1;

if (sk > 0) {

int k = i + 1 - sk;

if (k < 0) {

sk = i + 1;

k = 0;

}

if (sk > matchState.slideTab[k])

1919

matchState.slideTab[k] = sk;

}

}

if (matchState.pos > lastPos)

return false;

for (;;) {

if (++slideNext >= matchState.slideTabSize)

1928

slideNext = 0;

1929

if (matchState.slideTab[slideHead] > 0) {

1930

if (matchState.slideTab[slideHead] - 1 > matchState.slideTab[slideNext])

1931

matchState.slideTab[slideNext] = matchState.slideTab[slideHead] - 1;

1932

matchState.slideTab[slideHead] = 0;

1933

} else {

1934

if (matchState.matchHere())

return true;

}

if (matchState.pos == lastPos)

break;

Update the slide table. This code has much in common with

1943

the initialization code.

1944

1945

int sk = occ1[BadChar(matchState.in[matchState.pos + minl])];

1946

if (sk == NoOccurrence) {

1947

matchState.slideTab[slideNext] = minl;

1948

} else if (sk > 0) {

1949

int k = slideNext + minl - sk;

1950

if (k >= matchState.slideTabSize)

1951

k -= matchState.slideTabSize;

1952

if (sk > matchState.slideTab[k])

1953

matchState.slideTab[k] = sk;

1954

}

1955

slideHead = slideNext;

++matchState.pos;

}

return false;

}

#else

bool QRegExpEngine::bruteMatch(QRegExpMatchState &matchState) const

1962

{

1963

while (matchState.pos <= matchState.len) {

1964

if (matchState.matchHere())

return true;

++matchState.pos;

}

return false;

}

#endif

Here's the core of the engine. It tries to do a match here and now.

1974

1975

bool QRegExpMatchState::matchHere()

1976

{

1977

int ncur = 1, nnext = 0;

int i = 0, j, k, m;

bool stop = false;

matchLen = -1;

oneTestMatchedLen = -1;

1983

curStack[0] = QRegExpEngine::InitialState;

1984

1985

int ncap = eng->ncap;

1986

#ifndef QT_NO_REGEXP_CAPTURE

1987

if (ncap > 0) {

1988

for (j = 0; j < ncap; j++) {

1989

curCapBegin[j] = EmptyCapture;

1990

curCapEnd[j] = EmptyCapture;

}

}

#endif

#ifndef QT_NO_REGEXP_BACKREF

1996

while ((ncur > 0 || !sleeping.isEmpty()) && i <= len - pos && !stop)

1997

#else

1998

while (ncur > 0 && i <= len - pos && !stop)

1999

#endif

2000

{

2001

int ch = (i < len - pos) ? in[pos + i].unicode() : 0;

2002

for (j = 0; j < ncur; j++) {

2003

int cur = curStack[j];

2004

const QRegExpAutomatonState &scur = eng->s.at(cur);

2005

const QVector<int> &outs = scur.outs;

2006

for (k = 0; k < outs.size(); k++) {

2007

int next = outs.at(k);

2008

const QRegExpAutomatonState &snext = eng->s.at(next);

2009

bool inside = true;

2010

#if !defined(QT_NO_REGEXP_BACKREF) && !defined(QT_NO_REGEXP_CAPTURE)

2011

int needSomeSleep = 0;

#endif

First, check if the anchors are anchored properly.

2016

2017

int a = scur.anchors.value(next);

2018

if (a != 0 && !testAnchor(i, a, curCapBegin + j * ncap))

inside = false;

If indeed they are, check if the input character is

2023

correct for this transition.

if (inside) {

m = snext.match;

if ((m & (QRegExpEngine::CharClassBit | QRegExpEngine::BackRefBit)) == 0) {

if (eng->cs)

inside = (m == ch);

else

inside = (QChar(m).toLower() == QChar(ch).toLower());

2032

} else if (next == QRegExpEngine::FinalState) {

matchLen = i;

stop = minimal;

inside = true;

} else if ((m & QRegExpEngine::CharClassBit) != 0) {

2037

#ifndef QT_NO_REGEXP_CCLASS

2038

const QRegExpCharClass &cc = eng->cl.at(m ^ QRegExpEngine::CharClassBit);

2039

if (eng->cs)

2040

inside = cc.in(ch);

2041

else if (cc.negative())

2042

inside = cc.in(QChar(ch).toLower()) &&

2043

cc.in(QChar(ch).toUpper());

2044

else

2045

inside = cc.in(QChar(ch).toLower()) ||

2046

cc.in(QChar(ch).toUpper());

2047

#endif

2048

#if !defined(QT_NO_REGEXP_BACKREF) && !defined(QT_NO_REGEXP_CAPTURE)

2049

} else { /* ((m & QRegExpEngine::BackRefBit) != 0) */

2050

int bref = m ^ QRegExpEngine::BackRefBit;

2051

int ell = j * ncap + eng->captureForOfficialCapture.at(bref - 1);

2052

2053

inside = bref <= ncap && curCapBegin[ell] != EmptyCapture;

2054

if (inside) {

2055

if (eng->cs)

2056

inside = (in[pos + curCapBegin[ell]] == QChar(ch));

2057

else

2058

inside = (in[pos + curCapBegin[ell]].toLower()

2059

== QChar(ch).toLower());

}

if (inside) {

int delta;

if (curCapEnd[ell] == EmptyCapture)

2065

delta = i - curCapBegin[ell];

2066

else

2067

delta = curCapEnd[ell] - curCapBegin[ell];

2068

2069

inside = (delta <= len - (pos + i));

2070

if (inside && delta > 1) {

int n = 1;

if (eng->cs) {

while (n < delta) {

if (in[pos + curCapBegin[ell] + n]

!= in[pos + i + n])

break;

++n;

}

} else {

while (n < delta) {

QChar a = in[pos + curCapBegin[ell] + n];

2082

QChar b = in[pos + i + n];

2083

if (a.toLower() != b.toLower())

break;

++n;

}

}

inside = (n == delta);

2089

if (inside)

2090

needSomeSleep = delta - 1;

}

}

#endif

}

}

We must now update our data structures.

2099

2100

if (inside) {

2101

#ifndef QT_NO_REGEXP_CAPTURE

2102

int *capBegin, *capEnd;

2103

#endif

2104

2105

If the next state was not encountered yet, all

2106

is fine.

2107

2108

if ((m = inNextStack[next]) == -1) {

2109

m = nnext++;

2110

nextStack[m] = next;

2111

inNextStack[next] = m;

2112

#ifndef QT_NO_REGEXP_CAPTURE

2113

capBegin = nextCapBegin + m * ncap;

2114

capEnd = nextCapEnd + m * ncap;

2115

2116

2117

Otherwise, we'll first maintain captures in

2118

temporary arrays, and decide at the end whether

2119

it's best to keep the previous capture zones or

the new ones.

} else {

capBegin = tempCapBegin;

capEnd = tempCapEnd;

#endif

}

#ifndef QT_NO_REGEXP_CAPTURE

2129

2130

Updating the capture zones is much of a task.

2131

2132

if (ncap > 0) {

2133

memcpy(capBegin, curCapBegin + j * ncap, ncap * sizeof(int));

2134

memcpy(capEnd, curCapEnd + j * ncap, ncap * sizeof(int));

2135

int c = scur.atom, n = snext.atom;

int p = -1, q = -1;

int cap;

Lemma 1. For any x in the range [0..nf), we

2141

have f[x].parent < x.

2142

2143

Proof. By looking at startAtom(), it is

2144

clear that cf < nf holds all the time, and

2145

thus that f[nf].parent < nf.

If we are reentering an atom, we empty all

2150

capture zones inside it.

2151

2152

if ((q = scur.reenter.value(next)) != 0) {

2153

QBitArray b(eng->nf, false);

2154

b.setBit(q, true);

2155

for (int ell = q + 1; ell < eng->nf; ell++) {

2156

if (b.testBit(eng->f.at(ell).parent)) {

2157

b.setBit(ell, true);

2158

cap = eng->f.at(ell).capture;

2159

if (cap >= 0) {

2160

capBegin[cap] = EmptyCapture;

2161

capEnd[cap] = EmptyCapture;

}

}

}

p = eng->f.at(q).parent;

2166

2167

2168

Otherwise, close the capture zones we are

2169

leaving. We are leaving f[c].capture,

2170

f[f[c].parent].capture,

2171

f[f[f[c].parent].parent].capture, ...,

2172

until f[x].capture, with x such that

2173

f[x].parent is the youngest common ancestor

2174

for c and n.

2175

2176

We go up along c's and n's ancestry until

we find x.

} else {

p = c;

q = n;

while (p != q) {

if (p > q) {

cap = eng->f.at(p).capture;

2185

if (cap >= 0) {

2186

if (capBegin[cap] == i) {

2187

capBegin[cap] = EmptyCapture;

2188

capEnd[cap] = EmptyCapture;

} else {

capEnd[cap] = i;

}

}

p = eng->f.at(p).parent;

2194

} else {

2195

q = eng->f.at(q).parent;

}

}

}

In any case, we now open the capture zones

2202

we are entering. We work upwards from n

2203

until we reach p (the parent of the atom we

2204

reenter or the youngest common ancestor).

2205

2206

while (n > p) {

2207

cap = eng->f.at(n).capture;

2208

if (cap >= 0) {

2209

capBegin[cap] = i;

2210

capEnd[cap] = EmptyCapture;

2211

}

2212

n = eng->f.at(n).parent;

2213

}

2214

2215

If the next state was already in

2216

nextStack, we must choose carefully which

2217

capture zones we want to keep.

2218

2219

if (capBegin == tempCapBegin &&

2220

isBetterCapture(ncap, capBegin, capEnd, nextCapBegin + m * ncap,

2221

nextCapEnd + m * ncap)) {

2222

memcpy(nextCapBegin + m * ncap, capBegin, ncap * sizeof(int));

2223

memcpy(nextCapEnd + m * ncap, capEnd, ncap * sizeof(int));

2224

}

2225

}

2226

#ifndef QT_NO_REGEXP_BACKREF

2227

2228

We are done with updating the capture zones.

2229

It's now time to put the next state to sleep,

2230

if it needs to, and to remove it from

2231

nextStack.

2232

2233

if (needSomeSleep > 0) {

2234

QVector<int> zzZ(2 + 2 * ncap);

2235

zzZ[0] = i + needSomeSleep;

2236

zzZ[1] = next;

2237

if (ncap > 0) {

2238

memcpy(zzZ.data() + 2, capBegin, ncap * sizeof(int));

2239

memcpy(zzZ.data() + 2 + ncap, capEnd, ncap * sizeof(int));

2240

}

2241

inNextStack[nextStack[--nnext]] = -1;

2242

sleeping.append(zzZ);

}

#endif

#endif

}

}

}

#ifndef QT_NO_REGEXP_CAPTURE

2250

2251

If we reached the final state, hurray! Copy the captured

2252

zone.

2253

2254

if (ncap > 0 && (m = inNextStack[QRegExpEngine::FinalState]) != -1) {

2255

memcpy(capBegin, nextCapBegin + m * ncap, ncap * sizeof(int));

2256

memcpy(capEnd, nextCapEnd + m * ncap, ncap * sizeof(int));

2257

}

2258

#ifndef QT_NO_REGEXP_BACKREF

2259

2260

It's time to wake up the sleepers.

2261

2262

j = 0;

2263

while (j < sleeping.count()) {

2264

if (sleeping.at(j)[0] == i) {

2265

const QVector<int> &zzZ = sleeping.at(j);

2266

int next = zzZ[1];

2267

const int *capBegin = zzZ.data() + 2;

2268

const int *capEnd = zzZ.data() + 2 + ncap;

2269

bool copyOver = true;

2270

2271

if ((m = inNextStack[next]) == -1) {

2272

m = nnext++;

2273

nextStack[m] = next;

2274

inNextStack[next] = m;

2275

} else {

2276

copyOver = isBetterCapture(ncap, nextCapBegin + m * ncap, nextCapEnd + m * ncap,

capBegin, capEnd);

}

if (copyOver) {

memcpy(nextCapBegin + m * ncap, capBegin, ncap * sizeof(int));

2281

memcpy(nextCapEnd + m * ncap, capEnd, ncap * sizeof(int));

2282

}

2283

2284

sleeping.removeAt(j);

} else {

++j;

}

}

#endif

#endif

for (j = 0; j < nnext; j++)

2292

inNextStack[nextStack[j]] = -1;

2293

2294

// avoid needless iteration that confuses oneTestMatchedLen

2295

if (nnext == 1 && nextStack[0] == QRegExpEngine::FinalState

2296

#ifndef QT_NO_REGEXP_BACKREF

2297

&& sleeping.isEmpty()

#endif

)

stop = true;

qSwap(curStack, nextStack);

2303

#ifndef QT_NO_REGEXP_CAPTURE

2304

qSwap(curCapBegin, nextCapBegin);

2305

qSwap(curCapEnd, nextCapEnd);

#endif

ncur = nnext;

nnext = 0;

++i;

}

#ifndef QT_NO_REGEXP_BACKREF

2313

2314

If minimal matching is enabled, we might have some sleepers

2315

left.

2316

2317

if (!sleeping.isEmpty())

sleeping.clear();

#endif

oneTestMatchedLen = i - 1;

2322

return (matchLen >= 0);

2323

}

2324

2325

#ifndef QT_NO_REGEXP_CCLASS

2326

2327

QRegExpCharClass::QRegExpCharClass()

2328

: c(0), n(false)

2329

{

2330

#ifndef QT_NO_REGEXP_OPTIM

2331

occ1.fill(NoOccurrence, NumBadChars);

#endif

}

void QRegExpCharClass::clear()

{

c = 0;

r.resize(0);clear();

n = false;

}

executed 34392 times by 116 tests: end of block

Executed by:

tst_Collections
tst_Lancelot
tst_ModelTest
tst_NetworkSelfTest
tst_QAbstractFileEngine
tst_QAbstractItemModel
tst_QAbstractItemView
tst_QAbstractNetworkCache
tst_QAccessibility
tst_QApplication
tst_QCalendarWidget
tst_QComboBox
tst_QCompleter
tst_QDBusInterface
tst_QDataStream
tst_QDate
tst_QDateTime
tst_QDateTimeEdit
tst_QDir
tst_QDirIterator
tst_QDirModel
tst_QFactoryLoader
tst_QFile
tst_QFileDialog2
tst_QFileIconProvider
...

34392

2341

2342

void QRegExpCharClass::setNegative(bool negative)

2343

{

2344

n = negative;

2345

#ifndef QT_NO_REGEXP_OPTIM

2346

occ1.fill(0, NumBadChars);

#endif

}

void QRegExpCharClass::addCategories(uint cats)

2351

{

2352

static const int all_cats = FLAG(QChar::Mark_NonSpacing) |

2353

FLAG(QChar::Mark_SpacingCombining) |

2354

FLAG(QChar::Mark_Enclosing) |

2355

FLAG(QChar::Number_DecimalDigit) |

2356

FLAG(QChar::Number_Letter) |

2357

FLAG(QChar::Number_Other) |

2358

FLAG(QChar::Separator_Space) |

2359

FLAG(QChar::Separator_Line) |

2360

FLAG(QChar::Separator_Paragraph) |

2361

FLAG(QChar::Other_Control) |

2362

FLAG(QChar::Other_Format) |

2363

FLAG(QChar::Other_Surrogate) |

2364

FLAG(QChar::Other_PrivateUse) |

2365

FLAG(QChar::Other_NotAssigned) |

2366

FLAG(QChar::Letter_Uppercase) |

2367

FLAG(QChar::Letter_Lowercase) |

2368

FLAG(QChar::Letter_Titlecase) |

2369

FLAG(QChar::Letter_Modifier) |

2370

FLAG(QChar::Letter_Other) |

2371

FLAG(QChar::Punctuation_Connector) |

2372

FLAG(QChar::Punctuation_Dash) |

2373

FLAG(QChar::Punctuation_Open) |

2374

FLAG(QChar::Punctuation_Close) |

2375

FLAG(QChar::Punctuation_InitialQuote) |

2376

FLAG(QChar::Punctuation_FinalQuote) |

2377

FLAG(QChar::Punctuation_Other) |

2378

FLAG(QChar::Symbol_Math) |

2379

FLAG(QChar::Symbol_Currency) |

2380

FLAG(QChar::Symbol_Modifier) |

2381

FLAG(QChar::Symbol_Other);

2382

c |= (all_cats & cats);

2383

#ifndef QT_NO_REGEXP_OPTIM

2384

occ1.fill(0, NumBadChars);

#endif

}

void QRegExpCharClass::addRange(ushort from, ushort to)

{

if (from > to)

qSwap(from, to);

int m = r.size();

r.resize(m + 1);

r[m].from = from;

r[m].len = to - from + 1;

2396

2397

#ifndef QT_NO_REGEXP_OPTIM

2398

int i;

2399

2400

if (to - from < NumBadChars) {

2401

if (from % NumBadChars <= to % NumBadChars) {

2402

for (i = from % NumBadChars; i <= to % NumBadChars; i++)

2403

occ1[i] = 0;

2404

} else {

2405

for (i = 0; i <= to % NumBadChars; i++)

2406

occ1[i] = 0;

2407

for (i = from % NumBadChars; i < NumBadChars; i++)

occ1[i] = 0;

}

} else {

occ1.fill(0, NumBadChars);

}

#endif

}

bool QRegExpCharClass::in(QChar ch) const

2417

{

2418

#ifndef QT_NO_REGEXP_OPTIM

2419

if (occ1.at(BadChar(ch)) == NoOccurrence)

return n;

#endif

if (c != 0 && (c & FLAG(ch.category())) != 0)

2424

return !n;

2425

2426

const int uc = ch.unicode();

2427

int size = r.size();

2428

2429

for (int i = 0; i < size; ++i) {

2430

const QRegExpCharClassRange &range = r.at(i);

2431

if (uint(uc - range.from) < uint(r.at(i).len))

return !n;

}

return n;

}

#if defined(QT_DEBUG)

2438

void QRegExpCharClass::dump() const

2439

{

2440

int i;

2441

qDebug(" %stive character class", n ? "nega" : "posi");

2442

#ifndef QT_NO_REGEXP_CCLASS

2443

if (c != 0)

2444

qDebug(" categories 0x%.8x", c);

2445

#endif

2446

for (i = 0; i < r.size(); i++)

2447

qDebug(" 0x%.4x through 0x%.4x", r[i].from, r[i].from + r[i].len - 1);

}

#endif

#endif

QRegExpEngine::Box::Box(QRegExpEngine *engine)

2453

: eng(engine), skipanchors(0)

2454

#ifndef QT_NO_REGEXP_OPTIM

2455

, earlyStart(0), lateStart(0), maxl(0)

2456

#endif

2457

{

2458

#ifndef QT_NO_REGEXP_OPTIM

2459

occ1.fill(NoOccurrence, NumBadChars);

#endif

minl = 0;

}

QRegExpEngine::Box &QRegExpEngine::Box::operator=(const Box &b)

{

eng = b.eng;

ls = b.ls;

rs = b.rs;

lanchors = b.lanchors;

2470

ranchors = b.ranchors;

2471

skipanchors = b.skipanchors;

2472

#ifndef QT_NO_REGEXP_OPTIM

2473

earlyStart = b.earlyStart;

2474

lateStart = b.lateStart;

2475

str = b.str;

2476

leftStr = b.leftStr;

2477

rightStr = b.rightStr;

maxl = b.maxl;

occ1 = b.occ1;

#endif

minl = b.minl;

return *this;

}

void QRegExpEngine::Box::set(QChar ch)

2486

{

2487

ls.resize(1);

2488

ls[0] = eng->createState(ch);

2489

rs = ls;

2490

#ifndef QT_NO_REGEXP_OPTIM

str = ch;

leftStr = ch;

rightStr = ch;

maxl = 1;

occ1[BadChar(ch)] = 0;

#endif

minl = 1;

}

void QRegExpEngine::Box::set(const QRegExpCharClass &cc)

2501

{

2502

ls.resize(1);

2503

ls[0] = eng->createState(cc);

2504

rs = ls;

2505

#ifndef QT_NO_REGEXP_OPTIM

2506

maxl = 1;

2507

occ1 = cc.firstOccurrence();

#endif

minl = 1;

}

#ifndef QT_NO_REGEXP_BACKREF

2513

void QRegExpEngine::Box::set(int bref)

2514

{

2515

ls.resize(1);

2516

ls[0] = eng->createState(bref);

2517

rs = ls;

2518

if (bref >= 1 && bref <= MaxBackRefs)

2519

skipanchors = Anchor_BackRef0Empty << bref;

2520

#ifndef QT_NO_REGEXP_OPTIM

maxl = InftyLen;

#endif

minl = 0;

}

#endif

void QRegExpEngine::Box::cat(const Box &b)

2528

{

2529

eng->addCatTransitions(rs, b.ls);

2530

addAnchorsToEngine(b);

2531

if (minl == 0) {

2532

lanchors.unite(b.lanchors);

2533

if (skipanchors != 0) {

2534

for (int i = 0; i < b.ls.size(); i++) {

2535

int a = eng->anchorConcatenation(lanchors.value(b.ls.at(i), 0), skipanchors);

2536

lanchors.insert(b.ls.at(i), a);

2537

}

2538

}

2539

mergeInto(&ls, b.ls);

2540

}

2541

if (b.minl == 0) {

2542

ranchors.unite(b.ranchors);

2543

if (b.skipanchors != 0) {

2544

for (int i = 0; i < rs.size(); i++) {

2545

int a = eng->anchorConcatenation(ranchors.value(rs.at(i), 0), b.skipanchors);

2546

ranchors.insert(rs.at(i), a);

2547

}

2548

}

2549

mergeInto(&rs, b.rs);

2550

} else {

2551

ranchors = b.ranchors;

rs = b.rs;

}

#ifndef QT_NO_REGEXP_OPTIM

2556

if (maxl != InftyLen) {

2557

if (rightStr.length() + b.leftStr.length() >

2558

qMax(str.length(), b.str.length())) {

2559

earlyStart = minl - rightStr.length();

2560

lateStart = maxl - rightStr.length();

2561

str = rightStr + b.leftStr;

2562

} else if (b.str.length() > str.length()) {

2563

earlyStart = minl + b.earlyStart;

2564

lateStart = maxl + b.lateStart;

str = b.str;

}

}

if (leftStr.length() == maxl)

2570

leftStr += b.leftStr;

2571

2572

if (b.rightStr.length() == b.maxl) {

2573

rightStr += b.rightStr;

2574

} else {

2575

rightStr = b.rightStr;

2576

}

2577

2578

if (maxl == InftyLen || b.maxl == InftyLen) {

maxl = InftyLen;

} else {

maxl += b.maxl;

}

for (int i = 0; i < NumBadChars; i++) {

2585

if (b.occ1.at(i) != NoOccurrence && minl + b.occ1.at(i) < occ1.at(i))

2586

occ1[i] = minl + b.occ1.at(i);

}

#endif

minl += b.minl;

if (minl == 0)

skipanchors = eng->anchorConcatenation(skipanchors, b.skipanchors);

else

skipanchors = 0;

}

void QRegExpEngine::Box::orx(const Box &b)

2598

{

2599

mergeInto(&ls, b.ls);

2600

lanchors.unite(b.lanchors);

2601

mergeInto(&rs, b.rs);

2602

ranchors.unite(b.ranchors);

if (b.minl == 0) {

if (minl == 0)

skipanchors = eng->anchorAlternation(skipanchors, b.skipanchors);

2607

else

2608

skipanchors = b.skipanchors;

2609

}

2610

2611

#ifndef QT_NO_REGEXP_OPTIM

2612

for (int i = 0; i < NumBadChars; i++) {

2613

if (occ1.at(i) > b.occ1.at(i))

2614

occ1[i] = b.occ1.at(i);

}

earlyStart = 0;

lateStart = 0;

str = QString();

leftStr = QString();

rightStr = QString();

if (b.maxl > maxl)

maxl = b.maxl;

#endif

if (b.minl < minl)

minl = b.minl;

}

void QRegExpEngine::Box::plus(int atom)

2629

{

2630

#ifndef QT_NO_REGEXP_CAPTURE

2631

eng->addPlusTransitions(rs, ls, atom);

2632

#else

2633

Q_UNUSED(atom);

2634

eng->addCatTransitions(rs, ls);

2635

#endif

2636

addAnchorsToEngine(*this);

2637

#ifndef QT_NO_REGEXP_OPTIM

maxl = InftyLen;

#endif

}

void QRegExpEngine::Box::opt()

2643

{

2644

#ifndef QT_NO_REGEXP_OPTIM

earlyStart = 0;

lateStart = 0;

str = QString();

leftStr = QString();

rightStr = QString();

#endif

skipanchors = 0;

minl = 0;

}

void QRegExpEngine::Box::catAnchor(int a)

2656

{

2657

if (a != 0) {

2658

for (int i = 0; i < rs.size(); i++) {

2659

a = eng->anchorConcatenation(ranchors.value(rs.at(i), 0), a);

2660

ranchors.insert(rs.at(i), a);

2661

}

2662

if (minl == 0)

2663

skipanchors = eng->anchorConcatenation(skipanchors, a);

}

}

#ifndef QT_NO_REGEXP_OPTIM

2668

void QRegExpEngine::Box::setupHeuristics()

2669

{

2670

eng->goodEarlyStart = earlyStart;

2671

eng->goodLateStart = lateStart;

2672

eng->goodStr = eng->cs ? str : str.toLower();

eng->minl = minl;

if (eng->cs) {

A regular expression such as 112|1 has occ1['2'] = 2 and minl =

2678

1 at this point. An entry of occ1 has to be at most minl or

2679

infinity for the rest of the algorithm to go well.

2680

2681

We waited until here before normalizing these cases (instead of

2682

doing it in Box::orx()) because sometimes things improve by

2683

themselves. Consider for example (112|1)34.

2684

2685

for (int i = 0; i < NumBadChars; i++) {

2686

if (occ1.at(i) != NoOccurrence && occ1.at(i) >= minl)

occ1[i] = minl;

}

eng->occ1 = occ1;

} else {

eng->occ1.fill(0, NumBadChars);

2692

}

2693

2694

eng->heuristicallyChooseHeuristic();

}

#endif

#if defined(QT_DEBUG)

2699

void QRegExpEngine::Box::dump() const

2700

{

2701

int i;

2702

qDebug("Box of at least %d character%s", minl, minl == 1 ? "" : "s");

2703

qDebug(" Left states:");

2704

for (i = 0; i < ls.size(); i++) {

2705

if (lanchors.value(ls[i], 0) == 0)

2706

qDebug(" %d", ls[i]);

2707

else

2708

qDebug(" %d [anchors 0x%.8x]", ls[i], lanchors[ls[i]]);

2709

}

2710

qDebug(" Right states:");

2711

for (i = 0; i < rs.size(); i++) {

2712

if (ranchors.value(rs[i], 0) == 0)

2713

qDebug(" %d", rs[i]);

2714

else

2715

qDebug(" %d [anchors 0x%.8x]", rs[i], ranchors[rs[i]]);

2716

}

2717

qDebug(" Skip anchors: 0x%.8x", skipanchors);

}

#endif

void QRegExpEngine::Box::addAnchorsToEngine(const Box &to) const

2722

{

2723

for (int i = 0; i < to.ls.size(); i++) {

2724

for (int j = 0; j < rs.size(); j++) {

2725

int a = eng->anchorConcatenation(ranchors.value(rs.at(j), 0),

2726

to.lanchors.value(to.ls.at(i), 0));

2727

eng->addAnchors(rs[j], to.ls[i], a);

}

}

}

#ifndef QT_NO_REGEXP_CCLASS

2733

// fast lookup hash for xml schema extensions

2734

// sorted by name for b-search

2735

static const struct CategoriesRangeMapEntry {

2736

const char name[40];

2737

uint first, second;

2738

} categoriesRangeMap[] = {

2739

{ "AegeanNumbers", 0x10100, 0x1013F },

2740

{ "AlphabeticPresentationForms", 0xFB00, 0xFB4F },

2741

{ "AncientGreekMusicalNotation", 0x1D200, 0x1D24F },

2742

{ "AncientGreekNumbers", 0x10140, 0x1018F },

2743

{ "Arabic", 0x0600, 0x06FF },

2744

{ "ArabicPresentationForms-A", 0xFB50, 0xFDFF },

2745

{ "ArabicPresentationForms-B", 0xFE70, 0xFEFF },

2746

{ "ArabicSupplement", 0x0750, 0x077F },

2747

{ "Armenian", 0x0530, 0x058F },

2748

{ "Arrows", 0x2190, 0x21FF },

2749

{ "BasicLatin", 0x0000, 0x007F },

2750

{ "Bengali", 0x0980, 0x09FF },

2751

{ "BlockElements", 0x2580, 0x259F },

2752

{ "Bopomofo", 0x3100, 0x312F },

2753

{ "BopomofoExtended", 0x31A0, 0x31BF },

2754

{ "BoxDrawing", 0x2500, 0x257F },

2755

{ "BraillePatterns", 0x2800, 0x28FF },

2756

{ "Buginese", 0x1A00, 0x1A1F },

2757

{ "Buhid", 0x1740, 0x175F },

2758

{ "ByzantineMusicalSymbols", 0x1D000, 0x1D0FF },

2759

{ "CJKCompatibility", 0x3300, 0x33FF },

2760

{ "CJKCompatibilityForms", 0xFE30, 0xFE4F },

2761

{ "CJKCompatibilityIdeographs", 0xF900, 0xFAFF },

2762

{ "CJKCompatibilityIdeographsSupplement", 0x2F800, 0x2FA1F },

2763

{ "CJKRadicalsSupplement", 0x2E80, 0x2EFF },

2764

{ "CJKStrokes", 0x31C0, 0x31EF },

2765

{ "CJKSymbolsandPunctuation", 0x3000, 0x303F },

2766

{ "CJKUnifiedIdeographs", 0x4E00, 0x9FFF },

2767

{ "CJKUnifiedIdeographsExtensionA", 0x3400, 0x4DB5 },

2768

{ "CJKUnifiedIdeographsExtensionB", 0x20000, 0x2A6DF },

2769

{ "Cherokee", 0x13A0, 0x13FF },

2770

{ "CombiningDiacriticalMarks", 0x0300, 0x036F },

2771

{ "CombiningDiacriticalMarksSupplement", 0x1DC0, 0x1DFF },

2772

{ "CombiningHalfMarks", 0xFE20, 0xFE2F },

2773

{ "CombiningMarksforSymbols", 0x20D0, 0x20FF },

2774

{ "ControlPictures", 0x2400, 0x243F },

2775

{ "Coptic", 0x2C80, 0x2CFF },

2776

{ "CurrencySymbols", 0x20A0, 0x20CF },

2777

{ "CypriotSyllabary", 0x10800, 0x1083F },

2778

{ "Cyrillic", 0x0400, 0x04FF },

2779

{ "CyrillicSupplement", 0x0500, 0x052F },

2780

{ "Deseret", 0x10400, 0x1044F },

2781

{ "Devanagari", 0x0900, 0x097F },

2782

{ "Dingbats", 0x2700, 0x27BF },

2783

{ "EnclosedAlphanumerics", 0x2460, 0x24FF },

2784

{ "EnclosedCJKLettersandMonths", 0x3200, 0x32FF },

2785

{ "Ethiopic", 0x1200, 0x137F },

2786

{ "EthiopicExtended", 0x2D80, 0x2DDF },

2787

{ "EthiopicSupplement", 0x1380, 0x139F },

2788

{ "GeneralPunctuation", 0x2000, 0x206F },

2789

{ "GeometricShapes", 0x25A0, 0x25FF },

2790

{ "Georgian", 0x10A0, 0x10FF },

2791

{ "GeorgianSupplement", 0x2D00, 0x2D2F },

2792

{ "Glagolitic", 0x2C00, 0x2C5F },

2793

{ "Gothic", 0x10330, 0x1034F },

2794

{ "Greek", 0x0370, 0x03FF },

2795

{ "GreekExtended", 0x1F00, 0x1FFF },

2796

{ "Gujarati", 0x0A80, 0x0AFF },

2797

{ "Gurmukhi", 0x0A00, 0x0A7F },

2798

{ "HalfwidthandFullwidthForms", 0xFF00, 0xFFEF },

2799

{ "HangulCompatibilityJamo", 0x3130, 0x318F },

2800

{ "HangulJamo", 0x1100, 0x11FF },

2801

{ "HangulSyllables", 0xAC00, 0xD7A3 },

2802

{ "Hanunoo", 0x1720, 0x173F },

2803

{ "Hebrew", 0x0590, 0x05FF },

2804

{ "Hiragana", 0x3040, 0x309F },

2805

{ "IPAExtensions", 0x0250, 0x02AF },

2806

{ "IdeographicDescriptionCharacters", 0x2FF0, 0x2FFF },

2807

{ "Kanbun", 0x3190, 0x319F },

2808

{ "KangxiRadicals", 0x2F00, 0x2FDF },

2809

{ "Kannada", 0x0C80, 0x0CFF },

2810

{ "Katakana", 0x30A0, 0x30FF },

2811

{ "KatakanaPhoneticExtensions", 0x31F0, 0x31FF },

2812

{ "Kharoshthi", 0x10A00, 0x10A5F },

2813

{ "Khmer", 0x1780, 0x17FF },

2814

{ "KhmerSymbols", 0x19E0, 0x19FF },

2815

{ "Lao", 0x0E80, 0x0EFF },

2816

{ "Latin-1Supplement", 0x0080, 0x00FF },

2817

{ "LatinExtended-A", 0x0100, 0x017F },

2818

{ "LatinExtended-B", 0x0180, 0x024F },

2819

{ "LatinExtendedAdditional", 0x1E00, 0x1EFF },

2820

{ "LetterlikeSymbols", 0x2100, 0x214F },

2821

{ "Limbu", 0x1900, 0x194F },

2822

{ "LinearBIdeograms", 0x10080, 0x100FF },

2823

{ "LinearBSyllabary", 0x10000, 0x1007F },

2824

{ "Malayalam", 0x0D00, 0x0D7F },

2825

{ "MathematicalAlphanumericSymbols", 0x1D400, 0x1D7FF },

2826

{ "MathematicalOperators", 0x2200, 0x22FF },

2827

{ "MiscellaneousMathematicalSymbols-A", 0x27C0, 0x27EF },

2828

{ "MiscellaneousMathematicalSymbols-B", 0x2980, 0x29FF },

2829

{ "MiscellaneousSymbols", 0x2600, 0x26FF },

2830

{ "MiscellaneousSymbolsandArrows", 0x2B00, 0x2BFF },

2831

{ "MiscellaneousTechnical", 0x2300, 0x23FF },

2832

{ "ModifierToneLetters", 0xA700, 0xA71F },

2833

{ "Mongolian", 0x1800, 0x18AF },

2834

{ "MusicalSymbols", 0x1D100, 0x1D1FF },

2835

{ "Myanmar", 0x1000, 0x109F },

2836

{ "NewTaiLue", 0x1980, 0x19DF },

2837

{ "NumberForms", 0x2150, 0x218F },

2838

{ "Ogham", 0x1680, 0x169F },

2839

{ "OldItalic", 0x10300, 0x1032F },

2840

{ "OldPersian", 0x103A0, 0x103DF },

2841

{ "OpticalCharacterRecognition", 0x2440, 0x245F },

2842

{ "Oriya", 0x0B00, 0x0B7F },

2843

{ "Osmanya", 0x10480, 0x104AF },

2844

{ "PhoneticExtensions", 0x1D00, 0x1D7F },

2845

{ "PhoneticExtensionsSupplement", 0x1D80, 0x1DBF },

2846

{ "PrivateUse", 0xE000, 0xF8FF },

2847

{ "Runic", 0x16A0, 0x16FF },

2848

{ "Shavian", 0x10450, 0x1047F },

2849

{ "Sinhala", 0x0D80, 0x0DFF },

2850

{ "SmallFormVariants", 0xFE50, 0xFE6F },

2851

{ "SpacingModifierLetters", 0x02B0, 0x02FF },

2852

{ "Specials", 0xFFF0, 0xFFFF },

2853

{ "SuperscriptsandSubscripts", 0x2070, 0x209F },

2854

{ "SupplementalArrows-A", 0x27F0, 0x27FF },

2855

{ "SupplementalArrows-B", 0x2900, 0x297F },

2856

{ "SupplementalMathematicalOperators", 0x2A00, 0x2AFF },

2857

{ "SupplementalPunctuation", 0x2E00, 0x2E7F },

2858

{ "SupplementaryPrivateUseArea-A", 0xF0000, 0xFFFFF },

2859

{ "SupplementaryPrivateUseArea-B", 0x100000, 0x10FFFF },

2860

{ "SylotiNagri", 0xA800, 0xA82F },

2861

{ "Syriac", 0x0700, 0x074F },

2862

{ "Tagalog", 0x1700, 0x171F },

2863

{ "Tagbanwa", 0x1760, 0x177F },

2864

{ "Tags", 0xE0000, 0xE007F },

2865

{ "TaiLe", 0x1950, 0x197F },

2866

{ "TaiXuanJingSymbols", 0x1D300, 0x1D35F },

2867

{ "Tamil", 0x0B80, 0x0BFF },

2868

{ "Telugu", 0x0C00, 0x0C7F },

2869

{ "Thaana", 0x0780, 0x07BF },

2870

{ "Thai", 0x0E00, 0x0E7F },

2871

{ "Tibetan", 0x0F00, 0x0FFF },

2872

{ "Tifinagh", 0x2D30, 0x2D7F },

2873

{ "Ugaritic", 0x10380, 0x1039F },

2874

{ "UnifiedCanadianAboriginalSyllabics", 0x1400, 0x167F },

2875

{ "VariationSelectors", 0xFE00, 0xFE0F },

2876

{ "VariationSelectorsSupplement", 0xE0100, 0xE01EF },

2877

{ "VerticalForms", 0xFE10, 0xFE1F },

2878

{ "YiRadicals", 0xA490, 0xA4CF },

2879

{ "YiSyllables", 0xA000, 0xA48F },

2880

{ "YijingHexagramSymbols", 0x4DC0, 0x4DFF }

2881

};

2882

2883

inline bool operator<(const CategoriesRangeMapEntry &entry1, const CategoriesRangeMapEntry &entry2)

2884

{ return qstrcmp(entry1.name, entry2.name) < 0; }

2885

inline bool operator<(const char *name, const CategoriesRangeMapEntry &entry)

2886

{ return qstrcmp(name, entry.name) < 0; }

2887

inline bool operator<(const CategoriesRangeMapEntry &entry, const char *name)

2888

{ return qstrcmp(entry.name, name) < 0; }

2889

#endif // QT_NO_REGEXP_CCLASS

2890

2891

int QRegExpEngine::getChar()

2892

{

2893

return (yyPos == yyLen) ? EOS : yyIn[yyPos++].unicode();

2894

}

2895

2896

int QRegExpEngine::getEscape()

2897

{

2898

#ifndef QT_NO_REGEXP_ESCAPE

2899

const char tab[] = "afnrtv"; // no b, as \b means word boundary

2900

const char backTab[] = "\a\f\n\r\t\v";

ushort low;

int i;

#endif

ushort val;

int prevCh = yyCh;

if (prevCh == EOS) {

error(RXERR_END);

return Tok_Char | '\\';

2910

}

2911

yyCh = getChar();

2912

#ifndef QT_NO_REGEXP_ESCAPE

2913

if ((prevCh & ~0xff) == 0) {

2914

const char *p = strchr(tab, prevCh);

2915

if (p != 0)

2916

return Tok_Char | backTab[p - tab];

}

#endif

switch (prevCh) {

#ifndef QT_NO_REGEXP_ESCAPE

2922

case '0':

2923

val = 0;

2924

for (i = 0; i < 3; i++) {

2925

if (yyCh >= '0' && yyCh <= '7')

2926

val = (val << 3) | (yyCh - '0');

else

break;

yyCh = getChar();

}

if ((val & ~0377) != 0)

2932

error(RXERR_OCTAL);

2933

return Tok_Char | val;

2934

#endif

2935

#ifndef QT_NO_REGEXP_ESCAPE

case 'B':

return Tok_NonWord;

#endif

#ifndef QT_NO_REGEXP_CCLASS

2940

case 'D':

2941

// see QChar::isDigit()

2942

yyCharClass->addCategories(uint(-1) ^ FLAG(QChar::Number_DecimalDigit));

2943

return Tok_CharClass;

2944

case 'S':

2945

// see QChar::isSpace()

2946

yyCharClass->addCategories(uint(-1) ^ (FLAG(QChar::Separator_Space) |

2947

FLAG(QChar::Separator_Line) |

2948

FLAG(QChar::Separator_Paragraph) |

2949

FLAG(QChar::Other_Control)));

2950

yyCharClass->addRange(0x0000, 0x0008);

2951

yyCharClass->addRange(0x000e, 0x001f);

2952

yyCharClass->addRange(0x007f, 0x0084);

2953

yyCharClass->addRange(0x0086, 0x009f);

2954

return Tok_CharClass;

2955

case 'W':

2956

// see QChar::isLetterOrNumber() and QChar::isMark()

2957

yyCharClass->addCategories(uint(-1) ^ (FLAG(QChar::Mark_NonSpacing) |

2958

FLAG(QChar::Mark_SpacingCombining) |

2959

FLAG(QChar::Mark_Enclosing) |

2960

FLAG(QChar::Number_DecimalDigit) |

2961

FLAG(QChar::Number_Letter) |

2962

FLAG(QChar::Number_Other) |

2963

FLAG(QChar::Letter_Uppercase) |

2964

FLAG(QChar::Letter_Lowercase) |

2965

FLAG(QChar::Letter_Titlecase) |

2966

FLAG(QChar::Letter_Modifier) |

2967

FLAG(QChar::Letter_Other) |

2968

FLAG(QChar::Punctuation_Connector)));

2969

yyCharClass->addRange(0x203f, 0x2040);

2970

yyCharClass->addSingleton(0x2040);

2971

yyCharClass->addSingleton(0x2054);

2972

yyCharClass->addSingleton(0x30fb);

2973

yyCharClass->addRange(0xfe33, 0xfe34);

2974

yyCharClass->addRange(0xfe4d, 0xfe4f);

2975

yyCharClass->addSingleton(0xff3f);

2976

yyCharClass->addSingleton(0xff65);

2977

return Tok_CharClass;

2978

#endif

2979

#ifndef QT_NO_REGEXP_ESCAPE

case 'b':

return Tok_Word;

#endif

#ifndef QT_NO_REGEXP_CCLASS

2984

case 'd':

2985

// see QChar::isDigit()

2986

yyCharClass->addCategories(FLAG(QChar::Number_DecimalDigit));

2987

return Tok_CharClass;

2988

case 's':

2989

// see QChar::isSpace()

2990

yyCharClass->addCategories(FLAG(QChar::Separator_Space) |

2991

FLAG(QChar::Separator_Line) |

2992

FLAG(QChar::Separator_Paragraph));

2993

yyCharClass->addRange(0x0009, 0x000d);

2994

yyCharClass->addSingleton(0x0085);

2995

return Tok_CharClass;

2996

case 'w':

2997

// see QChar::isLetterOrNumber() and QChar::isMark()

2998

yyCharClass->addCategories(FLAG(QChar::Mark_NonSpacing) |

2999

FLAG(QChar::Mark_SpacingCombining) |

3000

FLAG(QChar::Mark_Enclosing) |

3001

FLAG(QChar::Number_DecimalDigit) |

3002

FLAG(QChar::Number_Letter) |

3003

FLAG(QChar::Number_Other) |

3004

FLAG(QChar::Letter_Uppercase) |

3005

FLAG(QChar::Letter_Lowercase) |

3006

FLAG(QChar::Letter_Titlecase) |

3007

FLAG(QChar::Letter_Modifier) |

3008

FLAG(QChar::Letter_Other));

3009

yyCharClass->addSingleton(0x005f); // '_'

3010

return Tok_CharClass;

3011

case 'I':

3012

if (xmlSchemaExtensions) {

3013

yyCharClass->setNegative(!yyCharClass->negative());

// fall through

} else {

break;

}

case 'i':

if (xmlSchemaExtensions) {

3020

yyCharClass->addCategories(FLAG(QChar::Mark_NonSpacing) |

3021

FLAG(QChar::Mark_SpacingCombining) |

3022

FLAG(QChar::Mark_Enclosing) |

3023

FLAG(QChar::Number_DecimalDigit) |

3024

FLAG(QChar::Number_Letter) |

3025

FLAG(QChar::Number_Other) |

3026

FLAG(QChar::Letter_Uppercase) |

3027

FLAG(QChar::Letter_Lowercase) |

3028

FLAG(QChar::Letter_Titlecase) |

3029

FLAG(QChar::Letter_Modifier) |

3030

FLAG(QChar::Letter_Other));

3031

yyCharClass->addSingleton(0x003a); // ':'

3032

yyCharClass->addSingleton(0x005f); // '_'

3033

yyCharClass->addRange(0x0041, 0x005a); // [A-Z]

3034

yyCharClass->addRange(0x0061, 0x007a); // [a-z]

3035

yyCharClass->addRange(0xc0, 0xd6);

3036

yyCharClass->addRange(0xd8, 0xf6);

3037

yyCharClass->addRange(0xf8, 0x2ff);

3038

yyCharClass->addRange(0x370, 0x37d);

3039

yyCharClass->addRange(0x37f, 0x1fff);

3040

yyCharClass->addRange(0x200c, 0x200d);

3041

yyCharClass->addRange(0x2070, 0x218f);

3042

yyCharClass->addRange(0x2c00, 0x2fef);

3043

yyCharClass->addRange(0x3001, 0xd7ff);

3044

yyCharClass->addRange(0xf900, 0xfdcf);

3045

yyCharClass->addRange(0xfdf0, 0xfffd);

3046

yyCharClass->addRange((ushort)0x10000, (ushort)0xeffff);

3047

return Tok_CharClass;

} else {

break;

}

case 'C':

if (xmlSchemaExtensions) {

3053

yyCharClass->setNegative(!yyCharClass->negative());

// fall through

} else {

break;

}

case 'c':

if (xmlSchemaExtensions) {

3060

yyCharClass->addCategories(FLAG(QChar::Mark_NonSpacing) |

3061

FLAG(QChar::Mark_SpacingCombining) |

3062

FLAG(QChar::Mark_Enclosing) |

3063

FLAG(QChar::Number_DecimalDigit) |

3064

FLAG(QChar::Number_Letter) |

3065

FLAG(QChar::Number_Other) |

3066

FLAG(QChar::Letter_Uppercase) |

3067

FLAG(QChar::Letter_Lowercase) |

3068

FLAG(QChar::Letter_Titlecase) |

3069

FLAG(QChar::Letter_Modifier) |

3070

FLAG(QChar::Letter_Other));

3071

yyCharClass->addSingleton(0x002d); // '-'

3072

yyCharClass->addSingleton(0x002e); // '.'

3073

yyCharClass->addSingleton(0x003a); // ':'

3074

yyCharClass->addSingleton(0x005f); // '_'

3075

yyCharClass->addSingleton(0xb7);

3076

yyCharClass->addRange(0x0030, 0x0039); // [0-9]

3077

yyCharClass->addRange(0x0041, 0x005a); // [A-Z]

3078

yyCharClass->addRange(0x0061, 0x007a); // [a-z]

3079

yyCharClass->addRange(0xc0, 0xd6);

3080

yyCharClass->addRange(0xd8, 0xf6);

3081

yyCharClass->addRange(0xf8, 0x2ff);

3082

yyCharClass->addRange(0x370, 0x37d);

3083

yyCharClass->addRange(0x37f, 0x1fff);

3084

yyCharClass->addRange(0x200c, 0x200d);

3085

yyCharClass->addRange(0x2070, 0x218f);

3086

yyCharClass->addRange(0x2c00, 0x2fef);

3087

yyCharClass->addRange(0x3001, 0xd7ff);

3088

yyCharClass->addRange(0xf900, 0xfdcf);

3089

yyCharClass->addRange(0xfdf0, 0xfffd);

3090

yyCharClass->addRange((ushort)0x10000, (ushort)0xeffff);

3091

yyCharClass->addRange(0x0300, 0x036f);

3092

yyCharClass->addRange(0x203f, 0x2040);

3093

return Tok_CharClass;

} else {

break;

}

case 'P':

if (xmlSchemaExtensions) {

3099

yyCharClass->setNegative(!yyCharClass->negative());

// fall through

} else {

break;

}

case 'p':

if (xmlSchemaExtensions) {

3106

if (yyCh != '{') {

3107

error(RXERR_CHARCLASS);

3108

return Tok_CharClass;

}

QByteArray category;

yyCh = getChar();

while (yyCh != '}') {

3114

if (yyCh == EOS) {

3115

error(RXERR_END);

3116

return Tok_CharClass;

3117

}

3118

category.append(yyCh);

3119

yyCh = getChar();

3120

}

3121

yyCh = getChar(); // skip closing '}'

3122

3123

int catlen = category.length();

3124

if (catlen == 1 || catlen == 2) {

3125

switch (category.at(0)) {

3126

case 'M':

3127

if (catlen == 1) {

3128

yyCharClass->addCategories(FLAG(QChar::Mark_NonSpacing) |

3129

FLAG(QChar::Mark_SpacingCombining) |

3130

FLAG(QChar::Mark_Enclosing));

3131

} else {

3132

switch (category.at(1)) {

3133

case 'n': yyCharClass->addCategories(FLAG(QChar::Mark_NonSpacing)); break; // Mn

3134

case 'c': yyCharClass->addCategories(FLAG(QChar::Mark_SpacingCombining)); break; // Mc

3135

case 'e': yyCharClass->addCategories(FLAG(QChar::Mark_Enclosing)); break; // Me

3136

default: error(RXERR_CATEGORY); break;

}

}

break;

case 'N':

if (catlen == 1) {

yyCharClass->addCategories(FLAG(QChar::Number_DecimalDigit) |

3143

FLAG(QChar::Number_Letter) |

3144

FLAG(QChar::Number_Other));

3145

} else {

3146

switch (category.at(1)) {

3147

case 'd': yyCharClass->addCategories(FLAG(QChar::Number_DecimalDigit)); break; // Nd

3148

case 'l': yyCharClass->addCategories(FLAG(QChar::Number_Letter)); break; // Hl

3149

case 'o': yyCharClass->addCategories(FLAG(QChar::Number_Other)); break; // No

3150

default: error(RXERR_CATEGORY); break;

}

}

break;

case 'Z':

if (catlen == 1) {

yyCharClass->addCategories(FLAG(QChar::Separator_Space) |

3157

FLAG(QChar::Separator_Line) |

3158

FLAG(QChar::Separator_Paragraph));

3159

} else {

3160

switch (category.at(1)) {

3161

case 's': yyCharClass->addCategories(FLAG(QChar::Separator_Space)); break; // Zs

3162

case 'l': yyCharClass->addCategories(FLAG(QChar::Separator_Line)); break; // Zl

3163

case 'p': yyCharClass->addCategories(FLAG(QChar::Separator_Paragraph)); break; // Zp

3164

default: error(RXERR_CATEGORY); break;

}

}

break;

case 'C':

if (catlen == 1) {

yyCharClass->addCategories(FLAG(QChar::Other_Control) |

3171

FLAG(QChar::Other_Format) |

3172

FLAG(QChar::Other_Surrogate) |

3173

FLAG(QChar::Other_PrivateUse) |

3174

FLAG(QChar::Other_NotAssigned));

3175

} else {

3176

switch (category.at(1)) {

3177

case 'c': yyCharClass->addCategories(FLAG(QChar::Other_Control)); break; // Cc

3178

case 'f': yyCharClass->addCategories(FLAG(QChar::Other_Format)); break; // Cf

3179

case 's': yyCharClass->addCategories(FLAG(QChar::Other_Surrogate)); break; // Cs

3180

case 'o': yyCharClass->addCategories(FLAG(QChar::Other_PrivateUse)); break; // Co

3181

case 'n': yyCharClass->addCategories(FLAG(QChar::Other_NotAssigned)); break; // Cn

3182

default: error(RXERR_CATEGORY); break;

}

}

break;

case 'L':

if (catlen == 1) {

yyCharClass->addCategories(FLAG(QChar::Letter_Uppercase) |

3189

FLAG(QChar::Letter_Lowercase) |

3190

FLAG(QChar::Letter_Titlecase) |

3191

FLAG(QChar::Letter_Modifier) |

3192

FLAG(QChar::Letter_Other));

3193

} else {

3194

switch (category.at(1)) {

3195

case 'u': yyCharClass->addCategories(FLAG(QChar::Letter_Uppercase)); break; // Lu

3196

case 'l': yyCharClass->addCategories(FLAG(QChar::Letter_Lowercase)); break; // Ll

3197

case 't': yyCharClass->addCategories(FLAG(QChar::Letter_Titlecase)); break; // Lt

3198

case 'm': yyCharClass->addCategories(FLAG(QChar::Letter_Modifier)); break; // Lm

3199

case 'o': yyCharClass->addCategories(FLAG(QChar::Letter_Other)); break; // Lo

3200

default: error(RXERR_CATEGORY); break;

}

}

break;

case 'P':

if (catlen == 1) {

yyCharClass->addCategories(FLAG(QChar::Punctuation_Connector) |

3207

FLAG(QChar::Punctuation_Dash) |

3208

FLAG(QChar::Punctuation_Open) |

3209

FLAG(QChar::Punctuation_Close) |

3210

FLAG(QChar::Punctuation_InitialQuote) |

3211

FLAG(QChar::Punctuation_FinalQuote) |

3212

FLAG(QChar::Punctuation_Other));

3213

} else {

3214

switch (category.at(1)) {

3215

case 'c': yyCharClass->addCategories(FLAG(QChar::Punctuation_Connector)); break; // Pc

3216

case 'd': yyCharClass->addCategories(FLAG(QChar::Punctuation_Dash)); break; // Pd

3217

case 's': yyCharClass->addCategories(FLAG(QChar::Punctuation_Open)); break; // Ps

3218

case 'e': yyCharClass->addCategories(FLAG(QChar::Punctuation_Close)); break; // Pe

3219

case 'i': yyCharClass->addCategories(FLAG(QChar::Punctuation_InitialQuote)); break; // Pi

3220

case 'f': yyCharClass->addCategories(FLAG(QChar::Punctuation_FinalQuote)); break; // Pf

3221

case 'o': yyCharClass->addCategories(FLAG(QChar::Punctuation_Other)); break; // Po

3222

default: error(RXERR_CATEGORY); break;

}

}

break;

case 'S':

if (catlen == 1) {

yyCharClass->addCategories(FLAG(QChar::Symbol_Math) |

3229

FLAG(QChar::Symbol_Currency) |

3230

FLAG(QChar::Symbol_Modifier) |

3231

FLAG(QChar::Symbol_Other));

3232

} else {

3233

switch (category.at(1)) {

3234

case 'm': yyCharClass->addCategories(FLAG(QChar::Symbol_Math)); break; // Sm

3235

case 'c': yyCharClass->addCategories(FLAG(QChar::Symbol_Currency)); break; // Sc

3236

case 'k': yyCharClass->addCategories(FLAG(QChar::Symbol_Modifier)); break; // Sk

3237

case 'o': yyCharClass->addCategories(FLAG(QChar::Symbol_Other)); break; // So

3238

default: error(RXERR_CATEGORY); break;

}

}

break;

default:

error(RXERR_CATEGORY);

3244

break;

3245

}

3246

} else if (catlen > 2 && category.at(0) == 'I' && category.at(1) == 's') {

3247

static const int N = sizeof(categoriesRangeMap) / sizeof(categoriesRangeMap[0]);

3248

const char * const categoryFamily = category.constData() + 2;

3249

const CategoriesRangeMapEntry *r = std::lower_bound(categoriesRangeMap, categoriesRangeMap + N, categoryFamily);

3250

if (r != categoriesRangeMap + N && qstrcmp(r->name, categoryFamily) == 0)

3251

yyCharClass->addRange(r->first, r->second);

3252

else

3253

error(RXERR_CATEGORY);

3254

} else {

3255

error(RXERR_CATEGORY);

3256

}

3257

return Tok_CharClass;

} else {

break;

}

#endif

#ifndef QT_NO_REGEXP_ESCAPE

3263

case 'x':

3264

val = 0;

3265

for (i = 0; i < 4; i++) {

3266

low = QChar(yyCh).toLower().unicode();

3267

if (low >= '0' && low <= '9')

3268

val = (val << 4) | (low - '0');

3269

else if (low >= 'a' && low <= 'f')

3270

val = (val << 4) | (low - 'a' + 10);

else

break;

yyCh = getChar();

}

return Tok_Char | val;

#endif

default:

break;

}

if (prevCh >= '1' && prevCh <= '9') {

3281

#ifndef QT_NO_REGEXP_BACKREF

3282

val = prevCh - '0';

3283

while (yyCh >= '0' && yyCh <= '9') {

3284

val = (val * 10) + (yyCh - '0');

3285

yyCh = getChar();

3286

}

3287

return Tok_BackRef | val;

3288

#else

3289

error(RXERR_DISABLED);

3290

#endif

3291

}

3292

return Tok_Char | prevCh;

3293

}

3294

3295

#ifndef QT_NO_REGEXP_INTERVAL

3296

int QRegExpEngine::getRep(int def)

3297

{

3298

if (yyCh >= '0' && yyCh <= '9') {

3299

int rep = 0;

3300

do {

3301

rep = 10 * rep + yyCh - '0';

3302

if (rep >= InftyRep) {

3303

error(RXERR_REPETITION);

rep = def;

}

yyCh = getChar();

} while (yyCh >= '0' && yyCh <= '9');

return rep;

} else {

return def;

}

}

#endif

#ifndef QT_NO_REGEXP_LOOKAHEAD

3316

void QRegExpEngine::skipChars(int n)

{

if (n > 0) {

yyPos += n - 1;

yyCh = getChar();

}

}

#endif

void QRegExpEngine::error(const char *msg)

3326

{

3327

if (yyError.isEmpty())

3328

yyError = QLatin1String(msg);

3329

}

3330

3331

void QRegExpEngine::startTokenizer(const QChar *rx, int len)

{

yyIn = rx;

yyPos0 = 0;

yyPos = 0;

yyLen = len;

yyCh = getChar();

yyCharClass.reset(new QRegExpCharClass);

yyMinRep = 0;

yyMaxRep = 0;

yyError = QString();

}

int QRegExpEngine::getToken()

3345

{

3346

#ifndef QT_NO_REGEXP_CCLASS

3347

ushort pendingCh = 0;

bool charPending;

bool rangePending;

int tok;

#endif

int prevCh = yyCh;

yyPos0 = yyPos - 1;

#ifndef QT_NO_REGEXP_CCLASS

3356

yyCharClass->clear();

#endif

yyMinRep = 0;

yyMaxRep = 0;

yyCh = getChar();

switch (prevCh) {

case EOS:

yyPos0 = yyPos;

return Tok_Eos;

case '$':

return Tok_Dollar;

case '(':

if (yyCh == '?') {

prevCh = getChar();

yyCh = getChar();

switch (prevCh) {

#ifndef QT_NO_REGEXP_LOOKAHEAD

3374

case '!':

3375

return Tok_NegLookahead;

3376

case '=':

3377

return Tok_PosLookahead;

3378

#endif

3379

case ':':

3380

return Tok_MagicLeftParen;

3381

case '<':

3382

error(RXERR_LOOKBEHIND);

3383

return Tok_MagicLeftParen;

3384

default:

3385

error(RXERR_LOOKAHEAD);

3386

return Tok_MagicLeftParen;

3387

}

3388

} else {

3389

return Tok_LeftParen;

3390

}

3391

case ')':

3392

return Tok_RightParen;

case '*':

yyMinRep = 0;

yyMaxRep = InftyRep;

return Tok_Quantifier;

case '+':

yyMinRep = 1;

yyMaxRep = InftyRep;

return Tok_Quantifier;

3401

case '.':

3402

#ifndef QT_NO_REGEXP_CCLASS

3403

yyCharClass->setNegative(true);

3404

#endif

3405

return Tok_CharClass;

case '?':

yyMinRep = 0;

yyMaxRep = 1;

return Tok_Quantifier;

3410

case '[':

3411

#ifndef QT_NO_REGEXP_CCLASS

3412

if (yyCh == '^') {

3413

yyCharClass->setNegative(true);

yyCh = getChar();

}

charPending = false;

rangePending = false;

3418

do {

3419

if (yyCh == '-' && charPending && !rangePending) {

rangePending = true;

yyCh = getChar();

} else {

if (charPending && !rangePending) {

3424

yyCharClass->addSingleton(pendingCh);

charPending = false;

}

if (yyCh == '\\') {

yyCh = getChar();

tok = getEscape();

if (tok == Tok_Word)

tok = '\b';

} else {

tok = Tok_Char | yyCh;

3434

yyCh = getChar();

3435

}

3436

if (tok == Tok_CharClass) {

3437

if (rangePending) {

3438

yyCharClass->addSingleton('-');

3439

yyCharClass->addSingleton(pendingCh);

3440

charPending = false;

3441

rangePending = false;

3442

}

3443

} else if ((tok & Tok_Char) != 0) {

3444

if (rangePending) {

3445

yyCharClass->addRange(pendingCh, tok ^ Tok_Char);

3446

charPending = false;

3447

rangePending = false;

3448

} else {

3449

pendingCh = tok ^ Tok_Char;

charPending = true;

}

} else {

error(RXERR_CHARCLASS);

3454

}

3455

}

3456

} while (yyCh != ']' && yyCh != EOS);

3457

if (rangePending)

3458

yyCharClass->addSingleton('-');

3459

if (charPending)

3460

yyCharClass->addSingleton(pendingCh);

if (yyCh == EOS)

error(RXERR_END);

else

yyCh = getChar();

return Tok_CharClass;

3466

#else

3467

error(RXERR_END);

3468

return Tok_Char | '[';

#endif

case '\\':

return getEscape();

case ']':

error(RXERR_LEFTDELIM);

3474

return Tok_Char | ']';

case '^':

return Tok_Caret;

case '{':

#ifndef QT_NO_REGEXP_INTERVAL

3479

yyMinRep = getRep(0);

yyMaxRep = yyMinRep;

if (yyCh == ',') {

yyCh = getChar();

yyMaxRep = getRep(InftyRep);

3484

}

3485

if (yyMaxRep < yyMinRep)

3486

error(RXERR_INTERVAL);

3487

if (yyCh != '}')

3488

error(RXERR_REPETITION);

3489

yyCh = getChar();

3490

return Tok_Quantifier;

3491

#else

3492

error(RXERR_DISABLED);

3493

return Tok_Char | '{';

#endif

case '|':

return Tok_Bar;

case '}':

error(RXERR_LEFTDELIM);

3499

return Tok_Char | '}';

3500

default:

3501

return Tok_Char | prevCh;

}

}

int QRegExpEngine::parse(const QChar *pattern, int len)

3506

{

3507

valid = true;

3508

startTokenizer(pattern, len);

3509

yyTok = getToken();

3510

#ifndef QT_NO_REGEXP_CAPTURE

3511

yyMayCapture = true;

3512

#else

3513

yyMayCapture = false;

3514

#endif

3515

3516

#ifndef QT_NO_REGEXP_CAPTURE

3517

int atom = startAtom(false);

3518

#endif

3519

QRegExpCharClass anything;

3520

Box box(this); // create InitialState

3521

box.set(anything);

3522

Box rightBox(this); // create FinalState

3523

rightBox.set(anything);

3524

3525

Box middleBox(this);

3526

parseExpression(&middleBox);

3527

#ifndef QT_NO_REGEXP_CAPTURE

3528

finishAtom(atom, false);

3529

#endif

3530

#ifndef QT_NO_REGEXP_OPTIM

3531

middleBox.setupHeuristics();

#endif

box.cat(middleBox);

box.cat(rightBox);

yyCharClass.reset(0);

3536

3537

#ifndef QT_NO_REGEXP_CAPTURE

3538

for (int i = 0; i < nf; ++i) {

3539

switch (f[i].capture) {

3540

case QRegExpAtom::NoCapture:

3541

break;

3542

case QRegExpAtom::OfficialCapture:

3543

f[i].capture = ncap;

3544

captureForOfficialCapture.append(ncap);

++ncap;

++officialncap;

break;

case QRegExpAtom::UnofficialCapture:

3549

f[i].capture = greedyQuantifiers ? ncap++ : QRegExpAtom::NoCapture;

}

}

#ifndef QT_NO_REGEXP_BACKREF

3554

#ifndef QT_NO_REGEXP_OPTIM

3555

if (officialncap == 0 && nbrefs == 0) {

ncap = nf = 0;

f.clear();

}

#endif

// handle the case where there's a \5 with no corresponding capture

3561

// (captureForOfficialCapture.size() != officialncap)

3562

for (int i = 0; i < nbrefs - officialncap; ++i) {

3563

captureForOfficialCapture.append(ncap);

++ncap;

}

#endif

#endif

if (!yyError.isEmpty())

3570

return -1;

3571

3572

#ifndef QT_NO_REGEXP_OPTIM

3573

const QRegExpAutomatonState &sinit = s.at(InitialState);

3574

caretAnchored = !sinit.anchors.isEmpty();

3575

if (caretAnchored) {

3576

const QMap<int, int> &anchors = sinit.anchors;

3577

QMap<int, int>::const_iterator a;

3578

for (a = anchors.constBegin(); a != anchors.constEnd(); ++a) {

3579

if (

3580

#ifndef QT_NO_REGEXP_ANCHOR_ALT

3581

(*a & Anchor_Alternation) != 0 ||

3582

#endif

3583

(*a & Anchor_Caret) == 0)

3584

{

3585

caretAnchored = false;

break;

}

}

}

#endif

// cleanup anchors

int numStates = s.count();

3594

for (int i = 0; i < numStates; ++i) {

3595

QRegExpAutomatonState &state = s[i];

3596

if (!state.anchors.isEmpty()) {

3597

QMap<int, int>::iterator a = state.anchors.begin();

3598

while (a != state.anchors.end()) {

3599

if (a.value() == 0)

3600

a = state.anchors.erase(a);

else

++a;

}

}

}

return yyPos0;

}

void QRegExpEngine::parseAtom(Box *box)

3611

{

3612

#ifndef QT_NO_REGEXP_LOOKAHEAD

3613

QRegExpEngine *eng = 0;

bool neg;

int len;

#endif

if ((yyTok & Tok_Char) != 0) {

3619

box->set(QChar(yyTok ^ Tok_Char));

3620

} else {

3621

#ifndef QT_NO_REGEXP_OPTIM

trivial = false;

#endif

switch (yyTok) {

case Tok_Dollar:

box->catAnchor(Anchor_Dollar);

3627

break;

3628

case Tok_Caret:

3629

box->catAnchor(Anchor_Caret);

3630

break;

3631

#ifndef QT_NO_REGEXP_LOOKAHEAD

3632

case Tok_PosLookahead:

3633

case Tok_NegLookahead:

3634

neg = (yyTok == Tok_NegLookahead);

3635

eng = new QRegExpEngine(cs, greedyQuantifiers);

3636

len = eng->parse(yyIn + yyPos - 1, yyLen - yyPos + 1);

if (len >= 0)

skipChars(len);

else

error(RXERR_LOOKAHEAD);

3641

box->catAnchor(addLookahead(eng, neg));

3642

yyTok = getToken();

3643

if (yyTok != Tok_RightParen)

3644

error(RXERR_LOOKAHEAD);

3645

break;

3646

#endif

3647

#ifndef QT_NO_REGEXP_ESCAPE

3648

case Tok_Word:

3649

box->catAnchor(Anchor_Word);

3650

break;

3651

case Tok_NonWord:

3652

box->catAnchor(Anchor_NonWord);

break;

#endif

case Tok_LeftParen:

case Tok_MagicLeftParen:

3657

yyTok = getToken();

3658

parseExpression(box);

3659

if (yyTok != Tok_RightParen)

error(RXERR_END);

break;

case Tok_CharClass:

box->set(*yyCharClass);

3664

break;

3665

case Tok_Quantifier:

3666

error(RXERR_REPETITION);

3667

break;

3668

default:

3669

#ifndef QT_NO_REGEXP_BACKREF

3670

if ((yyTok & Tok_BackRef) != 0)

3671

box->set(yyTok ^ Tok_BackRef);

3672

else

3673

#endif

3674

error(RXERR_DISABLED);

}

}

yyTok = getToken();

}

void QRegExpEngine::parseFactor(Box *box)

3681

{

3682

#ifndef QT_NO_REGEXP_CAPTURE

3683

int outerAtom = greedyQuantifiers ? startAtom(false) : -1;

3684

int innerAtom = startAtom(yyMayCapture && yyTok == Tok_LeftParen);

3685

bool magicLeftParen = (yyTok == Tok_MagicLeftParen);

3686

#else

3687

const int innerAtom = -1;

3688

#endif

3689

3690

#ifndef QT_NO_REGEXP_INTERVAL

3691

#define YYREDO() \

3692

yyIn = in, yyPos0 = pos0, yyPos = pos, yyLen = len, yyCh = ch, \

3693

*yyCharClass = charClass, yyMinRep = 0, yyMaxRep = 0, yyTok = tok

3694

3695

const QChar *in = yyIn;

int pos0 = yyPos0;

int pos = yyPos;

int len = yyLen;

int ch = yyCh;

QRegExpCharClass charClass;

3701

if (yyTok == Tok_CharClass)

3702

charClass = *yyCharClass;

3703

int tok = yyTok;

3704

bool mayCapture = yyMayCapture;

#endif

parseAtom(box);

#ifndef QT_NO_REGEXP_CAPTURE

3709

finishAtom(innerAtom, magicLeftParen);

3710

#endif

3711

3712

bool hasQuantifier = (yyTok == Tok_Quantifier);

3713

if (hasQuantifier) {

3714

#ifndef QT_NO_REGEXP_OPTIM

3715

trivial = false;

3716

#endif

3717

if (yyMaxRep == InftyRep) {

3718

box->plus(innerAtom);

3719

#ifndef QT_NO_REGEXP_INTERVAL

3720

} else if (yyMaxRep == 0) {

box->clear();

#endif

}

if (yyMinRep == 0)

box->opt();

#ifndef QT_NO_REGEXP_INTERVAL

3728

yyMayCapture = false;

3729

int alpha = (yyMinRep == 0) ? 0 : yyMinRep - 1;

3730

int beta = (yyMaxRep == InftyRep) ? 0 : yyMaxRep - (alpha + 1);

Box rightBox(this);

int i;

for (i = 0; i < beta; i++) {

YYREDO();

Box leftBox(this);

parseAtom(&leftBox);

leftBox.cat(rightBox);

leftBox.opt();

rightBox = leftBox;

}

for (i = 0; i < alpha; i++) {

YYREDO();

Box leftBox(this);

parseAtom(&leftBox);

leftBox.cat(rightBox);

rightBox = leftBox;

}

rightBox.cat(*box);

*box = rightBox;

#endif

yyTok = getToken();

#ifndef QT_NO_REGEXP_INTERVAL

3755

yyMayCapture = mayCapture;

#endif

}

#undef YYREDO

#ifndef QT_NO_REGEXP_CAPTURE

3760

if (greedyQuantifiers)

3761

finishAtom(outerAtom, hasQuantifier);

#endif

}

void QRegExpEngine::parseTerm(Box *box)

3766

{

3767

#ifndef QT_NO_REGEXP_OPTIM

3768

if (yyTok != Tok_Eos && yyTok != Tok_RightParen && yyTok != Tok_Bar)

3769

parseFactor(box);

3770

#endif

3771

while (yyTok != Tok_Eos && yyTok != Tok_RightParen && yyTok != Tok_Bar) {

3772

Box rightBox(this);

3773

parseFactor(&rightBox);

box->cat(rightBox);

}

}

void QRegExpEngine::parseExpression(Box *box)

3779

{

3780

parseTerm(box);

3781

while (yyTok == Tok_Bar) {

3782

#ifndef QT_NO_REGEXP_OPTIM

trivial = false;

#endif

Box rightBox(this);

yyTok = getToken();

parseTerm(&rightBox);

box->orx(rightBox);

}

}

The struct QRegExpPrivate contains the private data of a regular

3794

expression other than the automaton. It makes it possible for many

3795

QRegExp objects to use the same QRegExpEngine object with different

3796

QRegExpPrivate objects.

3797

3798

struct QRegExpPrivate

3799

{

3800

QRegExpEngine *eng;

3801

QRegExpEngineKey engineKey;

3802

bool minimal;

3803

#ifndef QT_NO_REGEXP_CAPTURE

3804

QString t; // last string passed to QRegExp::indexIn() or lastIndexIn()

3805

QStringList capturedCache; // what QRegExp::capturedTexts() returned last

3806

#endif

3807

QRegExpMatchState matchState;

3808

3809

inline QRegExpPrivate()

3810

: eng(0), engineKey(QString(), QRegExp::RegExp, Qt::CaseSensitive), minimal(false) { }

3811

inline QRegExpPrivate(const QRegExpEngineKey &key)

3812

: eng(0), engineKey(key), minimal(false) {}

3813

};

3814

3815

#if !defined(QT_NO_REGEXP_OPTIM)

3816

typedef QCache<QRegExpEngineKey, QRegExpEngine> EngineCache;

3817

Q_GLOBAL_STATIC(EngineCache, globalEngineCache)

3818

static QBasicMutex globalEngineCacheMutex;

3819

#endif // QT_NO_REGEXP_OPTIM

3820

3821

static void derefEngine(QRegExpEngine *eng, const QRegExpEngineKey &key)

3822

{

3823

if (!eng->ref.deref()) {

3824

#if !defined(QT_NO_REGEXP_OPTIM)

3825

if (globalEngineCache()) {

3826

QMutexLocker locker(&globalEngineCacheMutex);

3827

QT_TRY {

3828

globalEngineCache()->insert(key, eng, 4 + key.pattern.length() / 4);

3829

} QT_CATCH(const std::bad_alloc &) {

3830

// in case of an exception (e.g. oom), just delete the engine

delete eng;

}

} else {

delete eng;

}

#else

Q_UNUSED(key);

delete eng;

#endif

}

}

static void prepareEngine_helper(QRegExpPrivate *priv)

3844

{

3845

bool initMatchState = !priv->eng;

3846

#if !defined(QT_NO_REGEXP_OPTIM)

3847

if (!priv->eng && globalEngineCache()) {

3848

QMutexLocker locker(&globalEngineCacheMutex);

3849

priv->eng = globalEngineCache()->take(priv->engineKey);

3850

if (priv->eng != 0)

3851

priv->eng->ref.ref();

3852

}

3853

#endif // QT_NO_REGEXP_OPTIM

3854

3855

if (!priv->eng)

3856

priv->eng = new QRegExpEngine(priv->engineKey);

3857

3858

if (initMatchState)

3859

priv->matchState.prepareForMatch(priv->eng);

3860

}

3861

3862

inline static void prepareEngine(QRegExpPrivate *priv)

{

if (priv->eng)

return;

prepareEngine_helper(priv);

3867

}

3868

3869

static void prepareEngineForMatch(QRegExpPrivate *priv, const QString &str)

3870

{

3871

prepareEngine(priv);

3872

priv->matchState.prepareForMatch(priv->eng);

3873

#ifndef QT_NO_REGEXP_CAPTURE

3874

priv->t = str;

3875

priv->capturedCache.clear();

#else

Q_UNUSED(str);

#endif

}

static void invalidateEngine(QRegExpPrivate *priv)

3882

{

3883

if (priv->eng != 0) {

3884

derefEngine(priv->eng, priv->engineKey);

3885

priv->eng = 0;

3886

priv->matchState.drain();

}

}

/*!

\enum QRegExp::CaretMode

3892

3893

The CaretMode enum defines the different meanings of the caret

3894

(\b{^}) in a regular expression. The possible values are:

3895

3896

\value CaretAtZero

3897

The caret corresponds to index 0 in the searched string.

3898

3899

\value CaretAtOffset

3900

The caret corresponds to the start offset of the search.

3901

3902

\value CaretWontMatch

3903

The caret never matches.

/*!

\enum QRegExp::PatternSyntax

3908

3909

The syntax used to interpret the meaning of the pattern.

3910

3911

\value RegExp A rich Perl-like pattern matching syntax. This is

3912

the default.

3913

3914

\value RegExp2 Like RegExp, but with \l{greedy quantifiers}.

3915

(Introduced in Qt 4.2.)

3916

3917

\value Wildcard This provides a simple pattern matching syntax

3918

similar to that used by shells (command interpreters) for "file

3919

globbing". See \l{QRegExp wildcard matching}.

3920

3921

\value WildcardUnix This is similar to Wildcard but with the

3922

behavior of a Unix shell. The wildcard characters can be escaped

3923

with the character "\\".

3924

3925

\value FixedString The pattern is a fixed string. This is

3926

equivalent to using the RegExp pattern on a string in

3927

which all metacharacters are escaped using escape().

3928

3929

\value W3CXmlSchema11 The pattern is a regular expression as

3930

defined by the W3C XML Schema 1.1 specification.

3931

3932

\sa setPatternSyntax()

/*!

Constructs an empty regexp.

3937

3938

\sa isValid(), errorString()

QRegExp::QRegExp()

{

priv = new QRegExpPrivate;

prepareEngine(priv);

}

/*!

Constructs a regular expression object for the given \a pattern

3948

string. The pattern must be given using wildcard notation if \a

3949

syntax is \l Wildcard; the default is \l RegExp. The pattern is

3950

case sensitive, unless \a cs is Qt::CaseInsensitive. Matching is

3951

greedy (maximal), but can be changed by calling

3952

setMinimal().

3953

3954

\sa setPattern(), setCaseSensitivity(), setPatternSyntax()

3955

3956

QRegExp::QRegExp(const QString &pattern, Qt::CaseSensitivity cs, PatternSyntax syntax)

3957

{

3958

priv = new QRegExpPrivate(QRegExpEngineKey(pattern, syntax, cs));

prepareEngine(priv);

}

/*!

Constructs a regular expression as a copy of \a rx.

\sa operator=()

QRegExp::QRegExp(const QRegExp &rx)

3968

{

3969

priv = new QRegExpPrivate;

operator=(rx);

}

/*!

Destroys the regular expression and cleans up its internal data.

QRegExp::~QRegExp()

{

invalidateEngine(priv);

delete priv;

}

/*!

Copies the regular expression \a rx and returns a reference to the

3984

copy. The case sensitivity, wildcard, and minimal matching options

3985

are also copied.

3986

3987

QRegExp &QRegExp::operator=(const QRegExp &rx)

3988

{

3989

prepareEngine(rx.priv); // to allow sharing

3990

QRegExpEngine *otherEng = rx.priv->eng;

3991

if (otherEng)

3992

otherEng->ref.ref();

3993

invalidateEngine(priv);

3994

priv->eng = otherEng;

3995

priv->engineKey = rx.priv->engineKey;

3996

priv->minimal = rx.priv->minimal;

3997

#ifndef QT_NO_REGEXP_CAPTURE

3998

priv->t = rx.priv->t;

3999

priv->capturedCache = rx.priv->capturedCache;

4000

#endif

4001

if (priv->eng)

4002

priv->matchState.prepareForMatch(priv->eng);

4003

priv->matchState.captured = rx.priv->matchState.captured;

return *this;

}

/*!

\fn QRegExp &QRegExp::operator=(QRegExp &&other)

4009

4010

Move-assigns \a other to this QRegExp instance.

\since 5.2

/*!

\fn void QRegExp::swap(QRegExp &other)

4017

\since 4.8

4018

4019

Swaps regular expression \a other with this regular

4020

expression. This operation is very fast and never fails.

/*!

Returns \c true if this regular expression is equal to \a rx;

4025

otherwise returns \c false.

4026

4027

Two QRegExp objects are equal if they have the same pattern

4028

strings and the same settings for case sensitivity, wildcard and

4029

minimal matching.

4030

4031

bool QRegExp::operator==(const QRegExp &rx) const

4032

{

4033

return priv->engineKey == rx.priv->engineKey && priv->minimal == rx.priv->minimal;

}

/*!

\since 5.6

\relates QRegExp

Returns the hash value for \a key, using

4041

\a seed to seed the calculation.

4042

4043

uint qHash(const QRegExp &key, uint seed) Q_DECL_NOTHROW

4044

{

4045

QtPrivate::QHashCombine hash;

4046

seed = hash(seed, key.priv->engineKey);

4047

seed = hash(seed, key.priv->minimal);

4048

return seed;

executed 2048 times by 1 test: return seed;

Executed by:

tst_QRegExp

2048

}

/*!

\fn bool QRegExp::operator!=(const QRegExp &rx) const

4053

4054

Returns \c true if this regular expression is not equal to \a rx;

4055

otherwise returns \c false.

\sa operator==()

/*!

Returns \c true if the pattern string is empty; otherwise returns

4062

false.

4063

4064

If you call exactMatch() with an empty pattern on an empty string

4065

it will return true; otherwise it returns \c false since it operates

4066

over the whole string. If you call indexIn() with an empty pattern

4067

on \e any string it will return the start offset (0 by default)

4068

because the empty pattern matches the 'emptiness' at the start of

4069

the string. In this case the length of the match returned by

4070

matchedLength() will be 0.

4071

4072

See QString::isEmpty().

4073

4074

4075

bool QRegExp::isEmpty() const

4076

{

4077

return priv->engineKey.pattern.isEmpty();

}

/*!

Returns \c true if the regular expression is valid; otherwise returns

4082

false. An invalid regular expression never matches.

4083

4084

The pattern \b{[a-z} is an example of an invalid pattern, since

4085

it lacks a closing square bracket.

4086

4087

Note that the validity of a regexp may also depend on the setting

4088

of the wildcard flag, for example \b{*.html} is a valid

4089

wildcard regexp but an invalid full regexp.

\sa errorString()

bool QRegExp::isValid() const

4094

{

4095

if (priv->engineKey.pattern.isEmpty()) {

return true;

} else {

prepareEngine(priv);

return priv->eng->isValid();

}

}

/*!

Returns the pattern string of the regular expression. The pattern

4105

has either regular expression syntax or wildcard syntax, depending

4106

on patternSyntax().

4107

4108

\sa patternSyntax(), caseSensitivity()

4109

4110

QString QRegExp::pattern() const

4111

{

4112

return priv->engineKey.pattern;

}

/*!

Sets the pattern string to \a pattern. The case sensitivity,

4117

wildcard, and minimal matching options are not changed.

4118

4119

\sa setPatternSyntax(), setCaseSensitivity()

4120

4121

void QRegExp::setPattern(const QString &pattern)

4122

{

4123

if (priv->engineKey.pattern != pattern) {

4124

invalidateEngine(priv);

4125

priv->engineKey.pattern = pattern;

}

}

/*!

Returns Qt::CaseSensitive if the regexp is matched case

4131

sensitively; otherwise returns Qt::CaseInsensitive.

4132

4133

\sa patternSyntax(), pattern(), isMinimal()

4134

4135

Qt::CaseSensitivity QRegExp::caseSensitivity() const

4136

{

4137

return priv->engineKey.cs;

}

/*!

Sets case sensitive matching to \a cs.

4142

4143

If \a cs is Qt::CaseSensitive, \b{\\.txt$} matches

4144

\c{readme.txt} but not \c{README.TXT}.

4145

4146

\sa setPatternSyntax(), setPattern(), setMinimal()

4147

4148

void QRegExp::setCaseSensitivity(Qt::CaseSensitivity cs)

4149

{

4150

if ((bool)cs != (bool)priv->engineKey.cs) {

4151

invalidateEngine(priv);

4152

priv->engineKey.cs = cs;

}

}

/*!

Returns the syntax used by the regular expression. The default is

4158

QRegExp::RegExp.

4159

4160

\sa pattern(), caseSensitivity()

4161

4162

QRegExp::PatternSyntax QRegExp::patternSyntax() const

4163

{

4164

return priv->engineKey.patternSyntax;

}

/*!

Sets the syntax mode for the regular expression. The default is

4169

QRegExp::RegExp.

4170

4171

Setting \a syntax to QRegExp::Wildcard enables simple shell-like

4172

\l{QRegExp wildcard matching}. For example, \b{r*.txt} matches the

4173

string \c{readme.txt} in wildcard mode, but does not match

4174

\c{readme}.

4175

4176

Setting \a syntax to QRegExp::FixedString means that the pattern

4177

is interpreted as a plain string. Special characters (e.g.,

4178

backslash) don't need to be escaped then.

4179

4180

\sa setPattern(), setCaseSensitivity(), escape()

4181

4182

void QRegExp::setPatternSyntax(PatternSyntax syntax)

4183

{

4184

if (syntax != priv->engineKey.patternSyntax) {

4185

invalidateEngine(priv);

4186

priv->engineKey.patternSyntax = syntax;

}

}

/*!

Returns \c true if minimal (non-greedy) matching is enabled;

4192

otherwise returns \c false.

4193

4194

\sa caseSensitivity(), setMinimal()

4195

4196

bool QRegExp::isMinimal() const

4197

{

4198

return priv->minimal;

}

/*!

Enables or disables minimal matching. If \a minimal is false,

4203

matching is greedy (maximal) which is the default.

4204

4205

For example, suppose we have the input string "We must be

4206

bold, very bold!" and the pattern

4207

\b{.*}. With the default greedy (maximal) matching,

4208

the match is "We must be \underline{bold, very

4209

bold}!". But with minimal (non-greedy) matching, the

4210

first match is: "We must be \underline{bold}, very

4211

bold!" and the second match is "We must be bold,

4212

very \underline{bold}!". In practice we might use the pattern

4213

\b{[^<]*\} instead, although this will still fail for

4214

nested tags.

4215

4216

\sa setCaseSensitivity()

4217

4218

void QRegExp::setMinimal(bool minimal)

4219

{

4220

priv->minimal = minimal;

4221

}

4222

4223

// ### Qt 5: make non-const

4224

/*!

4225

Returns \c true if \a str is matched exactly by this regular

4226

expression; otherwise returns \c false. You can determine how much of

4227

the string was matched by calling matchedLength().

4228

4229

For a given regexp string R, exactMatch("R") is the equivalent of

4230

indexIn("^R$") since exactMatch() effectively encloses the regexp

4231

in the start of string and end of string anchors, except that it

4232

sets matchedLength() differently.

4233

4234

For example, if the regular expression is \b{blue}, then

4235

exactMatch() returns \c true only for input \c blue. For inputs \c

4236

bluebell, \c blutak and \c lightblue, exactMatch() returns \c false

4237

and matchedLength() will return 4, 3 and 0 respectively.

4238

4239

Although const, this function sets matchedLength(),

4240

capturedTexts(), and pos().

4241

4242

\sa indexIn(), lastIndexIn()

4243

4244

bool QRegExp::exactMatch(const QString &str) const

4245

{

4246

prepareEngineForMatch(priv, str);

4247

priv->matchState.match(str.unicode(), str.length(), 0, priv->minimal, true, 0);

4248

if (priv->matchState.captured[1] == str.length()) {

4249

return true;

4250

} else {

4251

priv->matchState.captured[0] = 0;

4252

priv->matchState.captured[1] = priv->matchState.oneTestMatchedLen;

return false;

}

}

// ### Qt 5: make non-const

4258

/*!

4259

Attempts to find a match in \a str from position \a offset (0 by

4260

default). If \a offset is -1, the search starts at the last

4261

character; if -2, at the next to last character; etc.

4262

4263

Returns the position of the first match, or -1 if there was no

4264

match.

4265

4266

The \a caretMode parameter can be used to instruct whether \b{^}

4267

should match at index 0 or at \a offset.

4268

4269

You might prefer to use QString::indexOf(), QString::contains(),

4270

or even QStringList::filter(). To replace matches use

QString::replace().

Example:

\snippet code/src_corelib_tools_qregexp.cpp 13

4275

4276

Although const, this function sets matchedLength(),

4277

capturedTexts() and pos().

4278

4279

If the QRegExp is a wildcard expression (see setPatternSyntax())

4280

and want to test a string against the whole wildcard expression,

4281

use exactMatch() instead of this function.

4282

4283

\sa lastIndexIn(), exactMatch()

4284

4285

4286

int QRegExp::indexIn(const QString &str, int offset, CaretMode caretMode) const

4287

{

4288

prepareEngineForMatch(priv, str);

4289

if (offset < 0)

4290

offset += str.length();

4291

priv->matchState.match(str.unicode(), str.length(), offset,

4292

priv->minimal, false, caretIndex(offset, caretMode));

4293

return priv->matchState.captured[0];

4294

}

4295

4296

// ### Qt 5: make non-const

4297

/*!

4298

Attempts to find a match backwards in \a str from position \a

4299

offset. If \a offset is -1 (the default), the search starts at the

4300

last character; if -2, at the next to last character; etc.

4301

4302

Returns the position of the first match, or -1 if there was no

4303

match.

4304

4305

The \a caretMode parameter can be used to instruct whether \b{^}

4306

should match at index 0 or at \a offset.

4307

4308

Although const, this function sets matchedLength(),

4309

capturedTexts() and pos().

4310

4311

\warning Searching backwards is much slower than searching

4312

forwards.

4313

4314

\sa indexIn(), exactMatch()

4315

4316

4317

int QRegExp::lastIndexIn(const QString &str, int offset, CaretMode caretMode) const

4318

{

4319

prepareEngineForMatch(priv, str);

4320

if (offset < 0)

4321

offset += str.length();

4322

if (offset < 0 || offset > str.length()) {

4323

memset(priv->matchState.captured, -1, priv->matchState.capturedSize*sizeof(int));

return -1;

}

while (offset >= 0) {

4328

priv->matchState.match(str.unicode(), str.length(), offset,

4329

priv->minimal, true, caretIndex(offset, caretMode));

4330

if (priv->matchState.captured[0] == offset)

return offset;

--offset;

}

return -1;

}

/*!

Returns the length of the last matched string, or -1 if there was

4339

no match.

4340

4341

\sa exactMatch(), indexIn(), lastIndexIn()

4342

4343

int QRegExp::matchedLength() const

4344

{

4345

return priv->matchState.captured[1];

4346

}

4347

4348

#ifndef QT_NO_REGEXP_CAPTURE

/*!

\since 4.6

Returns the number of captures contained in the regular expression.

4353

4354

int QRegExp::captureCount() const

4355

{

4356

prepareEngine(priv);

4357

return priv->eng->captureCount();

}

/*!

Returns a list of the captured text strings.

4362

4363

The first string in the list is the entire matched string. Each

4364

subsequent list element contains a string that matched a

4365

(capturing) subexpression of the regexp.

4366

4367

For example:

4368

\snippet code/src_corelib_tools_qregexp.cpp 14

4369

4370

The above example also captures elements that may be present but

4371

which we have no interest in. This problem can be solved by using

4372

non-capturing parentheses:

4373

4374

\snippet code/src_corelib_tools_qregexp.cpp 15

4375

4376

Note that if you want to iterate over the list, you should iterate

4377

over a copy, e.g.

4378

\snippet code/src_corelib_tools_qregexp.cpp 16

4379

4380

Some regexps can match an indeterminate number of times. For

4381

example if the input string is "Offsets: 12 14 99 231 7" and the

4382

regexp, \c{rx}, is \b{(\\d+)+}, we would hope to get a list of

4383

all the numbers matched. However, after calling

4384

\c{rx.indexIn(str)}, capturedTexts() will return the list ("12",

4385

"12"), i.e. the entire match was "12" and the first subexpression

4386

matched was "12". The correct approach is to use cap() in a

4387

\l{QRegExp#cap_in_a_loop}{loop}.

4388

4389

The order of elements in the string list is as follows. The first

4390

element is the entire matching string. Each subsequent element

4391

corresponds to the next capturing open left parentheses. Thus

4392

capturedTexts()[1] is the text of the first capturing parentheses,

4393

capturedTexts()[2] is the text of the second and so on

4394

(corresponding to $1, $2, etc., in some other regexp languages).

\sa cap(), pos()

QStringList QRegExp::capturedTexts() const

4399

{

4400

if (priv->capturedCache.isEmpty()) {

4401

prepareEngine(priv);

4402

const int *captured = priv->matchState.captured;

4403

int n = priv->matchState.capturedSize;

4404

4405

for (int i = 0; i < n; i += 2) {

4406

QString m;

4407

if (captured[i + 1] == 0)

4408

m = QLatin1String(""); // ### Qt 5: don't distinguish between null and empty

4409

else if (captured[i] >= 0)

4410

m = priv->t.mid(captured[i], captured[i + 1]);

4411

priv->capturedCache.append(m);

}

priv->t.clear();

}

return priv->capturedCache;

}

/*!

\internal

QStringList QRegExp::capturedTexts()

4422

{

4423

return const_cast<const QRegExp *>(this)->capturedTexts();

}

/*!

Returns the text captured by the \a nth subexpression. The entire

4428

match has index 0 and the parenthesized subexpressions have

4429

indexes starting from 1 (excluding non-capturing parentheses).

4430

4431

\snippet code/src_corelib_tools_qregexp.cpp 17

4432

4433

The order of elements matched by cap() is as follows. The first

4434

element, cap(0), is the entire matching string. Each subsequent

4435

element corresponds to the next capturing open left parentheses.

4436

Thus cap(1) is the text of the first capturing parentheses, cap(2)

4437

is the text of the second, and so on.

4438

4439

\sa capturedTexts(), pos()

4440

4441

QString QRegExp::cap(int nth) const

4442

{

4443

return capturedTexts().value(nth);

}

/*!

\internal

QString QRegExp::cap(int nth)

4450

{

4451

return const_cast<const QRegExp *>(this)->cap(nth);

}

/*!

Returns the position of the \a nth captured text in the searched

4456

string. If \a nth is 0 (the default), pos() returns the position

of the whole match.

Example:

\snippet code/src_corelib_tools_qregexp.cpp 18

4461

4462

For zero-length matches, pos() always returns -1. (For example, if

4463

cap(4) would return an empty string, pos(4) returns -1.) This is

4464

a feature of the implementation.

4465

4466

\sa cap(), capturedTexts()

4467

4468

int QRegExp::pos(int nth) const

4469

{

4470

if (nth < 0 || nth >= priv->matchState.capturedSize / 2)

4471

return -1;

4472

else

4473

return priv->matchState.captured[2 * nth];

}

/*!

\internal

int QRegExp::pos(int nth)

4480

{

4481

return const_cast<const QRegExp *>(this)->pos(nth);

}

/*!

Returns a text string that explains why a regexp pattern is

4486

invalid the case being; otherwise returns "no error occurred".

\sa isValid()

QString QRegExp::errorString() const

4491

{

4492

if (isValid()) {

4493

return QString::fromLatin1(RXERR_OK);

4494

} else {

4495

return priv->eng->errorString();

}

}

/*!

\internal

QString QRegExp::errorString()

4503

{

4504

return const_cast<const QRegExp *>(this)->errorString();

}

#endif

/*!

Returns the string \a str with every regexp special character

4510

escaped with a backslash. The special characters are $, (,), *, +,

4511

., ?, [, \,], ^, {, | and }.

Example:

\snippet code/src_corelib_tools_qregexp.cpp 19

4516

4517

This function is useful to construct regexp patterns dynamically:

4518

4519

\snippet code/src_corelib_tools_qregexp.cpp 20

4520

4521

\sa setPatternSyntax()

4522

4523

QString QRegExp::escape(const QString &str)

4524

{

4525

QString quoted;

4526

const int count = str.count();

4527

quoted.reserve(count * 2);

4528

const QLatin1Char backslash('\\');

4529

for (int i = 0; i < count; i++) {

4530

switch (str.at(i).toLatin1()) {

case '$':

case '(':

case ')':

case '*':

case '+':

case '.':

case '?':

case '[':

case '\\':

case ']':

case '^':

case '{':

case '|':

case '}':

quoted.append(backslash);

4546

}

4547

quoted.append(str.at(i));

}

return quoted;

}

#ifndef QT_NO_DATASTREAM

/*!

\relates QRegExp

Writes the regular expression \a regExp to stream \a out.

4558

4559

\sa {Serializing Qt Data Types}

4560

4561

QDataStream &operator<<(QDataStream &out, const QRegExp &regExp)

4562

{

4563

return out << regExp.pattern() << (quint8)regExp.caseSensitivity()

4564

<< (quint8)regExp.patternSyntax()

4565

<< (quint8)!!regExp.isMinimal();

}

/*!

\relates QRegExp

Reads a regular expression from stream \a in into \a regExp.

4572

4573

\sa {Serializing Qt Data Types}

4574

4575

QDataStream &operator>>(QDataStream &in, QRegExp &regExp)

{

QString pattern;

quint8 cs;

quint8 patternSyntax;

4580

quint8 isMinimal;

4581

4582

in >> pattern >> cs >> patternSyntax >> isMinimal;

4583

4584

QRegExp newRegExp(pattern, Qt::CaseSensitivity(cs),

4585

QRegExp::PatternSyntax(patternSyntax));

4586

4587

newRegExp.setMinimal(isMinimal);

regExp = newRegExp;

return in;

}

#endif // QT_NO_DATASTREAM

4592

4593

#ifndef QT_NO_DEBUG_STREAM

4594

QDebug operator<<(QDebug dbg, const QRegExp &r)

4595

{

4596

QDebugStateSaver saver(dbg);

4597

dbg.nospace() << "QRegExp(patternSyntax=" << r.patternSyntax()

4598

<< ", pattern='"<< r.pattern() << "')";

return dbg;

}

#endif

QT_END_NAMESPACE

Generated by Squish Coco Non-Commercial 4.3.0-BETA-master-30-08-2018-4cb69e9