1 /*
2 * Copyright (c) 1999, 2013, Oracle and/or its affiliates. All rights reserved.
3 * ORACLE PROPRIETARY/CONFIDENTIAL. Use is subject to license terms.
4 *
5 *
6 *
7 *
8 *
9 *
10 *
11 *
12 *
13 *
14 *
15 *
16 *
17 *
18 *
19 *
20 *
21 *
22 *
23 *
24 */
25
26 package java.util.regex;
27
28 import java.util.Objects;
29
30 /**
31 * An engine that performs match operations on a {@linkplain java.lang.CharSequence
32 * character sequence} by interpreting a {@link Pattern}.
33 *
34 * <p> A matcher is created from a pattern by invoking the pattern's {@link
35 * Pattern#matcher matcher} method. Once created, a matcher can be used to
36 * perform three different kinds of match operations:
37 *
38 * <ul>
39 *
40 * <li><p> The {@link #matches matches} method attempts to match the entire
41 * input sequence against the pattern. </p></li>
42 *
43 * <li><p> The {@link #lookingAt lookingAt} method attempts to match the
44 * input sequence, starting at the beginning, against the pattern. </p></li>
45 *
46 * <li><p> The {@link #find find} method scans the input sequence looking for
47 * the next subsequence that matches the pattern. </p></li>
48 *
49 * </ul>
50 *
51 * <p> Each of these methods returns a boolean indicating success or failure.
52 * More information about a successful match can be obtained by querying the
53 * state of the matcher.
54 *
55 * <p> A matcher finds matches in a subset of its input called the
56 * <i>region</i>. By default, the region contains all of the matcher's input.
57 * The region can be modified via the{@link #region region} method and queried
58 * via the {@link #regionStart regionStart} and {@link #regionEnd regionEnd}
59 * methods. The way that the region boundaries interact with some pattern
60 * constructs can be changed. See {@link #useAnchoringBounds
61 * useAnchoringBounds} and {@link #useTransparentBounds useTransparentBounds}
62 * for more details.
63 *
64 * <p> This class also defines methods for replacing matched subsequences with
65 * new strings whose contents can, if desired, be computed from the match
66 * result. The {@link #appendReplacement appendReplacement} and {@link
67 * #appendTail appendTail} methods can be used in tandem in order to collect
68 * the result into an existing string buffer, or the more convenient {@link
69 * #replaceAll replaceAll} method can be used to create a string in which every
70 * matching subsequence in the input sequence is replaced.
71 *
72 * <p> The explicit state of a matcher includes the start and end indices of
73 * the most recent successful match. It also includes the start and end
74 * indices of the input subsequence captured by each <a
75 * href="Pattern.html#cg">capturing group</a> in the pattern as well as a total
76 * count of such subsequences. As a convenience, methods are also provided for
77 * returning these captured subsequences in string form.
78 *
79 * <p> The explicit state of a matcher is initially undefined; attempting to
80 * query any part of it before a successful match will cause an {@link
81 * IllegalStateException} to be thrown. The explicit state of a matcher is
82 * recomputed by every match operation.
83 *
84 * <p> The implicit state of a matcher includes the input character sequence as
85 * well as the <i>append position</i>, which is initially zero and is updated
86 * by the {@link #appendReplacement appendReplacement} method.
87 *
88 * <p> A matcher may be reset explicitly by invoking its {@link #reset()}
89 * method or, if a new input sequence is desired, its {@link
90 * #reset(java.lang.CharSequence) reset(CharSequence)} method. Resetting a
91 * matcher discards its explicit state information and sets the append position
92 * to zero.
93 *
94 * <p> Instances of this class are not safe for use by multiple concurrent
95 * threads. </p>
96 *
97 *
98 * @author Mike McCloskey
99 * @author Mark Reinhold
100 * @author JSR-51 Expert Group
101 * @since 1.4
102 * @spec JSR-51
103 */
104
105 public final class Matcher implements MatchResult {
106
107 /**
108 * The Pattern object that created this Matcher.
109 */
110 Pattern parentPattern;
111
112 /**
113 * The storage used by groups. They may contain invalid values if
114 * a group was skipped during the matching.
115 */
116 int[] groups;
117
118 /**
119 * The range within the sequence that is to be matched. Anchors
120 * will match at these "hard" boundaries. Changing the region
121 * changes these values.
122 */
123 int from, to;
124
125 /**
126 * Lookbehind uses this value to ensure that the subexpression
127 * match ends at the point where the lookbehind was encountered.
128 */
129 int lookbehindTo;
130
131 /**
132 * The original string being matched.
133 */
134 CharSequence text;
135
136 /**
137 * Matcher state used by the last node. NOANCHOR is used when a
138 * match does not have to consume all of the input. ENDANCHOR is
139 * the mode used for matching all the input.
140 */
141 static final int ENDANCHOR = 1;
142 static final int NOANCHOR = 0;
143 int acceptMode = NOANCHOR;
144
145 /**
146 * The range of string that last matched the pattern. If the last
147 * match failed then first is -1; last initially holds 0 then it
148 * holds the index of the end of the last match (which is where the
149 * next search starts).
150 */
151 int first = -1, last = 0;
152
153 /**
154 * The end index of what matched in the last match operation.
155 */
156 int oldLast = -1;
157
158 /**
159 * The index of the last position appended in a substitution.
160 */
161 int lastAppendPosition = 0;
162
163 /**
164 * Storage used by nodes to tell what repetition they are on in
165 * a pattern, and where groups begin. The nodes themselves are stateless,
166 * so they rely on this field to hold state during a match.
167 */
168 int[] locals;
169
170 /**
171 * Boolean indicating whether or not more input could change
172 * the results of the last match.
173 *
174 * If hitEnd is true, and a match was found, then more input
175 * might cause a different match to be found.
176 * If hitEnd is true and a match was not found, then more
177 * input could cause a match to be found.
178 * If hitEnd is false and a match was found, then more input
179 * will not change the match.
180 * If hitEnd is false and a match was not found, then more
181 * input will not cause a match to be found.
182 */
183 boolean hitEnd;
184
185 /**
186 * Boolean indicating whether or not more input could change
187 * a positive match into a negative one.
188 *
189 * If requireEnd is true, and a match was found, then more
190 * input could cause the match to be lost.
191 * If requireEnd is false and a match was found, then more
192 * input might change the match but the match won't be lost.
193 * If a match was not found, then requireEnd has no meaning.
194 */
195 boolean requireEnd;
196
197 /**
198 * If transparentBounds is true then the boundaries of this
199 * matcher's region are transparent to lookahead, lookbehind,
200 * and boundary matching constructs that try to see beyond them.
201 */
202 boolean transparentBounds = false;
203
204 /**
205 * If anchoringBounds is true then the boundaries of this
206 * matcher's region match anchors such as ^ and $.
207 */
208 boolean anchoringBounds = true;
209
210 /**
211 * No default constructor.
212 */
213 Matcher() {
214 }
215
216 /**
217 * All matchers have the state used by Pattern during a match.
218 */
219 Matcher(Pattern parent, CharSequence text) {
220 this.parentPattern = parent;
221 this.text = text;
222
223 // Allocate state storage
224 int parentGroupCount = Math.max(parent.capturingGroupCount, 10);
225 groups = new int[parentGroupCount * 2];
226 locals = new int[parent.localCount];
227
228 // Put fields into initial states
229 reset();
230 }
231
232 /**
233 * Returns the pattern that is interpreted by this matcher.
234 *
235 * @return The pattern for which this matcher was created
236 */
237 public Pattern pattern() {
238 return parentPattern;
239 }
240
241 /**
242 * Returns the match state of this matcher as a {@link MatchResult}.
243 * The result is unaffected by subsequent operations performed upon this
244 * matcher.
245 *
246 * @return a <code>MatchResult</code> with the state of this matcher
247 * @since 1.5
248 */
249 public MatchResult toMatchResult() {
250 Matcher result = new Matcher(this.parentPattern, text.toString());
251 result.first = this.first;
252 result.last = this.last;
253 result.groups = this.groups.clone();
254 return result;
255 }
256
257 /**
258 * Changes the <tt>Pattern</tt> that this <tt>Matcher</tt> uses to
259 * find matches with.
260 *
261 * <p> This method causes this matcher to lose information
262 * about the groups of the last match that occurred. The
263 * matcher's position in the input is maintained and its
264 * last append position is unaffected.</p>
265 *
266 * @param newPattern
267 * The new pattern used by this matcher
268 * @return This matcher
269 * @throws IllegalArgumentException
270 * If newPattern is <tt>null</tt>
271 * @since 1.5
272 */
273 public Matcher usePattern(Pattern newPattern) {
274 if (newPattern == null)
275 throw new IllegalArgumentException("Pattern cannot be null");
276 parentPattern = newPattern;
277
278 // Reallocate state storage
279 int parentGroupCount = Math.max(newPattern.capturingGroupCount, 10);
280 groups = new int[parentGroupCount * 2];
281 locals = new int[newPattern.localCount];
282 for (int i = 0; i < groups.length; i++)
283 groups[i] = -1;
284 for (int i = 0; i < locals.length; i++)
285 locals[i] = -1;
286 return this;
287 }
288
289 /**
290 * Resets this matcher.
291 *
292 * <p> Resetting a matcher discards all of its explicit state information
293 * and sets its append position to zero. The matcher's region is set to the
294 * default region, which is its entire character sequence. The anchoring
295 * and transparency of this matcher's region boundaries are unaffected.
296 *
297 * @return This matcher
298 */
299 public Matcher reset() {
300 first = -1;
301 last = 0;
302 oldLast = -1;
303 for(int i=0; i<groups.length; i++)
304 groups[i] = -1;
305 for(int i=0; i<locals.length; i++)
306 locals[i] = -1;
307 lastAppendPosition = 0;
308 from = 0;
309 to = getTextLength();
310 return this;
311 }
312
313 /**
314 * Resets this matcher with a new input sequence.
315 *
316 * <p> Resetting a matcher discards all of its explicit state information
317 * and sets its append position to zero. The matcher's region is set to
318 * the default region, which is its entire character sequence. The
319 * anchoring and transparency of this matcher's region boundaries are
320 * unaffected.
321 *
322 * @param input
323 * The new input character sequence
324 *
325 * @return This matcher
326 */
327 public Matcher reset(CharSequence input) {
328 text = input;
329 return reset();
330 }
331
332 /**
333 * Returns the start index of the previous match.
334 *
335 * @return The index of the first character matched
336 *
337 * @throws IllegalStateException
338 * If no match has yet been attempted,
339 * or if the previous match operation failed
340 */
341 public int start() {
342 if (first < 0)
343 throw new IllegalStateException("No match available");
344 return first;
345 }
346
347 /**
348 * Returns the start index of the subsequence captured by the given group
349 * during the previous match operation.
350 *
351 * <p> <a href="Pattern.html#cg">Capturing groups</a> are indexed from left
352 * to right, starting at one. Group zero denotes the entire pattern, so
353 * the expression <i>m.</i><tt>start(0)</tt> is equivalent to
354 * <i>m.</i><tt>start()</tt>. </p>
355 *
356 * @param group
357 * The index of a capturing group in this matcher's pattern
358 *
359 * @return The index of the first character captured by the group,
360 * or <tt>-1</tt> if the match was successful but the group
361 * itself did not match anything
362 *
363 * @throws IllegalStateException
364 * If no match has yet been attempted,
365 * or if the previous match operation failed
366 *
367 * @throws IndexOutOfBoundsException
368 * If there is no capturing group in the pattern
369 * with the given index
370 */
371 public int start(int group) {
372 if (first < 0)
373 throw new IllegalStateException("No match available");
374 if (group < 0 || group > groupCount())
375 throw new IndexOutOfBoundsException("No group " + group);
376 return groups[group * 2];
377 }
378
379 /**
380 * Returns the start index of the subsequence captured by the given
381 * <a href="Pattern.html#groupname">named-capturing group</a> during the
382 * previous match operation.
383 *
384 * @param name
385 * The name of a named-capturing group in this matcher's pattern
386 *
387 * @return The index of the first character captured by the group,
388 * or {@code -1} if the match was successful but the group
389 * itself did not match anything
390 *
391 * @throws IllegalStateException
392 * If no match has yet been attempted,
393 * or if the previous match operation failed
394 *
395 * @throws IllegalArgumentException
396 * If there is no capturing group in the pattern
397 * with the given name
398 * @since 1.8
399 */
400 public int start(String name) {
401 return groups[getMatchedGroupIndex(name) * 2];
402 }
403
404 /**
405 * Returns the offset after the last character matched.
406 *
407 * @return The offset after the last character matched
408 *
409 * @throws IllegalStateException
410 * If no match has yet been attempted,
411 * or if the previous match operation failed
412 */
413 public int end() {
414 if (first < 0)
415 throw new IllegalStateException("No match available");
416 return last;
417 }
418
419 /**
420 * Returns the offset after the last character of the subsequence
421 * captured by the given group during the previous match operation.
422 *
423 * <p> <a href="Pattern.html#cg">Capturing groups</a> are indexed from left
424 * to right, starting at one. Group zero denotes the entire pattern, so
425 * the expression <i>m.</i><tt>end(0)</tt> is equivalent to
426 * <i>m.</i><tt>end()</tt>. </p>
427 *
428 * @param group
429 * The index of a capturing group in this matcher's pattern
430 *
431 * @return The offset after the last character captured by the group,
432 * or <tt>-1</tt> if the match was successful
433 * but the group itself did not match anything
434 *
435 * @throws IllegalStateException
436 * If no match has yet been attempted,
437 * or if the previous match operation failed
438 *
439 * @throws IndexOutOfBoundsException
440 * If there is no capturing group in the pattern
441 * with the given index
442 */
443 public int end(int group) {
444 if (first < 0)
445 throw new IllegalStateException("No match available");
446 if (group < 0 || group > groupCount())
447 throw new IndexOutOfBoundsException("No group " + group);
448 return groups[group * 2 + 1];
449 }
450
451 /**
452 * Returns the offset after the last character of the subsequence
453 * captured by the given <a href="Pattern.html#groupname">named-capturing
454 * group</a> during the previous match operation.
455 *
456 * @param name
457 * The name of a named-capturing group in this matcher's pattern
458 *
459 * @return The offset after the last character captured by the group,
460 * or {@code -1} if the match was successful
461 * but the group itself did not match anything
462 *
463 * @throws IllegalStateException
464 * If no match has yet been attempted,
465 * or if the previous match operation failed
466 *
467 * @throws IllegalArgumentException
468 * If there is no capturing group in the pattern
469 * with the given name
470 * @since 1.8
471 */
472 public int end(String name) {
473 return groups[getMatchedGroupIndex(name) * 2 + 1];
474 }
475
476 /**
477 * Returns the input subsequence matched by the previous match.
478 *
479 * <p> For a matcher <i>m</i> with input sequence <i>s</i>,
480 * the expressions <i>m.</i><tt>group()</tt> and
481 * <i>s.</i><tt>substring(</tt><i>m.</i><tt>start(),</tt> <i>m.</i><tt>end())</tt>
482 * are equivalent. </p>
483 *
484 * <p> Note that some patterns, for example <tt>a*</tt>, match the empty
485 * string. This method will return the empty string when the pattern
486 * successfully matches the empty string in the input. </p>
487 *
488 * @return The (possibly empty) subsequence matched by the previous match,
489 * in string form
490 *
491 * @throws IllegalStateException
492 * If no match has yet been attempted,
493 * or if the previous match operation failed
494 */
495 public String group() {
496 return group(0);
497 }
498
499 /**
500 * Returns the input subsequence captured by the given group during the
501 * previous match operation.
502 *
503 * <p> For a matcher <i>m</i>, input sequence <i>s</i>, and group index
504 * <i>g</i>, the expressions <i>m.</i><tt>group(</tt><i>g</i><tt>)</tt> and
505 * <i>s.</i><tt>substring(</tt><i>m.</i><tt>start(</tt><i>g</i><tt>),</tt> <i>m.</i><tt>end(</tt><i>g</i><tt>))</tt>
506 * are equivalent. </p>
507 *
508 * <p> <a href="Pattern.html#cg">Capturing groups</a> are indexed from left
509 * to right, starting at one. Group zero denotes the entire pattern, so
510 * the expression <tt>m.group(0)</tt> is equivalent to <tt>m.group()</tt>.
511 * </p>
512 *
513 * <p> If the match was successful but the group specified failed to match
514 * any part of the input sequence, then <tt>null</tt> is returned. Note
515 * that some groups, for example <tt>(a*)</tt>, match the empty string.
516 * This method will return the empty string when such a group successfully
517 * matches the empty string in the input. </p>
518 *
519 * @param group
520 * The index of a capturing group in this matcher's pattern
521 *
522 * @return The (possibly empty) subsequence captured by the group
523 * during the previous match, or <tt>null</tt> if the group
524 * failed to match part of the input
525 *
526 * @throws IllegalStateException
527 * If no match has yet been attempted,
528 * or if the previous match operation failed
529 *
530 * @throws IndexOutOfBoundsException
531 * If there is no capturing group in the pattern
532 * with the given index
533 */
534 public String group(int group) {
535 if (first < 0)
536 throw new IllegalStateException("No match found");
537 if (group < 0 || group > groupCount())
538 throw new IndexOutOfBoundsException("No group " + group);
539 if ((groups[group*2] == -1) || (groups[group*2+1] == -1))
540 return null;
541 return getSubSequence(groups[group * 2], groups[group * 2 + 1]).toString();
542 }
543
544 /**
545 * Returns the input subsequence captured by the given
546 * <a href="Pattern.html#groupname">named-capturing group</a> during the previous
547 * match operation.
548 *
549 * <p> If the match was successful but the group specified failed to match
550 * any part of the input sequence, then <tt>null</tt> is returned. Note
551 * that some groups, for example <tt>(a*)</tt>, match the empty string.
552 * This method will return the empty string when such a group successfully
553 * matches the empty string in the input. </p>
554 *
555 * @param name
556 * The name of a named-capturing group in this matcher's pattern
557 *
558 * @return The (possibly empty) subsequence captured by the named group
559 * during the previous match, or <tt>null</tt> if the group
560 * failed to match part of the input
561 *
562 * @throws IllegalStateException
563 * If no match has yet been attempted,
564 * or if the previous match operation failed
565 *
566 * @throws IllegalArgumentException
567 * If there is no capturing group in the pattern
568 * with the given name
569 * @since 1.7
570 */
571 public String group(String name) {
572 int group = getMatchedGroupIndex(name);
573 if ((groups[group*2] == -1) || (groups[group*2+1] == -1))
574 return null;
575 return getSubSequence(groups[group * 2], groups[group * 2 + 1]).toString();
576 }
577
578 /**
579 * Returns the number of capturing groups in this matcher's pattern.
580 *
581 * <p> Group zero denotes the entire pattern by convention. It is not
582 * included in this count.
583 *
584 * <p> Any non-negative integer smaller than or equal to the value
585 * returned by this method is guaranteed to be a valid group index for
586 * this matcher. </p>
587 *
588 * @return The number of capturing groups in this matcher's pattern
589 */
590 public int groupCount() {
591 return parentPattern.capturingGroupCount - 1;
592 }
593
594 /**
595 * Attempts to match the entire region against the pattern.
596 *
597 * <p> If the match succeeds then more information can be obtained via the
598 * <tt>start</tt>, <tt>end</tt>, and <tt>group</tt> methods. </p>
599 *
600 * @return <tt>true</tt> if, and only if, the entire region sequence
601 * matches this matcher's pattern
602 */
603 public boolean matches() {
604 return match(from, ENDANCHOR);
605 }
606
607 /**
608 * Attempts to find the next subsequence of the input sequence that matches
609 * the pattern.
610 *
611 * <p> This method starts at the beginning of this matcher's region, or, if
612 * a previous invocation of the method was successful and the matcher has
613 * not since been reset, at the first character not matched by the previous
614 * match.
615 *
616 * <p> If the match succeeds then more information can be obtained via the
617 * <tt>start</tt>, <tt>end</tt>, and <tt>group</tt> methods. </p>
618 *
619 * @return <tt>true</tt> if, and only if, a subsequence of the input
620 * sequence matches this matcher's pattern
621 */
622 public boolean find() {
623 int nextSearchIndex = last;
624 if (nextSearchIndex == first)
625 nextSearchIndex++;
626
627 // If next search starts before region, start it at region
628 if (nextSearchIndex < from)
629 nextSearchIndex = from;
630
631 // If next search starts beyond region then it fails
632 if (nextSearchIndex > to) {
633 for (int i = 0; i < groups.length; i++)
634 groups[i] = -1;
635 return false;
636 }
637 return search(nextSearchIndex);
638 }
639
640 /**
641 * Resets this matcher and then attempts to find the next subsequence of
642 * the input sequence that matches the pattern, starting at the specified
643 * index.
644 *
645 * <p> If the match succeeds then more information can be obtained via the
646 * <tt>start</tt>, <tt>end</tt>, and <tt>group</tt> methods, and subsequent
647 * invocations of the {@link #find()} method will start at the first
648 * character not matched by this match. </p>
649 *
650 * @param start the index to start searching for a match
651 * @throws IndexOutOfBoundsException
652 * If start is less than zero or if start is greater than the
653 * length of the input sequence.
654 *
655 * @return <tt>true</tt> if, and only if, a subsequence of the input
656 * sequence starting at the given index matches this matcher's
657 * pattern
658 */
659 public boolean find(int start) {
660 int limit = getTextLength();
661 if ((start < 0) || (start > limit))
662 throw new IndexOutOfBoundsException("Illegal start index");
663 reset();
664 return search(start);
665 }
666
667 /**
668 * Attempts to match the input sequence, starting at the beginning of the
669 * region, against the pattern.
670 *
671 * <p> Like the {@link #matches matches} method, this method always starts
672 * at the beginning of the region; unlike that method, it does not
673 * require that the entire region be matched.
674 *
675 * <p> If the match succeeds then more information can be obtained via the
676 * <tt>start</tt>, <tt>end</tt>, and <tt>group</tt> methods. </p>
677 *
678 * @return <tt>true</tt> if, and only if, a prefix of the input
679 * sequence matches this matcher's pattern
680 */
681 public boolean lookingAt() {
682 return match(from, NOANCHOR);
683 }
684
685 /**
686 * Returns a literal replacement <code>String</code> for the specified
687 * <code>String</code>.
688 *
689 * This method produces a <code>String</code> that will work
690 * as a literal replacement <code>s</code> in the
691 * <code>appendReplacement</code> method of the {@link Matcher} class.
692 * The <code>String</code> produced will match the sequence of characters
693 * in <code>s</code> treated as a literal sequence. Slashes ('\') and
694 * dollar signs ('$') will be given no special meaning.
695 *
696 * @param s The string to be literalized
697 * @return A literal string replacement
698 * @since 1.5
699 */
700 public static String quoteReplacement(String s) {
701 if ((s.indexOf('\\') == -1) && (s.indexOf('$') == -1))
702 return s;
703 StringBuilder sb = new StringBuilder();
704 for (int i=0; i<s.length(); i++) {
705 char c = s.charAt(i);
706 if (c == '\\' || c == '$') {
707 sb.append('\\');
708 }
709 sb.append(c);
710 }
711 return sb.toString();
712 }
713
714 /**
715 * Implements a non-terminal append-and-replace step.
716 *
717 * <p> This method performs the following actions: </p>
718 *
719 * <ol>
720 *
721 * <li><p> It reads characters from the input sequence, starting at the
722 * append position, and appends them to the given string buffer. It
723 * stops after reading the last character preceding the previous match,
724 * that is, the character at index {@link
725 * #start()} <tt>-</tt> <tt>1</tt>. </p></li>
726 *
727 * <li><p> It appends the given replacement string to the string buffer.
728 * </p></li>
729 *
730 * <li><p> It sets the append position of this matcher to the index of
731 * the last character matched, plus one, that is, to {@link #end()}.
732 * </p></li>
733 *
734 * </ol>
735 *
736 * <p> The replacement string may contain references to subsequences
737 * captured during the previous match: Each occurrence of
738 * <tt>${</tt><i>name</i><tt>}</tt> or <tt>$</tt><i>g</i>
739 * will be replaced by the result of evaluating the corresponding
740 * {@link #group(String) group(name)} or {@link #group(int) group(g)}
741 * respectively. For <tt>$</tt><i>g</i>,
742 * the first number after the <tt>$</tt> is always treated as part of
743 * the group reference. Subsequent numbers are incorporated into g if
744 * they would form a legal group reference. Only the numerals '0'
745 * through '9' are considered as potential components of the group
746 * reference. If the second group matched the string <tt>"foo"</tt>, for
747 * example, then passing the replacement string <tt>"$2bar"</tt> would
748 * cause <tt>"foobar"</tt> to be appended to the string buffer. A dollar
749 * sign (<tt>$</tt>) may be included as a literal in the replacement
750 * string by preceding it with a backslash (<tt>\$</tt>).
751 *
752 * <p> Note that backslashes (<tt>\</tt>) and dollar signs (<tt>$</tt>) in
753 * the replacement string may cause the results to be different than if it
754 * were being treated as a literal replacement string. Dollar signs may be
755 * treated as references to captured subsequences as described above, and
756 * backslashes are used to escape literal characters in the replacement
757 * string.
758 *
759 * <p> This method is intended to be used in a loop together with the
760 * {@link #appendTail appendTail} and {@link #find find} methods. The
761 * following code, for example, writes <tt>one dog two dogs in the
762 * yard</tt> to the standard-output stream: </p>
763 *
764 * <blockquote><pre>
765 * Pattern p = Pattern.compile("cat");
766 * Matcher m = p.matcher("one cat two cats in the yard");
767 * StringBuffer sb = new StringBuffer();
768 * while (m.find()) {
769 * m.appendReplacement(sb, "dog");
770 * }
771 * m.appendTail(sb);
772 * System.out.println(sb.toString());</pre></blockquote>
773 *
774 * @param sb
775 * The target string buffer
776 *
777 * @param replacement
778 * The replacement string
779 *
780 * @return This matcher
781 *
782 * @throws IllegalStateException
783 * If no match has yet been attempted,
784 * or if the previous match operation failed
785 *
786 * @throws IllegalArgumentException
787 * If the replacement string refers to a named-capturing
788 * group that does not exist in the pattern
789 *
790 * @throws IndexOutOfBoundsException
791 * If the replacement string refers to a capturing group
792 * that does not exist in the pattern
793 */
794 public Matcher appendReplacement(StringBuffer sb, String replacement) {
795
796 // If no match, return error
797 if (first < 0)
798 throw new IllegalStateException("No match available");
799
800 // Process substitution string to replace group references with groups
801 int cursor = 0;
802 StringBuilder result = new StringBuilder();
803
804 while (cursor < replacement.length()) {
805 char nextChar = replacement.charAt(cursor);
806 if (nextChar == '\\') {
807 cursor++;
808 if (cursor == replacement.length())
809 throw new IllegalArgumentException(
810 "character to be escaped is missing");
811 nextChar = replacement.charAt(cursor);
812 result.append(nextChar);
813 cursor++;
814 } else if (nextChar == '$') {
815 // Skip past $
816 cursor++;
817 // Throw IAE if this "$" is the last character in replacement
818 if (cursor == replacement.length())
819 throw new IllegalArgumentException(
820 "Illegal group reference: group index is missing");
821 nextChar = replacement.charAt(cursor);
822 int refNum = -1;
823 if (nextChar == '{') {
824 cursor++;
825 StringBuilder gsb = new StringBuilder();
826 while (cursor < replacement.length()) {
827 nextChar = replacement.charAt(cursor);
828 if (ASCII.isLower(nextChar) ||
829 ASCII.isUpper(nextChar) ||
830 ASCII.isDigit(nextChar)) {
831 gsb.append(nextChar);
832 cursor++;
833 } else {
834 break;
835 }
836 }
837 if (gsb.length() == 0)
838 throw new IllegalArgumentException(
839 "named capturing group has 0 length name");
840 if (nextChar != '}')
841 throw new IllegalArgumentException(
842 "named capturing group is missing trailing '}'");
843 String gname = gsb.toString();
844 if (ASCII.isDigit(gname.charAt(0)))
845 throw new IllegalArgumentException(
846 "capturing group name {" + gname +
847 "} starts with digit character");
848 if (!parentPattern.namedGroups().containsKey(gname))
849 throw new IllegalArgumentException(
850 "No group with name {" + gname + "}");
851 refNum = parentPattern.namedGroups().get(gname);
852 cursor++;
853 } else {
854 // The first number is always a group
855 refNum = (int)nextChar - '0';
856 if ((refNum < 0)||(refNum > 9))
857 throw new IllegalArgumentException(
858 "Illegal group reference");
859 cursor++;
860 // Capture the largest legal group string
861 boolean done = false;
862 while (!done) {
863 if (cursor >= replacement.length()) {
864 break;
865 }
866 int nextDigit = replacement.charAt(cursor) - '0';
867 if ((nextDigit < 0)||(nextDigit > 9)) { // not a number
868 break;
869 }
870 int newRefNum = (refNum * 10) + nextDigit;
871 if (groupCount() < newRefNum) {
872 done = true;
873 } else {
874 refNum = newRefNum;
875 cursor++;
876 }
877 }
878 }
879 // Append group
880 if (start(refNum) != -1 && end(refNum) != -1)
881 result.append(text, start(refNum), end(refNum));
882 } else {
883 result.append(nextChar);
884 cursor++;
885 }
886 }
887 // Append the intervening text
888 sb.append(text, lastAppendPosition, first);
889 // Append the match substitution
890 sb.append(result);
891
892 lastAppendPosition = last;
893 return this;
894 }
895
896 /**
897 * Implements a terminal append-and-replace step.
898 *
899 * <p> This method reads characters from the input sequence, starting at
900 * the append position, and appends them to the given string buffer. It is
901 * intended to be invoked after one or more invocations of the {@link
902 * #appendReplacement appendReplacement} method in order to copy the
903 * remainder of the input sequence. </p>
904 *
905 * @param sb
906 * The target string buffer
907 *
908 * @return The target string buffer
909 */
910 public StringBuffer appendTail(StringBuffer sb) {
911 sb.append(text, lastAppendPosition, getTextLength());
912 return sb;
913 }
914
915 /**
916 * Replaces every subsequence of the input sequence that matches the
917 * pattern with the given replacement string.
918 *
919 * <p> This method first resets this matcher. It then scans the input
920 * sequence looking for matches of the pattern. Characters that are not
921 * part of any match are appended directly to the result string; each match
922 * is replaced in the result by the replacement string. The replacement
923 * string may contain references to captured subsequences as in the {@link
924 * #appendReplacement appendReplacement} method.
925 *
926 * <p> Note that backslashes (<tt>\</tt>) and dollar signs (<tt>$</tt>) in
927 * the replacement string may cause the results to be different than if it
928 * were being treated as a literal replacement string. Dollar signs may be
929 * treated as references to captured subsequences as described above, and
930 * backslashes are used to escape literal characters in the replacement
931 * string.
932 *
933 * <p> Given the regular expression <tt>a*b</tt>, the input
934 * <tt>"aabfooaabfooabfoob"</tt>, and the replacement string
935 * <tt>"-"</tt>, an invocation of this method on a matcher for that
936 * expression would yield the string <tt>"-foo-foo-foo-"</tt>.
937 *
938 * <p> Invoking this method changes this matcher's state. If the matcher
939 * is to be used in further matching operations then it should first be
940 * reset. </p>
941 *
942 * @param replacement
943 * The replacement string
944 *
945 * @return The string constructed by replacing each matching subsequence
946 * by the replacement string, substituting captured subsequences
947 * as needed
948 */
949 public String replaceAll(String replacement) {
950 reset();
951 boolean result = find();
952 if (result) {
953 StringBuffer sb = new StringBuffer();
954 do {
955 appendReplacement(sb, replacement);
956 result = find();
957 } while (result);
958 appendTail(sb);
959 return sb.toString();
960 }
961 return text.toString();
962 }
963
964 /**
965 * Replaces the first subsequence of the input sequence that matches the
966 * pattern with the given replacement string.
967 *
968 * <p> This method first resets this matcher. It then scans the input
969 * sequence looking for a match of the pattern. Characters that are not
970 * part of the match are appended directly to the result string; the match
971 * is replaced in the result by the replacement string. The replacement
972 * string may contain references to captured subsequences as in the {@link
973 * #appendReplacement appendReplacement} method.
974 *
975 * <p>Note that backslashes (<tt>\</tt>) and dollar signs (<tt>$</tt>) in
976 * the replacement string may cause the results to be different than if it
977 * were being treated as a literal replacement string. Dollar signs may be
978 * treated as references to captured subsequences as described above, and
979 * backslashes are used to escape literal characters in the replacement
980 * string.
981 *
982 * <p> Given the regular expression <tt>dog</tt>, the input
983 * <tt>"zzzdogzzzdogzzz"</tt>, and the replacement string
984 * <tt>"cat"</tt>, an invocation of this method on a matcher for that
985 * expression would yield the string <tt>"zzzcatzzzdogzzz"</tt>. </p>
986 *
987 * <p> Invoking this method changes this matcher's state. If the matcher
988 * is to be used in further matching operations then it should first be
989 * reset. </p>
990 *
991 * @param replacement
992 * The replacement string
993 * @return The string constructed by replacing the first matching
994 * subsequence by the replacement string, substituting captured
995 * subsequences as needed
996 */
997 public String replaceFirst(String replacement) {
998 if (replacement == null)
999 throw new NullPointerException("replacement");
1000 reset();
1001 if (!find())
1002 return text.toString();
1003 StringBuffer sb = new StringBuffer();
1004 appendReplacement(sb, replacement);
1005 appendTail(sb);
1006 return sb.toString();
1007 }
1008
1009 /**
1010 * Sets the limits of this matcher's region. The region is the part of the
1011 * input sequence that will be searched to find a match. Invoking this
1012 * method resets the matcher, and then sets the region to start at the
1013 * index specified by the <code>start</code> parameter and end at the
1014 * index specified by the <code>end</code> parameter.
1015 *
1016 * <p>Depending on the transparency and anchoring being used (see
1017 * {@link #useTransparentBounds useTransparentBounds} and
1018 * {@link #useAnchoringBounds useAnchoringBounds}), certain constructs such
1019 * as anchors may behave differently at or around the boundaries of the
1020 * region.
1021 *
1022 * @param start
1023 * The index to start searching at (inclusive)
1024 * @param end
1025 * The index to end searching at (exclusive)
1026 * @throws IndexOutOfBoundsException
1027 * If start or end is less than zero, if
1028 * start is greater than the length of the input sequence, if
1029 * end is greater than the length of the input sequence, or if
1030 * start is greater than end.
1031 * @return this matcher
1032 * @since 1.5
1033 */
1034 public Matcher region(int start, int end) {
1035 if ((start < 0) || (start > getTextLength()))
1036 throw new IndexOutOfBoundsException("start");
1037 if ((end < 0) || (end > getTextLength()))
1038 throw new IndexOutOfBoundsException("end");
1039 if (start > end)
1040 throw new IndexOutOfBoundsException("start > end");
1041 reset();
1042 from = start;
1043 to = end;
1044 return this;
1045 }
1046
1047 /**
1048 * Reports the start index of this matcher's region. The
1049 * searches this matcher conducts are limited to finding matches
1050 * within {@link #regionStart regionStart} (inclusive) and
1051 * {@link #regionEnd regionEnd} (exclusive).
1052 *
1053 * @return The starting point of this matcher's region
1054 * @since 1.5
1055 */
1056 public int regionStart() {
1057 return from;
1058 }
1059
1060 /**
1061 * Reports the end index (exclusive) of this matcher's region.
1062 * The searches this matcher conducts are limited to finding matches
1063 * within {@link #regionStart regionStart} (inclusive) and
1064 * {@link #regionEnd regionEnd} (exclusive).
1065 *
1066 * @return the ending point of this matcher's region
1067 * @since 1.5
1068 */
1069 public int regionEnd() {
1070 return to;
1071 }
1072
1073 /**
1074 * Queries the transparency of region bounds for this matcher.
1075 *
1076 * <p> This method returns <tt>true</tt> if this matcher uses
1077 * <i>transparent</i> bounds, <tt>false</tt> if it uses <i>opaque</i>
1078 * bounds.
1079 *
1080 * <p> See {@link #useTransparentBounds useTransparentBounds} for a
1081 * description of transparent and opaque bounds.
1082 *
1083 * <p> By default, a matcher uses opaque region boundaries.
1084 *
1085 * @return <tt>true</tt> iff this matcher is using transparent bounds,
1086 * <tt>false</tt> otherwise.
1087 * @see java.util.regex.Matcher#useTransparentBounds(boolean)
1088 * @since 1.5
1089 */
1090 public boolean hasTransparentBounds() {
1091 return transparentBounds;
1092 }
1093
1094 /**
1095 * Sets the transparency of region bounds for this matcher.
1096 *
1097 * <p> Invoking this method with an argument of <tt>true</tt> will set this
1098 * matcher to use <i>transparent</i> bounds. If the boolean
1099 * argument is <tt>false</tt>, then <i>opaque</i> bounds will be used.
1100 *
1101 * <p> Using transparent bounds, the boundaries of this
1102 * matcher's region are transparent to lookahead, lookbehind,
1103 * and boundary matching constructs. Those constructs can see beyond the
1104 * boundaries of the region to see if a match is appropriate.
1105 *
1106 * <p> Using opaque bounds, the boundaries of this matcher's
1107 * region are opaque to lookahead, lookbehind, and boundary matching
1108 * constructs that may try to see beyond them. Those constructs cannot
1109 * look past the boundaries so they will fail to match anything outside
1110 * of the region.
1111 *
1112 * <p> By default, a matcher uses opaque bounds.
1113 *
1114 * @param b a boolean indicating whether to use opaque or transparent
1115 * regions
1116 * @return this matcher
1117 * @see java.util.regex.Matcher#hasTransparentBounds
1118 * @since 1.5
1119 */
1120 public Matcher useTransparentBounds(boolean b) {
1121 transparentBounds = b;
1122 return this;
1123 }
1124
1125 /**
1126 * Queries the anchoring of region bounds for this matcher.
1127 *
1128 * <p> This method returns <tt>true</tt> if this matcher uses
1129 * <i>anchoring</i> bounds, <tt>false</tt> otherwise.
1130 *
1131 * <p> See {@link #useAnchoringBounds useAnchoringBounds} for a
1132 * description of anchoring bounds.
1133 *
1134 * <p> By default, a matcher uses anchoring region boundaries.
1135 *
1136 * @return <tt>true</tt> iff this matcher is using anchoring bounds,
1137 * <tt>false</tt> otherwise.
1138 * @see java.util.regex.Matcher#useAnchoringBounds(boolean)
1139 * @since 1.5
1140 */
1141 public boolean hasAnchoringBounds() {
1142 return anchoringBounds;
1143 }
1144
1145 /**
1146 * Sets the anchoring of region bounds for this matcher.
1147 *
1148 * <p> Invoking this method with an argument of <tt>true</tt> will set this
1149 * matcher to use <i>anchoring</i> bounds. If the boolean
1150 * argument is <tt>false</tt>, then <i>non-anchoring</i> bounds will be
1151 * used.
1152 *
1153 * <p> Using anchoring bounds, the boundaries of this
1154 * matcher's region match anchors such as ^ and $.
1155 *
1156 * <p> Without anchoring bounds, the boundaries of this
1157 * matcher's region will not match anchors such as ^ and $.
1158 *
1159 * <p> By default, a matcher uses anchoring region boundaries.
1160 *
1161 * @param b a boolean indicating whether or not to use anchoring bounds.
1162 * @return this matcher
1163 * @see java.util.regex.Matcher#hasAnchoringBounds
1164 * @since 1.5
1165 */
1166 public Matcher useAnchoringBounds(boolean b) {
1167 anchoringBounds = b;
1168 return this;
1169 }
1170
1171 /**
1172 * <p>Returns the string representation of this matcher. The
1173 * string representation of a <code>Matcher</code> contains information
1174 * that may be useful for debugging. The exact format is unspecified.
1175 *
1176 * @return The string representation of this matcher
1177 * @since 1.5
1178 */
1179 public String toString() {
1180 StringBuilder sb = new StringBuilder();
1181 sb.append("java.util.regex.Matcher");
1182 sb.append("[pattern=" + pattern());
1183 sb.append(" region=");
1184 sb.append(regionStart() + "," + regionEnd());
1185 sb.append(" lastmatch=");
1186 if ((first >= 0) && (group() != null)) {
1187 sb.append(group());
1188 }
1189 sb.append("]");
1190 return sb.toString();
1191 }
1192
1193 /**
1194 * <p>Returns true if the end of input was hit by the search engine in
1195 * the last match operation performed by this matcher.
1196 *
1197 * <p>When this method returns true, then it is possible that more input
1198 * would have changed the result of the last search.
1199 *
1200 * @return true iff the end of input was hit in the last match; false
1201 * otherwise
1202 * @since 1.5
1203 */
1204 public boolean hitEnd() {
1205 return hitEnd;
1206 }
1207
1208 /**
1209 * <p>Returns true if more input could change a positive match into a
1210 * negative one.
1211 *
1212 * <p>If this method returns true, and a match was found, then more
1213 * input could cause the match to be lost. If this method returns false
1214 * and a match was found, then more input might change the match but the
1215 * match won't be lost. If a match was not found, then requireEnd has no
1216 * meaning.
1217 *
1218 * @return true iff more input could change a positive match into a
1219 * negative one.
1220 * @since 1.5
1221 */
1222 public boolean requireEnd() {
1223 return requireEnd;
1224 }
1225
1226 /**
1227 * Initiates a search to find a Pattern within the given bounds.
1228 * The groups are filled with default values and the match of the root
1229 * of the state machine is called. The state machine will hold the state
1230 * of the match as it proceeds in this matcher.
1231 *
1232 * Matcher.from is not set here, because it is the "hard" boundary
1233 * of the start of the search which anchors will set to. The from param
1234 * is the "soft" boundary of the start of the search, meaning that the
1235 * regex tries to match at that index but ^ won't match there. Subsequent
1236 * calls to the search methods start at a new "soft" boundary which is
1237 * the end of the previous match.
1238 */
1239 boolean search(int from) {
1240 this.hitEnd = false;
1241 this.requireEnd = false;
1242 from = from < 0 ? 0 : from;
1243 this.first = from;
1244 this.oldLast = oldLast < 0 ? from : oldLast;
1245 for (int i = 0; i < groups.length; i++)
1246 groups[i] = -1;
1247 acceptMode = NOANCHOR;
1248 boolean result = parentPattern.root.match(this, from, text);
1249 if (!result)
1250 this.first = -1;
1251 this.oldLast = this.last;
1252 return result;
1253 }
1254
1255 /**
1256 * Initiates a search for an anchored match to a Pattern within the given
1257 * bounds. The groups are filled with default values and the match of the
1258 * root of the state machine is called. The state machine will hold the
1259 * state of the match as it proceeds in this matcher.
1260 */
1261 boolean match(int from, int anchor) {
1262 this.hitEnd = false;
1263 this.requireEnd = false;
1264 from = from < 0 ? 0 : from;
1265 this.first = from;
1266 this.oldLast = oldLast < 0 ? from : oldLast;
1267 for (int i = 0; i < groups.length; i++)
1268 groups[i] = -1;
1269 acceptMode = anchor;
1270 boolean result = parentPattern.matchRoot.match(this, from, text);
1271 if (!result)
1272 this.first = -1;
1273 this.oldLast = this.last;
1274 return result;
1275 }
1276
1277 /**
1278 * Returns the end index of the text.
1279 *
1280 * @return the index after the last character in the text
1281 */
1282 int getTextLength() {
1283 return text.length();
1284 }
1285
1286 /**
1287 * Generates a String from this Matcher's input in the specified range.
1288 *
1289 * @param beginIndex the beginning index, inclusive
1290 * @param endIndex the ending index, exclusive
1291 * @return A String generated from this Matcher's input
1292 */
1293 CharSequence getSubSequence(int beginIndex, int endIndex) {
1294 return text.subSequence(beginIndex, endIndex);
1295 }
1296
1297 /**
1298 * Returns this Matcher's input character at index i.
1299 *
1300 * @return A char from the specified index
1301 */
1302 char charAt(int i) {
1303 return text.charAt(i);
1304 }
1305
1306 /**
1307 * Returns the group index of the matched capturing group.
1308 *
1309 * @return the index of the named-capturing group
1310 */
1311 int getMatchedGroupIndex(String name) {
1312 Objects.requireNonNull(name, "Group name");
1313 if (first < 0)
1314 throw new IllegalStateException("No match found");
1315 if (!parentPattern.namedGroups().containsKey(name))
1316 throw new IllegalArgumentException("No group with name <" + name + ">");
1317 return parentPattern.namedGroups().get(name);
1318 }
1319 }
1320