ICU 4.8.1.1  4.8.1.1
 All Data Structures Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
regex.h
Go to the documentation of this file.
1 /*
2 **********************************************************************
3 * Copyright (C) 2002-2011, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 * file name: regex.h
7 * encoding: US-ASCII
8 * indentation:4
9 *
10 * created on: 2002oct22
11 * created by: Andy Heninger
12 *
13 * ICU Regular Expressions, API for C++
14 */
15 
16 #ifndef REGEX_H
17 #define REGEX_H
18 
19 //#define REGEX_DEBUG
20 
45 #include "unicode/utypes.h"
46 
47 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
48 
49 #include "unicode/uobject.h"
50 #include "unicode/unistr.h"
51 #include "unicode/utext.h"
52 #include "unicode/parseerr.h"
53 
54 #include "unicode/uregex.h"
55 
56 // Forward Declarations
57 
59 
60 struct Regex8BitSet;
61 class RegexCImpl;
62 class RegexMatcher;
63 class RegexPattern;
64 struct REStackFrame;
66 class UnicodeSet;
67 class UVector;
68 class UVector32;
69 class UVector64;
70 
75 #ifdef REGEX_DEBUG
77  RegexPatternDump(const RegexPattern *pat);
78 #else
79  #undef RegexPatternDump
80  #define RegexPatternDump(pat)
81 #endif
82 
83 
84 
97 public:
98 
106  RegexPattern();
107 
114  RegexPattern(const RegexPattern &source);
115 
121  virtual ~RegexPattern();
122 
131  UBool operator==(const RegexPattern& that) const;
132 
141  inline UBool operator!=(const RegexPattern& that) const {return ! operator ==(that);}
142 
148  RegexPattern &operator =(const RegexPattern &source);
149 
157  virtual RegexPattern *clone() const;
158 
159 
184  static RegexPattern * U_EXPORT2 compile( const UnicodeString &regex,
185  UParseError &pe,
186  UErrorCode &status);
187 
188 
215  static RegexPattern * U_EXPORT2 compile( UText *regex,
216  UParseError &pe,
217  UErrorCode &status);
218 
243  static RegexPattern * U_EXPORT2 compile( const UnicodeString &regex,
244  uint32_t flags,
245  UParseError &pe,
246  UErrorCode &status);
247 
248 
275  static RegexPattern * U_EXPORT2 compile( UText *regex,
276  uint32_t flags,
277  UParseError &pe,
278  UErrorCode &status);
279 
280 
303  static RegexPattern * U_EXPORT2 compile( const UnicodeString &regex,
304  uint32_t flags,
305  UErrorCode &status);
306 
307 
332  static RegexPattern * U_EXPORT2 compile( UText *regex,
333  uint32_t flags,
334  UErrorCode &status);
335 
336 
342  virtual uint32_t flags() const;
343 
361  virtual RegexMatcher *matcher(const UnicodeString &input,
362  UErrorCode &status) const;
363 
364 private:
378  RegexMatcher *matcher(const UChar *input,
379  UErrorCode &status) const;
380 public:
381 
382 
394  virtual RegexMatcher *matcher(UErrorCode &status) const;
395 
396 
411  static UBool U_EXPORT2 matches(const UnicodeString &regex,
412  const UnicodeString &input,
413  UParseError &pe,
414  UErrorCode &status);
415 
416 
431  static UBool U_EXPORT2 matches(UText *regex,
432  UText *input,
433  UParseError &pe,
434  UErrorCode &status);
435 
436 
445  virtual UnicodeString pattern() const;
446 
447 
458  virtual UText *patternText(UErrorCode &status) const;
459 
460 
499  virtual int32_t split(const UnicodeString &input,
500  UnicodeString dest[],
501  int32_t destCapacity,
502  UErrorCode &status) const;
503 
504 
543  virtual int32_t split(UText *input,
544  UText *dest[],
545  int32_t destCapacity,
546  UErrorCode &status) const;
547 
548 
554  virtual UClassID getDynamicClassID() const;
555 
561  static UClassID U_EXPORT2 getStaticClassID();
562 
563 private:
564  //
565  // Implementation Data
566  //
567  UText *fPattern; // The original pattern string.
568  UnicodeString *fPatternString; // The original pattern UncodeString if relevant
569  uint32_t fFlags; // The flags used when compiling the pattern.
570  //
571  UVector64 *fCompiledPat; // The compiled pattern p-code.
572  UnicodeString fLiteralText; // Any literal string data from the pattern,
573  // after un-escaping, for use during the match.
574 
575  UVector *fSets; // Any UnicodeSets referenced from the pattern.
576  Regex8BitSet *fSets8; // (and fast sets for latin-1 range.)
577 
578 
579  UErrorCode fDeferredStatus; // status if some prior error has left this
580  // RegexPattern in an unusable state.
581 
582  int32_t fMinMatchLen; // Minimum Match Length. All matches will have length
583  // >= this value. For some patterns, this calculated
584  // value may be less than the true shortest
585  // possible match.
586 
587  int32_t fFrameSize; // Size of a state stack frame in the
588  // execution engine.
589 
590  int32_t fDataSize; // The size of the data needed by the pattern that
591  // does not go on the state stack, but has just
592  // a single copy per matcher.
593 
594  UVector32 *fGroupMap; // Map from capture group number to position of
595  // the group's variables in the matcher stack frame.
596 
597  int32_t fMaxCaptureDigits;
598 
599  UnicodeSet **fStaticSets; // Ptr to static (shared) sets for predefined
600  // regex character classes, e.g. Word.
601 
602  Regex8BitSet *fStaticSets8; // Ptr to the static (shared) latin-1 only
603  // sets for predefined regex classes.
604 
605  int32_t fStartType; // Info on how a match must start.
606  int32_t fInitialStringIdx; //
607  int32_t fInitialStringLen;
608  UnicodeSet *fInitialChars;
609  UChar32 fInitialChar;
610  Regex8BitSet *fInitialChars8;
611  UBool fNeedsAltInput;
612 
613  friend class RegexCompile;
614  friend class RegexMatcher;
615  friend class RegexCImpl;
616 
617  //
618  // Implementation Methods
619  //
620  void init(); // Common initialization, for use by constructors.
621  void zap(); // Common cleanup
622 #ifdef REGEX_DEBUG
623  void dumpOp(int32_t index) const;
624  friend void U_EXPORT2 RegexPatternDump(const RegexPattern *);
625 #endif
626 
627 };
628 
629 
630 
641 public:
642 
657  RegexMatcher(const UnicodeString &regexp, uint32_t flags, UErrorCode &status);
658 
674  RegexMatcher(UText *regexp, uint32_t flags, UErrorCode &status);
675 
697  RegexMatcher(const UnicodeString &regexp, const UnicodeString &input,
698  uint32_t flags, UErrorCode &status);
699 
721  RegexMatcher(UText *regexp, UText *input,
722  uint32_t flags, UErrorCode &status);
723 
724 private:
738  RegexMatcher(const UnicodeString &regexp, const UChar *input,
739  uint32_t flags, UErrorCode &status);
740 public:
741 
742 
748  virtual ~RegexMatcher();
749 
750 
757  virtual UBool matches(UErrorCode &status);
758 
759 
770  virtual UBool matches(int64_t startIndex, UErrorCode &status);
771 
772 
786  virtual UBool lookingAt(UErrorCode &status);
787 
788 
802  virtual UBool lookingAt(int64_t startIndex, UErrorCode &status);
803 
804 
817  virtual UBool find();
818 
819 
829  virtual UBool find(int64_t start, UErrorCode &status);
830 
831 
841  virtual UnicodeString group(UErrorCode &status) const;
842 
843 
856  virtual UnicodeString group(int32_t groupNum, UErrorCode &status) const;
857 
858 
864  virtual int32_t groupCount() const;
865 
866 
881  virtual UText *group(UText *dest, int64_t &group_len, UErrorCode &status) const;
882 
898  virtual UText *group(int32_t groupNum, UText *dest, int64_t &group_len, UErrorCode &status) const;
899 
915  virtual UText *group(int32_t groupNum, UText *dest, UErrorCode &status) const;
916 
917 
925  virtual int32_t start(UErrorCode &status) const;
926 
934  virtual int64_t start64(UErrorCode &status) const;
935 
936 
950  virtual int32_t start(int32_t group, UErrorCode &status) const;
951 
965  virtual int64_t start64(int32_t group, UErrorCode &status) const;
966 
967 
981  virtual int32_t end(UErrorCode &status) const;
982 
996  virtual int64_t end64(UErrorCode &status) const;
997 
998 
1016  virtual int32_t end(int32_t group, UErrorCode &status) const;
1017 
1035  virtual int64_t end64(int32_t group, UErrorCode &status) const;
1036 
1037 
1046  virtual RegexMatcher &reset();
1047 
1048 
1064  virtual RegexMatcher &reset(int64_t index, UErrorCode &status);
1065 
1066 
1084  virtual RegexMatcher &reset(const UnicodeString &input);
1085 
1086 
1100  virtual RegexMatcher &reset(UText *input);
1101 
1102 
1127  virtual RegexMatcher &refreshInputText(UText *input, UErrorCode &status);
1128 
1129 private:
1143  RegexMatcher &reset(const UChar *input);
1144 public:
1145 
1153  virtual const UnicodeString &input() const;
1154 
1163  virtual UText *inputText() const;
1164 
1175  virtual UText *getInput(UText *dest, UErrorCode &status) const;
1176 
1177 
1196  virtual RegexMatcher &region(int64_t start, int64_t limit, UErrorCode &status);
1197 
1209  virtual RegexMatcher &region(int64_t regionStart, int64_t regionLimit, int64_t startIndex, UErrorCode &status);
1210 
1219  virtual int32_t regionStart() const;
1220 
1229  virtual int64_t regionStart64() const;
1230 
1231 
1240  virtual int32_t regionEnd() const;
1241 
1250  virtual int64_t regionEnd64() const;
1251 
1260  virtual UBool hasTransparentBounds() const;
1261 
1280  virtual RegexMatcher &useTransparentBounds(UBool b);
1281 
1282 
1290  virtual UBool hasAnchoringBounds() const;
1291 
1292 
1305  virtual RegexMatcher &useAnchoringBounds(UBool b);
1306 
1307 
1320  virtual UBool hitEnd() const;
1321 
1331  virtual UBool requireEnd() const;
1332 
1333 
1339  virtual const RegexPattern &pattern() const;
1340 
1341 
1358  virtual UnicodeString replaceAll(const UnicodeString &replacement, UErrorCode &status);
1359 
1360 
1381  virtual UText *replaceAll(UText *replacement, UText *dest, UErrorCode &status);
1382 
1383 
1404  virtual UnicodeString replaceFirst(const UnicodeString &replacement, UErrorCode &status);
1405 
1406 
1431  virtual UText *replaceFirst(UText *replacement, UText *dest, UErrorCode &status);
1432 
1433 
1461  virtual RegexMatcher &appendReplacement(UnicodeString &dest,
1462  const UnicodeString &replacement, UErrorCode &status);
1463 
1464 
1492  virtual RegexMatcher &appendReplacement(UText *dest,
1493  UText *replacement, UErrorCode &status);
1494 
1495 
1506  virtual UnicodeString &appendTail(UnicodeString &dest);
1507 
1508 
1522  virtual UText *appendTail(UText *dest, UErrorCode &status);
1523 
1524 
1548  virtual int32_t split(const UnicodeString &input,
1549  UnicodeString dest[],
1550  int32_t destCapacity,
1551  UErrorCode &status);
1552 
1553 
1577  virtual int32_t split(UText *input,
1578  UText *dest[],
1579  int32_t destCapacity,
1580  UErrorCode &status);
1581 
1603  virtual void setTimeLimit(int32_t limit, UErrorCode &status);
1604 
1611  virtual int32_t getTimeLimit() const;
1612 
1634  virtual void setStackLimit(int32_t limit, UErrorCode &status);
1635 
1643  virtual int32_t getStackLimit() const;
1644 
1645 
1659  virtual void setMatchCallback(URegexMatchCallback *callback,
1660  const void *context,
1661  UErrorCode &status);
1662 
1663 
1674  virtual void getMatchCallback(URegexMatchCallback *&callback,
1675  const void *&context,
1676  UErrorCode &status);
1677 
1678 
1692  virtual void setFindProgressCallback(URegexFindProgressCallback *callback,
1693  const void *context,
1694  UErrorCode &status);
1695 
1696 
1707  virtual void getFindProgressCallback(URegexFindProgressCallback *&callback,
1708  const void *&context,
1709  UErrorCode &status);
1710 
1711 
1717  void setTrace(UBool state);
1718 
1719 
1725  static UClassID U_EXPORT2 getStaticClassID();
1726 
1732  virtual UClassID getDynamicClassID() const;
1733 
1734 private:
1735  // Constructors and other object boilerplate are private.
1736  // Instances of RegexMatcher can not be assigned, copied, cloned, etc.
1737  RegexMatcher(); // default constructor not implemented
1738  RegexMatcher(const RegexPattern *pat);
1739  RegexMatcher(const RegexMatcher &other);
1740  RegexMatcher &operator =(const RegexMatcher &rhs);
1741  void init(UErrorCode &status); // Common initialization
1742  void init2(UText *t, UErrorCode &e); // Common initialization, part 2.
1743 
1744  friend class RegexPattern;
1745  friend class RegexCImpl;
1746 public:
1748  void resetPreserveRegion(); // Reset matcher state, but preserve any region.
1749 private:
1750 
1751  //
1752  // MatchAt This is the internal interface to the match engine itself.
1753  // Match status comes back in matcher member variables.
1754  //
1755  void MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status);
1756  inline void backTrack(int64_t &inputIdx, int32_t &patIdx);
1757  UBool isWordBoundary(int64_t pos); // perform Perl-like \b test
1758  UBool isUWordBoundary(int64_t pos); // perform RBBI based \b test
1759  REStackFrame *resetStack();
1760  inline REStackFrame *StateSave(REStackFrame *fp, int64_t savePatIdx, UErrorCode &status);
1761  void IncrementTime(UErrorCode &status);
1762  UBool ReportFindProgress(int64_t matchIndex, UErrorCode &status);
1763 
1764  int64_t appendGroup(int32_t groupNum, UText *dest, UErrorCode &status) const;
1765 
1766  UBool findUsingChunk();
1767  void MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &status);
1768  UBool isChunkWordBoundary(int32_t pos);
1769 
1770  const RegexPattern *fPattern;
1771  RegexPattern *fPatternOwned; // Non-NULL if this matcher owns the pattern, and
1772  // should delete it when through.
1773 
1774  const UnicodeString *fInput; // The string being matched. Only used for input()
1775  UText *fInputText; // The text being matched. Is never NULL.
1776  UText *fAltInputText; // A shallow copy of the text being matched.
1777  // Only created if the pattern contains backreferences.
1778  int64_t fInputLength; // Full length of the input text.
1779  int32_t fFrameSize; // The size of a frame in the backtrack stack.
1780 
1781  int64_t fRegionStart; // Start of the input region, default = 0.
1782  int64_t fRegionLimit; // End of input region, default to input.length.
1783 
1784  int64_t fAnchorStart; // Region bounds for anchoring operations (^ or $).
1785  int64_t fAnchorLimit; // See useAnchoringBounds
1786 
1787  int64_t fLookStart; // Region bounds for look-ahead/behind and
1788  int64_t fLookLimit; // and other boundary tests. See
1789  // useTransparentBounds
1790 
1791  int64_t fActiveStart; // Currently active bounds for matching.
1792  int64_t fActiveLimit; // Usually is the same as region, but
1793  // is changed to fLookStart/Limit when
1794  // entering look around regions.
1795 
1796  UBool fTransparentBounds; // True if using transparent bounds.
1797  UBool fAnchoringBounds; // True if using anchoring bounds.
1798 
1799  UBool fMatch; // True if the last attempted match was successful.
1800  int64_t fMatchStart; // Position of the start of the most recent match
1801  int64_t fMatchEnd; // First position after the end of the most recent match
1802  // Zero if no previous match, even when a region
1803  // is active.
1804  int64_t fLastMatchEnd; // First position after the end of the previous match,
1805  // or -1 if there was no previous match.
1806  int64_t fAppendPosition; // First position after the end of the previous
1807  // appendReplacement(). As described by the
1808  // JavaDoc for Java Matcher, where it is called
1809  // "append position"
1810  UBool fHitEnd; // True if the last match touched the end of input.
1811  UBool fRequireEnd; // True if the last match required end-of-input
1812  // (matched $ or Z)
1813 
1814  UVector64 *fStack;
1815  REStackFrame *fFrame; // After finding a match, the last active stack frame,
1816  // which will contain the capture group results.
1817  // NOT valid while match engine is running.
1818 
1819  int64_t *fData; // Data area for use by the compiled pattern.
1820  int64_t fSmallData[8]; // Use this for data if it's enough.
1821 
1822  int32_t fTimeLimit; // Max time (in arbitrary steps) to let the
1823  // match engine run. Zero for unlimited.
1824 
1825  int32_t fTime; // Match time, accumulates while matching.
1826  int32_t fTickCounter; // Low bits counter for time. Counts down StateSaves.
1827  // Kept separately from fTime to keep as much
1828  // code as possible out of the inline
1829  // StateSave function.
1830 
1831  int32_t fStackLimit; // Maximum memory size to use for the backtrack
1832  // stack, in bytes. Zero for unlimited.
1833 
1834  URegexMatchCallback *fCallbackFn; // Pointer to match progress callback funct.
1835  // NULL if there is no callback.
1836  const void *fCallbackContext; // User Context ptr for callback function.
1837 
1838  URegexFindProgressCallback *fFindProgressCallbackFn; // Pointer to match progress callback funct.
1839  // NULL if there is no callback.
1840  const void *fFindProgressCallbackContext; // User Context ptr for callback function.
1841 
1842 
1843  UBool fInputUniStrMaybeMutable; // Set when fInputText wraps a UnicodeString that may be mutable - compatibility.
1844 
1845  UBool fTraceDebug; // Set true for debug tracing of match engine.
1846 
1847  UErrorCode fDeferredStatus; // Save error state that cannot be immediately
1848  // reported, or that permanently disables this matcher.
1849 
1850  RuleBasedBreakIterator *fWordBreakItr;
1851 };
1852 
1854 #endif // UCONFIG_NO_REGULAR_EXPRESSIONS
1855 #endif