UBool RegexCompile::doParseActions()

in extra/icu/icu-release-73-1/source/i18n/regexcmp.cpp [346:1869]


UBool RegexCompile::doParseActions(int32_t action)
{
    UBool   returnVal = true;

    switch ((Regex_PatternParseAction)action) {

    case doPatStart:
        // Start of pattern compiles to:
        //0   SAVE   2        Fall back to position of FAIL
        //1   jmp    3
        //2   FAIL            Stop if we ever reach here.
        //3   NOP             Dummy, so start of pattern looks the same as
        //                    the start of an ( grouping.
        //4   NOP             Resreved, will be replaced by a save if there are
        //                    OR | operators at the top level
        appendOp(URX_STATE_SAVE, 2);
        appendOp(URX_JMP,  3);
        appendOp(URX_FAIL, 0);

        // Standard open nonCapture paren action emits the two NOPs and
        //   sets up the paren stack frame.
        doParseActions(doOpenNonCaptureParen);
        break;

    case doPatFinish:
        // We've scanned to the end of the pattern
        //  The end of pattern compiles to:
        //        URX_END
        //    which will stop the runtime match engine.
        //  Encountering end of pattern also behaves like a close paren,
        //   and forces fixups of the State Save at the beginning of the compiled pattern
        //   and of any OR operations at the top level.
        //
        handleCloseParen();
        if (fParenStack.size() > 0) {
            // Missing close paren in pattern.
            error(U_REGEX_MISMATCHED_PAREN);
        }

        // add the END operation to the compiled pattern.
        appendOp(URX_END, 0);

        // Terminate the pattern compilation state machine.
        returnVal = false;
        break;



    case doOrOperator:
        // Scanning a '|', as in (A|B)
        {
            // Generate code for any pending literals preceding the '|'
            fixLiterals(false);

            // Insert a SAVE operation at the start of the pattern section preceding
            //   this OR at this level.  This SAVE will branch the match forward
            //   to the right hand side of the OR in the event that the left hand
            //   side fails to match and backtracks.  Locate the position for the
            //   save from the location on the top of the parentheses stack.
            int32_t savePosition = fParenStack.popi();
            int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(savePosition);
            U_ASSERT(URX_TYPE(op) == URX_NOP);  // original contents of reserved location
            op = buildOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+1);
            fRXPat->fCompiledPat->setElementAt(op, savePosition);

            // Append an JMP operation into the compiled pattern.  The operand for
            //  the JMP will eventually be the location following the ')' for the
            //  group.  This will be patched in later, when the ')' is encountered.
            appendOp(URX_JMP, 0);

            // Push the position of the newly added JMP op onto the parentheses stack.
            // This registers if for fixup when this block's close paren is encountered.
            fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);

            // Append a NOP to the compiled pattern.  This is the slot reserved
            //   for a SAVE in the event that there is yet another '|' following
            //   this one.
            appendOp(URX_NOP, 0);
            fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);
        }
        break;


    case doBeginNamedCapture:
        // Scanning (?<letter.
        //   The first letter of the name will come through again under doConinueNamedCapture.
        fCaptureName = new UnicodeString();
        if (fCaptureName == nullptr) {
            error(U_MEMORY_ALLOCATION_ERROR);
        }
        break;

    case  doContinueNamedCapture:
        fCaptureName->append(fC.fChar);
        break;

    case doBadNamedCapture:
        error(U_REGEX_INVALID_CAPTURE_GROUP_NAME);
        break;
        
    case doOpenCaptureParen:
        // Open Capturing Paren, possibly named.
        //   Compile to a
        //      - NOP, which later may be replaced by a save-state if the
        //         parenthesized group gets a * quantifier, followed by
        //      - START_CAPTURE  n    where n is stack frame offset to the capture group variables.
        //      - NOP, which may later be replaced by a save-state if there
        //             is an '|' alternation within the parens.
        //
        //    Each capture group gets three slots in the save stack frame:
        //         0: Capture Group start position (in input string being matched.)
        //         1: Capture Group end position.
        //         2: Start of Match-in-progress.
        //    The first two locations are for a completed capture group, and are
        //     referred to by back references and the like.
        //    The third location stores the capture start position when an START_CAPTURE is
        //      encountered.  This will be promoted to a completed capture when (and if) the corresponding
        //      END_CAPTURE is encountered.
        {
            fixLiterals();
            appendOp(URX_NOP, 0);
            int32_t  varsLoc = allocateStackData(3);    // Reserve three slots in match stack frame.
            appendOp(URX_START_CAPTURE, varsLoc);
            appendOp(URX_NOP, 0);

            // On the Parentheses stack, start a new frame and add the positions
            //   of the two NOPs.  Depending on what follows in the pattern, the
            //   NOPs may be changed to SAVE_STATE or JMP ops, with a target
            //   address of the end of the parenthesized group.
            fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
            fParenStack.push(capturing, *fStatus);                        // Frame type.
            fParenStack.push(fRXPat->fCompiledPat->size()-3, *fStatus);   // The first  NOP location
            fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP loc

            // Save the mapping from group number to stack frame variable position.
            fRXPat->fGroupMap->addElement(varsLoc, *fStatus);

            // If this is a named capture group, add the name->group number mapping.
            if (fCaptureName != nullptr) {
                if (!fRXPat->initNamedCaptureMap()) {
                    if (U_SUCCESS(*fStatus)) {
                        error(fRXPat->fDeferredStatus);
                    }
                    break;
                }
                int32_t groupNumber = fRXPat->fGroupMap->size();
                int32_t previousMapping = uhash_puti(fRXPat->fNamedCaptureMap, fCaptureName, groupNumber, fStatus);
                fCaptureName = nullptr;    // hash table takes ownership of the name (key) string.
                if (previousMapping > 0 && U_SUCCESS(*fStatus)) {
                    error(U_REGEX_INVALID_CAPTURE_GROUP_NAME);
                }
            }
        }
        break;

    case doOpenNonCaptureParen:
        // Open non-caputuring (grouping only) Paren.
        //   Compile to a
        //      - NOP, which later may be replaced by a save-state if the
        //         parenthesized group gets a * quantifier, followed by
        //      - NOP, which may later be replaced by a save-state if there
        //             is an '|' alternation within the parens.
        {
            fixLiterals();
            appendOp(URX_NOP, 0);
            appendOp(URX_NOP, 0);

            // On the Parentheses stack, start a new frame and add the positions
            //   of the two NOPs.
            fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
            fParenStack.push(plain,      *fStatus);                       // Begin a new frame.
            fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The first  NOP location
            fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP loc
        }
         break;


    case doOpenAtomicParen:
        // Open Atomic Paren.  (?>
        //   Compile to a
        //      - NOP, which later may be replaced if the parenthesized group
        //         has a quantifier, followed by
        //      - STO_SP  save state stack position, so it can be restored at the ")"
        //      - NOP, which may later be replaced by a save-state if there
        //             is an '|' alternation within the parens.
        {
            fixLiterals();
            appendOp(URX_NOP, 0);
            int32_t  varLoc = allocateData(1);    // Reserve a data location for saving the state stack ptr.
            appendOp(URX_STO_SP, varLoc);
            appendOp(URX_NOP, 0);

            // On the Parentheses stack, start a new frame and add the positions
            //   of the two NOPs.  Depending on what follows in the pattern, the
            //   NOPs may be changed to SAVE_STATE or JMP ops, with a target
            //   address of the end of the parenthesized group.
            fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
            fParenStack.push(atomic, *fStatus);                           // Frame type.
            fParenStack.push(fRXPat->fCompiledPat->size()-3, *fStatus);   // The first NOP
            fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP
        }
        break;


    case doOpenLookAhead:
        // Positive Look-ahead   (?=  stuff  )
        //
        //   Note:   Addition of transparent input regions, with the need to
        //           restore the original regions when failing out of a lookahead
        //           block, complicated this sequence.  Some combined opcodes
        //           might make sense - or might not, lookahead aren't that common.
        //
        //      Caution:  min match length optimization knows about this
        //               sequence; don't change without making updates there too.
        //
        // Compiles to
        //    1    LA_START     dataLoc     Saves SP, Input Pos, Active input region.
        //    2.   STATE_SAVE   4            on failure of lookahead, goto 4
        //    3    JMP          6           continue ...
        //
        //    4.   LA_END                   Look Ahead failed.  Restore regions.
        //    5.   BACKTRACK                and back track again.
        //
        //    6.   NOP              reserved for use by quantifiers on the block.
        //                          Look-ahead can't have quantifiers, but paren stack
        //                             compile time conventions require the slot anyhow.
        //    7.   NOP              may be replaced if there is are '|' ops in the block.
        //    8.     code for parenthesized stuff.
        //    9.   LA_END
        //
        //  Four data slots are reserved, for saving state on entry to the look-around
        //    0:   stack pointer on entry.
        //    1:   input position on entry.
        //    2:   fActiveStart, the active bounds start on entry.
        //    3:   fActiveLimit, the active bounds limit on entry.
        {
            fixLiterals();
            int32_t dataLoc = allocateData(4);
            appendOp(URX_LA_START, dataLoc);
            appendOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+ 2);
            appendOp(URX_JMP, fRXPat->fCompiledPat->size()+ 3);
            appendOp(URX_LA_END, dataLoc);
            appendOp(URX_BACKTRACK, 0);
            appendOp(URX_NOP, 0);
            appendOp(URX_NOP, 0);

            // On the Parentheses stack, start a new frame and add the positions
            //   of the NOPs.
            fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
            fParenStack.push(lookAhead, *fStatus);                        // Frame type.
            fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The first  NOP location
            fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP location
        }
        break;

    case doOpenLookAheadNeg:
        // Negated Lookahead.   (?! stuff )
        // Compiles to
        //    1.    LA_START    dataloc
        //    2.    SAVE_STATE  7         // Fail within look-ahead block restores to this state,
        //                                //   which continues with the match.
        //    3.    NOP                   // Std. Open Paren sequence, for possible '|'
        //    4.       code for parenthesized stuff.
        //    5.    LA_END                // Cut back stack, remove saved state from step 2.
        //    6.    BACKTRACK             // code in block succeeded, so neg. lookahead fails.
        //    7.    END_LA                // Restore match region, in case look-ahead was using
        //                                        an alternate (transparent) region.
        //  Four data slots are reserved, for saving state on entry to the look-around
        //    0:   stack pointer on entry.
        //    1:   input position on entry.
        //    2:   fActiveStart, the active bounds start on entry.
        //    3:   fActiveLimit, the active bounds limit on entry.
        {
            fixLiterals();
            int32_t dataLoc = allocateData(4);
            appendOp(URX_LA_START, dataLoc);
            appendOp(URX_STATE_SAVE, 0);    // dest address will be patched later.
            appendOp(URX_NOP, 0);

            // On the Parentheses stack, start a new frame and add the positions
            //   of the StateSave and NOP.
            fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
            fParenStack.push(negLookAhead, *fStatus);                    // Frame type
            fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The STATE_SAVE location
            fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP location

            // Instructions #5 - #7 will be added when the ')' is encountered.
        }
        break;

    case doOpenLookBehind:
        {
            //   Compile a (?<= look-behind open paren.
            //
            //          Compiles to
            //              0       URX_LB_START     dataLoc
            //              1       URX_LB_CONT      dataLoc
            //              2                        MinMatchLen
            //              3                        MaxMatchLen
            //              4       URX_NOP          Standard '(' boilerplate.
            //              5       URX_NOP          Reserved slot for use with '|' ops within (block).
            //              6         <code for LookBehind expression>
            //              7       URX_LB_END       dataLoc    # Check match len, restore input  len
            //              8       URX_LA_END       dataLoc    # Restore stack, input pos
            //
            //          Allocate a block of matcher data, to contain (when running a match)
            //              0:    Stack ptr on entry
            //              1:    Input Index on entry
            //              2:    fActiveStart, the active bounds start on entry.
            //              3:    fActiveLimit, the active bounds limit on entry.
            //              4:    Start index of match current match attempt.
            //          The first four items must match the layout of data for LA_START / LA_END

            // Generate match code for any pending literals.
            fixLiterals();

            // Allocate data space
            int32_t dataLoc = allocateData(5);

            // Emit URX_LB_START
            appendOp(URX_LB_START, dataLoc);

            // Emit URX_LB_CONT
            appendOp(URX_LB_CONT, dataLoc);
            appendOp(URX_RESERVED_OP, 0);    // MinMatchLength.  To be filled later.
            appendOp(URX_RESERVED_OP, 0);    // MaxMatchLength.  To be filled later.

            // Emit the NOPs
            appendOp(URX_NOP, 0);
            appendOp(URX_NOP, 0);

            // On the Parentheses stack, start a new frame and add the positions
            //   of the URX_LB_CONT and the NOP.
            fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
            fParenStack.push(lookBehind, *fStatus);                       // Frame type
            fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The first NOP location
            fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The 2nd   NOP location

            // The final two instructions will be added when the ')' is encountered.
        }

        break;

    case doOpenLookBehindNeg:
        {
            //   Compile a (?<! negated look-behind open paren.
            //
            //          Compiles to
            //              0       URX_LB_START     dataLoc    # Save entry stack, input len
            //              1       URX_LBN_CONT     dataLoc    # Iterate possible match positions
            //              2                        MinMatchLen
            //              3                        MaxMatchLen
            //              4                        continueLoc (9)
            //              5       URX_NOP          Standard '(' boilerplate.
            //              6       URX_NOP          Reserved slot for use with '|' ops within (block).
            //              7         <code for LookBehind expression>
            //              8       URX_LBN_END      dataLoc    # Check match len, cause a FAIL
            //              9       ...
            //
            //          Allocate a block of matcher data, to contain (when running a match)
            //              0:    Stack ptr on entry
            //              1:    Input Index on entry
            //              2:    fActiveStart, the active bounds start on entry.
            //              3:    fActiveLimit, the active bounds limit on entry.
            //              4:    Start index of match current match attempt.
            //          The first four items must match the layout of data for LA_START / LA_END

            // Generate match code for any pending literals.
            fixLiterals();

            // Allocate data space
            int32_t dataLoc = allocateData(5);

            // Emit URX_LB_START
            appendOp(URX_LB_START, dataLoc);

            // Emit URX_LBN_CONT
            appendOp(URX_LBN_CONT, dataLoc);
            appendOp(URX_RESERVED_OP, 0);    // MinMatchLength.  To be filled later.
            appendOp(URX_RESERVED_OP, 0);    // MaxMatchLength.  To be filled later.
            appendOp(URX_RESERVED_OP, 0);    // Continue Loc.    To be filled later.

            // Emit the NOPs
            appendOp(URX_NOP, 0);
            appendOp(URX_NOP, 0);

            // On the Parentheses stack, start a new frame and add the positions
            //   of the URX_LB_CONT and the NOP.
            fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
            fParenStack.push(lookBehindN, *fStatus);                      // Frame type
            fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The first NOP location
            fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The 2nd   NOP location

            // The final two instructions will be added when the ')' is encountered.
        }
        break;

    case doConditionalExpr:
        // Conditionals such as (?(1)a:b)
    case doPerlInline:
        // Perl inline-conditionals.  (?{perl code}a|b) We're not perl, no way to do them.
        error(U_REGEX_UNIMPLEMENTED);
        break;


    case doCloseParen:
        handleCloseParen();
        if (fParenStack.size() <= 0) {
            //  Extra close paren, or missing open paren.
            error(U_REGEX_MISMATCHED_PAREN);
        }
        break;

    case doNOP:
        break;


    case doBadOpenParenType:
    case doRuleError:
        error(U_REGEX_RULE_SYNTAX);
        break;


    case doMismatchedParenErr:
        error(U_REGEX_MISMATCHED_PAREN);
        break;

    case doPlus:
        //  Normal '+'  compiles to
        //     1.   stuff to be repeated  (already built)
        //     2.   jmp-sav 1
        //     3.   ...
        //
        //  Or, if the item to be repeated can match a zero length string,
        //     1.   STO_INP_LOC  data-loc
        //     2.      body of stuff to be repeated
        //     3.   JMP_SAV_X    2
        //     4.   ...

        //
        //  Or, if the item to be repeated is simple
        //     1.   Item to be repeated.
        //     2.   LOOP_SR_I    set number  (assuming repeated item is a set ref)
        //     3.   LOOP_C       stack location
        {
            int32_t  topLoc = blockTopLoc(false);        // location of item #1
            int32_t  frameLoc;

            // Check for simple constructs, which may get special optimized code.
            if (topLoc == fRXPat->fCompiledPat->size() - 1) {
                int32_t repeatedOp = (int32_t)fRXPat->fCompiledPat->elementAti(topLoc);

                if (URX_TYPE(repeatedOp) == URX_SETREF) {
                    // Emit optimized code for [char set]+
                    appendOp(URX_LOOP_SR_I, URX_VAL(repeatedOp));
                    frameLoc = allocateStackData(1);
                    appendOp(URX_LOOP_C, frameLoc);
                    break;
                }

                if (URX_TYPE(repeatedOp) == URX_DOTANY ||
                    URX_TYPE(repeatedOp) == URX_DOTANY_ALL ||
                    URX_TYPE(repeatedOp) == URX_DOTANY_UNIX) {
                    // Emit Optimized code for .+ operations.
                    int32_t loopOpI = buildOp(URX_LOOP_DOT_I, 0);
                    if (URX_TYPE(repeatedOp) == URX_DOTANY_ALL) {
                        // URX_LOOP_DOT_I operand is a flag indicating ". matches any" mode.
                        loopOpI |= 1;
                    }
                    if (fModeFlags & UREGEX_UNIX_LINES) {
                        loopOpI |= 2;
                    }
                    appendOp(loopOpI);
                    frameLoc = allocateStackData(1);
                    appendOp(URX_LOOP_C, frameLoc);
                    break;
                }

            }

            // General case.

            // Check for minimum match length of zero, which requires
            //    extra loop-breaking code.
            if (minMatchLength(topLoc, fRXPat->fCompiledPat->size()-1) == 0) {
                // Zero length match is possible.
                // Emit the code sequence that can handle it.
                insertOp(topLoc);
                frameLoc = allocateStackData(1);

                int32_t op = buildOp(URX_STO_INP_LOC, frameLoc);
                fRXPat->fCompiledPat->setElementAt(op, topLoc);

                appendOp(URX_JMP_SAV_X, topLoc+1);
            } else {
                // Simpler code when the repeated body must match something non-empty
                appendOp(URX_JMP_SAV, topLoc);
            }
        }
        break;

    case doNGPlus:
        //  Non-greedy '+?'  compiles to
        //     1.   stuff to be repeated  (already built)
        //     2.   state-save  1
        //     3.   ...
        {
            int32_t topLoc      = blockTopLoc(false);
            appendOp(URX_STATE_SAVE, topLoc);
        }
        break;


    case doOpt:
        // Normal (greedy) ? quantifier.
        //  Compiles to
        //     1. state save 3
        //     2.    body of optional block
        //     3. ...
        // Insert the state save into the compiled pattern, and we're done.
        {
            int32_t   saveStateLoc = blockTopLoc(true);
            int32_t   saveStateOp  = buildOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size());
            fRXPat->fCompiledPat->setElementAt(saveStateOp, saveStateLoc);
        }
        break;

    case doNGOpt:
        // Non-greedy ?? quantifier
        //   compiles to
        //    1.  jmp   4
        //    2.     body of optional block
        //    3   jmp   5
        //    4.  state save 2
        //    5    ...
        //  This code is less than ideal, with two jmps instead of one, because we can only
        //  insert one instruction at the top of the block being iterated.
        {
            int32_t  jmp1_loc = blockTopLoc(true);
            int32_t  jmp2_loc = fRXPat->fCompiledPat->size();

            int32_t  jmp1_op  = buildOp(URX_JMP, jmp2_loc+1);
            fRXPat->fCompiledPat->setElementAt(jmp1_op, jmp1_loc);

            appendOp(URX_JMP, jmp2_loc+2);

            appendOp(URX_STATE_SAVE, jmp1_loc+1);
        }
        break;


    case doStar:
        // Normal (greedy) * quantifier.
        // Compiles to
        //       1.   STATE_SAVE   4
        //       2.      body of stuff being iterated over
        //       3.   JMP_SAV      2
        //       4.   ...
        //
        // Or, if the body is a simple [Set],
        //       1.   LOOP_SR_I    set number
        //       2.   LOOP_C       stack location
        //       ...
        //
        // Or if this is a .*
        //       1.   LOOP_DOT_I    (. matches all mode flag)
        //       2.   LOOP_C        stack location
        //
        // Or, if the body can match a zero-length string, to inhibit infinite loops,
        //       1.   STATE_SAVE   5
        //       2.   STO_INP_LOC  data-loc
        //       3.      body of stuff
        //       4.   JMP_SAV_X    2
        //       5.   ...
        {
            // location of item #1, the STATE_SAVE
            int32_t   topLoc = blockTopLoc(false);
            int32_t   dataLoc = -1;

            // Check for simple *, where the construct being repeated
            //   compiled to single opcode, and might be optimizable.
            if (topLoc == fRXPat->fCompiledPat->size() - 1) {
                int32_t repeatedOp = (int32_t)fRXPat->fCompiledPat->elementAti(topLoc);

                if (URX_TYPE(repeatedOp) == URX_SETREF) {
                    // Emit optimized code for a [char set]*
                    int32_t loopOpI = buildOp(URX_LOOP_SR_I, URX_VAL(repeatedOp));
                    fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc);
                    dataLoc = allocateStackData(1);
                    appendOp(URX_LOOP_C, dataLoc);
                    break;
                }

                if (URX_TYPE(repeatedOp) == URX_DOTANY ||
                    URX_TYPE(repeatedOp) == URX_DOTANY_ALL ||
                    URX_TYPE(repeatedOp) == URX_DOTANY_UNIX) {
                    // Emit Optimized code for .* operations.
                    int32_t loopOpI = buildOp(URX_LOOP_DOT_I, 0);
                    if (URX_TYPE(repeatedOp) == URX_DOTANY_ALL) {
                        // URX_LOOP_DOT_I operand is a flag indicating . matches any mode.
                        loopOpI |= 1;
                    }
                    if ((fModeFlags & UREGEX_UNIX_LINES) != 0) {
                        loopOpI |= 2;
                    }
                    fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc);
                    dataLoc = allocateStackData(1);
                    appendOp(URX_LOOP_C, dataLoc);
                    break;
                }
            }

            // Emit general case code for this *
            // The optimizations did not apply.

            int32_t   saveStateLoc = blockTopLoc(true);
            int32_t   jmpOp        = buildOp(URX_JMP_SAV, saveStateLoc+1);

            // Check for minimum match length of zero, which requires
            //    extra loop-breaking code.
            if (minMatchLength(saveStateLoc, fRXPat->fCompiledPat->size()-1) == 0) {
                insertOp(saveStateLoc);
                dataLoc = allocateStackData(1);

                int32_t op = buildOp(URX_STO_INP_LOC, dataLoc);
                fRXPat->fCompiledPat->setElementAt(op, saveStateLoc+1);
                jmpOp      = buildOp(URX_JMP_SAV_X, saveStateLoc+2);
            }

            // Locate the position in the compiled pattern where the match will continue
            //   after completing the *.   (4 or 5 in the comment above)
            int32_t continueLoc = fRXPat->fCompiledPat->size()+1;

            // Put together the save state op and store it into the compiled code.
            int32_t saveStateOp = buildOp(URX_STATE_SAVE, continueLoc);
            fRXPat->fCompiledPat->setElementAt(saveStateOp, saveStateLoc);

            // Append the URX_JMP_SAV or URX_JMPX operation to the compiled pattern.
            appendOp(jmpOp);
        }
        break;

    case doNGStar:
        // Non-greedy *? quantifier
        // compiles to
        //     1.   JMP    3
        //     2.      body of stuff being iterated over
        //     3.   STATE_SAVE  2
        //     4    ...
        {
            int32_t     jmpLoc  = blockTopLoc(true);                   // loc  1.
            int32_t     saveLoc = fRXPat->fCompiledPat->size();        // loc  3.
            int32_t     jmpOp   = buildOp(URX_JMP, saveLoc);
            fRXPat->fCompiledPat->setElementAt(jmpOp, jmpLoc);
            appendOp(URX_STATE_SAVE, jmpLoc+1);
        }
        break;


    case doIntervalInit:
        // The '{' opening an interval quantifier was just scanned.
        // Init the counter variables that will accumulate the values as the digits
        //    are scanned.
        fIntervalLow = 0;
        fIntervalUpper = -1;
        break;

    case doIntevalLowerDigit:
        // Scanned a digit from the lower value of an {lower,upper} interval
        {
            int32_t digitValue = u_charDigitValue(fC.fChar);
            U_ASSERT(digitValue >= 0);
            int64_t val = (int64_t)fIntervalLow*10 + digitValue;
            if (val > INT32_MAX) {
                error(U_REGEX_NUMBER_TOO_BIG);
            } else {
                fIntervalLow = (int32_t)val;
            }
        }
        break;

    case doIntervalUpperDigit:
        // Scanned a digit from the upper value of an {lower,upper} interval
        {
            if (fIntervalUpper < 0) {
                fIntervalUpper = 0;
            }
            int32_t digitValue = u_charDigitValue(fC.fChar);
            U_ASSERT(digitValue >= 0);
            int64_t val = (int64_t)fIntervalUpper*10 + digitValue;
            if (val > INT32_MAX) {
                error(U_REGEX_NUMBER_TOO_BIG);
            } else {
                fIntervalUpper = (int32_t)val;
            }
        }
        break;

    case doIntervalSame:
        // Scanned a single value interval like {27}.  Upper = Lower.
        fIntervalUpper = fIntervalLow;
        break;

    case doInterval:
        // Finished scanning a normal {lower,upper} interval.  Generate the code for it.
        if (compileInlineInterval() == false) {
            compileInterval(URX_CTR_INIT, URX_CTR_LOOP);
        }
        break;

    case doPossessiveInterval:
        // Finished scanning a Possessive {lower,upper}+ interval.  Generate the code for it.
        {
            // Remember the loc for the top of the block being looped over.
            //   (Can not reserve a slot in the compiled pattern at this time, because
            //    compileInterval needs to reserve also, and blockTopLoc can only reserve
            //    once per block.)
            int32_t topLoc = blockTopLoc(false);

            // Produce normal looping code.
            compileInterval(URX_CTR_INIT, URX_CTR_LOOP);

            // Surround the just-emitted normal looping code with a STO_SP ... LD_SP
            //  just as if the loop was inclosed in atomic parentheses.

            // First the STO_SP before the start of the loop
            insertOp(topLoc);

            int32_t  varLoc = allocateData(1);   // Reserve a data location for saving the
            int32_t  op     = buildOp(URX_STO_SP, varLoc);
            fRXPat->fCompiledPat->setElementAt(op, topLoc);

            int32_t loopOp = (int32_t)fRXPat->fCompiledPat->popi();
            U_ASSERT(URX_TYPE(loopOp) == URX_CTR_LOOP && URX_VAL(loopOp) == topLoc);
            loopOp++;     // point LoopOp after the just-inserted STO_SP
            fRXPat->fCompiledPat->push(loopOp, *fStatus);

            // Then the LD_SP after the end of the loop
            appendOp(URX_LD_SP, varLoc);
        }

        break;

    case doNGInterval:
        // Finished scanning a non-greedy {lower,upper}? interval.  Generate the code for it.
        compileInterval(URX_CTR_INIT_NG, URX_CTR_LOOP_NG);
        break;

    case doIntervalError:
        error(U_REGEX_BAD_INTERVAL);
        break;

    case doLiteralChar:
        // We've just scanned a "normal" character from the pattern,
        literalChar(fC.fChar);
        break;


    case doEscapedLiteralChar:
        // We've just scanned an backslashed escaped character with  no
        //   special meaning.  It represents itself.
        if ((fModeFlags & UREGEX_ERROR_ON_UNKNOWN_ESCAPES) != 0 &&
            ((fC.fChar >= 0x41 && fC.fChar<= 0x5A) ||     // in [A-Z]
            (fC.fChar >= 0x61 && fC.fChar <= 0x7a))) {   // in [a-z]
               error(U_REGEX_BAD_ESCAPE_SEQUENCE);
             }
        literalChar(fC.fChar);
        break;


    case doDotAny:
        // scanned a ".",  match any single character.
        {
            fixLiterals(false);
            if (fModeFlags & UREGEX_DOTALL) {
                appendOp(URX_DOTANY_ALL, 0);
            } else if (fModeFlags & UREGEX_UNIX_LINES) {
                appendOp(URX_DOTANY_UNIX, 0);
            } else {
                appendOp(URX_DOTANY, 0);
            }
        }
        break;

    case doCaret:
        {
            fixLiterals(false);
            if (       (fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
                appendOp(URX_CARET, 0);
            } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
                appendOp(URX_CARET_M, 0);
            } else if ((fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
                appendOp(URX_CARET, 0);   // Only testing true start of input.
            } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
                appendOp(URX_CARET_M_UNIX, 0);
            }
        }
        break;

    case doDollar:
        {
            fixLiterals(false);
            if (       (fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
                appendOp(URX_DOLLAR, 0);
            } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
                appendOp(URX_DOLLAR_M, 0);
            } else if ((fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
                appendOp(URX_DOLLAR_D, 0);
            } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
                appendOp(URX_DOLLAR_MD, 0);
            }
        }
        break;

    case doBackslashA:
        fixLiterals(false);
        appendOp(URX_CARET, 0);
        break;

    case doBackslashB:
        {
            #if  UCONFIG_NO_BREAK_ITERATION==1
            if (fModeFlags & UREGEX_UWORD) {
                error(U_UNSUPPORTED_ERROR);
            }
            #endif
            fixLiterals(false);
            int32_t op = (fModeFlags & UREGEX_UWORD)? URX_BACKSLASH_BU : URX_BACKSLASH_B;
            appendOp(op, 1);
        }
        break;

    case doBackslashb:
        {
            #if  UCONFIG_NO_BREAK_ITERATION==1
            if (fModeFlags & UREGEX_UWORD) {
                error(U_UNSUPPORTED_ERROR);
            }
            #endif
            fixLiterals(false);
            int32_t op = (fModeFlags & UREGEX_UWORD)? URX_BACKSLASH_BU : URX_BACKSLASH_B;
            appendOp(op, 0);
        }
        break;

    case doBackslashD:
        fixLiterals(false);
        appendOp(URX_BACKSLASH_D, 1);
        break;

    case doBackslashd:
        fixLiterals(false);
        appendOp(URX_BACKSLASH_D, 0);
        break;

    case doBackslashG:
        fixLiterals(false);
        appendOp(URX_BACKSLASH_G, 0);
        break;

    case doBackslashH:
        fixLiterals(false);
        appendOp(URX_BACKSLASH_H, 1);
        break;

    case doBackslashh:
        fixLiterals(false);
        appendOp(URX_BACKSLASH_H, 0);
        break;

    case doBackslashR:
        fixLiterals(false);
        appendOp(URX_BACKSLASH_R, 0);
        break;

    case doBackslashS:
        fixLiterals(false);
        appendOp(URX_STAT_SETREF_N, URX_ISSPACE_SET);
        break;

    case doBackslashs:
        fixLiterals(false);
        appendOp(URX_STATIC_SETREF, URX_ISSPACE_SET);
        break;

    case doBackslashV:
        fixLiterals(false);
        appendOp(URX_BACKSLASH_V, 1);
        break;

    case doBackslashv:
        fixLiterals(false);
        appendOp(URX_BACKSLASH_V, 0);
        break;

    case doBackslashW:
        fixLiterals(false);
        appendOp(URX_STAT_SETREF_N, URX_ISWORD_SET);
        break;

    case doBackslashw:
        fixLiterals(false);
        appendOp(URX_STATIC_SETREF, URX_ISWORD_SET);
        break;

    case doBackslashX:
        #if  UCONFIG_NO_BREAK_ITERATION==1
        // Grapheme Cluster Boundary requires ICU break iteration.
        error(U_UNSUPPORTED_ERROR);
        #endif
        fixLiterals(false);
        appendOp(URX_BACKSLASH_X, 0);
        break;

    case doBackslashZ:
        fixLiterals(false);
        appendOp(URX_DOLLAR, 0);
        break;

    case doBackslashz:
        fixLiterals(false);
        appendOp(URX_BACKSLASH_Z, 0);
        break;

    case doEscapeError:
        error(U_REGEX_BAD_ESCAPE_SEQUENCE);
        break;

    case doExit:
        fixLiterals(false);
        returnVal = false;
        break;

    case doProperty:
        {
            fixLiterals(false);
            UnicodeSet *theSet = scanProp();
            compileSet(theSet);
        }
        break;

    case doNamedChar:
        {
            UChar32 c = scanNamedChar();
            literalChar(c);
        }
        break;


    case doBackRef:
        // BackReference.  Somewhat unusual in that the front-end can not completely parse
        //                 the regular expression, because the number of digits to be consumed
        //                 depends on the number of capture groups that have been defined.  So
        //                 we have to do it here instead.
        {
            int32_t  numCaptureGroups = fRXPat->fGroupMap->size();
            int32_t  groupNum = 0;
            UChar32  c        = fC.fChar;

            for (;;) {
                // Loop once per digit, for max allowed number of digits in a back reference.
                int32_t digit = u_charDigitValue(c);
                groupNum = groupNum * 10 + digit;
                if (groupNum >= numCaptureGroups) {
                    break;
                }
                c = peekCharLL();
                if (RegexStaticSets::gStaticSets->fRuleDigitsAlias->contains(c) == false) {
                    break;
                }
                nextCharLL();
            }

            // Scan of the back reference in the source regexp is complete.  Now generate
            //  the compiled code for it.
            // Because capture groups can be forward-referenced by back-references,
            //  we fill the operand with the capture group number.  At the end
            //  of compilation, it will be changed to the variable's location.
            U_ASSERT(groupNum > 0);  // Shouldn't happen.  '\0' begins an octal escape sequence,
                                     //    and shouldn't enter this code path at all.
            fixLiterals(false);
            if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
                appendOp(URX_BACKREF_I, groupNum);
            } else {
                appendOp(URX_BACKREF, groupNum);
            }
        }
        break;

    case doBeginNamedBackRef:
        U_ASSERT(fCaptureName == nullptr);
        fCaptureName = new UnicodeString;
        if (fCaptureName == nullptr) {
            error(U_MEMORY_ALLOCATION_ERROR);
        }
        break;
            
    case doContinueNamedBackRef:
        fCaptureName->append(fC.fChar);
        break;

    case doCompleteNamedBackRef:
        {
        int32_t groupNumber =
            fRXPat->fNamedCaptureMap ? uhash_geti(fRXPat->fNamedCaptureMap, fCaptureName) : 0;
        if (groupNumber == 0) {
            // Group name has not been defined.
            //   Could be a forward reference. If we choose to support them at some
            //   future time, extra mechanism will be required at this point.
            error(U_REGEX_INVALID_CAPTURE_GROUP_NAME);
        } else {
            // Given the number, handle identically to a \n numbered back reference.
            // See comments above, under doBackRef
            fixLiterals(false);
            if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
                appendOp(URX_BACKREF_I, groupNumber);
            } else {
                appendOp(URX_BACKREF, groupNumber);
            }
        }
        delete fCaptureName;
        fCaptureName = nullptr;
        break;
        }
       
    case doPossessivePlus:
        // Possessive ++ quantifier.
        // Compiles to
        //       1.   STO_SP
        //       2.      body of stuff being iterated over
        //       3.   STATE_SAVE 5
        //       4.   JMP        2
        //       5.   LD_SP
        //       6.   ...
        //
        //  Note:  TODO:  This is pretty inefficient.  A mass of saved state is built up
        //                then unconditionally discarded.  Perhaps introduce a new opcode.  Ticket 6056
        //
        {
            // Emit the STO_SP
            int32_t   topLoc = blockTopLoc(true);
            int32_t   stoLoc = allocateData(1);  // Reserve the data location for storing save stack ptr.
            int32_t   op     = buildOp(URX_STO_SP, stoLoc);
            fRXPat->fCompiledPat->setElementAt(op, topLoc);

            // Emit the STATE_SAVE
            appendOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+2);

            // Emit the JMP
            appendOp(URX_JMP, topLoc+1);

            // Emit the LD_SP
            appendOp(URX_LD_SP, stoLoc);
        }
        break;

    case doPossessiveStar:
        // Possessive *+ quantifier.
        // Compiles to
        //       1.   STO_SP       loc
        //       2.   STATE_SAVE   5
        //       3.      body of stuff being iterated over
        //       4.   JMP          2
        //       5.   LD_SP        loc
        //       6    ...
        // TODO:  do something to cut back the state stack each time through the loop.
        {
            // Reserve two slots at the top of the block.
            int32_t   topLoc = blockTopLoc(true);
            insertOp(topLoc);

            // emit   STO_SP     loc
            int32_t   stoLoc = allocateData(1);    // Reserve the data location for storing save stack ptr.
            int32_t   op     = buildOp(URX_STO_SP, stoLoc);
            fRXPat->fCompiledPat->setElementAt(op, topLoc);

            // Emit the SAVE_STATE   5
            int32_t L7 = fRXPat->fCompiledPat->size()+1;
            op = buildOp(URX_STATE_SAVE, L7);
            fRXPat->fCompiledPat->setElementAt(op, topLoc+1);

            // Append the JMP operation.
            appendOp(URX_JMP, topLoc+1);

            // Emit the LD_SP       loc
            appendOp(URX_LD_SP, stoLoc);
        }
        break;

    case doPossessiveOpt:
        // Possessive  ?+ quantifier.
        //  Compiles to
        //     1. STO_SP      loc
        //     2. SAVE_STATE  5
        //     3.    body of optional block
        //     4. LD_SP       loc
        //     5. ...
        //
        {
            // Reserve two slots at the top of the block.
            int32_t   topLoc = blockTopLoc(true);
            insertOp(topLoc);

            // Emit the STO_SP
            int32_t   stoLoc = allocateData(1);   // Reserve the data location for storing save stack ptr.
            int32_t   op     = buildOp(URX_STO_SP, stoLoc);
            fRXPat->fCompiledPat->setElementAt(op, topLoc);

            // Emit the SAVE_STATE
            int32_t   continueLoc = fRXPat->fCompiledPat->size()+1;
            op = buildOp(URX_STATE_SAVE, continueLoc);
            fRXPat->fCompiledPat->setElementAt(op, topLoc+1);

            // Emit the LD_SP
            appendOp(URX_LD_SP, stoLoc);
        }
        break;


    case doBeginMatchMode:
        fNewModeFlags = fModeFlags;
        fSetModeFlag  = true;
        break;

    case doMatchMode:   //  (?i)    and similar
        {
            int32_t  bit = 0;
            switch (fC.fChar) {
            case 0x69: /* 'i' */   bit = UREGEX_CASE_INSENSITIVE; break;
            case 0x64: /* 'd' */   bit = UREGEX_UNIX_LINES;       break;
            case 0x6d: /* 'm' */   bit = UREGEX_MULTILINE;        break;
            case 0x73: /* 's' */   bit = UREGEX_DOTALL;           break;
            case 0x75: /* 'u' */   bit = 0; /* Unicode casing */  break;
            case 0x77: /* 'w' */   bit = UREGEX_UWORD;            break;
            case 0x78: /* 'x' */   bit = UREGEX_COMMENTS;         break;
            case 0x2d: /* '-' */   fSetModeFlag = false;          break;
            default:
                UPRV_UNREACHABLE_EXIT;  // Should never happen.  Other chars are filtered out
                                        // by the scanner.
            }
            if (fSetModeFlag) {
                fNewModeFlags |= bit;
            } else {
                fNewModeFlags &= ~bit;
            }
        }
        break;

    case doSetMatchMode:
        // Emit code to match any pending literals, using the not-yet changed match mode.
        fixLiterals();

        // We've got a (?i) or similar.  The match mode is being changed, but
        //   the change is not scoped to a parenthesized block.
        U_ASSERT(fNewModeFlags < 0);
        fModeFlags = fNewModeFlags;

        break;


    case doMatchModeParen:
        // We've got a (?i: or similar.  Begin a parenthesized block, save old
        //   mode flags so they can be restored at the close of the block.
        //
        //   Compile to a
        //      - NOP, which later may be replaced by a save-state if the
        //         parenthesized group gets a * quantifier, followed by
        //      - NOP, which may later be replaced by a save-state if there
        //             is an '|' alternation within the parens.
        {
            fixLiterals(false);
            appendOp(URX_NOP, 0);
            appendOp(URX_NOP, 0);

            // On the Parentheses stack, start a new frame and add the positions
            //   of the two NOPs (a normal non-capturing () frame, except for the
            //   saving of the original mode flags.)
            fParenStack.push(fModeFlags, *fStatus);
            fParenStack.push(flags, *fStatus);                            // Frame Marker
            fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The first NOP
            fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP

            // Set the current mode flags to the new values.
            U_ASSERT(fNewModeFlags < 0);
            fModeFlags = fNewModeFlags;
        }
        break;

    case doBadModeFlag:
        error(U_REGEX_INVALID_FLAG);
        break;

    case doSuppressComments:
        // We have just scanned a '(?'.  We now need to prevent the character scanner from
        // treating a '#' as a to-the-end-of-line comment.
        //   (This Perl compatibility just gets uglier and uglier to do...)
        fEOLComments = false;
        break;


    case doSetAddAmp:
        {
          UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
          set->add(chAmp);
        }
        break;

    case doSetAddDash:
        {
          UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
          set->add(chDash);
        }
        break;

     case doSetBackslash_s:
        {
         UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
         set->addAll(RegexStaticSets::gStaticSets->fPropSets[URX_ISSPACE_SET]);
         break;
        }

     case doSetBackslash_S:
        {
            UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
            UnicodeSet SSet;
            SSet.addAll(RegexStaticSets::gStaticSets->fPropSets[URX_ISSPACE_SET]).complement();
            set->addAll(SSet);
            break;
        }

    case doSetBackslash_d:
        {
            UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
            // TODO - make a static set, ticket 6058.
            addCategory(set, U_GC_ND_MASK, *fStatus);
            break;
        }

    case doSetBackslash_D:
        {
            UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
            UnicodeSet digits;
            // TODO - make a static set, ticket 6058.
            digits.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ND_MASK, *fStatus);
            digits.complement();
            set->addAll(digits);
            break;
        }

    case doSetBackslash_h:
        {
            UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
            UnicodeSet h;
            h.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ZS_MASK, *fStatus);
            h.add((UChar32)9);   // Tab
            set->addAll(h);
            break;
        }

    case doSetBackslash_H:
        {
            UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
            UnicodeSet h;
            h.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ZS_MASK, *fStatus);
            h.add((UChar32)9);   // Tab
            h.complement();
            set->addAll(h);
            break;
        }

    case doSetBackslash_v:
        {
            UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
            set->add((UChar32)0x0a, (UChar32)0x0d);  // add range
            set->add((UChar32)0x85);
            set->add((UChar32)0x2028, (UChar32)0x2029);
            break;
        }

    case doSetBackslash_V:
        {
            UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
            UnicodeSet v;
            v.add((UChar32)0x0a, (UChar32)0x0d);  // add range
            v.add((UChar32)0x85);
            v.add((UChar32)0x2028, (UChar32)0x2029);
            v.complement();
            set->addAll(v);
            break;
        }

    case doSetBackslash_w:
        {
            UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
            set->addAll(RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET]);
            break;
        }

    case doSetBackslash_W:
        {
            UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
            UnicodeSet SSet;
            SSet.addAll(RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET]).complement();
            set->addAll(SSet);
            break;
        }

    case doSetBegin:
        {
            fixLiterals(false);
            LocalPointer<UnicodeSet> lpSet(new UnicodeSet(), *fStatus);
            fSetStack.push(lpSet.orphan(), *fStatus);
            fSetOpStack.push(setStart, *fStatus);
            if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
                fSetOpStack.push(setCaseClose, *fStatus);
            }
            break;
        }

    case doSetBeginDifference1:
        //  We have scanned something like [[abc]-[
        //  Set up a new UnicodeSet for the set beginning with the just-scanned '['
        //  Push a Difference operator, which will cause the new set to be subtracted from what
        //    went before once it is created.
        setPushOp(setDifference1);
        fSetOpStack.push(setStart, *fStatus);
        if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
            fSetOpStack.push(setCaseClose, *fStatus);
        }
        break;

    case doSetBeginIntersection1:
        //  We have scanned something like  [[abc]&[
        //   Need both the '&' operator and the open '[' operator.
        setPushOp(setIntersection1);
        fSetOpStack.push(setStart, *fStatus);
        if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
            fSetOpStack.push(setCaseClose, *fStatus);
        }
        break;

    case doSetBeginUnion:
        //  We have scanned something like  [[abc][
        //     Need to handle the union operation explicitly [[abc] | [
        setPushOp(setUnion);
        fSetOpStack.push(setStart, *fStatus);
        if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
            fSetOpStack.push(setCaseClose, *fStatus);
        }
        break;

    case doSetDifference2:
        // We have scanned something like [abc--
        //   Consider this to unambiguously be a set difference operator.
        setPushOp(setDifference2);
        break;

    case doSetEnd:
        // Have encountered the ']' that closes a set.
        //    Force the evaluation of any pending operations within this set,
        //    leave the completed set on the top of the set stack.
        setEval(setEnd);
        U_ASSERT(fSetOpStack.peeki()==setStart);
        fSetOpStack.popi();
        break;

    case doSetFinish:
        {
        // Finished a complete set expression, including all nested sets.
        //   The close bracket has already triggered clearing out pending set operators,
        //    the operator stack should be empty and the operand stack should have just
        //    one entry, the result set.
        U_ASSERT(fSetOpStack.empty());
        UnicodeSet *theSet = (UnicodeSet *)fSetStack.pop();
        U_ASSERT(fSetStack.empty());
        compileSet(theSet);
        break;
        }

    case doSetIntersection2:
        // Have scanned something like [abc&&
        setPushOp(setIntersection2);
        break;

    case doSetLiteral:
        // Union the just-scanned literal character into the set being built.
        //    This operation is the highest precedence set operation, so we can always do
        //    it immediately, without waiting to see what follows.  It is necessary to perform
        //    any pending '-' or '&' operation first, because these have the same precedence
        //    as union-ing in a literal'
        {
            setEval(setUnion);
            UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
            s->add(fC.fChar);
            fLastSetLiteral = fC.fChar;
            break;
        }

    case doSetLiteralEscaped:
        // A back-slash escaped literal character was encountered.
        // Processing is the same as with setLiteral, above, with the addition of
        //  the optional check for errors on escaped ASCII letters.
        {
            if ((fModeFlags & UREGEX_ERROR_ON_UNKNOWN_ESCAPES) != 0 &&
                ((fC.fChar >= 0x41 && fC.fChar<= 0x5A) ||     // in [A-Z]
                 (fC.fChar >= 0x61 && fC.fChar <= 0x7a))) {   // in [a-z]
                error(U_REGEX_BAD_ESCAPE_SEQUENCE);
            }
            setEval(setUnion);
            UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
            s->add(fC.fChar);
            fLastSetLiteral = fC.fChar;
            break;
        }

        case doSetNamedChar:
        // Scanning a \N{UNICODE CHARACTER NAME}
        //  Aside from the source of the character, the processing is identical to doSetLiteral,
        //    above.
        {
            UChar32  c = scanNamedChar();
            setEval(setUnion);
            UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
            s->add(c);
            fLastSetLiteral = c;
            break;
        }

    case doSetNamedRange:
        // We have scanned literal-\N{CHAR NAME}.  Add the range to the set.
        // The left character is already in the set, and is saved in fLastSetLiteral.
        // The right side needs to be picked up, the scan is at the 'N'.
        // Lower Limit > Upper limit being an error matches both Java
        //        and ICU UnicodeSet behavior.
        {
            UChar32  c = scanNamedChar();
            if (U_SUCCESS(*fStatus) && (fLastSetLiteral == U_SENTINEL || fLastSetLiteral > c)) {
                error(U_REGEX_INVALID_RANGE);
            }
            UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
            s->add(fLastSetLiteral, c);
            fLastSetLiteral = c;
            break;
        }


    case  doSetNegate:
        // Scanned a '^' at the start of a set.
        // Push the negation operator onto the set op stack.
        // A twist for case-insensitive matching:
        //   the case closure operation must happen _before_ negation.
        //   But the case closure operation will already be on the stack if it's required.
        //   This requires checking for case closure, and swapping the stack order
        //    if it is present.
        {
            int32_t  tosOp = fSetOpStack.peeki();
            if (tosOp == setCaseClose) {
                fSetOpStack.popi();
                fSetOpStack.push(setNegation, *fStatus);
                fSetOpStack.push(setCaseClose, *fStatus);
            } else {
                fSetOpStack.push(setNegation, *fStatus);
            }
        }
        break;

    case doSetNoCloseError:
        error(U_REGEX_MISSING_CLOSE_BRACKET);
        break;

    case doSetOpError:
        error(U_REGEX_RULE_SYNTAX);   //  -- or && at the end of a set.  Illegal.
        break;

    case doSetPosixProp:
        {
            UnicodeSet *s = scanPosixProp();
            if (s != nullptr) {
                UnicodeSet *tos = (UnicodeSet *)fSetStack.peek();
                tos->addAll(*s);
                delete s;
            }  // else error.  scanProp() reported the error status already.
        }
        break;

    case doSetProp:
        //  Scanned a \p \P within [brackets].
        {
            UnicodeSet *s = scanProp();
            if (s != nullptr) {
                UnicodeSet *tos = (UnicodeSet *)fSetStack.peek();
                tos->addAll(*s);
                delete s;
            }  // else error.  scanProp() reported the error status already.
        }
        break;


    case doSetRange:
        // We have scanned literal-literal.  Add the range to the set.
        // The left character is already in the set, and is saved in fLastSetLiteral.
        // The right side is the current character.
        // Lower Limit > Upper limit being an error matches both Java
        //        and ICU UnicodeSet behavior.
        {

        if (fLastSetLiteral == U_SENTINEL || fLastSetLiteral > fC.fChar) {
            error(U_REGEX_INVALID_RANGE);
        }
        UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
        s->add(fLastSetLiteral, fC.fChar);
        break;
        }

    default:
        UPRV_UNREACHABLE_EXIT;
    }

    if (U_FAILURE(*fStatus)) {
        returnVal = false;
    }

    return returnVal;
}