in zookeeper-server/src/main/java/org/apache/zookeeper/server/DataTree.java [861:1122]
public ProcessTxnResult processTxn(TxnHeader header, Record txn, boolean isSubTxn) {
ProcessTxnResult rc = new ProcessTxnResult();
try {
rc.clientId = header.getClientId();
rc.cxid = header.getCxid();
rc.zxid = header.getZxid();
rc.type = header.getType();
rc.err = 0;
rc.multiResult = null;
switch (header.getType()) {
case OpCode.create:
CreateTxn createTxn = (CreateTxn) txn;
rc.path = createTxn.getPath();
createNode(
createTxn.getPath(),
createTxn.getData(),
createTxn.getAcl(),
createTxn.getEphemeral() ? header.getClientId() : 0,
createTxn.getParentCVersion(),
header.getZxid(),
header.getTime(),
null);
break;
case OpCode.create2:
CreateTxn create2Txn = (CreateTxn) txn;
rc.path = create2Txn.getPath();
Stat stat = new Stat();
createNode(
create2Txn.getPath(),
create2Txn.getData(),
create2Txn.getAcl(),
create2Txn.getEphemeral() ? header.getClientId() : 0,
create2Txn.getParentCVersion(),
header.getZxid(),
header.getTime(),
stat);
rc.stat = stat;
break;
case OpCode.createTTL:
CreateTTLTxn createTtlTxn = (CreateTTLTxn) txn;
rc.path = createTtlTxn.getPath();
stat = new Stat();
createNode(
createTtlTxn.getPath(),
createTtlTxn.getData(),
createTtlTxn.getAcl(),
EphemeralType.TTL.toEphemeralOwner(createTtlTxn.getTtl()),
createTtlTxn.getParentCVersion(),
header.getZxid(),
header.getTime(),
stat);
rc.stat = stat;
break;
case OpCode.createContainer:
CreateContainerTxn createContainerTxn = (CreateContainerTxn) txn;
rc.path = createContainerTxn.getPath();
stat = new Stat();
createNode(
createContainerTxn.getPath(),
createContainerTxn.getData(),
createContainerTxn.getAcl(),
EphemeralType.CONTAINER_EPHEMERAL_OWNER,
createContainerTxn.getParentCVersion(),
header.getZxid(),
header.getTime(),
stat);
rc.stat = stat;
break;
case OpCode.delete:
case OpCode.deleteContainer:
DeleteTxn deleteTxn = (DeleteTxn) txn;
rc.path = deleteTxn.getPath();
deleteNode(deleteTxn.getPath(), header.getZxid());
break;
case OpCode.reconfig:
case OpCode.setData:
SetDataTxn setDataTxn = (SetDataTxn) txn;
rc.path = setDataTxn.getPath();
rc.stat = setData(
setDataTxn.getPath(),
setDataTxn.getData(),
setDataTxn.getVersion(),
header.getZxid(),
header.getTime());
break;
case OpCode.setACL:
SetACLTxn setACLTxn = (SetACLTxn) txn;
rc.path = setACLTxn.getPath();
rc.stat = setACL(setACLTxn.getPath(), setACLTxn.getAcl(), setACLTxn.getVersion());
break;
case OpCode.closeSession:
long sessionId = header.getClientId();
if (txn != null) {
killSession(sessionId, header.getZxid(),
ephemerals.remove(sessionId),
((CloseSessionTxn) txn).getPaths2Delete());
} else {
killSession(sessionId, header.getZxid());
}
break;
case OpCode.error:
ErrorTxn errTxn = (ErrorTxn) txn;
rc.err = errTxn.getErr();
break;
case OpCode.check:
CheckVersionTxn checkTxn = (CheckVersionTxn) txn;
rc.path = checkTxn.getPath();
break;
case OpCode.multi:
MultiTxn multiTxn = (MultiTxn) txn;
List<Txn> txns = multiTxn.getTxns();
rc.multiResult = new ArrayList<>();
boolean failed = false;
for (Txn subtxn : txns) {
if (subtxn.getType() == OpCode.error) {
failed = true;
break;
}
}
boolean post_failed = false;
for (Txn subtxn : txns) {
final Supplier<Record> supplier;
switch (subtxn.getType()) {
case OpCode.create:
case OpCode.create2:
supplier = CreateTxn::new;
break;
case OpCode.createTTL:
supplier = CreateTTLTxn::new;
break;
case OpCode.createContainer:
supplier = CreateContainerTxn::new;
break;
case OpCode.delete:
case OpCode.deleteContainer:
supplier = DeleteTxn::new;
break;
case OpCode.setData:
supplier = SetDataTxn::new;
break;
case OpCode.error:
supplier = ErrorTxn::new;
post_failed = true;
break;
case OpCode.check:
supplier = CheckVersionTxn::new;
break;
default:
throw new IOException("Invalid type of op: " + subtxn.getType());
}
final Record record;
if (failed && subtxn.getType() != OpCode.error) {
int ec = post_failed ? Code.RUNTIMEINCONSISTENCY.intValue() : Code.OK.intValue();
subtxn.setType(OpCode.error);
record = new ErrorTxn(ec);
} else {
record = RequestRecord.fromBytes(subtxn.getData()).readRecord(supplier);
}
assert !failed || (subtxn.getType() == OpCode.error);
TxnHeader subHdr = new TxnHeader(
header.getClientId(),
header.getCxid(),
header.getZxid(),
header.getTime(),
subtxn.getType());
ProcessTxnResult subRc = processTxn(subHdr, record, true);
rc.multiResult.add(subRc);
if (subRc.err != 0 && rc.err == 0) {
rc.err = subRc.err;
}
}
break;
}
} catch (KeeperException e) {
LOG.debug("Failed: {}:{}", header, txn, e);
rc.err = e.code().intValue();
} catch (IOException e) {
LOG.debug("Failed: {}:{}", header, txn, e);
}
/*
* Snapshots are taken lazily. When serializing a node, it's data
* and children copied in a synchronization block on that node,
* which means newly created node won't be in the snapshot, so
* we won't have mismatched cversion and pzxid when replaying the
* createNode txn.
*
* But there is a tricky scenario that if the child is deleted due
* to session close and re-created in a different global session
* after that the parent is serialized, then when replay the txn
* because the node belongs to a different session, replay the
* closeSession txn won't delete it anymore, and we'll get NODEEXISTS
* error when replay the createNode txn. In this case, we need to
* update the cversion and pzxid to the new value.
*
* Note, such failures on DT should be seen only during
* restore.
*/
if (header.getType() == OpCode.create && rc.err == Code.NODEEXISTS.intValue()) {
LOG.debug("Adjusting parent cversion for Txn: {} path: {} err: {}", header.getType(), rc.path, rc.err);
int lastSlash = rc.path.lastIndexOf('/');
String parentName = rc.path.substring(0, lastSlash);
CreateTxn cTxn = (CreateTxn) txn;
try {
setCversionPzxid(parentName, cTxn.getParentCVersion(), header.getZxid());
} catch (NoNodeException e) {
LOG.error("Failed to set parent cversion for: {}", parentName, e);
rc.err = e.code().intValue();
}
} else if (rc.err != Code.OK.intValue()) {
LOG.debug("Ignoring processTxn failure hdr: {} : error: {}", header.getType(), rc.err);
}
/*
* Things we can only update after the whole txn is applied to data
* tree.
*
* If we update the lastProcessedZxid with the first sub txn in multi
* and there is a snapshot in progress, it's possible that the zxid
* associated with the snapshot only include partial of the multi op.
*
* When loading snapshot, it will only load the txns after the zxid
* associated with snapshot file, which could cause data inconsistency
* due to missing sub txns.
*
* To avoid this, we only update the lastProcessedZxid when the whole
* multi-op txn is applied to DataTree.
*/
if (!isSubTxn) {
/*
* A snapshot might be in progress while we are modifying the data
* tree. If we set lastProcessedZxid prior to making corresponding
* change to the tree, then the zxid associated with the snapshot
* file will be ahead of its contents. Thus, while restoring from
* the snapshot, the restore method will not apply the transaction
* for zxid associated with the snapshot file, since the restore
* method assumes that transaction to be present in the snapshot.
*
* To avoid this, we first apply the transaction and then modify
* lastProcessedZxid. During restore, we correctly handle the
* case where the snapshot contains data ahead of the zxid associated
* with the file.
*/
if (rc.zxid > lastProcessedZxid) {
lastProcessedZxid = rc.zxid;
}
if (digestFromLoadedSnapshot != null) {
compareSnapshotDigests(rc.zxid);
} else {
// only start recording digest when we're not in fuzzy state
logZxidDigest(rc.zxid, getTreeDigest());
}
}
return rc;
}