in npm/pkg/dataplane/policies/chain-management_linux.go [298:442]
func (pMgr *PolicyManager) cleanupOtherIptables() error {
hadNFT := util.Iptables == util.IptablesNft
if hadNFT {
klog.Info("detected nft iptables. cleaning up legacy iptables")
util.SetIptablesToLegacy()
} else {
klog.Info("detected legacy iptables. cleaning up nft iptables")
util.SetIptablesToNft()
}
defer func() {
if hadNFT {
klog.Info("cleaned up legacy iptables")
util.SetIptablesToNft()
} else {
klog.Info("cleaned up nft tables")
util.SetIptablesToLegacy()
}
}()
deletedJumpRule := false
// 1.1. delete the deprecated jump to AZURE-NPM
errCode, err := pMgr.ignoreErrorsAndRunIPTablesCommand(removeDeprecatedJumpIgnoredErrors, util.IptablesDeletionFlag, deprecatedJumpFromForwardToAzureChainArgs...)
if errCode == 0 {
klog.Infof("[cleanup] deleted deprecated jump rule from FORWARD chain to AZURE-NPM chain")
deletedJumpRule = true
} else if err != nil {
metrics.SendErrorLogAndMetric(util.IptmID,
"[cleanup] failed to delete deprecated jump rule from FORWARD chain to AZURE-NPM chain for unexpected reason with exit code %d and error: %s",
errCode, err.Error())
}
// 1.2. delete the jump to AZURE-NPM
errCode, err = pMgr.ignoreErrorsAndRunIPTablesCommand(removeDeprecatedJumpIgnoredErrors, util.IptablesDeletionFlag, jumpFromForwardToAzureChainArgs...)
if errCode == 0 {
deletedJumpRule = true
klog.Infof("[cleanup] deleted jump rule from FORWARD chain to AZURE-NPM chain")
} else if err != nil {
metrics.SendErrorLogAndMetric(util.IptmID,
"[cleanup] failed to delete jump rule from FORWARD chain to AZURE-NPM chain for unexpected reason with exit code %d and error: %s",
errCode, err.Error())
}
// 2. get current chains
currentChains, err := ioutil.AllCurrentAzureChains(pMgr.ioShim.Exec, util.IptablesDefaultWaitTime)
if err != nil {
return npmerrors.SimpleErrorWrapper("[cleanup] failed to get current chains for bootup", err)
}
if len(currentChains) == 0 {
klog.Info("no chains to cleanup")
return nil
}
klog.Infof("[cleanup] %d chains to clean up", len(currentChains))
// 3.1. try to flush all chains at once
chains := make([]string, 0, len(currentChains))
_, hasAzureChain := currentChains[util.IptablesAzureChain]
if hasAzureChain {
// putting AZURE-NPM chain first is required for proper unit testing (for determinancy in destroying chains)
chains = append(chains, util.IptablesAzureChain)
}
for chain := range currentChains {
if chain == util.IptablesAzureChain {
// putting AZURE-NPM chain first is required for proper unit testing (for determinancy in destroying chains)
continue
}
chains = append(chains, chain)
}
creator := pMgr.creatorForCleanup(chains)
if err := restore(creator); err != nil {
msg := "[cleanup] failed to flush all chains with error: %s"
klog.Infof(msg, err.Error())
metrics.SendErrorLogAndMetric(util.IptmID, msg, err.Error())
// 3.2. if we failed to flush all chains, then try to flush and delete them one by one
var aggregateError error
if _, ok := currentChains[util.IptablesAzureChain]; ok {
_, err := pMgr.runIPTablesCommand(util.IptablesFlushFlag, util.IptablesAzureChain)
aggregateError = err
if err != nil && !deletedJumpRule {
// fixes #3088
// if we failed to delete a jump rule to AZURE-NPM and we failed to flush AZURE-NPM chain,
// then there is risk that there is a jump rule to AZURE-NPM, which in turn has rules which could lead to allowing or dropping a packet.
// We have failed to cleanup the other iptables rules, and there is no guarantee that packets will be processed correctly now.
// So we must crash and retry.
return npmerrors.SimpleErrorWrapper("[cleanup] must crash and retry. failed to delete jump rule and flush AZURE-NPM chain with error", err)
}
}
for chain := range currentChains {
if chain == util.IptablesAzureChain {
// already flushed above
continue
}
errCode, err := pMgr.runIPTablesCommand(util.IptablesFlushFlag, chain)
if err != nil && errCode != doesNotExistErrorCode {
// NOTE: if we fail to flush or delete the chain, then we will never clean it up in the future.
// This is zero-day behavior since NPM supported nft (we used to mark the chain stale, but this would not have worked as expected).
// NPM currently has no mechanism for retrying flush/delete for a chain from the other iptables version (other than the AZURE-NPM chain which is handled above).
currentErrString := fmt.Sprintf("failed to flush chain %s with err [%v]", chain, err)
if aggregateError == nil {
aggregateError = npmerrors.SimpleError(currentErrString)
} else {
aggregateError = npmerrors.SimpleErrorWrapper(currentErrString+" and had previous error", aggregateError)
}
}
}
if aggregateError != nil {
metrics.SendErrorLogAndMetric(util.IptmID,
"[cleanup] benign failure to flush chains with error: %s",
aggregateError.Error())
}
}
// 4. delete all chains
var aggregateError error
for _, chain := range chains {
errCode, err := pMgr.runIPTablesCommand(util.IptablesDestroyFlag, chain)
if err != nil && errCode != doesNotExistErrorCode {
// NOTE: if we fail to flush or delete the chain, then we will never clean it up in the future.
// This is zero-day behavior since NPM supported nft (we used to mark the chain stale, but this would not have worked as expected).
// NPM currently has no mechanism for retrying flush/delete for a chain from the other iptables version (other than the AZURE-NPM chain which is handled above).
currentErrString := fmt.Sprintf("failed to delete chain %s with err [%v]", chain, err)
if aggregateError == nil {
aggregateError = npmerrors.SimpleError(currentErrString)
} else {
aggregateError = npmerrors.SimpleErrorWrapper(currentErrString+" and had previous error", aggregateError)
}
}
}
if aggregateError != nil {
metrics.SendErrorLogAndMetric(util.IptmID,
"[cleanup] benign failure to delete chains with error: %s",
aggregateError.Error())
}
return nil
}