func cleanup()

in x86/x86spec/cleanup.go [820:1139]
270 lines of code
120 McCabe index (conditional complexity)

func cleanup(insts []*instruction) []*instruction {
	var haveOp map[string]bool
	if onlySomePages {
		haveOp = map[string]bool{}
	}

	// Clean individual instruction encodings and opcode sequences.
	sawJZ := map[string]bool{}
	out := insts[:0]
	for seq, inst := range insts {
		inst.seq = seq

		// There are two copies each of JZ rel16 and JZ rel32. Delete the second.
		if strings.HasPrefix(inst.syntax, "JZ rel") {
			if sawJZ[inst.syntax] {
				continue
			}
			sawJZ[inst.syntax] = true
		}
		out = append(out, inst)

		// Intel CMPXCHG16B and CMPXCHG8B have surprise "m64" or " m128" at end of encoding.
		surprises := []string{
			" m64",
			" m128",
		}
		for _, s := range surprises {
			if strings.HasSuffix(inst.syntax, s) && strings.HasSuffix(inst.opcode, s) {
				inst.opcode = strings.TrimSuffix(inst.opcode, s)
			}
		}

		op, args := splitSyntax(inst.syntax)
		op = strings.TrimRight(op, "*")
		inst.syntax = joinSyntax(op, args)

		// Check argument names in syntax against encoding details.
		if enc, ok := encodings[inst.syntax]; ok {
			inst.args = enc
		}
		if len(args) == len(inst.args)+1 && args[len(args)-1] == "imm8" {
			fixed := make([]string, len(args))
			copy(fixed, inst.args)
			fixed[len(args)-1] = "imm8"
			inst.args = fixed
		} else if len(args) == 0 && len(inst.args) == 1 && inst.args[0] == "NA" {
			inst.args = []string{}
		} else if len(args) != len(inst.args) {
			fmt.Fprintf(os.Stderr, "p.%d: %s has %d args but %d encoding details:\n\t%s\n", inst.page, inst.syntax, len(args), len(inst.args), strings.Join(inst.args, "; "))
			inst.syntax = joinSyntax(op, args)
			continue
		}

		var action []string
		for i, arg := range args {
			arg = strings.TrimSpace(arg)
			arg = strings.TrimRight(arg, "*")
			if (arg == "reg" || strings.HasPrefix(arg, "reg/")) && containsAll(inst.desc, "upper bits", "r64", "zero") {
				arg = "r32" + strings.TrimPrefix(arg, "reg")
			}

			enc := inst.args[i]
			enc = strings.TrimSpace(enc)
			switch {
			case strings.HasSuffix(enc, " (r))"):
				enc = strings.TrimSuffix(enc, ")")
			case strings.HasSuffix(enc, " (R)"):
				enc = strings.TrimSuffix(enc, " (R)") + " (r)"
			case strings.HasSuffix(enc, " (W)"):
				enc = strings.TrimSuffix(enc, " (W)") + " (w)"
			case strings.HasSuffix(enc, " (r,w)"):
				enc = strings.TrimSuffix(enc, " (r,w)") + " (r, w)"
			case enc == "Imm8":
				enc = "imm8"
			case enc == "imm8/26/32":
				enc = "imm8/16/32"
			case enc == "BaseReg (R): VSIB:base, VectorReg(R): VSIB:index":
				enc = "vsib (r)"
			}
			inst.args[i] = enc

			switch {
			case strings.HasSuffix(enc, " (r)"):
				action = append(action, "r")
				enc = strings.TrimSuffix(enc, " (r)")
			case strings.HasSuffix(enc, " (w)"):
				action = append(action, "w")
				enc = strings.TrimSuffix(enc, " (w)")
			case strings.HasSuffix(enc, " (r, w)"):
				action = append(action, "rw")
				enc = strings.TrimSuffix(enc, " (r, w)")
			case strings.HasPrefix(enc, "imm"), enc == "Offset", enc == "iw", arg == "1", arg == "0", arg == "3":
				action = append(action, "r")
			case i < len(opAction[op]):
				action = append(action, opAction[op][i])
			default:
				fmt.Fprintf(os.Stderr, "p.%d: %s has encoding %s for %s but no r/w annotations\n", inst.page, inst.syntax, enc, arg)
				action = append(action, "?")
			}

			if arg == "mem" && op == "LDDQU" {
				arg = "m128"
			}
			if arg == "reg" && op == "LAR" {
				arg = "r32"
			}
			if actual := encodeReplace[[2]string{arg, enc}]; actual != "" {
				arg = actual
			}

			if (arg == "r8" || arg == "r16" || arg == "r32" || arg == "r64") && enc == "ModRM:r/m" {
				addTag(inst, "modrm_regonly")
				arg = "rmr" + arg[1:]
			}
			if (arg == "xmm2" || arg == "ymm2") && enc == "ModRM:r/m" {
				addTag(inst, "modrm_regonly")
			}

			if (arg == "m8" || arg == "m16" || arg == "m32" || arg == "m64" || arg == "m128" || arg == "m256") && enc == "ModRM:r/m" {
				addTag(inst, "modrm_memonly")
			}

			if arg == "r64" && (inst.syntax == "MOV r64, CR8" || inst.syntax == "MOV CR8, r64") {
				arg = "rmr64"
				addTag(inst, "modrm_regonly")
			}
			if arg == "CR8" {
				enc = ""
			}

			if !encodeOK[[2]string{arg, enc}] {
				fmt.Fprintf(os.Stderr, "p.%d: %s has invalid encoding %s for %s\n\t{%q, %q}: true,\n", inst.page, inst.syntax, enc, arg, arg, enc)
			}

			args[i] = arg

			// Intel SETcc and others are missing the /r.
			// But CALL rel16 and CALL rel32 have a bad encoding table so ignore the ModRM there.
			if strings.HasPrefix(enc, "ModRM") && !strings.Contains(inst.opcode, " /") && op != "CALL" {
				inst.opcode += " /r"
			}
			if strings.HasPrefix(enc, "ModRM:reg") && !strings.Contains(inst.opcode, "/r") {
				// The opcode is taken up with something else. Bug in table.
				fmt.Fprintf(os.Stderr, "p.%d: %s has invalid encoding %s: no reg field in %s\n", inst.page, inst.syntax, arg, inst.opcode)
			}
			// XBEGIN is missing cw cd.
			if enc == "Offset" && arg == "rel16" && !strings.Contains(inst.opcode, " cw") {
				inst.opcode += " cw"
			}
			if enc == "Offset" && arg == "rel32" && !strings.Contains(inst.opcode, " cd") {
				inst.opcode += " cd"
			}
			if enc == "Moffs" && !strings.Contains(inst.opcode, "cm") {
				inst.opcode += " cm"
			}

			inst.action = strings.Join(action, ",")
		}

		inst.syntax = joinSyntax(op, args)

		// The Intel manual lists each XCHG form with arguments in both orders.
		// While this is technically correct, it confuses lots of the analysis.
		// Change half of them to start with a fake "XX" byte.
		if op == "XCHG" && !strings.HasPrefix(args[0], "r/") && !strings.HasSuffix(args[0], "op") {
			inst.opcode = "XX " + inst.opcode
		}

		// Intel manual is not great about disabling REX instructions on 32-bit systems.
		if strings.Contains(inst.opcode, "REX") && inst.valid32 == "V" {
			inst.valid32 = "N.E."
		}

		if inst.valid32 == "V" {
			switch {
			case containsAll(inst.compat, "not supported", "earlier than the Intel486"):
				inst.cpuid = "486"
			case containsAll(inst.compat, "not supported", "earlier than the Pentium"),
				containsAll(inst.compat, "were introduced", "with the Pentium"):
				inst.cpuid = "Pentium"
			case containsAll(inst.compat, "were introduced", "in the Pentium II"):
				inst.cpuid = "PentiumII"
			case containsAll(inst.compat, "were introduced", "in the P6 family"),
				containsAll(inst.compat, "were introduced in P6 family"):
				addTag(inst, "P6")
			}
		}

		if onlySomePages {
			op, _ := splitSyntax(inst.syntax)
			haveOp[op] = true
		}
	}

	insts = out
	sort.Sort(byOpcode(insts))

	// Detect operand size dependencies.
	var last *instruction
	for _, inst := range insts {
		if last != nil {
			f1, _ := splitOpcode(last.opcode)
			f2, _ := splitOpcode(inst.opcode)
			if f1 == f2 {
				// Conflict: cannot distinguish instructions based on fixed prefix.
				if is16vs32pair(last, inst) {
					addTag(last, "operand16")
					addTag(inst, "operand32")
					continue
				}
				if is16vs32pair(inst, last) {
					addTag(last, "operand32")
					addTag(inst, "operand16")
					last = inst
					continue
				}
			}
		}
		last = inst
	}

	// Detect pseudo-ops, defined as opcode entries subsumed by more general ones.
	seen := map[string]*instruction{}
	for _, inst := range insts {
		if strings.HasPrefix(inst.opcode, "9B ") { // FWAIT prefix
			addTag(inst, "pseudo")
			continue
		}
		if inst.opcode == "F0" || inst.opcode == "F2" || inst.opcode == "F3" {
			addTag(inst, "pseudo")
			continue
		}
		if strings.HasPrefix(inst.syntax, "REP ") || strings.HasPrefix(inst.syntax, "REPE ") || strings.HasPrefix(inst.syntax, "REPNE ") {
			addTag(inst, "pseudo")
			continue
		}
		if strings.HasPrefix(inst.syntax, "SAL ") { // SHL is canonical
			addTag(inst, "pseudo")
			continue
		}
		if old := seen[inst.opcode]; old != nil {
			if condLess(old.syntax, inst.syntax) {
				addTag(inst, "pseudo")
				continue
			}
			if xchgLess(inst.syntax, old.syntax) {
				old.tags = append(old.tags, "pseudo")
				seen[inst.opcode] = inst
				continue
			}
		}

		seen[inst.opcode] = inst

		if last != nil && canGenerate(last.opcode, inst.opcode) {
			addTag(inst, "pseudo")
			continue
		}
		last = inst
	}
	for _, inst := range insts {
		if strings.Contains(inst.opcode, "REX ") {
			if old := seen[strings.Replace(inst.opcode, "REX ", "", 1)]; old != nil && old.syntax == inst.syntax {
				addTag(inst, "pseudo64")
				continue
			} else if old != nil && hasTag(old, "pseudo") {
				addTag(inst, "pseudo")
				continue
			}
		}
		if strings.Contains(inst.opcode, "REX.W ") {
			if old := seen[strings.Replace(inst.opcode, "REX.W ", "", -1)]; old != nil && old.syntax == inst.syntax {
				addTag(old, "ignoreREXW")
				addTag(inst, "pseudo")
				continue
			} else if old != nil && hasTag(old, "pseudo") {
				addTag(inst, "pseudo")
				continue
			} else if old != nil && !hasTag(old, "operand16") && !hasTag(old, "operand32") {
				// There is a 64-bit form of this instruction.
				// Mark this one as only valid in the non-64-bit operand modes.
				addTag(old, "operand16")
				addTag(old, "operand32")
				continue
			}
		}
	}

	// Undo XCHG hack above.
	for _, inst := range insts {
		if strings.HasPrefix(inst.opcode, "XX ") {
			inst.opcode = strings.TrimPrefix(inst.opcode, "XX ")
			addTag(inst, "pseudo")
			removeTag(inst, "pseudo64")
		}
	}

	// Last ditch effort. Manual fixes.
	// Some things are too hard to infer.
	for _, inst := range insts {
		for _, fix := range fixup[[2]string{inst.syntax, inst.opcode}] {
			fix(inst)
		}
		sort.Strings(inst.tags)
	}

	sort.Sort(bySeq(insts))

	if onlySomePages {
		for _, inst := range extraInsts {
			op, _ := splitSyntax(inst.syntax)
			if haveOp[op] {
				insts = append(insts, inst)
			}
		}
	} else {
		insts = append(insts, extraInsts...)
	}
	return insts
}