func parsePage()

in arm/armspec/spec.go [175:403]
202 lines of code
79 McCabe index (conditional complexity)

func parsePage(num int, p pdf.Page) (name string, table []Inst) {
	content := p.Content()

	var text []pdf.Text
	for _, t := range content.Text {
		if match(t, "Times-Roman", 7.2, "") {
			t.FontSize = 9
		}
		if match(t, "Times-Roman", 6.72, "") && '0' <= t.S[0] && t.S[0] <= '9' {
			t.S = string([]rune("⁰¹²³⁴⁵⁶⁷⁸⁹")[t.S[0]-'0'])
			t.FontSize = 9
			t.Y -= 2.28
		}
		if t.Font == "Gen_Arial" {
			continue
		}
		text = append(text, t)
	}

	text = findWords(text)

	for i, t := range text {
		if t.Font == "Times" {
			t.Font = "Times-Roman"
			text[i] = t
		}
	}

	if debugPage > 0 {
		for _, t := range text {
			fmt.Println(t)
		}
		for _, r := range content.Rect {
			fmt.Println(r)
		}
	}

	// Remove text we should ignore.
	out := text[:0]
	skip := false
	for _, t := range text {
		// skip page footer
		if match(t, "Helvetica", 8, "A") || match(t, "Helvetica", 8, "ARM DDI") || match(t, "Helvetica-Oblique", 8, "Copyright") {
			continue
		}
		// skip section header and body text
		if match(t, "Helvetica-Bold", 12, "") && (sectionRE.MatchString(t.S) || t.S == "Alphabetical list of instructions") {
			skip = true
			continue
		}
		if skip && match(t, "Times-Roman", 9, "") {
			continue
		}
		skip = false
		out = append(out, t)
	}
	text = out

	// Page header must say Instruction Details.
	if len(text) == 0 || !match(text[0], "Helvetica-Oblique", 8, "Instruction Details") && !match(text[0], "Times-Roman", 9, "Instruction Details") {
		return "", nil
	}
	text = text[1:]

	isSection := func(text []pdf.Text, i int) int {
		if i+2 <= len(text) && match(text[i], "Helvetica-Bold", 10, "") && sectionRE.MatchString(text[i].S) && match(text[i+1], "Helvetica-Bold", 10, "") {
			return 2
		}
		if i+1 <= len(text) && match(text[i], "Helvetica-Bold", 10, "") && childRE.MatchString(text[i].S) {
			return 1
		}
		return 0
	}

	// Skip dummy headlines and sections.
	for d := isSection(text, 0); d != 0; d = isSection(text, 0) {
		i := d
		for i < len(text) && !match(text[i], "Helvetica-Bold", 9, "Encoding") && !match(text[i], "Helvetica-Bold", 10, "") {
			i++
		}
		if isSection(text, i) == 0 {
			break
		}
		text = text[i:]
	}

	// Next line is headline. Can wrap to multiple lines.
	d := isSection(text, 0)
	if d == 0 {
		if debugPage > 0 {
			fmt.Printf("non-inst-headline: %v\n", text[0])
		}
		checkNoEncodings(num, text)
		return "", nil
	}
	if d == 2 {
		name = text[1].S
		text = text[2:]
	} else if d == 1 {
		m := childRE.FindStringSubmatch(text[0].S)
		name = m[1]
		text = text[1:]
	}
	for len(text) > 0 && match(text[0], "Helvetica-Bold", 10, "") {
		name += " " + text[0].S
		text = text[1:]
	}

	// Skip description.
	for len(text) > 0 && (match(text[0], "Times-Roman", 9, "") || match(text[0], "LucidaSansTypewriteX", 6.48, "") || match(text[0], "Times-Bold", 10, "Note")) {
		text = text[1:]
	}

	// Encodings follow.
	warned := false
	for i := 0; i < len(text); {
		if match(text[i], "Helvetica-Bold", 10, "Assembler syntax") ||
			match(text[i], "Helvetica-Bold", 9, "Modified operation in ThumbEE") ||
			match(text[i], "Helvetica-Bold", 9, "Unallocated memory hints") ||
			match(text[i], "Helvetica-Bold", 9, "Related encodings") ||
			match(text[i], "Times-Roman", 9, "Figure A") ||
			match(text[i], "Helvetica-Bold", 9, "Table A") ||
			match(text[i], "Helvetica-Bold", 9, "VFP Instructions") ||
			match(text[i], "Helvetica-Bold", 9, "VFP instructions") ||
			match(text[i], "Helvetica-Bold", 9, "VFP vectors") ||
			match(text[i], "Helvetica-Bold", 9, "FLDMX") ||
			match(text[i], "Helvetica-Bold", 9, "FSTMX") ||
			match(text[i], "Helvetica-Bold", 9, "Advanced SIMD and VFP") {
			checkNoEncodings(num, text[i:])
			break
		}
		if match(text[i], "Helvetica-Bold", 9, "Figure A") {
			y := text[i].Y
			i++
			for i < len(text) && math.Abs(text[i].Y-y) < 2 {
				i++
			}
			continue
		}
		if !match(text[i], "Helvetica-Bold", 9, "Encoding") {
			if !warned {
				warned = true
				fmt.Fprintln(os.Stderr, "page", num, ": unexpected:", text[i])
			}
			i++
			continue
		}
		inst := Inst{
			Name: name,
		}
		enc := text[i].S
		x := text[i].X
		i++
		// Possible subarchitecture notes.
		for i < len(text) && text[i].X > x+36 {
			if inst.Arch != "" {
				inst.Arch += " "
			}
			inst.Arch += text[i].S
			i++
		}
		// Encoding syntaxes.
		for i < len(text) && (match(text[i], "LucidaSansTypewriteX", 6.48, "") || text[i].X > x+36) {
			if text[i].X < x+0.25*inch {
				inst.Syntax = append(inst.Syntax, text[i].S)
			} else {
				s := inst.Syntax[len(inst.Syntax)-1]
				if !strings.Contains(s, "\t") {
					s += "\t"
				} else {
					s += " "
				}
				s += text[i].S
				inst.Syntax[len(inst.Syntax)-1] = s
			}
			i++
		}

		var bits, abits, aenc string
		bits, i = readBitBox(inst.Name, inst.Syntax, content, text, i)
		if strings.Contains(enc, " / ") {
			if i < len(text) && match(text[i], "Times-Roman", 8, "") {
				abits, i = readBitBox(inst.Name, inst.Syntax, content, text, i)
			} else {
				abits = bits
			}
			slash := strings.Index(enc, " / ")
			aenc = "Encoding " + enc[slash+len(" / "):]
			enc = enc[:slash]
		}

		// pseudocode
		y0 := -1 * inch
		tab := 0.0
		for i < len(text) && match(text[i], "LucidaSansTypewriteX", 6.48, "") {
			t := text[i]
			i++
			if math.Abs(t.Y-y0) < 3 {
				// same line as last fragment, probably just two spaces
				inst.Code += " " + t.S
				continue
			}
			if inst.Code != "" {
				inst.Code += "\n"
			}
			if t.X > x+0.1*inch {
				if tab == 0 {
					tab = t.X - x
				}
				inst.Code += strings.Repeat("\t", int((t.X-x)/tab+0.5))
			} else {
				tab = 0
			}
			inst.Code += t.S
			y0 = t.Y
		}

		inst.ID = strings.TrimPrefix(enc, "Encoding ")
		inst.Bits = bits
		table = append(table, inst)
		if abits != "" {
			inst.ID = strings.TrimPrefix(aenc, "Encoding ")
			inst.Bits = abits
			table = append(table, inst)
		}

	}
	return name, table
}