in arm/armspec/spec.go [175:403]
func parsePage(num int, p pdf.Page) (name string, table []Inst) {
content := p.Content()
var text []pdf.Text
for _, t := range content.Text {
if match(t, "Times-Roman", 7.2, "") {
t.FontSize = 9
}
if match(t, "Times-Roman", 6.72, "") && '0' <= t.S[0] && t.S[0] <= '9' {
t.S = string([]rune("⁰¹²³⁴⁵⁶⁷⁸⁹")[t.S[0]-'0'])
t.FontSize = 9
t.Y -= 2.28
}
if t.Font == "Gen_Arial" {
continue
}
text = append(text, t)
}
text = findWords(text)
for i, t := range text {
if t.Font == "Times" {
t.Font = "Times-Roman"
text[i] = t
}
}
if debugPage > 0 {
for _, t := range text {
fmt.Println(t)
}
for _, r := range content.Rect {
fmt.Println(r)
}
}
// Remove text we should ignore.
out := text[:0]
skip := false
for _, t := range text {
// skip page footer
if match(t, "Helvetica", 8, "A") || match(t, "Helvetica", 8, "ARM DDI") || match(t, "Helvetica-Oblique", 8, "Copyright") {
continue
}
// skip section header and body text
if match(t, "Helvetica-Bold", 12, "") && (sectionRE.MatchString(t.S) || t.S == "Alphabetical list of instructions") {
skip = true
continue
}
if skip && match(t, "Times-Roman", 9, "") {
continue
}
skip = false
out = append(out, t)
}
text = out
// Page header must say Instruction Details.
if len(text) == 0 || !match(text[0], "Helvetica-Oblique", 8, "Instruction Details") && !match(text[0], "Times-Roman", 9, "Instruction Details") {
return "", nil
}
text = text[1:]
isSection := func(text []pdf.Text, i int) int {
if i+2 <= len(text) && match(text[i], "Helvetica-Bold", 10, "") && sectionRE.MatchString(text[i].S) && match(text[i+1], "Helvetica-Bold", 10, "") {
return 2
}
if i+1 <= len(text) && match(text[i], "Helvetica-Bold", 10, "") && childRE.MatchString(text[i].S) {
return 1
}
return 0
}
// Skip dummy headlines and sections.
for d := isSection(text, 0); d != 0; d = isSection(text, 0) {
i := d
for i < len(text) && !match(text[i], "Helvetica-Bold", 9, "Encoding") && !match(text[i], "Helvetica-Bold", 10, "") {
i++
}
if isSection(text, i) == 0 {
break
}
text = text[i:]
}
// Next line is headline. Can wrap to multiple lines.
d := isSection(text, 0)
if d == 0 {
if debugPage > 0 {
fmt.Printf("non-inst-headline: %v\n", text[0])
}
checkNoEncodings(num, text)
return "", nil
}
if d == 2 {
name = text[1].S
text = text[2:]
} else if d == 1 {
m := childRE.FindStringSubmatch(text[0].S)
name = m[1]
text = text[1:]
}
for len(text) > 0 && match(text[0], "Helvetica-Bold", 10, "") {
name += " " + text[0].S
text = text[1:]
}
// Skip description.
for len(text) > 0 && (match(text[0], "Times-Roman", 9, "") || match(text[0], "LucidaSansTypewriteX", 6.48, "") || match(text[0], "Times-Bold", 10, "Note")) {
text = text[1:]
}
// Encodings follow.
warned := false
for i := 0; i < len(text); {
if match(text[i], "Helvetica-Bold", 10, "Assembler syntax") ||
match(text[i], "Helvetica-Bold", 9, "Modified operation in ThumbEE") ||
match(text[i], "Helvetica-Bold", 9, "Unallocated memory hints") ||
match(text[i], "Helvetica-Bold", 9, "Related encodings") ||
match(text[i], "Times-Roman", 9, "Figure A") ||
match(text[i], "Helvetica-Bold", 9, "Table A") ||
match(text[i], "Helvetica-Bold", 9, "VFP Instructions") ||
match(text[i], "Helvetica-Bold", 9, "VFP instructions") ||
match(text[i], "Helvetica-Bold", 9, "VFP vectors") ||
match(text[i], "Helvetica-Bold", 9, "FLDMX") ||
match(text[i], "Helvetica-Bold", 9, "FSTMX") ||
match(text[i], "Helvetica-Bold", 9, "Advanced SIMD and VFP") {
checkNoEncodings(num, text[i:])
break
}
if match(text[i], "Helvetica-Bold", 9, "Figure A") {
y := text[i].Y
i++
for i < len(text) && math.Abs(text[i].Y-y) < 2 {
i++
}
continue
}
if !match(text[i], "Helvetica-Bold", 9, "Encoding") {
if !warned {
warned = true
fmt.Fprintln(os.Stderr, "page", num, ": unexpected:", text[i])
}
i++
continue
}
inst := Inst{
Name: name,
}
enc := text[i].S
x := text[i].X
i++
// Possible subarchitecture notes.
for i < len(text) && text[i].X > x+36 {
if inst.Arch != "" {
inst.Arch += " "
}
inst.Arch += text[i].S
i++
}
// Encoding syntaxes.
for i < len(text) && (match(text[i], "LucidaSansTypewriteX", 6.48, "") || text[i].X > x+36) {
if text[i].X < x+0.25*inch {
inst.Syntax = append(inst.Syntax, text[i].S)
} else {
s := inst.Syntax[len(inst.Syntax)-1]
if !strings.Contains(s, "\t") {
s += "\t"
} else {
s += " "
}
s += text[i].S
inst.Syntax[len(inst.Syntax)-1] = s
}
i++
}
var bits, abits, aenc string
bits, i = readBitBox(inst.Name, inst.Syntax, content, text, i)
if strings.Contains(enc, " / ") {
if i < len(text) && match(text[i], "Times-Roman", 8, "") {
abits, i = readBitBox(inst.Name, inst.Syntax, content, text, i)
} else {
abits = bits
}
slash := strings.Index(enc, " / ")
aenc = "Encoding " + enc[slash+len(" / "):]
enc = enc[:slash]
}
// pseudocode
y0 := -1 * inch
tab := 0.0
for i < len(text) && match(text[i], "LucidaSansTypewriteX", 6.48, "") {
t := text[i]
i++
if math.Abs(t.Y-y0) < 3 {
// same line as last fragment, probably just two spaces
inst.Code += " " + t.S
continue
}
if inst.Code != "" {
inst.Code += "\n"
}
if t.X > x+0.1*inch {
if tab == 0 {
tab = t.X - x
}
inst.Code += strings.Repeat("\t", int((t.X-x)/tab+0.5))
} else {
tab = 0
}
inst.Code += t.S
y0 = t.Y
}
inst.ID = strings.TrimPrefix(enc, "Encoding ")
inst.Bits = bits
table = append(table, inst)
if abits != "" {
inst.ID = strings.TrimPrefix(aenc, "Encoding ")
inst.Bits = abits
table = append(table, inst)
}
}
return name, table
}