tools/diff-processor/documentparser/document_parser.go (155 lines of code) (raw):

package documentparser import ( "fmt" "regexp" "sort" "strings" ) var ( fieldNameRegex = regexp.MustCompile("[\\*|-]\\s+`([a-z0-9_\\./]+)`") // * `xxx` nestedObjectRegex = regexp.MustCompile(`<a\s+name="([a-z0-9_]+)">`) // <a name="xxx"> nestedHashTagRegex = regexp.MustCompile(`\(#(nested_[a-z0-9_]+)\)`) // #(nested_xxx) horizontalLineRegex = regexp.MustCompile("- - -|-{3,}") // - - - or --- sectionSeparator = "## " ) // DocumentParser parse *.html.markdown resource doc files. type DocumentParser struct { root *node nestedBlock map[string]string } type node struct { name string children []*node text string } func NewParser() *DocumentParser { return &DocumentParser{ nestedBlock: make(map[string]string), } } func (d *DocumentParser) FlattenFields() []string { var paths []string traverse( &paths, "", d.root, ) sort.Strings(paths) return paths } func traverse(paths *[]string, path string, n *node) { if n == nil { return } var curPath string if path != "" { curPath = path + "." + n.name } else { curPath = n.name } if curPath != "" { *paths = append(*paths, curPath) } for _, c := range n.children { traverse(paths, curPath, c) } } // Parse parse a resource document markdown's arguments and attributes section. // The parsed file format is defined in mmv1/templates/terraform/resource.html.markdown.tmpl. func (d *DocumentParser) Parse(src []byte) error { var argument, attribute string for _, p := range strings.Split(string(src), "\n"+sectionSeparator) { if strings.HasPrefix(p, "Attributes Reference") { attribute = p } if strings.HasPrefix(p, "Argument Reference") { argument = p } } for _, text := range []string{argument, attribute} { if len(text) != 0 { sections := horizontalLineRegex.Split(text, -1) var allTopLevelFieldSections string for _, part := range sections { topLevelPropertySection, err := d.extractNestedObject(part) if err != nil { return err } allTopLevelFieldSections += topLevelPropertySection } root := &node{ text: allTopLevelFieldSections, } if err := d.bfs(root, d.nestedBlock); err != nil { return err } if d.root == nil { d.root = root } else { d.root.children = append(d.root.children, root.children...) } } } return nil } func (d *DocumentParser) extractNestedObject(input string) (string, error) { parts := splitWithRegexp(input, nestedObjectRegex) for _, p := range parts[1:] { nestedName := findPattern(p, nestedObjectRegex) if nestedName == "" { return "", fmt.Errorf("could not find nested object name in %s", p) } d.nestedBlock[nestedName] = p } return parts[0], nil } func (d *DocumentParser) bfs(root *node, nestedBlock map[string]string) error { if root == nil { return fmt.Errorf("no node to visit") } queue := []*node{root} for len(queue) > 0 { l := len(queue) for _, cur := range queue { // the separator should always at the beginning of the line parts := splitWithRegexp(cur.text, fieldNameRegex) for _, p := range parts[1:] { p = strings.ReplaceAll(p, "\n", "") fieldName := findPattern(p, fieldNameRegex) if fieldName == "" { return fmt.Errorf("could not find field name in %s", p) } // There is a special case in some hand written resource eg. in compute_instance, where its attributes is in a.0.b.0.c format. fieldName = strings.ReplaceAll(fieldName, ".0.", ".") newNode := &node{ name: fieldName, } cur.children = append(cur.children, newNode) nestedHashTag := findPattern(p, nestedHashTagRegex) if text, ok := nestedBlock[nestedHashTag]; ok { newNode.text = text queue = append(queue, newNode) } } } queue = queue[l:] } return nil } func findPattern(text string, re *regexp.Regexp) string { match := re.FindStringSubmatch(text) if match != nil { return match[1] } return "" } func splitWithRegexp(text string, re *regexp.Regexp) []string { matches := re.FindAllStringIndex(text, -1) if len(matches) == 0 { return []string{text} } var parts []string start := 0 for _, match := range matches { end := match[0] parts = append(parts, text[start:end]) start = end } parts = append(parts, text[start:]) return parts }