func()

in src/cmd/nel/main.go [208:309]


func (m *Matcher) Match() error {
	nerThreshold := m.parameters.GetFloat64("ner_threshold")
	validLabels := set.New(m.parameters.GetSlice("valid_labels", ",")...)

	matchThreshold := m.parameters.GetFloat64("match_threshold")
	matchMargin := m.parameters.GetFloat64("match_margin")

	matchedSlots := make(map[string]taxonomy.Terms)
	conceptSet := set.New()
	slotCnt := 0
	matchedSlotCnt := 0

	defaultCategories := set.New()
	cancerCategories := set.New("C")
	personCategories := set.New("M")

	fname := m.parameters.Get("input_file")
	file, err := os.Open(fname)
	if err != nil {
		return err
	}
	defer file.Close()

	scanner := bufio.NewScanner(file)
	lineCnt := 0

	outputFname := m.parameters.Get("output_file")
	writer := fio.Writer(outputFname)
	defer writer.Close()

	header := "#nct_id\teligibility_type\tcriterion\tlabel\tterm\tner_score\tconcepts\ttree_numbers\tnel_score\n"
	writer.WriteString(header)

	glog.Infof("Matching NER terms ...")

	for scanner.Scan() {
		lineCnt++
		line := scanner.Text()
		if len(line) == 0 || line[0] == param.Comment {
			continue
		}

		// Extract NER terms
		values := strings.Split(line, "\t")
		nctID := values[0]
		eligibilityType := values[1]
		criterion := values[2]
		termStr := values[3]
		slots := getNERSlots(termStr, nerThreshold, validLabels)
		slotCnt += slots.Size()

		// Match NER terms to concepts
		for _, slot := range slots {
			subterms := slot.SubTerms()
			for _, subterm := range subterms {
				if _, ok := matchedSlots[subterm]; !ok {
					var validCategories set.Set
					switch slot.label {
					case "word_scores:cancer":
						validCategories = cancerCategories
					case "word_scores:gender":
						validCategories = personCategories
					default:
						validCategories = defaultCategories
					}
					matchedSlots[subterm] = m.vocabulary.Match(subterm, matchMargin, validCategories)
				}
			}

			slot.Normalize(m.normalize)
			hasMatch := false

			for _, subterm := range subterms {
				matchedConcepts := matchedSlots[subterm]
				if matchedConcepts.MaxValue() >= matchThreshold {
					hasMatch = true
					conceptSet.Add(matchedConcepts.Keys()...)
					concepts := strings.Join(matchedConcepts.Keys(), "|")
					nelScore := matchedConcepts.MaxValue()
					treeNumbers := strings.Join(matchedConcepts.TreeNumbers(), "|")
					if _, err := fmt.Fprintf(writer, "%s\t%s\t%s\t%s\t%s\t%s\t%.3f\n", nctID, eligibilityType, criterion, slot.String(), concepts, treeNumbers, nelScore); err != nil {
						return err
					}
				}
			}

			if hasMatch {
				matchedSlotCnt++
			} else {
				if _, err := fmt.Fprintf(writer, "%s\t%s\t%s\t%s\n", nctID, eligibilityType, criterion, slot.String()); err != nil {
					return err
				}
			}
		}
	}

	glog.Infof("Lines read: %d, Slots: %d, Unique slots: %d\n", lineCnt, slotCnt, len(matchedSlots))
	glog.Infof("%d slots matched to %d concepts\n", matchedSlotCnt, conceptSet.Size())
	glog.Infof("%d slots not matched\n", slotCnt-matchedSlotCnt)

	return nil
}