in src/cmd/nel/main.go [208:309]
func (m *Matcher) Match() error {
nerThreshold := m.parameters.GetFloat64("ner_threshold")
validLabels := set.New(m.parameters.GetSlice("valid_labels", ",")...)
matchThreshold := m.parameters.GetFloat64("match_threshold")
matchMargin := m.parameters.GetFloat64("match_margin")
matchedSlots := make(map[string]taxonomy.Terms)
conceptSet := set.New()
slotCnt := 0
matchedSlotCnt := 0
defaultCategories := set.New()
cancerCategories := set.New("C")
personCategories := set.New("M")
fname := m.parameters.Get("input_file")
file, err := os.Open(fname)
if err != nil {
return err
}
defer file.Close()
scanner := bufio.NewScanner(file)
lineCnt := 0
outputFname := m.parameters.Get("output_file")
writer := fio.Writer(outputFname)
defer writer.Close()
header := "#nct_id\teligibility_type\tcriterion\tlabel\tterm\tner_score\tconcepts\ttree_numbers\tnel_score\n"
writer.WriteString(header)
glog.Infof("Matching NER terms ...")
for scanner.Scan() {
lineCnt++
line := scanner.Text()
if len(line) == 0 || line[0] == param.Comment {
continue
}
// Extract NER terms
values := strings.Split(line, "\t")
nctID := values[0]
eligibilityType := values[1]
criterion := values[2]
termStr := values[3]
slots := getNERSlots(termStr, nerThreshold, validLabels)
slotCnt += slots.Size()
// Match NER terms to concepts
for _, slot := range slots {
subterms := slot.SubTerms()
for _, subterm := range subterms {
if _, ok := matchedSlots[subterm]; !ok {
var validCategories set.Set
switch slot.label {
case "word_scores:cancer":
validCategories = cancerCategories
case "word_scores:gender":
validCategories = personCategories
default:
validCategories = defaultCategories
}
matchedSlots[subterm] = m.vocabulary.Match(subterm, matchMargin, validCategories)
}
}
slot.Normalize(m.normalize)
hasMatch := false
for _, subterm := range subterms {
matchedConcepts := matchedSlots[subterm]
if matchedConcepts.MaxValue() >= matchThreshold {
hasMatch = true
conceptSet.Add(matchedConcepts.Keys()...)
concepts := strings.Join(matchedConcepts.Keys(), "|")
nelScore := matchedConcepts.MaxValue()
treeNumbers := strings.Join(matchedConcepts.TreeNumbers(), "|")
if _, err := fmt.Fprintf(writer, "%s\t%s\t%s\t%s\t%s\t%s\t%.3f\n", nctID, eligibilityType, criterion, slot.String(), concepts, treeNumbers, nelScore); err != nil {
return err
}
}
}
if hasMatch {
matchedSlotCnt++
} else {
if _, err := fmt.Fprintf(writer, "%s\t%s\t%s\t%s\n", nctID, eligibilityType, criterion, slot.String()); err != nil {
return err
}
}
}
}
glog.Infof("Lines read: %d, Slots: %d, Unique slots: %d\n", lineCnt, slotCnt, len(matchedSlots))
glog.Infof("%d slots matched to %d concepts\n", matchedSlotCnt, conceptSet.Size())
glog.Infof("%d slots not matched\n", slotCnt-matchedSlotCnt)
return nil
}