in backend/spdxclassifier.go [63:165]
func (obj *SPDXClassifier) ScanData(ctx context.Context, data []byte, info *interfaces.Info) (*interfaces.Result, error) {
if info.FileInfo.IsDir() {
return nil, nil // skip
}
if len(data) == 0 {
return nil, nil // skip
}
ctx, cancel := context.WithCancel(ctx)
defer cancel()
licenseMap := make(map[string]struct{})
// An official parser for SPDX ID's seems be:
// https://github.com/spdx/tools-golang/blob/a16d50ee155238df280a68252acc25e9afb7acea/idsearcher/idsearcher.go#L269
// If it's meant to be that simplistic, we'll implement something
// similar. Please report bugs over there before you report them here =D
reader := bytes.NewReader(data)
scanner := bufio.NewScanner(reader)
for scanner.Scan() {
// In an effort to short-circuit things if needed, we run a
// check ourselves and break out early if we see that we have
// cancelled early.
select {
case <-ctx.Done():
return nil, errwrap.Wrapf(ctx.Err(), "scanner ended early")
default:
}
s := scanner.Text() // newlines will be stripped here
strs := strings.SplitN(s, magicStringSPDX, 2) // max split of 2
if len(strs) == 1 { // no split happened, string not found
continue
}
// weird way to parse, but whatever:
// "if prefixed by more than n characters, it's probably not a
// short-form ID; it's probably code to detect short-form IDs."
if len(stripTrash(strs[0])) > magicNumberSPDX { // arbitrary wat
continue
}
// spdx says: "stop before trailing */ if it is present"
lid := strings.Split(strs[1], "*/")[0] // lid is licenseID
lid = strings.TrimSpace(lid)
lid = stripTrash(lid)
licenseMap[lid] = struct{}{}
}
if len(licenseMap) == 0 {
// NOTE: this is NOT the same as interfaces.ErrUnknownLicense
// because in this scenario, we're comfortable (ish) the parser
// is exhaustive at finding a license with this methodology.
return nil, errwrap.Wrapf(scanner.Err(), "spdx scanner error")
}
ids := []string{}
for id := range licenseMap {
ids = append(ids, id)
}
sort.Strings(ids) // deterministic order
licenseList := []*licenses.License{}
for _, id := range ids {
license := &licenses.License{
SPDX: id,
// TODO: populate other fields here?
}
// If we find an unknown SPDX ID, we don't want to error,
// because that would allow someone to put junk in their code to
// prevent us scanning it. Instead, create an invalid license
// but return it anyways. If we ever want to check validity, we
// know to expect failures. It *must* be valid because it's an
// explicit SPDX scanner.
if err := license.Validate(); err != nil {
//return nil, err
license = &licenses.License{
//SPDX: "",
Origin: "", // unknown!
Custom: id,
// TODO: populate other fields here (eg: found license text)
}
}
licenseList = append(licenseList, license)
}
result := &interfaces.Result{
Licenses: licenseList,
Confidence: 1.0, // TODO: what should we put here?
}
// We perform the strange task of processing any partial results, and
// returning some even if we errored, because the spdx code seems to
// think this is better than no results. I'll do the same, but there is
// no guarantee the calling iterator will use these. (Currently it does
// not!)
return result, errwrap.Wrapf(scanner.Err(), "spdx scanner error")
}