func()

in backend/spdxclassifier.go [63:165]


func (obj *SPDXClassifier) ScanData(ctx context.Context, data []byte, info *interfaces.Info) (*interfaces.Result, error) {
	if info.FileInfo.IsDir() {
		return nil, nil // skip
	}
	if len(data) == 0 {
		return nil, nil // skip
	}

	ctx, cancel := context.WithCancel(ctx)
	defer cancel()

	licenseMap := make(map[string]struct{})

	// An official parser for SPDX ID's seems be:
	// https://github.com/spdx/tools-golang/blob/a16d50ee155238df280a68252acc25e9afb7acea/idsearcher/idsearcher.go#L269
	// If it's meant to be that simplistic, we'll implement something
	// similar. Please report bugs over there before you report them here =D

	reader := bytes.NewReader(data)
	scanner := bufio.NewScanner(reader)
	for scanner.Scan() {
		// In an effort to short-circuit things if needed, we run a
		// check ourselves and break out early if we see that we have
		// cancelled early.
		select {
		case <-ctx.Done():
			return nil, errwrap.Wrapf(ctx.Err(), "scanner ended early")
		default:
		}

		s := scanner.Text()                           // newlines will be stripped here
		strs := strings.SplitN(s, magicStringSPDX, 2) // max split of 2
		if len(strs) == 1 {                           // no split happened, string not found
			continue
		}

		// weird way to parse, but whatever:
		// "if prefixed by more than n characters, it's probably not a
		// short-form ID; it's probably code to detect short-form IDs."
		if len(stripTrash(strs[0])) > magicNumberSPDX { // arbitrary wat
			continue
		}

		// spdx says: "stop before trailing */ if it is present"
		lid := strings.Split(strs[1], "*/")[0] // lid is licenseID
		lid = strings.TrimSpace(lid)
		lid = stripTrash(lid)

		licenseMap[lid] = struct{}{}
	}

	if len(licenseMap) == 0 {
		// NOTE: this is NOT the same as interfaces.ErrUnknownLicense
		// because in this scenario, we're comfortable (ish) the parser
		// is exhaustive at finding a license with this methodology.
		return nil, errwrap.Wrapf(scanner.Err(), "spdx scanner error")
	}

	ids := []string{}
	for id := range licenseMap {
		ids = append(ids, id)
	}
	sort.Strings(ids) // deterministic order

	licenseList := []*licenses.License{}

	for _, id := range ids {
		license := &licenses.License{
			SPDX: id,
			// TODO: populate other fields here?
		}

		// If we find an unknown SPDX ID, we don't want to error,
		// because that would allow someone to put junk in their code to
		// prevent us scanning it. Instead, create an invalid license
		// but return it anyways. If we ever want to check validity, we
		// know to expect failures. It *must* be valid because it's an
		// explicit SPDX scanner.
		if err := license.Validate(); err != nil {
			//return nil, err
			license = &licenses.License{
				//SPDX: "",
				Origin: "", // unknown!
				Custom: id,
				// TODO: populate other fields here (eg: found license text)
			}
		}

		licenseList = append(licenseList, license)
	}

	result := &interfaces.Result{
		Licenses:   licenseList,
		Confidence: 1.0, // TODO: what should we put here?
	}

	// We perform the strange task of processing any partial results, and
	// returning some even if we errored, because the spdx code seems to
	// think this is better than no results. I'll do the same, but there is
	// no guarantee the calling iterator will use these. (Currently it does
	// not!)
	return result, errwrap.Wrapf(scanner.Err(), "spdx scanner error")
}