def parse_rules()

in aristotle/aristotle.py [0:0]
89 lines of code
40 McCabe index (conditional complexity)

    def parse_rules(self, rules, filename=None):
        """Parses the given rules and builds/updates necessary data structures.

        :param rules: rules (one per line) to parse and build/update the necessary data structures
        :type rules: string, required
        :param filename: if the passed in rules came from a file, the filename of that file
        :type filename: string, optional
        """
        try:
            for lineno, line in enumerate(rules.splitlines()):
                # ignore comments and blank lines
                is_disabled_rule = False
                if len(line.strip()) == 0:
                    continue
                if line.lstrip().startswith('#'):
                    if disabled_rule_re.match(line.strip()):
                        is_disabled_rule = True
                        line = line.lstrip()[1:].strip()
                    else:
                        # valid comment (not disabled rule)
                        print_debug("Skipping comment: {}".format(line))
                        continue

                # extract sid
                matchobj = sid_re.search(line)
                if not matchobj:
                    print_error("Invalid rule on line {}:\n{}".format(lineno, line), fatal=True)
                sid = int(matchobj.group("SID"))

                # extract classtype. This only grabs the first one; some engines support multiple
                # 'classtype' keywords in rules but it practice it is rarely, if ever, done.
                classtype = None
                matchobj = classtype_keyword_re.search(line)
                if matchobj:
                    classtype = matchobj.group("CLASSTYPE")
                else:
                    print_debug("No 'classtype' keyword found in sid {}".format(sid))

                # extract metadata keyword value
                metadata_str = ""
                matchobj = metadata_keyword_re.search(line)
                if matchobj:
                    metadata_str = matchobj.group("METADATA")
                else:
                    print_warning("No 'metatdata' keyword found in sid {}".format(sid))
                if (lineno % 1000 == 0):
                    print_debug("metadata_str for sid {}:\n{}".format(sid, metadata_str))

                # extract 'msg' field
                matchobj = rule_msg_re.search(line)
                if not matchobj:
                    print_warning("Unable to extract rule msg from SID '{}'.".format(sid))
                    msg = ""
                else:
                    msg = matchobj.group("MSG")

                # build dict
                if sid in self.metadata_dict.keys():
                    # include the first encountered enabled rule; if they are all disabled, include the first encountered.
                    print_warning("Duplicate sid '{}' found{}".format(sid, "!" if not filename else " in file '{}'!".format(filename)))
                    if is_disabled_rule:
                        print_warning("Ignoring disabled rule with duplicate sid: {}".format(line))
                        continue
                    if self.metadata_dict[sid]['disabled']:
                        print_warning("Ignoring disabled rule with duplicate sid: {}".format(self.metadata_dict[sid]['raw_rule']))
                    else:
                        print_warning("Ignoring rule with duplicate sid: {}".format(line))
                        continue

                self.metadata_dict[sid] = {'metadata': {},
                                           'msg': msg,
                                           'disabled': False if self.enable_all_rules else is_disabled_rule,
                                           'originally_disabled': is_disabled_rule,
                                           'raw_rule': line
                                           }

                metadata_pairs = []

                if len(metadata_str) > 0:
                    metadata_pairs.extend(metadata_str.split(','))

                if classtype and not self.ignore_classtype_keyword:
                    # add classtype from keyword as pseudo metadata key
                    metadata_pairs.append("classtype {}".format(classtype))

                if filename and not self.ignore_filename:
                    metadata_pairs.append("filename {}".format(filename))

                for kvpair in metadata_pairs:
                    # key-value pairs are case insensitive; make everything lower case
                    # also remove extra spaces before, after, and between key and value
                    kvsplit = [e.strip() for e in kvpair.lower().strip().split(' ', 1)]
                    if len(kvsplit) < 2:
                        # just a single word in metadata. warn and skip
                        print_warning("Single word metadata value found, ignoring '{}' in sid {}".format(kvpair, sid))
                        continue
                    k, v = kvsplit
                    if k == "sid" and int(v) != sid:
                        # this is in violation of the BETTER schema, throw warning
                        print_warning("line {}: 'sid' metadata key value '{}' does not match rule sid '{}'. This may lead to unexpected results".format(lineno, v, sid))
                    # normalize_better() returns a list b/c in rare cases it will produce more than one key/value pair.
                    # Because of that, make everything a(nother) list, even though most of the time it will be
                    # a one element list
                    if self.normalize:
                        kvs = self.normalize_better(k, v, sid)
                    else:
                        kvs = [kvsplit]
                    for current_kvp in kvs:
                        k, v = current_kvp
                        self.add_metadata(sid, k, v)
                    for k in self.metadata_dict[sid]['metadata'].keys():
                        # remove duplicate values for the same key
                        self.metadata_dict[sid]['metadata'][k] = list(set(self.metadata_dict[sid]['metadata'][k]))

                # add sid as pseudo metadata key unless it already exists
                if 'sid' not in self.metadata_dict[sid]['metadata'].keys():
                    # keys and values are strings; variable "sid" is int so must
                    # be cast as str when used the same way other keys and values are used.
                    self.metadata_dict[sid]['metadata']['sid'] = [str(sid)]
                    self.keys_dict['sid'][str(sid)] = [sid]

                # add 'originally_disabled' as pseudo metadata key so it can be filtered on
                if 'originally_disabled' in self.metadata_dict[sid]['metadata'].keys():
                    print_warning("Metadata key 'originally_disabled' found in SID {}. "
                                  "This is an internal metadata key used by Aristotle. "
                                  "The value '{}' found in the rule will be ignored.".format(sid, self.metadata_dict[sid]['metadata']['originally_disabled']))
                    self.delete_metadata(sid, 'originally_disabled')
                self.add_metadata(sid, 'originally_disabled', str(self.metadata_dict[sid]['originally_disabled']))

        except Exception as e:
            traceback.print_exc(e)
            print_error("Problem loading rules: {}".format(e), fatal=True)