scripts/schema/loader.py

# Licensed to Elasticsearch B.V. under one or more contributor # license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright # ownership. Elasticsearch B.V. licenses this file to you under # the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. import copy import git import glob from typing import ( Any, Dict, List, Optional, ) import yaml from generators import ecs_helpers from _types import ( Field, FieldEntry, FieldNestedEntry, MultiField, SchemaDetails, ) # Loads main ECS schemas and optional additional schemas. # They are deeply nested, then merged together. # This script doesn't fill in defaults other than the bare minimum for a predictable # deeply nested structure. It doesn't concern itself with what "should be allowed" # in being a good ECS citizen. It just loads things and merges them together. # The deeply nested structured returned by this script looks like this. # # [schema name]: { # 'schema_details': { # 'reusable': ... # }, # 'field_details': { # 'type': ... # }, # 'fields': { # [field name]: { # 'field_details': { ... } # 'fields': { # # (dotted key names replaced by deep nesting) # [field name]: { # 'field_details': { ... } # 'fields': { # } # } # } # } # } # Schemas at the top level always have all 3 keys populated. # Leaf fields only have 'field_details' populated. # Any intermediate field with other fields nested within them have 'fields' populated. # Note that intermediate fields rarely have 'field_details' populated, but it's supported. # Examples of this are 'dns.answers', 'observer.egress'. EXPERIMENTAL_SCHEMA_DIR = 'experimental/schemas' def load_schemas( ref: Optional[str] = None, included_files: Optional[List[str]] = [] ) -> Dict[str, FieldEntry]: """Loads ECS and custom schemas. They are returned deeply nested and merged.""" # ECS fields (from git ref or not) schema_files_raw: Dict[str, FieldNestedEntry] = load_schemas_from_git( ref) if ref else load_schema_files(ecs_helpers.ecs_files()) fields: Dict[str, FieldEntry] = deep_nesting_representation(schema_files_raw) # Custom additional files if included_files and len(included_files) > 0: print('Loading user defined schemas: {0}'.format(included_files)) # If --ref provided and --include loading experimental schemas if ref and EXPERIMENTAL_SCHEMA_DIR in included_files: exp_schema_files_raw: Dict[str, FieldNestedEntry] = load_schemas_from_git( ref, target_dir=EXPERIMENTAL_SCHEMA_DIR) exp_fields: Dict[str, FieldEntry] = deep_nesting_representation(exp_schema_files_raw) fields = merge_fields(fields, exp_fields) included_files.remove(EXPERIMENTAL_SCHEMA_DIR) # Remaining additional custom files (never from git ref) custom_files: List[str] = ecs_helpers.glob_yaml_files(included_files) custom_fields: Dict[str, FieldEntry] = deep_nesting_representation(load_schema_files(custom_files)) fields = merge_fields(fields, custom_fields) return fields def load_schema_files(files: List[str]) -> Dict[str, FieldNestedEntry]: fields_nested: Dict[str, FieldNestedEntry] = {} for f in files: new_fields: Dict[str, FieldNestedEntry] = read_schema_file(f) fields_nested = ecs_helpers.safe_merge_dicts(fields_nested, new_fields) return fields_nested def load_schemas_from_git( ref: str, target_dir: Optional[str] = 'schemas' ) -> Dict[str, FieldNestedEntry]: tree: git.objects.tree.Tree = ecs_helpers.get_tree_by_ref(ref) fields_nested: Dict[str, FieldNestedEntry] = {} # Handles case if target dir doesn't exists in git ref if ecs_helpers.path_exists_in_git_tree(tree, target_dir): for blob in tree[target_dir].blobs: if blob.name.endswith('.yml'): new_fields: Dict[str, FieldNestedEntry] = read_schema_blob(blob, ref) fields_nested = ecs_helpers.safe_merge_dicts(fields_nested, new_fields) else: raise KeyError(f"Target directory './{target_dir}' not present in git ref '{ref}'!") return fields_nested def read_schema_file(file_name: str) -> Dict[str, FieldNestedEntry]: """Read a raw schema yml file into a dict.""" with open(file_name) as f: raw: List[FieldNestedEntry] = yaml.safe_load(f.read()) return nest_schema(raw, file_name) def read_schema_blob( blob: git.objects.blob.Blob, ref: str ) -> Dict[str, FieldNestedEntry]: """Read a raw schema yml git blob into a dict.""" content: str = blob.data_stream.read().decode('utf-8') raw: List[FieldNestedEntry] = yaml.safe_load(content) file_name: str = "{} (git ref {})".format(blob.name, ref) return nest_schema(raw, file_name) def nest_schema(raw: List[FieldNestedEntry], file_name: str) -> Dict[str, FieldNestedEntry]: """ Raw schema files are an array of schema details: [{'name': 'base', ...}] This function loops over the array (usually 1 schema per file) and turns it into a dict with the schema name as the key: { 'base': { 'name': 'base', ...}} """ fields: Dict[str, FieldNestedEntry] = {} for schema in raw: if 'name' not in schema: raise ValueError("Schema file {} is missing mandatory attribute 'name'".format(file_name)) fields[schema['name']] = schema return fields def deep_nesting_representation(fields: Dict[str, FieldNestedEntry]) -> Dict[str, FieldEntry]: deeply_nested: Dict[str, FieldEntry] = {} for (name, flat_schema) in fields.items(): # We destructively select what goes into schema_details and child fields. # The rest is 'field_details'. flat_schema = flat_schema.copy() flat_schema['node_name'] = flat_schema['name'] # Schema-only details. Not present on other nested field groups. schema_details: SchemaDetails = {} for schema_key in ['root', 'group', 'reusable', 'title']: if schema_key in flat_schema: schema_details[schema_key] = flat_schema.pop(schema_key) nested_schema = nest_fields(flat_schema.pop('fields', [])) # Re-assemble new structure deeply_nested[name] = { 'schema_details': schema_details, # What's still in flat_schema is the field_details for the field set itself 'field_details': flat_schema, 'fields': nested_schema['fields'] } return deeply_nested def nest_fields(field_array: List[Field]) -> Dict[str, Dict[str, FieldEntry]]: schema_root: Dict[str, Dict[str, FieldEntry]] = {'fields': {}} for field in field_array: nested_levels: List[str] = field['name'].split('.') parent_fields: List[str] = nested_levels[:-1] leaf_field: str = nested_levels[-1] # "nested_schema" is a cursor we move within the schema_root structure we're building. # Here we reset the cursor for this new field. nested_schema = schema_root['fields'] current_path = [] for idx, level in enumerate(parent_fields): nested_schema.setdefault(level, {}) # Where nested fields will live nested_schema[level].setdefault('fields', {}) # Make type:object explicit for intermediate parent fields nested_schema[level].setdefault('field_details', {}) field_details = nested_schema[level]['field_details'] field_details['node_name'] = level # Respect explicitly defined object fields if 'type' in field_details and field_details['type'] in ['object', 'nested']: field_details.setdefault('intermediate', False) else: field_details.setdefault('type', 'object') field_details.setdefault('name', '.'.join(parent_fields[:idx + 1])) field_details.setdefault('intermediate', True) # moving the nested_schema cursor deeper current_path.extend([level]) nested_schema = nested_schema[level]['fields'] nested_schema.setdefault(leaf_field, {}) # Overwrite 'name' with the leaf field's name. The flat_name is already computed. field['node_name'] = leaf_field nested_schema[leaf_field]['field_details'] = field return schema_root def array_of_maps_to_map(array_vals: List[MultiField]) -> Dict[str, MultiField]: ret_map: Dict[str, MultiField] = {} for map_val in array_vals: name: str = map_val['name'] # if multiple name fields exist in the same custom definition this will take the last one ret_map[name] = map_val return ret_map def map_of_maps_to_array(map_vals: Dict[str, MultiField]) -> List[MultiField]: ret_list: List[MultiField] = [] for key in map_vals: ret_list.append(map_vals[key]) return sorted(ret_list, key=lambda k: k['name']) def dedup_and_merge_lists(list_a: List[MultiField], list_b: List[MultiField]) -> List[MultiField]: list_a_map: Dict[str, MultiField] = array_of_maps_to_map(list_a) list_a_map.update(array_of_maps_to_map(list_b)) return map_of_maps_to_array(list_a_map) def merge_fields(a: Dict[str, FieldEntry], b: Dict[str, FieldEntry]) -> Dict[str, FieldEntry]: """Merge ECS field sets with custom field sets.""" a = copy.deepcopy(a) b = copy.deepcopy(b) for key in b: if key not in a: a[key] = b[key] continue # merge field details if 'normalize' in b[key]['field_details']: a[key].setdefault('field_details', {}) a[key]['field_details'].setdefault('normalize', []) a[key]['field_details']['normalize'].extend(b[key]['field_details'].pop('normalize')) if 'multi_fields' in b[key]['field_details']: a[key].setdefault('field_details', {}) a[key]['field_details'].setdefault('multi_fields', []) a[key]['field_details']['multi_fields'] = dedup_and_merge_lists( a[key]['field_details']['multi_fields'], b[key]['field_details']['multi_fields']) # if we don't do this then the update call below will overwrite a's field_details, with the original # contents of b, which undoes our merging the multi_fields del b[key]['field_details']['multi_fields'] a[key]['field_details'].update(b[key]['field_details']) # merge schema details if 'schema_details' in b[key]: asd = a[key]['schema_details'] bsd = b[key]['schema_details'] if 'reusable' in b[key]['schema_details']: asd.setdefault('reusable', {}) if 'top_level' in bsd['reusable']: asd['reusable']['top_level'] = bsd['reusable']['top_level'] else: asd['reusable'].setdefault('top_level', True) if 'order' in bsd['reusable']: asd['reusable']['order'] = bsd['reusable']['order'] asd['reusable'].setdefault('expected', []) asd['reusable']['expected'].extend(bsd['reusable']['expected']) bsd.pop('reusable') asd.update(bsd) # merge nested fields if 'fields' in b[key]: a[key].setdefault('fields', {}) a[key]['fields'] = merge_fields(a[key]['fields'], b[key]['fields']) return a def load_yaml_file(file_name): with open(file_name) as f: return yaml.safe_load(f.read()) # You know, for silent tests def warn(message: str) -> None: print(message) def eval_globs(globs): """Accepts an array of glob patterns or file names, returns the array of actual files""" all_files = [] for g in globs: if g.endswith('/'): g += '*' new_files = glob.glob(g) if len(new_files) == 0: warn("{} did not match any files".format(g)) else: all_files.extend(new_files) return all_files def load_definitions(file_globs): sets = [] for f in ecs_helpers.glob_yaml_files(file_globs): raw = load_yaml_file(f) sets.append(raw) return sets

scripts/schema/loader.py (193 lines of code) (raw):