in src/read_log_file.py [0:0]
def get_parsed_data_from_file(logfile, ignore_crashes = False):
regex_capture_string, column_names, data_types = get_parsing_groups()
table = __manyMatch_LineSearch(regex_capture_string, logfile)
# Construct a dictionary to hold column names, and associated data
table_groups = {}
group_number = 0
for column, data_type in zip(column_names, data_types):
if column:
if column not in table_groups:
# Create a new column, with the associated indicies of the data & datatype
table_groups[column] = [group_number], [data_type]
else:
# Add to the column of associated indicies, and datatypes
table_groups[column][0].append(group_number)
table_groups[column][1].append(data_type)
group_number += 1 # Not done in for loop, because if not column, then index doesnt changeß
# For each unique column name, select from the table the non-zero values for the associated
# column(s), and return a list in that correct data type. Then, update the dictionary's value
# to be that updated list. If no value, 'None' lives in the row
for column in table_groups:
table_groups[column] = __create_column(table, # data
table_groups[column][1], # DATATYPES
table_groups[column][0]) # table indicies
### Special Case ###
# Updates the eventtype column to properly put "Safepoint" at the eventtype, rather than None
table_groups["EventType"] = set_safepoints_eventype(
table_groups["EventType"],
table_groups["SafepointName"],
table_groups["TimeToStopApplication_seconds"])
df = pd.DataFrame(table_groups)
## Clean data, apply resrictions as needed **
df.replace({np.nan: None}, inplace= True)
if ignore_crashes:
if not assert_no_timing_errors(df):
df = fix_timing_errors(df)
return df