in data_extraction_transformation/scripts/one_time_use_scripts/comput_mean_median_max_min.py [0:0]
def parse_latex_table(latex_string, expected_numeric_cols=11):
"""Convert a LaTeX table string into a DataFrame while ensuring 12 columns (1 text + 11 numeric)."""
# Step 1: Preprocess rows, remove LaTeX line breaks, and split the table into rows
rows = [re.sub(r"\\\\", "", row).strip() for row in latex_string.split("\n") if row.strip()]
# Step 2: Handle escaped ampersands (\\&) correctly by replacing them with a placeholder
rows = [row.replace(r"\\&", "[AMP]") for row in rows] # Temporarily replace \\& with [AMP]
# Step 3: Split the row by " & " but preserve the [AMP] in the text column
data = [row.split(" & ") for row in rows]
# Step 4: Restore the ampersand in the first column after splitting
data = [[cell.replace("[AMP]", "&") for cell in row] for row in data]
# Step 5: Ensure every row has exactly 12 columns (1 text + 11 numeric)
for i, row in enumerate(data):
if len(row) < expected_numeric_cols + 1: # If row has fewer columns, pad it
missing_cols = (expected_numeric_cols + 1) - len(row)
data[i].extend(["N/A"] * missing_cols) # Fill missing columns with N/A
elif len(row) > expected_numeric_cols + 1: # If row has too many columns, keep only the first 12
data[i] = row[:expected_numeric_cols + 1] # Keep only 12 columns (1 text + 11 numeric)
# Step 6: Convert to DataFrame
df = pd.DataFrame(data)
df.iloc[:, -1] = df.iloc[:, -1].str.replace(r'\\', '', regex=True) # Remove backslashes
df.iloc[:, -1] = df.iloc[:, -1].str.strip()
# Step 7: Convert numeric columns (2nd to last) to numeric values
for col in df.columns.tolist()[1:]: # Exclude the first column (text)
df[col] = pd.to_numeric(df[col], errors="coerce") # Coerce invalid numeric entries to NaN
# print(df)
return df