def parse_latex_table()

in data_extraction_transformation/scripts/one_time_use_scripts/comput_mean_median_max_min.py [0:0]


def parse_latex_table(latex_string, expected_numeric_cols=11):
    """Convert a LaTeX table string into a DataFrame while ensuring 12 columns (1 text + 11 numeric)."""
    
    # Step 1: Preprocess rows, remove LaTeX line breaks, and split the table into rows
    rows = [re.sub(r"\\\\", "", row).strip() for row in latex_string.split("\n") if row.strip()]

    # Step 2: Handle escaped ampersands (\\&) correctly by replacing them with a placeholder
    rows = [row.replace(r"\\&", "[AMP]") for row in rows]  # Temporarily replace \\& with [AMP]
    
    # Step 3: Split the row by " & " but preserve the [AMP] in the text column
    data = [row.split(" & ") for row in rows]

    # Step 4: Restore the ampersand in the first column after splitting
    data = [[cell.replace("[AMP]", "&") for cell in row] for row in data]
    
    # Step 5: Ensure every row has exactly 12 columns (1 text + 11 numeric)
    for i, row in enumerate(data):
        if len(row) < expected_numeric_cols + 1:  # If row has fewer columns, pad it
            missing_cols = (expected_numeric_cols + 1) - len(row)
            data[i].extend(["N/A"] * missing_cols)  # Fill missing columns with N/A
        elif len(row) > expected_numeric_cols + 1:  # If row has too many columns, keep only the first 12
            data[i] = row[:expected_numeric_cols + 1]  # Keep only 12 columns (1 text + 11 numeric)

    # Step 6: Convert to DataFrame
    df = pd.DataFrame(data)
    df.iloc[:, -1] = df.iloc[:, -1].str.replace(r'\\', '', regex=True)  # Remove backslashes
    df.iloc[:, -1] = df.iloc[:, -1].str.strip()

    # Step 7: Convert numeric columns (2nd to last) to numeric values
    for col in df.columns.tolist()[1:]:  # Exclude the first column (text)
        df[col] = pd.to_numeric(df[col], errors="coerce")  # Coerce invalid numeric entries to NaN
    # print(df)
    return df