def parse_html_to_dataframe()

in src/utils.py [0:0]


def parse_html_to_dataframe(path: str, dataset_type: str) -> pd.DataFrame:
    """
    @param path: folder path of dataset_type (training, validation, testing)
    @dataset_type: fathom dataset (one of 'training', 'validation', 'testing')
    Extracts useful features from html fathom dataset
    fathom label has attribute 'data-fathom="xyz"'
    @return: pandas dataframe of all the features
    """
    features = [
        'file_id',
        'language',
        'name',
        'class',
        'id',
        'maxlen',
        'type',
        'labels', # fathom_types
        'autocomplete_text',
        'placeholder_text',
        'ml_dataset',
        'html_cleaned'
    ]

    features_dict = {f: [] for f in features}
    for filename in os.listdir(f'{path}/{dataset_type}'):
        print(filename)
        language = filename.split('_')[0]
        f = os.path.join(path, dataset_type, filename)
        if not os.path.isfile(f):
            continue
        HtmlFile = open(f, 'r', encoding='utf-8', errors='replace')
        source_code = HtmlFile.read()
        soup = BeautifulSoup(source_code, "html.parser")
        for tag in soup.find_all():
            features_dict['file_id'].append(filename)
            features_dict['language'].append(language)
            features_dict['name'].append(tag.name)
            features_dict['class'].append(tag.attrs.get('class', []))
            features_dict['id'].append(tag.attrs.get('id', ''))
            features_dict['maxlen'].append(tag.attrs.get('maxlength', ''))
            features_dict['type'].append(tag.attrs.get('type', ''))
            features_dict['labels'].append(tag.attrs.get('data-fathom', 'other'))
            features_dict['autocomplete_text'].append(tag.attrs.get('autocomplete', ''))
            features_dict['placeholder_text'].append(tag.attrs.get('placeholder', ''))
            features_dict['ml_dataset'].append(dataset_type)
            features_dict['html_cleaned'].append(clean_html(str(tag))[:STR_LIMIT])
    return pd.DataFrame(features_dict).fillna('')