in src/utils.py [0:0]
def parse_html_to_dataframe(path: str, dataset_type: str) -> pd.DataFrame:
"""
@param path: folder path of dataset_type (training, validation, testing)
@dataset_type: fathom dataset (one of 'training', 'validation', 'testing')
Extracts useful features from html fathom dataset
fathom label has attribute 'data-fathom="xyz"'
@return: pandas dataframe of all the features
"""
features = [
'file_id',
'language',
'name',
'class',
'id',
'maxlen',
'type',
'labels', # fathom_types
'autocomplete_text',
'placeholder_text',
'ml_dataset',
'html_cleaned'
]
features_dict = {f: [] for f in features}
for filename in os.listdir(f'{path}/{dataset_type}'):
print(filename)
language = filename.split('_')[0]
f = os.path.join(path, dataset_type, filename)
if not os.path.isfile(f):
continue
HtmlFile = open(f, 'r', encoding='utf-8', errors='replace')
source_code = HtmlFile.read()
soup = BeautifulSoup(source_code, "html.parser")
for tag in soup.find_all():
features_dict['file_id'].append(filename)
features_dict['language'].append(language)
features_dict['name'].append(tag.name)
features_dict['class'].append(tag.attrs.get('class', []))
features_dict['id'].append(tag.attrs.get('id', ''))
features_dict['maxlen'].append(tag.attrs.get('maxlength', ''))
features_dict['type'].append(tag.attrs.get('type', ''))
features_dict['labels'].append(tag.attrs.get('data-fathom', 'other'))
features_dict['autocomplete_text'].append(tag.attrs.get('autocomplete', ''))
features_dict['placeholder_text'].append(tag.attrs.get('placeholder', ''))
features_dict['ml_dataset'].append(dataset_type)
features_dict['html_cleaned'].append(clean_html(str(tag))[:STR_LIMIT])
return pd.DataFrame(features_dict).fillna('')