src/geo_map/extract_attr_from_biosample_page.py (69 lines of code) (raw):
#!/usr/bin/env python
# encoding: utf-8
'''
*Copyright (c) 2023, Alibaba Group;
*Licensed under the Apache License, Version 2.0 (the "License");
*you may not use this file except in compliance with the License.
*You may obtain a copy of the License at
* http://www.apache.org/licenses/LICENSE-2.0
*Unless required by applicable law or agreed to in writing, software
*distributed under the License is distributed on an "AS IS" BASIS,
*WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*See the License for the specific language governing permissions and
*limitations under the License.
@author: Hey
@email: sanyuan.**@**.com
@tel: 137****6540
@datetime: 2023/3/14 18:52
@project: DeepProtFunc
@file: extract_attr_from_biosample_page.py
@desc: extract attributes from biosample page
'''
import json
import os
from bs4 import BeautifulSoup
def create_doc_from_filename(filename):
# create BeautifulSoup from html
with open(filename, "r", encoding='utf-8') as f:
html_content = f.read()
soup = BeautifulSoup(html_content, "html.parser")
return soup
def parse(soup):
obj = {}
title_div = soup.find_all("h2", class_="title")
for div in title_div:
title = div.text
obj["title"] = title
doc_sum = soup.find("div", class_="docsum")
for dl in doc_sum.findChildren("dl"):
dt = dl.find("dt").text.strip()
dd = dl.find("dd")
tr_list = dd.find_all("tr")
if tr_list:
obj[dt] = {}
for tr in tr_list:
th = tr.find("th").text.strip()
td = tr.find("td").text.strip()
obj[dt][th] = td
else:
obj[dt] = dd.text.strip()
return obj
if __name__ == "__main__":
with open("./data/00all_SRA_run_biosample_info_res.txt", "w") as rfp:
for filename in os.listdir("./sra_biosample_html"):
if filename.endswith(".html"):
sra_biosample = filename.replace(".html", "").split("_")
sra = sra_biosample[0]
biosample = sra_biosample[1]
soup = create_doc_from_filename(os.path.join("./sra_biosample_html/", filename))
try:
obj = parse(soup)
except Exception as e:
print(e)
print("fail to parse html: %s" % filename)
continue
obj["sra"] = sra
obj["biosample"] = biosample
if "Attributes" in obj:
rfp.write(json.dumps(obj, ensure_ascii=False)+"\n")
else:
print(filename)
print(obj)
print("-"*25 + "Extract Attr Done" + "-"*25)