src/plot/plot_map_pie_fig4_2.py [39:129]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
sns.set()

dir_path = "/mnt_nas/****/rdrp/predicts/rdrp_40_extend/protein/binary_class/sefn/20230201140320-bak/checkpoint-100000/all/"

# The species corresponding to each virus
# The virus_id removes the semicolon and is the contig_id
contig_id_2_species_id = {}
contig_id_set = set()
species_id_set = set()
with open(os.path.join(dir_path, "ecology/all_species.txt.edit"), "r") as rfp:
    cnt = 0
    reader = csv.reader(rfp, delimiter='\t')
    for row in reader:
        cnt += 1
        if cnt == 1:
            continue
        contig_id, species_id = row[0].strip(), row[1].strip()
        contig_id_set.add(contig_id)
        species_id_set.add(species_id)
        if contig_id not in contig_id_2_species_id:
            contig_id_2_species_id[contig_id] = set()
        contig_id_2_species_id[contig_id].add(species_id)
print("contig size: %d" %len(contig_id_set))
print("species size: %d" %len(species_id_set))

# Check whether the contig crosses species, result: no cross
for item in contig_id_2_species_id.items():
    assert len(item[1]) == 1

# Verify that all virus_ids exist in species
with open(os.path.join(dir_path, "ecology/all_lib_rpm_cov.txt.cov20"), "r") as rfp:
    reader = csv.reader(rfp, delimiter="\t")
    cnt = 0
    for row in reader:
        cnt += 1
        if cnt == 1:
            continue
        superclade_name, virus_id, rpm, coverage, lib = row[0], row[1], row[2], row[3], row[4]
        virus_id = virus_id[0:virus_id.index(':')]
        assert virus_id in contig_id_set


# Get the group where each superclade is located
df = pd.read_csv(os.path.join(dir_path, "tbl/all_info.tbl"), delimiter='\t')
superclade_group = {}
group_set = set()
for index, row in df.iterrows():
    superclade, cluster, protein_id, group1, group2 = row["Superclade"], row["Cluster"], row["ID"], row["group1"], row["group2"]
    if superclade not in superclade_group:
        superclade_group[superclade] = set()
    superclade_group[superclade].add(group2)
    group_set.add(group2)
print("superclade size: %d" %len(superclade_group))
print("group size: %d" %len(group_set))
print(group_set)

# Check whether the superclade crosses the group, the result: no cross
for item in superclade_group.items():
    assert len(item[1]) == 1

# Get the latitude and longitude of each SRA
filepath = "/mnt_nas/****/workspace/DeepProtFunc/src/geo/data/all_sra_lat_lon.csv"
sra_lat_lon_info = {}
lat_lon_sra = {}
sra_set = set()
with open(filepath, "r") as rfp:
    cnt = 0
    reader = csv.reader(rfp)
    for row in reader:
        cnt += 1
        if cnt == 1:
            continue
        sra, name, lat, lon, data_type = row[0], row[1], float(row[2]), float(row[3]), row[4]
        sra_lat_lon_info[sra] = [lat, lon]
        lat_lon = "%0.8f###%0.8f" %(lat, lon)
        if lat_lon not in lat_lon_sra:
            lat_lon_sra[lat_lon] = set()
        lat_lon_sra[lat_lon].add(sra)
        sra_set.add(sra)
print("sra size: %d, lat_lon_sra size: %d" % (len(sra_set), len(lat_lon_sra)))


# SRA set with latitude and longitude
found_set = set()
# SRA set with no latitude and longitude
unfound_set = set()

# The latitude and longitude involved in each species
species_lat_lon = {}
# Species under each group (label) under each latitude and longitude
lat_lon_species = {}
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -


src/plot/plot_map_pie_fig_aff4_2.py [38:126]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
sns.set()

dir_path = "/mnt_nas/****/rdrp/predicts/rdrp_40_extend/protein/binary_class/sefn/20230201140320-bak/checkpoint-100000/all/"

# The species corresponding to each virus
# The virus_id below removes the semicolon and is the contig_id
contig_id_2_species_id = {}
contig_id_set = set()
species_id_set = set()
with open(os.path.join(dir_path, "ecology/all_species.txt.edit"), "r") as rfp:
    cnt = 0
    reader = csv.reader(rfp, delimiter='\t')
    for row in reader:
        cnt += 1
        if cnt == 1:
            continue
        contig_id, species_id = row[0].strip(), row[1].strip()
        contig_id_set.add(contig_id)
        species_id_set.add(species_id)
        if contig_id not in contig_id_2_species_id:
            contig_id_2_species_id[contig_id] = set()
        contig_id_2_species_id[contig_id].add(species_id)
print("contig size: %d" %len(contig_id_set))
print("species size: %d" %len(species_id_set))

# Check whether the contig crosses species, result: no cross
for item in contig_id_2_species_id.items():
    assert len(item[1]) == 1

# Verify that all virus_ids exist in species
with open(os.path.join(dir_path, "ecology/all_lib_rpm_cov.txt.cov20"), "r") as rfp:
    reader = csv.reader(rfp, delimiter="\t")
    cnt = 0
    for row in reader:
        cnt += 1
        if cnt == 1:
            continue
        superclade_name, virus_id, rpm, coverage, lib = row[0], row[1], row[2], row[3], row[4]
        virus_id = virus_id[0:virus_id.index(':')]
        assert virus_id in contig_id_set

# Get the group where each superclade is located
df = pd.read_csv(os.path.join(dir_path, "tbl/all_info.tbl"), delimiter='\t')
superclade_group = {}
group_set = set()
for index, row in df.iterrows():
    superclade, cluster, protein_id, group1, group2 = row["Superclade"], row["Cluster"], row["ID"], row["group1"], row["group2"]
    if superclade not in superclade_group:
        superclade_group[superclade] = set()
    superclade_group[superclade].add(group2)
    group_set.add(group2)
print("superclade size: %d" %len(superclade_group))
print("group size: %d" %len(group_set))
print(group_set)

# Check whether the superclade crosses the group, the result: no cross
for item in superclade_group.items():
    assert len(item[1]) == 1

# Get the latitude and longitude of each sra
filepath = "/mnt_nas/****/workspace/DeepProtFunc/src/geo/data/all_sra_lat_lon.csv"
sra_lat_lon_info = {}
lat_lon_sra = {}
sra_set = set()
with open(filepath, "r") as rfp:
    cnt = 0
    reader = csv.reader(rfp)
    for row in reader:
        cnt += 1
        if cnt == 1:
            continue
        sra, name, lat, lon, data_type = row[0], row[1], float(row[2]), float(row[3]), row[4]
        sra_lat_lon_info[sra] = [lat, lon]
        lat_lon = "%0.8f###%0.8f" %(lat, lon)
        if lat_lon not in lat_lon_sra:
            lat_lon_sra[lat_lon] = set()
        lat_lon_sra[lat_lon].add(sra)
        sra_set.add(sra)
print("sra size: %d, lat_lon_sra size: %d" % (len(sra_set), len(lat_lon_sra)))


# The SRA set with lat and lon
found_set = set()
# The SRA set with no lat and lon
unfound_set = set()
# The latitude and longitude involved in each species
species_lat_lon = {}
# Species under each group (label) under each latitude and longitude
lat_lon_species = {}
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -