codes/data_processing.py (147 lines of code) (raw):
def tokenizer (x, k):
## K-mer tokenization for DNA sequences ##
tok = ''
i = 0
while i <= len(x)-k:
for j in range (k):
tok = tok + x[i+j]
tok = tok + ' '
i+=1
return tok
def pre_processing (x, genome_dict):
## transfer the K-mer tokenized DNA sequences into integers ##
x = x.split()
for i in range (len(x)):
x[i] = genome_dict[x[i]]
return x
def pre_pro (x, k):
## Discretize the atac signals ##
y = x.copy()
x = x[:len(x)-k+1]
for i in range (len(x)):
x[i] = sum(y[i:i+k])/k
if x[i]==0:
x[i] = 1
continue
if x[i]>0 and x[i]<0.001:
x[i] = 2
continue
if x[i]>=0.001 and x[i]<0.01:
x[i] = 3
continue
if x[i]>=0.01 and x[i]<0.1:
x[i] = 4
continue
if x[i]>=0.1 and x[i]<0.125:
x[i] = 5
continue
if x[i]>=0.125 and x[i]<0.15:
x[i] = 6
continue
if x[i]>=0.15 and x[i]<0.2:
x[i] = 7
continue
if x[i]>=0.2 and x[i]<0.3:
x[i] = 8
continue
if x[i]>=0.3 and x[i]<0.4:
x[i] = 9
continue
if x[i]>=0.4 and x[i]<0.5:
x[i] = 10
continue
if x[i]>=0.5 and x[i]<0.6:
x[i] = 11
continue
if x[i]>=0.6 and x[i]<0.7:
x[i] = 12
continue
if x[i]>=0.7 and x[i]<0.8:
x[i] = 13
continue
if x[i]>=0.8 and x[i]<0.9:
x[i] = 14
continue
if x[i]>=0.9 and x[i]<1.0:
x[i] = 15
continue
if x[i]>=1.0 and x[i]<2.0:
x[i] = 16
continue
if x[i]>=2.0 and x[i]<3.0:
x[i] = 17
continue
if x[i]>=3.0 and x[i]<4.0:
x[i] = 18
continue
if x[i]>=4.0 and x[i]<5.0:
x[i] = 19
continue
if x[i]>=5.0 and x[i]<6.0:
x[i] = 20
continue
if x[i]>=6.0 and x[i]<7.0:
x[i] = 21
continue
if x[i]>=7.0 and x[i]<8.0:
x[i] = 22
continue
if x[i]>=8.0 and x[i]<9.0:
x[i] = 23
continue
if x[i]>=9.0 and x[i]<10.0:
x[i] = 24
continue
if x[i]>=10.0 and x[i]<11.0:
x[i] = 25
continue
if x[i]>=11.0 and x[i]<12.0:
x[i] = 26
continue
if x[i]>=12.0 and x[i]<13.0:
x[i] = 27
continue
if x[i]>=13.0 and x[i]<14.0:
x[i] = 28
continue
if x[i]>=14.0 and x[i]<15.0:
x[i] = 29
continue
if x[i]>=15.0 and x[i]<20.0:
x[i] = 30
continue
if x[i]>=20.0 and x[i]<25.0:
x[i] = 31
continue
if x[i]>=25.0 and x[i]<35.0:
x[i] = 32
continue
if x[i]>=35.0 and x[i]<55.0:
x[i] = 33
continue
if x[i]>=55.0 and x[i]<100.0:
x[i] = 34
continue
if x[i]>=100.0 and x[i]<200.0:
x[i] = 35
continue
if x[i]>=200.0:
x[i] = 36
continue
return x
def narrowPeak_Reader(path):
## .narrowPeak file reader ##
f = open(path, encoding = "utf-8")
E = f.read()
E = E.split('\n')
for i in range (len(E)):
E[i] = E[i].split()
return E
def value_fixer(CHR_value):
## fix the nan values in the .bigWig files ##
for m in range (len(CHR_value)):
if math.isnan(CHR_value[m]) == True:
CHR_value[m]=0.0
return CHR_value
def txt_Reader(path):
## .txt file reader ##
f = open(path, encoding = "utf-8")
E = f.read()
E = E.split('\n')
for i in range (len(E)):
E[i] = E[i].split()[0]
return E