''' 自编TFR dat格式结果文件统计脚本 2023.3.12 ''' from collections import Counter
# 数据过滤 loci_start = [] loci_finish = [] total_line = [] withopen('./genome.baima.fasta.2.7.7.80.10.50.500.dat', 'r') asinput: for i ininput.readlines()[15:]: if i.find('Sequence') != -1or i.find('Parameters') != -1: continue else: lst = i.strip().split(' ') iflen(lst) < 15: continue iflen(loci_start) < 1andlen(loci_finish) < 1: # 处理列表为空的情况 loci_start.append(lst[0]) loci_finish.append(lst[1]) total_line.append(lst) else: if lst[0] != loci_start[-1] and lst[1] != loci_finish[-1]: # 开始结束位点都不同,则记录数据 loci_start.append(lst[0]) loci_finish.append(lst[1]) total_line.append(lst) elif lst[0] == loci_start[-1]: # 开始位点相同,跳过 continue elif lst[1] == loci_finish[-1]: # 结束位点相同,删除列表最后一个元素并加入新元素 del loci_start[-1] del loci_finish[-1] del total_line[-1] loci_start.append(lst[0]) loci_finish.append(lst[1]) total_line.append(lst)
# 提取motif长度和重复序列长度 motif_lst = [] leng_lst = [] for i in total_line: motif_lst.append(i[2]) leng = int(i[1]) - int(i[0]) + 1 leng_lst.append(leng)
# 统计相同motif的总长度和所有重复序列总长度 total_leng = {} motif_sum = 0 for i inrange(len(motif_lst)): item = motif_lst[i] motif_sum += leng_lst[i] if item in total_leng: total_leng[item] += leng_lst[i] else: total_leng[item] = leng_lst[i]
# 统计motif-mer数量,总数,占比 count_motif = Counter(motif_lst) count_lst = list(count_motif.items()) count_lst.sort(key = lambda x : x[1], reverse = True) lst_ = [] hit_num = 0 for i in count_lst: hit_num += i[1] for i in count_lst: ls = i[0] lst1 = list(i) if ls in total_leng: lst1.append(total_leng[ls]) precentage = '%.2f%%'%(100 * i[1] / hit_num) lst1.append(precentage) lst_.append(lst1)