2 回答

TA贡献1824条经验 获得超6个赞
完整的工作解决方案,没有熊猫:
from itertools import product
def str_or_int(item):
try:
return int(item)
except ValueError:
return item
def correspond(big, small, output):
with open(big, 'r') as bigf, open(small, 'r') as smallf, open(output, 'w') as outputf:
current = None
count = 0
for b_line, s_line in product(filter(lambda x: x != '\n', bigf), filter(lambda x: x != '\n', smallf)):
if b_line != current:
if count > 0:
out_line = current.split()
outputf.write('\t'.join((out_line[0], out_line[1], out_line[2], str(count), out_line[5])) + '\n')
current = b_line
count = 0
b_line = [str_or_int(s) for s in b_line.split()]
s_line = [str_or_int(s) for s in s_line.split()]
try:
if b_line[0] == s_line[0] and b_line[3] >= s_line[1] >= b_line[2] and b_line[3] >= s_line[2] >= b_line[2]:
count += 1
except IndexError:
continue
如果您有问题,请在评论中提问

TA贡献1963条经验 获得超6个赞
鉴于您的示例输入是这样的:
big = '''chr1 transcript 2481359 2483515 - RP3-395M20.8
chr1 transcript 2487078 2492123 + TNFRSF14
chr1 transcript 2497849 2501297 + RP3-395M20.7
chr1 transcript 2512999 2515942 + RP3-395M20.9
chr1 transcript 2517930 2521041 + FAM213B
chr1 transcript 2522078 2524087 - MMEL1'''
small = '''chr1 2487088 2492113 17
chr1 100757323 100757324 19
chr1 2487099 2492023 21
chr1 100758316 100758317 41
chr1 2514000 2515742 14'''
big, small = ([l.split() for l in d.splitlines()] for d in (big, small))
您可以使用sum生成器表达式来计算small符合条件的行数,然后用于str.join生成所需的输出:
for name_big, _, low, high, _, note in big:
count = sum(1 for name_small, n1, n2, _ in small if name_big == name_small and all(int(low) <= int(n) <= int(high) for n in (n1, n2)))
if count:
print('\t'.join((name_big, low, high, str(count), note)))
这输出:
chr1 2487078 2492123 2 TNFRSF14
chr1 2512999 2515942 1 RP3-395M20.9
添加回答
举报