0

I want to find positions that overlaps with two coordinates and also that both are in the same chromosomes.

The file with the positions looks like this


with open(file_path, 'r') as f:
    lines = [l for l in f if not l.startswith('#')]
print(lines)
['chr1\t36931696\t.\tT\t.\t100\tPASS\tDP=839\tGT:GQ:AD:DP:VF:NL:SB:NC\t0/.:100:830:839:0.0107:24:-100.0000:0.0071\n', 'chr2\t25457280\t.\tA\t.\t100\tPASS\tDP=1410\tGT:GQ:AD:DP:VF:NL:SB:NC\t0/0:19:1403:1410:0.0050:24:-100.0000:0.0014\n', '\n', '\n']

# I have limited the file to have only two lines. But actually this normally have 100k lines

And the file with the intervals looks like this

print(bedregions)
[('chr1', 36931694, 36931909, 'CSF3R.exon.17.line.1.chr1.36931697.36932509--tile--1.probe--coordinates(36931694-36931909)'), ('chr2', 25466989, 25467211, 'DNMT3A.CDS.17.line.57.merged--with.DNMT3A.CDS.16.li.probe--coordinates(25466989-25467211)')]

# I have limited this file as well to have two tuples, this has actually 500 tuples

This is what I have been trying

def roi2(file_path,bedregions):
    with open(file_path, 'r') as f:
        lines = [l for l in f if not l.startswith('#')]
        
    chr2position = {}
    for position, line in enumerate(lines):
        # If there is a empty line this will give a empty list
        # Amd the following split will give a out of range error
        if (len(line)) == 1:
            break
        # Take the chr
        chr = line.strip().split()[0]
        
        if chr not in chr2position:
            chr2position[chr] = position
    filtered_lines =[]
    for element  in bedregions:
        ch, start, end, probe_name = element
        
        for lineindex in range(start + chr2position[chr], end + chr2position[chr] ):
            
            filtered_lines.append(lines[lineindex])
# This return a error in the last line. IndexError list index out of range

  • add if condition `lineindex < len(lines)` – deadshot Jul 30 '22 at 13:13
  • don't use `chr` as a variable name it's builtin function name use some other name – deadshot Jul 30 '22 at 13:16
  • please add the link to the other question where you asked that already. or delete one of them to merge them. like this is just not good. https://stackoverflow.com/questions/73170845/find-points-in-intervals-with-a-extra-condition/73173220?noredirect=1#comment129236479_73173220 – Rabinzel Jul 30 '22 at 13:22

1 Answers1

0

This should do what you want considering the data structure you mentioned

f = open(file_path, 'r')
lines = f.readlines()
chr2base2index = dict()
for index,line in enumerate(lines):
    if (len(line)) == 1:
            break
    if line[0] == '#':            
            continue
    handle = line.strip().split()
    chrm, base = handle[0], int(handle[1])
    if chrm not in chr2base2index:
        chr2base2index[chrm] = dict()
    if base not in chr2base2index[chrm]:
        chr2base2index[chrm][base] = index

filtered_lines = []
for chrm, start, end, probe_name in bedregions:
    if chrm not in chr2base2index:
        print(f'Chromosome {chrm} not found')
        continue
    for base in range(start, end):
        index = chr2base2index[chrm].get(base, None)
        if index != None:
            filtered_lines.append('\t'.join(lines[index].strip().split() + [probe_name]))
filtered_lines


['chr1\t36931696\t.\tT\t.\t100\tPASS\tDP=839\tGT:GQ:AD:DP:VF:NL:SB:NC\t0/.:100:830:839:0.0107:24:-100.0000:0.0071\tCSF3R.exon.17.line.1.chr1.36931697.36932509--tile--1.probe--coordinates(36931694-36931909)',
 'chr1\t36931697\t.\tT\t.\t100\tPASS\tDP=832\tGT:GQ:AD:DP:VF:NL:SB:NC\t0/0:15:829:832:0.0036:24:-100.0000:0.0154\tCSF3R.exon.17.line.1.chr1.36931697.36932509--tile--1.probe--coordinates(36931694-36931909)',
 'chr1\t36931698\t.\tT\t.\t100\tPASS\tDP=837\tGT:GQ:AD:DP:VF:NL:SB:NC\t0/0:36:836:837:0.0012:24:-100.0000:0.0095\tCSF3R.exon.17.line.1.chr1.36931697.36932509--tile--1.probe--coordinates(36931694-36931909)',
 'chr1\t36931699\t.\tA\t.\t100\tPASS\tDP=836\tGT:GQ:AD:DP:VF:NL:SB:NC\t0/0:36:835:836:0.0012:24:-100.0000:0.0107\tCSF3R.exon.17.line.1.chr1.36931697.36932509--tile--1.probe--coordinates(36931694-36931909)',
 'chr1\t36931700\t.\tC\t.\t100\tPASS\tDP=818\tGT:GQ:AD:DP:VF:NL:SB:NC\t0/0:14:814:818:0.0049:24:-100.0000:0.0320\tCSF3R.exon.17.line.1.chr1.36931697.36932509--tile--1.probe--coordinates(36931694-36931909)',
 'chr1\t36931701\t.\tA\t.\t100\tPASS\tDP=841\tGT:GQ:AD:DP:VF:NL:SB:NC\t0/0:20:838:841:0.0036:24:-100.0000:0.0047\tCSF3R.exon.17.line.1.chr1.36931697.36932509--tile--1.probe--coordinates(36931694-36931909)',
 'chr1\t36931702\t.\tA\t.\t100\tPASS\tDP=825\tGT:GQ:AD:DP:VF:NL:SB:NC\t0/0:19:822:825:0.0036:24:-100.0000:0.0237\tCSF3R.exon.17.line.1.chr1.36931697.36932509--tile--1.probe--coordinates(36931694-36931909)',
 'chr1\t36931703\t.\tT\t.\t100\tPASS\tDP=833\tGT:GQ:AD:DP:VF:NL:SB:NC\t0/0:26:832:833:0.0012:24:-100.0000:0.0142\tCSF3R.exon.17.line.1.chr1.36931697.36932509--tile--1.probe--coordinates(36931694-36931909)',
 'chr1\t36931704\t.\tA\t.\t100\tPASS\tDP=833\tGT:GQ:AD:DP:VF:NL:SB:NC\t0/0:11:829:833:0.0048:24:-100.0000:0.0142\tCSF3R.exon.17.line.1.chr1.36931697.36932509--tile--1.probe--coordinates(36931694-36931909)']



Work with as many chromosomes as found in the interval file. If a chromosome is given is the interval file but not in the searched file, this would give you and error so I have put a line that solve that issue and also inform you if that happen.

Results are given in a list, one line per element but this can be change in the last line of the code.