You can use defaultdict and regex for this
import re
from collections import defaultdict
# Get file contents
with open("test.fastq", "r") as f:
content = f.read()
samples = defaultdict(list) # Make defaultdict of empty lists
identifier = ""
# Iterate through every line in file
for line in content.split("\n"):
# Find strings which start with @
if re.match("^@.*", line):
# Set identifier to match following lines to this section
identifier = line.replace("@", "")
else:
# Add the line to its identifier
samples[identifier].append(line)
Now all you have to do is save the contents of this default dictionary into multiple files:
# Loop through all samples (and their contents)
for sample_name, sample_items in samples.items():
# Create new file with the name of its sample_name.fastq
# (You might want to change the naming)
with open(f"{sample_name}.fastq", "w") as f:
# Write each element of the sample_items to new line
f.write("\n".join(sample_items))
It might be helpful for you to also include @sample_name
in the beginning of the file (first line), but I'm not sure you want that so I haven't added that.
Note that you can adjust the regex settings to only match @sample[number]
instead of all @...
, if you want that, you can use re.match("^@sample\d+")
instead