Following sample code demonstrate how you can
- define a search criteria
- read data file
- form an array of records
- fill data structure with genome data chunks
- output data matching the search criteria
use strict;
use warnings;
use feature 'say';
my($protein_id,$location) = ('ACL93470.1','1700..2557'); # search criteria
my(@chain,$genome);
my($proteins,$id);
# read genome data into array @chain
while( <DATA> ) {
push @chain, $genome if defined $genome and />lcl/;
$genome = undef if />lcl/;
$genome .= $_;
}
push @chain, $genome if defined $genome;
# build data structure $proteins
for( @chain ) {
($id) = /^>lcl\|(\S+)/;
my @elements = /\[(.*?)\]/g;
$proteins->{$id} = { map { split('=', $_) } @elements };
($proteins->{$id}{protein}) = /([^]]*)\z/;
}
# output data of search criteria
for( keys %$proteins ) {
if( $proteins->{$_}{protein_id} eq $protein_id and $proteins->{$_}{location} eq $location ) {
say "Protein ID: $protein_id\n"
. "Location: $location\n"
. "Protein: $proteins->{$_}{protein}";
}
}
__DATA__
>lcl|CP001340.1_cds_ACL93468.1_1 [locus_tag=CCNA_00001] [protein=pyruvate, phosphate dikinase regulatory protein] [protein_id=ACL93468.1] [location=202..1107] [gbkey=CDS]
GTGGTTAAGCAACCGTTAACGGATGATCCACAGGAGAGTCTGGCGCAGGGCGAGAGCGAAAGGCTGCCGC
CACGCTTCGCCACCTACTTCCATATCCACTTGGTTTCAGACTCCACAGGCGAGACGCTGAACGCGATGGC
GCGGGCGGT
>lcl|CP001340.1_cds_ACL93470.1_3 [locus_tag=CCNA_00003] [protein=shikimate 5-dehydrogenase] [protein_id=ACL93470.1] [location=1700..2557] [gbkey=CDS]
ATGACCAACGCCATCACGGGCGCGGCCATTGTCGGCGGTGTCTGCGGTCAACCGATCAAGCATTCGATGA
GCCCGGTGATCCACAACGCCTGGATCGCAGCGGCCGGCCTTGACGCGGCTTATGTGCCATTCGCCCCGGC
Output
Protein ID: ACL93470.1
Location: 1700..2557
Protein:
ATGACCAACGCCATCACGGGCGCGGCCATTGTCGGCGGTGTCTGCGGTCAACCGATCAAGCATTCGATGA
GCCCGGTGATCCACAACGCCTGGATCGCAGCGGCCGGCCTTGACGCGGCTTATGTGCCATTCGCCCCGGC