In my comment I said that parsing is fairly easy. Here is how it could be done. As the question lacks a proper specification of the file format, I will assume the following:
The file consists of properties, which have values:
document ::= property*
property ::= word "(" value ("," value)* ")"
A value is a double-quoted string containing numbers seperated by commata, or a single word:
value ::= '"' ( word | number ("," number)* ) '"'
Spaces, backslashes, and comments are irrelevant.
Here is a possible implementation; I will not go into the details of explaining how to write a simple parser.
package Parser;
use strict; use warnings;
sub parse {
my ($data) = @_;
# perform tokenization
pos($data) = 0;
my $length = length $data;
my @tokens;
while(pos($data) < $length) {
next if $data =~ m{\G\s+}gc
or $data =~ m{\G\\}gc
or $data =~ m{\G/[*].*?[*]/}gc;
if ($data =~ m/\G([",()])/gc) {
push @tokens, [symbol => $1];
} elsif ($data =~ m/\G([0-9]+[.][0-9]+)/gc) {
push @tokens, [number => 0+$1];
} elsif ($data =~ m/\G(\w+)/gc) {
push @tokens, [word => $1];
} else {
die "unreckognized token at:\n", substr $data, pos($data), 10;
}
}
return parse_document(\@tokens);
}
sub token_error {
my ($token, $expected) = @_;
return "Wrong token [@$token] when expecting [@$expected]";
}
sub parse_document {
my ($tokens) = @_;
my @properties;
push @properties, parse_property($tokens) while @$tokens;
return @properties;
}
sub parse_property {
my ($tokens) = @_;
$tokens->[0][0] eq "word"
or die token_error $tokens->[0], ["word"];
my $name = (shift @$tokens)->[1];
$tokens->[0][0] eq "symbol" and $tokens->[0][1] eq '('
or die token_error $tokens->[0], [symbol => '('];
shift @$tokens;
my @vals;
VAL: {
push @vals, parse_value($tokens);
if ($tokens->[0][0] eq 'symbol' and $tokens->[0][1] eq ',') {
shift @$tokens;
redo VAL;
}
}
$tokens->[0][0] eq "symbol" and $tokens->[0][1] eq ')'
or die token_error $tokens->[0], [symbol => ')'];
shift @$tokens;
return [ $name => @vals ];
}
sub parse_value {
my ($tokens) = @_;
$tokens->[0][0] eq "symbol" and $tokens->[0][1] eq '"'
or die token_error $tokens->[0], [symbol => '"'];
shift @$tokens;
my $value;
if ($tokens->[0][0] eq "word") {
$value = (shift @$tokens)->[1];
} else {
my @nums;
NUM: {
$tokens->[0][0] eq 'number'
or die token_error $tokens->[0], ['number'];
push @nums, (shift @$tokens)->[1];
if ($tokens->[0][0] eq 'symbol' and $tokens->[0][1] eq ',') {
shift @$tokens;
redo NUM;
}
}
$value = \@nums;
}
$tokens->[0][0] eq "symbol" and $tokens->[0][1] eq '"'
or die token_error $tokens->[0], [symbol => '"'];
shift @$tokens;
return $value;
}
Now, we get the following data structure as output from Parser::parse
:
(
["Student_name", "Eric"],
["scoreA", [10, 20, 30, 40]],
["scoreB", [15, 30, 45, 50, 55]],
[
"final",
[12.23, 19, 37.88, 45.98, 60],
[7, 20.11, 24.56, 45.66, 57.88],
[5, 15.78, 22.88, 40.9, 57.99],
[10, 16.87, 26.99, 38.99, 40.66],
],
["Student_name", "Liy"],
["scoreA", [5, 10, 20, 60]],
["scoreB", [25, 30, 40, 55, 60]],
[
"final",
[2.23, 15, 37.88, 45.98, 70],
[10, 28.11, 34.56, 45.66, 57.88],
[8, 19.78, 32.88, 40.9, 57.66],
[10, 27.87, 39.99, 59.99, 78.66],
],
...,
)
As a next step, we want to transform it into nested hashes, so that we have the structure
{
Eric => {
scoreA => [...],
scoreB => [...],
final => [[...], ...],
},
Liy => {...},
...,
}
So we simply run it through this small sub:
sub properties_to_hash {
my %hash;
while(my $name_prop = shift @_) {
$name_prop->[0] eq 'Student_name' or die "Expected Student_name property";
my $name = $name_prop->[1];
while( @_ and $_[0][0] ne 'Student_name') {
my ($prop, @vals) = @{ shift @_ };
if (@vals > 1) {
$hash{$name}{$prop} = \@vals;
} else {
$hash{$name}{$prop} = $vals[0];
}
}
}
return \%hash;
}
So we have the main code
my $data = properties_to_hash(Parser::parse( $file_contents ));
Now we can move on to Part 2 fo the problem: calculating your scores. That is, once you make clear what you need done.
Edit: Bilinear interpolation
Let f be the function that returns the value at a certain coordinate. If we have a value at those coordinates, we can return that. Else, we perform bilinear interpolation with the next known values.
The formula for bilinear interpolation[1] is:
f(x, y) = 1/( (x_2 - x_1) · (y_2 - y_1) ) · (
f(x_1, y_1) · (x_2 - x) · (y_2 - y)
+ f(x_2, y_1) · (x - x_1) · (y_2 - y)
+ f(x_1, y_2) · (x_2 - x) · (y - y_1)
+ f(x_2, y_2) · (x - x_1) · (y - y_1)
)
Now, scoreA
denote the positions of the data points in the final
table on the first axis, scoreA
the positions on the second axis. We have to do the following:
- assert that the requested values
x, y
are inside the bounds
- fetch the next smaller and next larger positions
- perform interpolation
.
sub f {
my ($data, $x, $y) = @_;
# do bounds check:
my ($x_min, $x_max, $y_min, $y_max) = (@{$data->{scoreA}}[0, -1], @{$data->{scoreB}}[0, -1]);
die "indices ($x, $y) out of range ([$x_min, $x_max], [$y_min, $y_max])"
unless $x_min <= $x && $x <= $x_max
&& $y_min <= $y && $y <= $y_max;
To fetch the boxing indices x_1, x_2, y_1, y_2
we need to iterate through all possible scores. We'll also remember the physical indices x_i1, x_i2, y_i1, y_i2
of the underlying arrays.
my ($x_i1, $x_i2, $y_i1, $y_i2);
for ([$data->{scoreA}, \$x_i1, \$x_i2], [$data->{scoreB}, \$y_i1, \$y_i2]) {
my ($scores, $a_i1, $a_i2) = @$_;
for my $i (0 .. $#$scores) {
if ($scores->[$i] <= $x) {
($$a_i1, $$a_i2) = $i == $#$scores ? ($i, $i+1) : ($i-1, $i);
last;
}
}
}
my ($x_1, $x_2) = @{$data->{scoreA}}[$x_i1, $x_i2];
my ($y_1, $y_2) = @{$data->{scoreB}}[$y_i1, $y_i2];
Now, interpolation according to above formula can be performed, but each access at a known index can be changed to an access via physical index, so f(x_1, y_2)
would become
$final->[$x_i1][$y_i2]
Detailed Explanation of sub f
sub f { ... }
declares a sub with name f
, although that is probably a bad name. bilinear_interpolation
might be a better name.
my ($data, $x, $y) = @_
states that our sub takes three arguments:
$data
, a hash reference containing fields scoreA
, scoreB
and final
, which are array references.
$x
, the position along the scoreA
-axis where interpolation is required.
$y
, the position along the scoreB
-axis where interpolation is required.
Next, we want to assert that the positions for $x
and $y
are valid aka inside bounds. The first value in $data->{scoreA}
is the minimal value; the maximal value is in the last position (index -1
). To get both at once, we use an array slice. A slice accesses multiple values at once and returns a list, like @array[1, 2]
. Because we use complex data structures which use references, we have to dereference the array in $data->{scoreA}
. This makes the slice look like @{$data->{scoreA}}[0, 1]
.
Now that we have the $x_min
and $x_max
values, we throw and error unless the requested value $x
is inside the range defined by the min/max values. This is true when
$x_min <= $x && $x <= $x_max
Should either $x
or $y
be out of bounds, we throw an error which shows the actual bounds. So the code
die "indices ($x, $y) out of range ([$x_min, $x_max], [$y_min, $y_max])"
could, for example, throw an error like
indices (10, 500) out of range ([20, 30], [25, 57]) at script.pl line 42
Here we can see that the value for $x
is too small, and $y
is too large.
The next problem is to find neighbouring values. Assuming scoreA
holds [1, 2, 3, 4, 5]
, and $x
is 3.7
, we want to select the values 3
and 4
. But because we can pull some nifty tricks a bit later, we would rather remember the position of the neighbouring values, not the values themselves. So this would give 2
and 3
in above example (remember that arrows are zero-based).
We can do this by looping over all indices of our array. When we find a value that is ≤ $x
, we remember the index. E.g. 3
is the first value that is ≤ $x
, so we remember the index 2
. For the next higher value, we have to be a bit carful: Obviously, we can just take the next index, so 2 + 1 = 3
. But now assume that $x
is 5
. This passes the bounds check. The first value that is ≤ $x
would be value 5
, so we can remember position 4
. However, there is no entry at position 5
, so we could use the current index itself. Because this would lead to division by zero later on, we would be better off remembering positions 3
and 4
(values 4
and 5
).
Expressed as code, that is
my ($x_i1, $x_i2);
my @scoreA = @{ $data->{scoreA} }; # shortcut to the scoreA entry
for my $i (0 .. $#scores) { # iterate over all indices: `$#arr` is the last idx of @arr
if ($scores[$i] <= $x) { # do this if the current value is ≤ $x
if ($i != $#scores) { # if this isn't the last index
($x_i1, $x_i2) = ($i, $i+1);
} else { # so this is the last index
($x_i1, $x_i2) = ($i-1, $i);
}
last; # break out of the loop
}
}
In my original code I choose a more complex solution to avoid copy-pasting the same code for finding the neighbours of $y
.
Because we also need the values, we obtain them via a slice with the indices:
my ($x_1, $x_2) = @{$data->{scoreA}}[$x_i1, $x_i2];
Now we have all surrounding values $x1, $x_2, $y_1, $y_2
which define the rectangle in which we want to perform bilinear interpolation. The mathematical formula is easy to translate to Perl: just choose the correct operators (*
, not ·
for multiplication), and the variables need dollar signs before them.
The formula I used is recursive: The definition of f refers to itself. This would imply an infinite loop, unless we do some thinking and break the recursion. f symbolizes the value at a certain position. In most cases, this means interpolating. However, if $x
and $y
are both equal to values in scoreA
and scoreB
respectively, we don't need bilinear interpolation, and can return the final
entry directly.
This can be done by checking if both $x
and $y
are members of their arrays, and doing an early return. Or we can use the fact that $x_1, ..., $y_2
all are members of the arrays. Instead of recursing with values we know don't need interpolating, we just do an array access. This is what we have saved the indices $x_i1, ..., $y_i2
for. So wherever the original formula says f(x_1, y_1)
or similar, we write the equivalent $data->{final}[$x_i1][$y_i2]
.