I'm trying to learn Nim by writing a certain bioinformatics tool that I already implemented in other languages.
I have the following version that compiles and runs apparently correctly:
from strutils import join
from sequtils import zip
type
Nucleotides = distinct string
Qualities = distinct string
#Nucleotides = string
#Qualities = string
Fastq = tuple
name: string
nucls: Nucleotides
quals: Qualities
# proc `==` (ns, ms: Nucleotides): bool =
# string(ns) == string(ms)
# https://nim-by-example.github.io/types/distinct/
proc `==` (ns, ms: Nucleotides): bool {.borrow.}
proc makeFastq(name, nucls, quals: string): Fastq =
result = (name: name, nucls: nucls.Nucleotides, quals: quals.Qualities)
proc bestQuals(quals1, quals2: string): string =
let N = min(quals1.len, quals2.len)
result = newStringOfCap(N)
for pair in zip(quals1, quals2):
result.add(chr(max(ord(pair.a), ord(pair.b))))
proc bestQuals(quals1, quals2: Qualities): Qualities =
result = bestQuals(string(quals1), string(quals2)).Qualities
proc fuseFastq(rec1, rec2: Fastq): Fastq =
result = (name: rec1.name, nucls: rec1.nucls, quals: bestQuals(rec1.quals, rec2.quals))
proc `$` (record: Fastq): string =
result = join([
record.name,
string(record.nucls),
"+",
string(record.quals)], "\n")
iterator parseFastqs(input: File): Fastq =
var
nameLine: string
nucLine: string
quaLine: string
while not input.endOfFile:
nameLine = input.readLine()
nucLine = input.readLine()
discard input.readLine()
quaLine = input.readLine()
yield makeFastq(nameLine, nucLine, quaLine)
proc deduplicate() =
var
record: Fastq
record = (name: "", nucls: "".Nucleotides, quals: "".Qualities)
for fastq in parseFastqs(stdin):
if record.nucls != fastq.nucls:
if record.name != "":
echo $record
record = fastq
else:
record = fuseFastq(record, fastq)
continue
if record.name != "":
echo $record
when isMainModule:
deduplicate()
Now, I would like to have deduplicate
take as argument the "thing" (currently an iterator) that generates Fastq
tuples. It would indeed seem much cleaner to have the when isMainModule
part deal with reading from stdin or possibly something else in the future (a file passed as command-line argument, for instance):
proc deduplicate(inputFqs: <some relevant type>) =
var
record: Fastq
record = (name: "", nucls: "".Nucleotides, quals: "".Qualities)
for fastq in inputFqs:
if record.nucls != fastq.nucls:
if record.name != "":
echo $record
record = fastq
else:
record = fuseFastq(record, fastq)
continue
if record.name != "":
echo $record
when isMainModule:
let inputFqs = parseFastqs(stdin)
deduplicate(inputFqs)
Is there a simple and efficient way to do this?
I naïvely tried the following:
proc deduplicate(inputFqs: iterator) =
var
record: Fastq
record = (name: "", nucls: "".Nucleotides, quals: "".Qualities)
for fastq in inputFqs:
if record.nucls != fastq.nucls:
if record.name != "":
echo $record
record = fastq
else:
record = fuseFastq(record, fastq)
continue
if record.name != "":
echo $record
when isMainModule:
let inputFqs = parseFastqs(stdin)
deduplicate(inputFqs)
And this results in the following compilation error: Error: attempting to call undeclared routine: 'parseFastqs'
.
I searched and understood from the manual that I should make my iterator a closure iterator. So I started simply by just using the {.closure.}
pragma:
iterator parseFastqs(input: File): Fastq {.closure.} =
But I kept having the same error.
So I tried mimicking more closely the example given in the manual:
iterator parseFastqs(input: File): Fastq {.closure.} =
var
nameLine: string
nucLine: string
quaLine: string
while not input.endOfFile:
nameLine = input.readLine()
nucLine = input.readLine()
discard input.readLine()
quaLine = input.readLine()
yield makeFastq(nameLine, nucLine, quaLine)
proc deduplicate(inputFqs: iterator(): Fastq {.closure.}) =
var
record: Fastq
record = (name: "", nucls: "".Nucleotides, quals: "".Qualities)
for fastq in inputFqs:
if record.nucls != fastq.nucls:
if record.name != "":
echo $record
record = fastq
else:
record = fuseFastq(record, fastq)
continue
if record.name != "":
echo $record
deduplicate(parseFastqs(stdin))
This resulted in a type error:
Error: type mismatch: got (iterator (): Fastq{.closure.})
but expected one of:
iterator items[T](a: set[T]): T
iterator items(a: cstring): char
iterator items[T](a: openArray[T]): T
iterator items[IX, T](a: array[IX, T]): T
iterator items(a: string): char
iterator items[T](a: seq[T]): T
iterator items(E: typedesc[enum]): E:type
iterator items[T](s: HSlice[T, T]): T
expression: items(inputFqs)
What am I doing wrong?
Edit: Solving the type issue
It seems that the type mismatch can be solved by changing for fastq in inputFqs:
to for fastq in inputFqs():
. The situation is back to Error: attempting to call undeclared routine: 'parseFastqs'
.
Some tinkering with the example from the manual reveals that the type of the iterator parameter does not need to have the parentheses. The following compiles and runs correctly:
iterator count0(): int {.closure.} =
yield 0
iterator count2(): int {.closure.} =
var x = 1
yield x
inc x
yield x
proc invoke(iter: iterator) =
for x in iter(): echo x
invoke(count0)
invoke(count2)
Now I would be interested in the meaning of the parentheses in the original example:
proc invoke(iter: iterator(): int {.closure.}) =