4

I'm trying to learn Nim by writing a certain bioinformatics tool that I already implemented in other languages.

I have the following version that compiles and runs apparently correctly:

from strutils import join
from sequtils import zip

type
  Nucleotides = distinct string
  Qualities = distinct string
  #Nucleotides = string
  #Qualities = string
  Fastq = tuple
    name: string
    nucls: Nucleotides
    quals: Qualities

# proc `==` (ns, ms: Nucleotides): bool =
#   string(ns) == string(ms)
# https://nim-by-example.github.io/types/distinct/
proc `==` (ns, ms: Nucleotides): bool {.borrow.}

proc makeFastq(name, nucls, quals: string): Fastq =
  result = (name: name, nucls: nucls.Nucleotides, quals: quals.Qualities)

proc bestQuals(quals1, quals2: string): string =
  let N = min(quals1.len, quals2.len)
  result = newStringOfCap(N)
  for pair in zip(quals1, quals2):
    result.add(chr(max(ord(pair.a), ord(pair.b))))

proc bestQuals(quals1, quals2: Qualities): Qualities =
  result = bestQuals(string(quals1), string(quals2)).Qualities

proc fuseFastq(rec1, rec2: Fastq): Fastq =
  result = (name: rec1.name, nucls: rec1.nucls, quals: bestQuals(rec1.quals, rec2.quals))

proc `$` (record: Fastq): string =
  result = join([
    record.name,
    string(record.nucls),
    "+",
    string(record.quals)], "\n")

iterator parseFastqs(input: File): Fastq =
  var
    nameLine: string
    nucLine: string
    quaLine: string
  while not input.endOfFile:
    nameLine = input.readLine()
    nucLine = input.readLine()
    discard input.readLine()
    quaLine = input.readLine()
    yield makeFastq(nameLine, nucLine, quaLine)

proc deduplicate() =
  var
    record: Fastq
  record = (name: "", nucls: "".Nucleotides, quals: "".Qualities)
  for fastq in parseFastqs(stdin):
    if record.nucls != fastq.nucls:
      if record.name != "":
        echo $record
      record = fastq
    else:
      record = fuseFastq(record, fastq)
      continue
  if record.name != "":
    echo $record

when isMainModule:
  deduplicate()

Now, I would like to have deduplicate take as argument the "thing" (currently an iterator) that generates Fastq tuples. It would indeed seem much cleaner to have the when isMainModule part deal with reading from stdin or possibly something else in the future (a file passed as command-line argument, for instance):

proc deduplicate(inputFqs: <some relevant type>) =
  var
    record: Fastq
  record = (name: "", nucls: "".Nucleotides, quals: "".Qualities)
  for fastq in inputFqs:
    if record.nucls != fastq.nucls:
      if record.name != "":
        echo $record
      record = fastq
    else:
      record = fuseFastq(record, fastq)
      continue
  if record.name != "":
    echo $record

when isMainModule:
  let inputFqs = parseFastqs(stdin)
  deduplicate(inputFqs)

Is there a simple and efficient way to do this?

I naïvely tried the following:

proc deduplicate(inputFqs: iterator) =
  var
    record: Fastq
  record = (name: "", nucls: "".Nucleotides, quals: "".Qualities)
  for fastq in inputFqs:
    if record.nucls != fastq.nucls:
      if record.name != "":
        echo $record
      record = fastq
    else:
      record = fuseFastq(record, fastq)
      continue
  if record.name != "":
    echo $record

when isMainModule:
  let inputFqs = parseFastqs(stdin)
  deduplicate(inputFqs)

And this results in the following compilation error: Error: attempting to call undeclared routine: 'parseFastqs'.

I searched and understood from the manual that I should make my iterator a closure iterator. So I started simply by just using the {.closure.} pragma:

iterator parseFastqs(input: File): Fastq {.closure.} =

But I kept having the same error.

So I tried mimicking more closely the example given in the manual:

iterator parseFastqs(input: File): Fastq {.closure.} =
  var
    nameLine: string
    nucLine: string
    quaLine: string
  while not input.endOfFile:
    nameLine = input.readLine()
    nucLine = input.readLine()
    discard input.readLine()
    quaLine = input.readLine()
    yield makeFastq(nameLine, nucLine, quaLine)

proc deduplicate(inputFqs: iterator(): Fastq {.closure.}) =
  var
    record: Fastq
  record = (name: "", nucls: "".Nucleotides, quals: "".Qualities)
  for fastq in inputFqs:
    if record.nucls != fastq.nucls:
      if record.name != "":
        echo $record
      record = fastq
    else:
      record = fuseFastq(record, fastq)
      continue
  if record.name != "":
    echo $record

deduplicate(parseFastqs(stdin))

This resulted in a type error:

Error: type mismatch: got (iterator (): Fastq{.closure.})
but expected one of: 
iterator items[T](a: set[T]): T
iterator items(a: cstring): char
iterator items[T](a: openArray[T]): T
iterator items[IX, T](a: array[IX, T]): T
iterator items(a: string): char
iterator items[T](a: seq[T]): T
iterator items(E: typedesc[enum]): E:type
iterator items[T](s: HSlice[T, T]): T

expression: items(inputFqs)

What am I doing wrong?

Edit: Solving the type issue

It seems that the type mismatch can be solved by changing for fastq in inputFqs: to for fastq in inputFqs():. The situation is back to Error: attempting to call undeclared routine: 'parseFastqs'.

Some tinkering with the example from the manual reveals that the type of the iterator parameter does not need to have the parentheses. The following compiles and runs correctly:

iterator count0(): int {.closure.} =
  yield 0

iterator count2(): int {.closure.} =
  var x = 1
  yield x
  inc x
  yield x

proc invoke(iter: iterator) =
  for x in iter(): echo x

invoke(count0)
invoke(count2)

Now I would be interested in the meaning of the parentheses in the original example:

proc invoke(iter: iterator(): int {.closure.}) =
bli
  • 7,549
  • 7
  • 48
  • 94

1 Answers1

2

You must loop over an iterator

for item in myIterator():
  echo repr item

or you could transform it into a sequence

import sequtils
echo toSeq(myIterator())
enthus1ast
  • 2,099
  • 15
  • 22