3

I 'm developing a specialized quad tree for doing some bioinformatics. The types for the qtree are:

type base = A | C | G | T | ROOT  ;;
type quad_tree = Nd of bases * quad_tree  * quad_tree  * quad_tree * quad_tree 
             | Empty
             | Leaf of int ref ;;

let init_quad_tree = Nd(ROOT, Empty,Empty,Empty,Empty);;
let new_node b = Nd(b,Empty,Empty,Empty,Empty);;

Now to do a match on these trees when either constructing or walking you end up with something like:

let rec add_node  base k qtree = 
  let rec aux k' accum qtree' = 
    if k' = k then
  match qtree' with
  | Nd(bse, Empty, cc, gg, tt) -> Nd(bse, (Leaf(ref accum)),cc,gg,tt)
  | Nd(bse, aa, Empty, gg, tt) -> Nd(bse, aa,(Leaf(ref accum)),gg,tt)
  | Nd(bse, aa, cc, Empty, tt) -> Nd(bse, aa,cc,(Leaf(ref accum)),tt)
  | Nd(bse, aa, cc, gg, Empty) -> Nd(bse, aa,cc,gg,(Leaf(ref accum)))
  | Leaf _ -> qtree'
  | Empty -> Leaf(ref accum)
  | _ -> qtree'
else
match qtree' with
| Leaf(iref)  -> iref := !iref + 1; qtree'                        
| Nd(bse, Empty,Empty,Empty,Empty) ->  (*all empty*)
    (
    match base with
    | A -> Nd(bse,(new_node base),Empty,Empty,Empty)
    | C -> Nd(bse,Empty,(new_node base),Empty,Empty)
    | G -> Nd(bse,Empty,Empty,(new_node base),Empty)
    | T -> Nd(bse,Empty,Empty,Empty,(new_node base))
    | _ -> qtree'
    )
...
| Nd(bse, Empty,(Nd(_,_,_,_,_) as c),(Nd(_,_,_,_,_) as g),(Nd(_,_,_,_,_) as t)) -> 
    (
    match base with
    | A -> Nd(bse,(new_node base),(aux (k'+1) (accum+1) c),(aux (k'+1) (accum+1) g),(aux (k'+1) (accum+1) t))
    | C -> Nd(bse,Empty,(aux (k'+1)(accum+1) c),(aux (k'+1)(accum+1) g),(aux (k'+1)(accum+1) t))
    | G -> Nd(bse,Empty,(aux (k'+1)(accum+1) c),(aux (k'+1)(accum+1) g),(aux (k'+1)(accum+1) t))
    | T -> Nd(bse,Empty,(aux (k'+1)(accum+1) c),(aux (k'+1)(accum+1) g),(aux (k'+1)(accum+1) t))
    | _ -> qtree'
    )
...
| Nd(bse, (Nd(_,_,_,_,_) as a),(Nd(_,_,_,_,_) as c),(Nd(_,_,_,_,_) as g),(Nd(_,_,_,_,_) as t)) ->
...

You get the idea, basically I need to cover all 16 combinations there (4 subtrees which can either be empty or Nd). That's a lot of typing and it's error prone.

However, it's a very regular structure that would lend itself to code generation. I was going to actually generate this code using a Ruby script, but I'm wondering if this would be possible with campl4 or the new -ppx-style "macros" (for lack of a better term)? And if so, how could I get started in either one of those directions?

aneccodeal
  • 8,531
  • 7
  • 45
  • 74
  • What are you trying to represent and achieve here? Why do you have two kinds of leaves (`Empty`, `Leaf`)? Why is there a `ROOT` mixed with the bases? Why is the type called `bases` and not `base`? – Martin Jambon Nov 15 '13 at 18:17
  • Good questions.Just prior to reading your comment I changed bases to base in my code (Just changed it above). ROOT is only for labeling the ROOT node at the top. Empty and Leaf are different things: Empty is an initial condition, Leaf's will contain a count of how many times the string has been seen (that's the intent, the code above does not reflect that, instead the accum just shows the current level of the tree). – aneccodeal Nov 15 '13 at 20:20
  • Basically, it's a tree of fixed-length strings (in this case bases, or k-mers). The number of times the k-mer has been seen already is to be stored in the leaves (I need to change the Leaf type to: Leaf of base * int ref ) – aneccodeal Nov 15 '13 at 20:23

1 Answers1

1

In a functional-idiomatic tree, every node is a root to its sub-tree, even if every other node in that sub-tree is empty. You'll want to collapse out the explicit ROOT definition, and merge in the counter property to the leaf node:

type base = A | C | G | T ;;
type quad_tree = 
  | Node of base * int ref * quad_tree * quad_tree * quad_tree * quad_tree
  | Empty

But then while you're at it you might as well just make that ref an explicit int so that you can use persistent data structures:

type quad_tree = 
  | Node of base * int * quad_tree ...
  | Empty

Walking/constructing shouldn't then have to be that complex based on my understanding of what you want to do (each node representing the strings that match its path exactly) -- just let yourself make a new version of the tree each time. A kinda ugly version:

let shorter str = String.sub 1 ((String.len str) - 1);;

let rec add_str base str = match base with
  | Empty -> 
     let ch = String.get str 0 in
     if ch = 'A' then add_str Node('A', 0, Empty, Empty, Empty, Empty) (shorter str)
     else if ch = 'C' then add_str Node('C', 0, Empty, Empty, Empty, Empty) (shorter str)
     ...
  | Node(b, v, aa, cc, gg, tt) ->
     let len = String.length str in
     if len = 0 then Node(b, v + 1, aa, cc, gg, tt) else
     let ch = String.get str 0 in
     if ch = 'A' then match aa with
       | Empty -> Node(b, v, (add_str Empty str), cc, gg, tt)
       | Node(b', v', ... , tt') -> add_str Node(b', v', ... , tt') (shorter str)
     else if ch = 'C' then match cc with
       | Empty -> Node(b, v, aa, (add_str Empty str), gg, tt)
       | Node(b', v', ... , tt') -> add_str Node(b', v', ... , tt') (shorter str)
     ...