(* MetaOCaml-source to the paper:
   Christoph A. Herrmann: 
   "Functional Meta-Programming in the Construction of Parallel Programs",
   presented at the workshop 
   "Constructive Methods of Parallel Programming (CMPP'04)", Stirling, UK
   Example: Karatsuba polynomial multiplication expressed
   in terms of a special Divide-and-Conquer skeleton

   Copyright: University of Passau, 2004
   may freely be used for non-commercial purposes

   USE IS AT YOUR OWN RISK: NO RESPONSIBILITY IS TAKEN FOR ANY DIRECT OR 
   INDIRECT DISADVANTAGE, DAMAGE ETC. TO ANYBODY OR ANYTHING INCURRED BY USE!

   please report bugs to herrmann(AT)fmi.uni-passau.de  
*)

(* ---------------------------------------------------------------------------*)
open Printf
open List
open Mpi   (* this is a special library (ocamlmpi-1.0.1), available via
                 http://pauillac.inria.fr/~xleroy/software.html,
              which is required IN ADDITION to a C/MPI installation;
              combined installation of ocamlmpi and MetaOCaml requires 
              a considerable amount of manual interaction               *) 

let size   = comm_size comm_world  (* number of processes *)
let myrank = comm_rank comm_world  (* current process identifier *) 

(* ---------------------------------------------------------------------------*)
(* auxiliary functions for Karatsuba's polynomial multiplication *)

let al2 xs = Array.length xs / 2

let lowpart    xs  =  let n = al2 xs in Array.sub xs 0 n
let highpart   xs  =  let n = al2 xs in Array.sub xs n n
let mixedparts xs  =  let n = al2 xs in Array.init n (fun i -> xs.(i)+xs.(n+i))

(* sequential Karatsuba's polynomial multiplication,
   "xs" and "ys" have to be arrays of length the same power of two *)

let rec karat_seq xs ys =
  let n = Array.length xs in
  if n<=16                      
  then                         (* solution for small problem sizes *)
    let zs = Array.make (2*n) 0 in
    for i=0 to n-1 do
      for j=0 to n-1 do
        zs.(i+j) <- zs.(i+j) + xs.(i) * ys.(j)
      done
    done;
    zs
  else                         (* solution for large problem sizes *)
    let low   = karat_seq (lowpart    xs) (lowpart    ys)
    and high  = karat_seq (highpart   xs) (highpart   ys)
    and mixed = karat_seq (mixedparts xs) (mixedparts ys) in
    let zs = Array.append low high in
    for i=0 to n-1 do
      zs.(i+n/2) <- zs.(i+n/2) + mixed.(i) - low.(i)-high.(i)
    done;
    zs
       
(* customizing functions for Karatsuba's polynomial multiplication 
   to be used in combination with the divide-and-conquer skeleton "dc" *)

let karat_basic xys = (* input is the concatenation of both coefficient
                         arrays of the operands to multiply *) 
    karat_seq (lowpart xys) (highpart xys)
 

let karat_divide subproblem xys = (* input is the concatenation of both 
                                     coefficient arrays of the operands 
                                     to multiply *) 
  let xs = lowpart  xys and ys = highpart xys in
  let sel = match subproblem with 0->lowpart | 1->highpart | 2->mixedparts 
  in Array.append (sel xs) (sel ys)


let karat_combine xss =           (* input is an array of length three, of 
                                     which each element is one subproblem 
                                     solution *)
  let low = xss.(0) and high = xss.(1) and mix = xss.(2) in
  let ys = Array.append low high 
  and n = Array.length low in
  for i=0 to n-1 do
     ys.(i+n/2) <- ys.(i+n/2) + mix.(i) - low.(i) - high.(i)
  done;
  ys

(* ---------------------------------------------------------------------------*)
(* auxiliary functions to convert integer values between the types "int"
   and its subranges "tag" and "rank" used by MPI;
   purpose: avoid run-time type errors due to type casts, we hope for
            a simpler solution in future releases of MetaOCaml  *)

let rec totag (i:int) = 
  if i<=0 
  then
    if i=0
    then 0
    else
      begin
	printf "negative tag";
	0
      end
  else 
    if i mod 2 = 0
    then 2*((totag (i/2)):tag)
    else 1+2*(totag (i/2)) 

let rec torank (i:int) = 
 if i<=0 
 then
   if i=0
   then 0
   else
     begin
       printf "negative rank";
       0
     end
 else 
   if i mod 2 = 0
   then 2*((torank (i/2)):rank)
   else 1+2*(torank (i/2)) 

let rec fromrank (i:rank) = 
 if i<=0 
 then
   if i=0
   then 0
   else
     begin
       printf "negative rank";
       0
     end
 else 
   if i mod 2 = 0
   then 2*((fromrank (i/2)):int)
   else 1+2*(fromrank (i/2)) 

(* ---------------------------------------------------------------------------*)
(* definition of the language for describing parallelism *)

type 'a exp = 
    Atom of 'a 
  | Seq of (int * (int -> 'a exp))
  | Par of (int * (int array -> int -> 'a exp))

let cseq xs = Seq (length xs, nth xs)

(* cost model function *)

let rec wduSeqPar isSeq n f =                    
  let w = ref 0 and d = ref 0 and u = ref 0 in 
  for i=0 to n-1 do
    let (w1,d1,u1) = wdu (f i) in
    w := !w + w1;
    d := if isSeq then !d + d1 else max !d d1;
    u := if isSeq then max !u u1 else !u + u1
  done;
  (!w,!d,!u)
and wdu case = match case with
  Atom _    -> (1,1,1)
| Seq (n,f) -> wduSeqPar true n f 
| Par (n,f) -> wduSeqPar false n (f (Array.make n 0))  

let id x = x

(* partial evaluator: takes a parallel specification and generates
   code; precisely: combines code from smaller parts applying
   instantiation in "Par" and emission of pieces passed to "Atom" 
   where appropriate                                                *)       

let rec peval case relpart submasters = match case with 
  Atom addCode -> if myrank=submasters.(relpart) 
                  then addCode   
                  else id
| Seq (0,f)          -> id 
| Seq (n,f) when n>0 -> let pev s = peval s relpart submasters
                        in fun x -> pev (f (n-1)) (pev (Seq (n-1,f)) x) 
| Par (0,f)          -> id
| Par (n,f) when n>0 -> let partadrs = Array.make n 0 
                        and base = ref submasters.(relpart) 
                        and mypart = ref 0 in
                        for i=0 to n-1 do
                           let (_,_,u) = wdu (f partadrs i) in
                           partadrs.(i) <- !base;
                           if !base<=myrank && !base+u>myrank then mypart := i;
                           base := !base + u
                        done;
                        peval (f partadrs !mypart) !mypart partadrs 

(* ---------------------------------------------------------------------------*)
(* implementation of the skeleton "dc"  *)

let send  x d t = Mpi.send  x (torank d) (totag t) Mpi.comm_world             
let receive d t = Mpi.receive (torank d) (totag t) Mpi.comm_world
                      
let rec dc degree basic divide combine depth  = 
   if depth=0 
   then Atom (fun x -> (.< let y = .~x in basic y >.))
   else 
   let master partners =       
    cseq [ Atom (fun x -> .< let y = .~x in
                             for i=1 to degree-1 do
                                send y (partners.(i)) depth
                             done;
                             divide 0 y 
                          >.);
           dc degree basic divide combine (depth-1);
           Atom (fun x -> .< let y = .~x in
                             let tmpdata = Array.make degree [||] in
                             tmpdata.(0) <- y;
                             for i=1 to degree-1 do
                                tmpdata.(i) <- receive (partners.(i)) depth
                             done;
                             combine tmpdata 
                          >.)
         ]
   and worker mypart partner =  
    cseq [ Atom (fun x -> .< begin 
                             .~x; 
                             let y = receive partner depth in 
                             divide mypart y
                             end 
                          >.);             
           dc degree basic divide combine (depth-1);
           Atom (fun x -> .< let y = .~x in 
                             send y partner depth; 
                             y 
                          >.) 
         ]     
   in Par (degree,       
           fun partners mypart -> if mypart=0
                                  then master partners
                                  else worker mypart partners.(0))

(* ---------------------------------------------------------------------------*)
(* a listing of all program designs to be used with the "dc" skeleton  *)

let design name psize = match name with 
| "karat" -> dc 3 karat_basic  
                  karat_divide  
                  karat_combine  
                  psize
                     
(* auxiliary power function *)
let rec pow k n = if n=0 then 1 else k * pow k (n-1)
 
(* main program, to be used with the following modes:

   (a) calculation of required number of processes,
       to be run on a single process,
       args: karat <recursion_depth> procs

   (b) complete cost information,
       to be run on a single process,
       args: karat <recursion_depth> info

   (c) test of sequential execution times,
       to be run on a single process,
       args: karat <recursion_depth> seqk

   (d) parallel execution, to be run on as many
       processes as (a) tells
       args: karat <recursion_depth> run <no_of_runs>
*)

let main = 
  let name = Sys.argv.(1) in 
  let psize = Scanf.sscanf (Sys.argv.(2)) "%d" (fun i -> i) in
  let mode = Sys.argv.(3)
  in 
  match mode with
    "procs" -> 
      let (w,d,u) = wdu (design name psize) in
      printf "%d\n" u
  | "info" -> 
      let (w,d,u) = wdu (design name psize) in
      printf "design %s: work=%d, depth=%d, processes=%d\n" name w d u
  | "seqk" ->
      let base=15 in
      let q = Array.make 1 [||] in
      for i=0 to 5 do
        let indata = Array.init (pow 2 (i+base+1)) (fun i -> 1) in
        let startTime = wtime () in
        q.(0) <- karat_basic indata;
	printf "i=%d, time=%f\n" (i+base) (wtime()-.startTime) 
      done
  | "run" -> 
      let code _ = peval (design name psize) 0 [| 0 |] in
      let runs = Scanf.sscanf (Sys.argv.(4)) "%d" (fun i -> i) in
      let runtimes = Array.make runs 0.0 in
      let pet = ref 0.0 in
      let base = 15 in
      let q = Array.make 1 [||] in
	  Mpi.barrier Mpi.comm_world;
	for i=0 to runs-1 do
          let indata = Array.init (pow 2 (i+base+1)) (fun i -> 1) in
          let startTime = wtime () in
	  let fct = code () .< indata >. in
          pet := wtime () -. startTime;
   	  let startTime = wtime () in
          q.(0) <- .!fct;
	  Mpi.barrier Mpi.comm_world;
	  runtimes.(i) <- wtime () -. startTime
	done;
	printf "pid %d finished \n" (fromrank myrank);
	flush stdout;
	Mpi.barrier Mpi.comm_world;
	if myrank=0
	then begin
              printf "\n\npevaltimes\n"
	     end;     
	for i=0 to size-1 do
	Mpi.barrier Mpi.comm_world;
	if myrank=i 
	then 
	  begin
	    printf "proc %d: %f\n" i !pet;
	    flush stdout
	  end;
	Mpi.barrier Mpi.comm_world;
      done;
      Mpi.barrier Mpi.comm_world;
      if myrank=0
      then
	begin
	  printf "runtimes\n";
	  for i=0 to runs-1 do
	    printf "run %d: %f\n" (i+base) runtimes.(i)
	  done;
	end