(* MetaOCaml-source to the paper: Christoph A. Herrmann: "Functional Meta-Programming in the Construction of Parallel Programs", presented at the workshop "Constructive Methods of Parallel Programming (CMPP'04)", Stirling, UK Example: Karatsuba polynomial multiplication expressed in terms of a special Divide-and-Conquer skeleton Copyright: University of Passau, 2004 may freely be used for non-commercial purposes USE IS AT YOUR OWN RISK: NO RESPONSIBILITY IS TAKEN FOR ANY DIRECT OR INDIRECT DISADVANTAGE, DAMAGE ETC. TO ANYBODY OR ANYTHING INCURRED BY USE! please report bugs to herrmann(AT)fmi.uni-passau.de *) (* ---------------------------------------------------------------------------*) open Printf open List open Mpi (* this is a special library (ocamlmpi-1.0.1), available via http://pauillac.inria.fr/~xleroy/software.html, which is required IN ADDITION to a C/MPI installation; combined installation of ocamlmpi and MetaOCaml requires a considerable amount of manual interaction *) let size = comm_size comm_world (* number of processes *) let myrank = comm_rank comm_world (* current process identifier *) (* ---------------------------------------------------------------------------*) (* auxiliary functions for Karatsuba's polynomial multiplication *) let al2 xs = Array.length xs / 2 let lowpart xs = let n = al2 xs in Array.sub xs 0 n let highpart xs = let n = al2 xs in Array.sub xs n n let mixedparts xs = let n = al2 xs in Array.init n (fun i -> xs.(i)+xs.(n+i)) (* sequential Karatsuba's polynomial multiplication, "xs" and "ys" have to be arrays of length the same power of two *) let rec karat_seq xs ys = let n = Array.length xs in if n<=16 then (* solution for small problem sizes *) let zs = Array.make (2*n) 0 in for i=0 to n-1 do for j=0 to n-1 do zs.(i+j) <- zs.(i+j) + xs.(i) * ys.(j) done done; zs else (* solution for large problem sizes *) let low = karat_seq (lowpart xs) (lowpart ys) and high = karat_seq (highpart xs) (highpart ys) and mixed = karat_seq (mixedparts xs) (mixedparts ys) in let zs = Array.append low high in for i=0 to n-1 do zs.(i+n/2) <- zs.(i+n/2) + mixed.(i) - low.(i)-high.(i) done; zs (* customizing functions for Karatsuba's polynomial multiplication to be used in combination with the divide-and-conquer skeleton "dc" *) let karat_basic xys = (* input is the concatenation of both coefficient arrays of the operands to multiply *) karat_seq (lowpart xys) (highpart xys) let karat_divide subproblem xys = (* input is the concatenation of both coefficient arrays of the operands to multiply *) let xs = lowpart xys and ys = highpart xys in let sel = match subproblem with 0->lowpart | 1->highpart | 2->mixedparts in Array.append (sel xs) (sel ys) let karat_combine xss = (* input is an array of length three, of which each element is one subproblem solution *) let low = xss.(0) and high = xss.(1) and mix = xss.(2) in let ys = Array.append low high and n = Array.length low in for i=0 to n-1 do ys.(i+n/2) <- ys.(i+n/2) + mix.(i) - low.(i) - high.(i) done; ys (* ---------------------------------------------------------------------------*) (* auxiliary functions to convert integer values between the types "int" and its subranges "tag" and "rank" used by MPI; purpose: avoid run-time type errors due to type casts, we hope for a simpler solution in future releases of MetaOCaml *) let rec totag (i:int) = if i<=0 then if i=0 then 0 else begin printf "negative tag"; 0 end else if i mod 2 = 0 then 2*((totag (i/2)):tag) else 1+2*(totag (i/2)) let rec torank (i:int) = if i<=0 then if i=0 then 0 else begin printf "negative rank"; 0 end else if i mod 2 = 0 then 2*((torank (i/2)):rank) else 1+2*(torank (i/2)) let rec fromrank (i:rank) = if i<=0 then if i=0 then 0 else begin printf "negative rank"; 0 end else if i mod 2 = 0 then 2*((fromrank (i/2)):int) else 1+2*(fromrank (i/2)) (* ---------------------------------------------------------------------------*) (* definition of the language for describing parallelism *) type 'a exp = Atom of 'a | Seq of (int * (int -> 'a exp)) | Par of (int * (int array -> int -> 'a exp)) let cseq xs = Seq (length xs, nth xs) (* cost model function *) let rec wduSeqPar isSeq n f = let w = ref 0 and d = ref 0 and u = ref 0 in for i=0 to n-1 do let (w1,d1,u1) = wdu (f i) in w := !w + w1; d := if isSeq then !d + d1 else max !d d1; u := if isSeq then max !u u1 else !u + u1 done; (!w,!d,!u) and wdu case = match case with Atom _ -> (1,1,1) | Seq (n,f) -> wduSeqPar true n f | Par (n,f) -> wduSeqPar false n (f (Array.make n 0)) let id x = x (* partial evaluator: takes a parallel specification and generates code; precisely: combines code from smaller parts applying instantiation in "Par" and emission of pieces passed to "Atom" where appropriate *) let rec peval case relpart submasters = match case with Atom addCode -> if myrank=submasters.(relpart) then addCode else id | Seq (0,f) -> id | Seq (n,f) when n>0 -> let pev s = peval s relpart submasters in fun x -> pev (f (n-1)) (pev (Seq (n-1,f)) x) | Par (0,f) -> id | Par (n,f) when n>0 -> let partadrs = Array.make n 0 and base = ref submasters.(relpart) and mypart = ref 0 in for i=0 to n-1 do let (_,_,u) = wdu (f partadrs i) in partadrs.(i) <- !base; if !base<=myrank && !base+u>myrank then mypart := i; base := !base + u done; peval (f partadrs !mypart) !mypart partadrs (* ---------------------------------------------------------------------------*) (* implementation of the skeleton "dc" *) let send x d t = Mpi.send x (torank d) (totag t) Mpi.comm_world let receive d t = Mpi.receive (torank d) (totag t) Mpi.comm_world let rec dc degree basic divide combine depth = if depth=0 then Atom (fun x -> (.< let y = .~x in basic y >.)) else let master partners = cseq [ Atom (fun x -> .< let y = .~x in for i=1 to degree-1 do send y (partners.(i)) depth done; divide 0 y >.); dc degree basic divide combine (depth-1); Atom (fun x -> .< let y = .~x in let tmpdata = Array.make degree [||] in tmpdata.(0) <- y; for i=1 to degree-1 do tmpdata.(i) <- receive (partners.(i)) depth done; combine tmpdata >.) ] and worker mypart partner = cseq [ Atom (fun x -> .< begin .~x; let y = receive partner depth in divide mypart y end >.); dc degree basic divide combine (depth-1); Atom (fun x -> .< let y = .~x in send y partner depth; y >.) ] in Par (degree, fun partners mypart -> if mypart=0 then master partners else worker mypart partners.(0)) (* ---------------------------------------------------------------------------*) (* a listing of all program designs to be used with the "dc" skeleton *) let design name psize = match name with | "karat" -> dc 3 karat_basic karat_divide karat_combine psize (* auxiliary power function *) let rec pow k n = if n=0 then 1 else k * pow k (n-1) (* main program, to be used with the following modes: (a) calculation of required number of processes, to be run on a single process, args: karat procs (b) complete cost information, to be run on a single process, args: karat info (c) test of sequential execution times, to be run on a single process, args: karat seqk (d) parallel execution, to be run on as many processes as (a) tells args: karat run *) let main = let name = Sys.argv.(1) in let psize = Scanf.sscanf (Sys.argv.(2)) "%d" (fun i -> i) in let mode = Sys.argv.(3) in match mode with "procs" -> let (w,d,u) = wdu (design name psize) in printf "%d\n" u | "info" -> let (w,d,u) = wdu (design name psize) in printf "design %s: work=%d, depth=%d, processes=%d\n" name w d u | "seqk" -> let base=15 in let q = Array.make 1 [||] in for i=0 to 5 do let indata = Array.init (pow 2 (i+base+1)) (fun i -> 1) in let startTime = wtime () in q.(0) <- karat_basic indata; printf "i=%d, time=%f\n" (i+base) (wtime()-.startTime) done | "run" -> let code _ = peval (design name psize) 0 [| 0 |] in let runs = Scanf.sscanf (Sys.argv.(4)) "%d" (fun i -> i) in let runtimes = Array.make runs 0.0 in let pet = ref 0.0 in let base = 15 in let q = Array.make 1 [||] in Mpi.barrier Mpi.comm_world; for i=0 to runs-1 do let indata = Array.init (pow 2 (i+base+1)) (fun i -> 1) in let startTime = wtime () in let fct = code () .< indata >. in pet := wtime () -. startTime; let startTime = wtime () in q.(0) <- .!fct; Mpi.barrier Mpi.comm_world; runtimes.(i) <- wtime () -. startTime done; printf "pid %d finished \n" (fromrank myrank); flush stdout; Mpi.barrier Mpi.comm_world; if myrank=0 then begin printf "\n\npevaltimes\n" end; for i=0 to size-1 do Mpi.barrier Mpi.comm_world; if myrank=i then begin printf "proc %d: %f\n" i !pet; flush stdout end; Mpi.barrier Mpi.comm_world; done; Mpi.barrier Mpi.comm_world; if myrank=0 then begin printf "runtimes\n"; for i=0 to runs-1 do printf "run %d: %f\n" (i+base) runtimes.(i) done; end