Lecture 11.2: Compiling to StackLang

Last class, we discussed the notion of a compiler, which translates programs in one language to another one. To better understand and reason about compilers, we defined StackLang, a target language inspired by stack-based virtual machines like the OCaml VM and the JVM.

Thus, we can now compare calculator programs to their equivalent StackLang targets; for instance, 2 < 4 would be represented as [Push 2; Push 4; AppInstr LT], and we then wrote an evaluator to understand the semantics of a stack program.

Today, we would like to define a compile function that turns CalcLang expressions into a StackLang instructions, and have it follow a specification that ensures that any CalcLang expression that produces a CalcLang value can be compiled to a StackLang program that produces a semantically equivalent StackLang value.

∀ (e : expr) (v : calc_value),
                 eval StrMap.empty e = v ⇒ execute (compile e) = (as_stack_value v)

In this class, we will compile our calculator programs to StackLang. Along the way, we will see some nuances in how we represented our binders, which we will cover before getting to ITE-expressions and correctness.

Here is the code we wrote last class.

Since we are manipulating programs in two different languages, we will use modules for namespacing.

type op = | Add | Mul | And | Or | LT | EQ
module StrMap = Map.Make(String)

module StackLang = struct 
  type value = int
  type instr =
    | Push of value
    | AppInstr of op
    | Set of string
    | Get of string
    | Jump of int
    | JumpIfNotOne of int

  type program = instr list

  let eval_op (o : op) (vs : value list) : value =
    match o, vs with
    | Add, [n1; n2] -> (n1 + n2)
    | Mul, [n1; n2] -> (n1 * n2)
    | LT,  [n1; n2] -> (if n1 < n2 then 1 else 0)
    | EQ,  [n1; n2] -> (if n1 = n2 then 1 else 0)
    | And, [n1; n2] -> (if n1 = 1 && n2 = 1 then 1 else 0)
    | Or,  [n1; n2] -> (if n1 = 1 || n2 = 1 then 1 else 0)
    | _ -> failwith "eval_op: incorrect arguments"

  type config = (instr list * value list * value StrMap.t)

  let step (cfg : config) : config =
    match cfg with
    | ([],  vs, env) -> cfg
    | (hd::tl, vs, env) ->
      (match hd with
      | Push v -> (tl, v::vs, env)
      | Set s ->
        (match vs with
        | [] -> failwith "saw a malformed program"
        | v :: vs -> (tl, vs, StrMap.add s v env))
      | Get s ->
        (match StrMap.find_opt s env with
        | None -> failwith ("unbound identifier " ^ s)
        | Some v -> (tl, v::vs, env))
      | AppInstr op ->
        (match vs with
        | [] | [_] -> failwith "stack underflow"
        | v1::v0::vs ->
          let v = eval_op op [v0; v1] in
          (tl, v::vs, env))
      | Jump n -> failwith "TODO"
      | JumpIfNotOne n -> failwith "TODO")


  let rec loop (c : config) : config =
    match step c with
    | ([], vs, e) -> ([], vs, e)
    | cfg -> loop cfg

  let execute (p : program) : value =
    match loop (p, [], StrMap.empty) with
    | (_,  [], _) -> failwith "program did not return a value."
    | (_, v :: _, _) -> v
end

Compiling Calc expressions to StackLang

Before we define how to compile our calculator language, let's briefly go over its interpreter.

Our interpreter for Calc

module CalcLang = struct

  type value =
    | Nat  of int
    | Bool of bool

  type expr =
    | Value of value
    | Var   of string
    | Let   of string * expr * expr
    | App   of op * expr list
    | ITE   of expr * expr * expr

  let eval_calc_op (op : op) (args : value list) : value =
    match (op, args) with
    | (Add, [Nat n1; Nat n2]) -> Nat (n1 + n2)
    | (Mul, [Nat n1; Nat n2]) -> Nat (n1 * n2)
    | (And, [Bool b1; Bool b2]) -> Bool (b1 && b2)
    | (Or , [Bool b1; Bool b2]) -> Bool (b1 || b2)
    | (LT , [Nat n1; Nat n2]) -> Bool (n1 < n2)
    | (EQ , [Nat e1; Nat e2]) -> Bool (e1 = e2)
    | (EQ , [Bool e1; Bool e2]) -> Bool (e1 = e2)
    | _ -> failwith "runtime error: arity mismatch or unexpected argument"

  let rec eval (env : value StrMap.t) (e : expr) : value =
    match e with
    | Value v -> v
    | Var x ->
      (match StrMap.find_opt x env with
      | Some v -> v
      | None   -> failwith ("Unbound Variable: " ^ x))
    | Let (x, e1, e2) ->
      let v1 = eval env e1 in
      let env = StrMap.add x v1 env in
      eval env e2
    | ITE (g, t, f) ->
      (match eval env g with
      | Bool true  -> eval env t
      | Bool false -> eval env f
      | _ -> failwith "guard is not a bool")
    | App (op, args) -> eval_calc_op op (List.map (eval env) args)
end

To perform compilation, we are going to write some pure function called lower that performs a source-to-source translation. Often in real-world compilers, passes perform translations between intermediate representations that are defined identically to the ASTs we are working with, looking very much like a source-to-source translation. Compiler passes are also intended to perform a single function and, ideally, are as simple as possible. Using a pure function is a good mental model for how we define these programs.

We will define lower with the following signature and base cases:

let rec lower (e : CalcLang.expr) : StackLang.program =
  match e with
  | Value (Nat v) -> [Push v]
  | Value (Bool true) -> [Push 1]
  | Value (Bool false) -> [Push 0]
  | ...

lower takes CalcLang's tree-shaped expr and flattens it to a list of instructions. Pattern-matching on e, our base cases are single instructions that push values onto the stack.

Variable binding is where we will need to make our first recursive call to lower. While Var x can be directly translated into a Get x, we need to keep in mind the order in which we translate let-bindings. Can you recall how we should lower this CalcLang expression from last class?

CalcLang: let x = 6 in x

StackLang: [Push 6; Set "x"; Get "x"]

Let's do two more exercises where we perform a little more computation in the bound expression and body:

CalcLang: let x = 1 * 2 in x

StackLang: [Push 1; Push 2; AppInstr Mul; Set "x"; Get "x"]

CalcLang: let x = 1 * 2 in 3 + x

StackLang: [Push 1; Push 2; AppInstr Mul; Set "x"; Push 3; Get "x"; AppInstr Add]

Let-expressions first evaluate the code in the bound expression, then bind it to an identifier with Set, and finally evaluate the code in the body. Because we are concatenating lists, we can use OCaml's list append operation @ to directly encode this intuition:

  ...
  | Var x -> [Get x]
  | Let (x, e1, e2) -> lower e1 @ [StackLang.Set x] @ lower e2
  ...

Building on the examples from before, to apply an operation we first lower each argument and post-append the AppInstr instruction to the end of the list. We can simply map over our list of arguments as we have in prior lectures.

The last branch, containing if-then-else expressions, will remain a todo-item for the time being. Before we address this branch, we'll first lower some programs and observe that there is a bug in our system.

  ...
  | App (op, args) -> List.flatten (List.map lower args) @ [AppInstr op]
  | ITE (g, t, f) -> failwith "TODO"

In its entirety, lower will be defined as:

lower

let rec lower (e : CalcLang.expr) : StackLang.program =
  match e with
  | Value (Nat v) -> [Push v]
  | Value (Bool true) -> [Push 1]
  | Value (Bool false) -> [Push 0]
  | Var x -> [Get x]
  | Let (x, e1, e2) -> lower e1 @ [StackLang.Set x] @ lower e2
  | App (op, args) ->
      List.flatten (List.map lower args) @ [AppInstr op]
  | ITE (g, t, f) -> failwith "TODO"

The "Uniquify" Compiler Pass

With our new lower function, we can already convert many of the CalcLang examples we've already seen. This function will let us write complex StackLang programs using the much more ergonomic CalcLang.

We'll start with some programs we've seen before:

let x = 1 in x, a simple expression that evalutes to 1
let x = 1 in 2 * x, an expression that performs a little more work and evaluates to 2
let x = 1 in (let x = 3 in x) * x, an nested let-expression that evalutates to 3

─( 00:00:00 )─< command 0 >─────────────────────────────────
utop # lower(Let("x", Value (Nat 1), Var "x"));;
- : program = [Push 1; Set "x"; Get "x"]
─( 00:00:00 )─< command 0 >─────────────────────────────────
utop # execute(lower(Let("x", Value (Nat 1), Var "x")));;
- : value = 1

─( 00:00:00 )─< command 0 >─────────────────────────────────
utop # lower(Let("x", Value (Nat 1), App(Mul, [Value (Nat 2); Var "x"])));;
- : program = [Push 1; Set "x"; Push 2; Get "x"; AppInstr Mul]
─( 00:00:00 )─< command 0 >─────────────────────────────────
utop # execute(lower(Let("x", Value (Nat 1), App(Mul, [Value (Nat 2); Var "x"]))));;
- : value = 2

─( 00:00:00 )─< command 0 >─────────────────────────────────
utop # lower(Let("x", Value (Nat 1), App(Mul, [Let("x", Value (Nat 3), Var "x"); Var "x"])));;
- : program = [Push 1; Set "x"; Push 3; Set "x"; Get "x"; Get "x"; AppInstr Mul]
─( 00:00:00 )─< command 0 >─────────────────────────────────
utop # execute(lower(Let("x", Value (Nat 1), App(Mul, [Let("x", Value (Nat 3), Var "x"); Var "x"]))));;
- : value = 9

Ah, but a bug has appeared in that last example! Our lower function cannot correctly handle nested let-binders. The key assumption that we would like to make is that every program should not care about binders[1]. For instance, all of the following programs should return 3:

let x = 1 in (let x = 3 in x) * x,
let x = 1 in (let y = 3 in y) * x, and
let moo = 1 in (let neigh = 3 in neigh) * moo

By design, this was not an issue in CalcLang. That said, this problem occurs in our StackLang.

To handle this we will an additional pass to our compiler called uniquify[2]. The uniquify phase ensures that each variable corresponds to a fresh, unique string and that variable shadowing is not present in the program. For this compiler pass, we will write another function that transforms Calc programs to equivalent programs with new binders.

As we map over a program, uniquify will maintain a map of how many times a let expression has introduced each variable. For any bound variable we encounter, we will re-bind it to a fresh name composed of the original name and the number of counts we have seen.

Under this "uniquify" scheme, we will expect the following program translations:

let x = 6 in x
--->>> let x#0 = 6 in x#0

let x = 1 in let y = 2 in x + y
--->>> let x#0 = 1 in let y#0 = 2 in x#0 + y#0

let x = 1 in (let x = 2 in x) + x
--->>> let x#0 = 1 in (let x#1 = 2 in x1) + x#0

Let's take another look at that last program:

After evaluating the outer let x = 1 in ... we will add an entry to a map called bound with {"x" ↦ 1}.
We'll next evaluate the inner (let x = 2 in ... our bound map will increment x again, so that it will be {"x" ↦ 2}.
We'll encounter the variable x and rewrite it to "x" concatenated with the counter in bound, minus one, giving x#1. We use the tag # to distinguish surface-level variables from our unique variables.
Rewriting the let-binder, we will return (let x#1 = 2 in x#1) and discarding the updated bound map and use {"x" ↦ 1} once more.
Once more encountering the free variable x, we will peform the same operation and rewrite x to x#0.
The top-level let-binder will also be rewritten to x#0 and we will return the final expression.

With this in mind, the OCaml encoding is straightforward:

let make_name (var : string) (ctr : int) : string
  = var ^ "#" ^ string_of_int (ctr - 1)

let next_counter (var : string) (bound : int StrMap.t) : int =
  match StrMap.find_opt var bound with
  | None -> 1
  | Some ctr -> ctr + 1

let rec uniquify (bound : int StrMap.t) (e : CalcLang.expr) : CalcLang.expr =
  match e with
  | Value v -> e
  | Var x ->
     (match StrMap.find_opt x bound with
     | None -> Var x (* leave free variables untouched *)
     | Some ctr -> Var (make_name x ctr))
  | Let (x, e1, e2) ->
     let e1' = uniquify bound e1 in
     let ctr = next_counter x bound in
     let e2' = uniquify (StrMap.add x ctr bound) e2 in
     Let(make_name x ctr, e1', e2')
  | App (o, args) -> App(o, List.map (fun a -> uniquify bound a) args)
  | ITE(g, t, f) ->
     ITE(uniquify bound g,
         uniquify bound t,
         uniquify bound f)

We can now check that our programs are doing the right thing:

utop # open CalcLang;;

utop # uniquify StrMap.empty (Let("x", Value (Nat 1), Var "x"));;
- : expr = Let ("x#0", Value (Nat 1), Var "x#0")

utop # uniquify StrMap.empty (Let("x", Value (Nat 1), Let("y", Value (Nat 1), Var "y")));;
- : expr = Let ("x#0", Value (Nat 1), Let ("y#0", Value (Nat 1), Var "y#0"))

utop # uniquify StrMap.empty (Let("x", Value (Nat 1), App(Mul, [Let("x", Value (Nat 3), Var "x"); Var "x"])));;
- : expr = Let ("x#0", Value (Nat 1), App (Mul, [Let ("x#1", Value (Nat 3), Var "x#1"); Var "x#0"]))

With our second compiler pass in place, we can finally define our top-level compile function:

let compile (e : CalcLang.expr) : StackLang.program =
  let e' = uniquify StrMap.empty e in
  let prg = lower e' in
  prg

Our let-binding programs will now work as expected.

utop # open CalcLang;;
utop # open StackLang;;
utop # compile(Let("x", Value (Nat 1), Var "x"));;
- : program = [Push 1; Set "x#0"; Get "x#0"]

utop # execute(compile(Let("x", Value (Nat 1), Var "x")));;
- : int = 1

utop # compile(Let("x", Value (Nat 1), App(Mul, [Let("x", Value (Nat 3), Var "x"); Var "x"])));;
[Push 1; Set "x#0"; Push 3; Set "x#1"; Get "x#1"; Get "x#0"; AppInstr Mul]

utop # execute(compile(Let("x", Value (Nat 1), App(Mul, [Let("x", Value (Nat 3), Var "x"); Var "x"]))));;
- : int = 3

Keep in mind, however that our compile function is still partial and will crash when we lower if-then-else expressions into our StackLang. We'll need to fix this before we can reason about our compiler's correctness. It is time to switch gears back to StackLang and discuss how we should encode control flow.