Stuff I forgot to commit.

2020-06-29 16:17:24 +02:00 · 2020-06-29 16:17:24 +02:00 · 9f47433501
commit 9f47433501
parent b8ae0092c1
14 changed files with 517 additions and 249 deletions
--- a/Images/BranchPredictor.png
+++ b/Images/BranchPredictor.png
--- a/TODO.org
+++ b/TODO.org
@ -1,57 +0,0 @@
-* Tasks
-** DONE File IO and test
-** DONE Stop exploding the heap with logs :DDD
-** DONE Fix DONE instruction for VM termination
-*** DONE Add setting instructions
-** DONE Add assembler
-** DONE Chisel tester
-** DONE Add LF
-** DONE Redo colors in fansi. ANSI fucks up string formatting
-** DONE Columnize log events
-** DONE Chisel test log evaluator
-** DONE Create giftWrapper script
-** DONE Better sourceinfo stuff
-   Good enough
-
-** DONE Test options
-*** DONE How much NOP pad?
-*** DONE Verbosity?
-*** DONE Which tests?
-** DONE ish Step counter, pretty print VM log, including final memory state
-** TODO More programs
-*** DONE Real programs
-*** TODO Basic programs
-    Needs more
-** DONE Merge in LF changes
-** TODO Breakpoints
-*** TODO VM breakpoints
-**** TODO Record breakpoints in chisel tester
-*** TODO Chisel breakpoints
-**** TODO Freeze processor to record state
-**** TODO Record breakpoints in chisel tester
-*** TODO Draw breakpoints in the printer
-** TODO Calculate steps needed
-** TODO Unmangle derailed traces
-   With incorrect designs the trace printer ends up printing a lot of diveregent 
-   unsychnronizable blocks
-** DONE Fix DONE instruction
-*** DONE Parse error
-*** DONE Use DONE address
-** DONE Hazard generator
-   good enough
-** TODO Semantic logging
-   Currently logging is quite awkward, a combination of fansi and regular strings.
-   Ideally a markdown format such as HTML should be used. There are already plenty
-   good scala libraries for this, such as liyaohi's stuff (big shoutout!)
-
-** TODO Interactive stepping
-   This one is a pretty big undertaking, but it could be very useful to run the circuit in an interactiv
-   environment.
-   https://venus.cs61c.org/ is a good example of how useful this can be for a virtual machine.
-   This task requires pretty good understanding of chisel.
-* Maybe
-** DONE Move instruction recording to IMEM rather than IF?
-   Only care about what IF gets, won't have to deal with whatever logic is in IF.
-** DONE Figure out why loading instructions backwards made shit werk
-   Not as funny as you'd think. The issue was overwriting the last written instruction with 0
-
--- a/branchProfiler.scala
+++ b/branchProfiler.scala
--- a/src/main/scala/main.scala
+++ b/src/main/scala/main.scala
@ -0,0 +1,7 @@
+package FiveStage
+
+object main {
+  def main(args: Array[String]): Unit = {
+    println("helo")
+  }
+}
--- a/src/test/resources/tests/programs/constants.s
+++ b/src/test/resources/tests/programs/constants.s
@ -0,0 +1,15 @@
+main:
+  li x0, 0x0
+  nop
+  li x1, 0xABCDEF0
+  nop
+  li x1, 32
+  li x1, 0x800
+  li x1, 0x7FF
+  nop
+  nop
+  done
+#regset t0,10
+#regset t1,23
+#regset t2,43
+#regset t3,-11
--- a/src/test/resources/tests/programs/halfwords.s
+++ b/src/test/resources/tests/programs/halfwords.s
@ -0,0 +1,18 @@
+main:	
+  li x1, 0x11223344
+  li x2, 0x55667788
+  li x10, 0x100
+  sw x1,  0(x10)
+  sw x2,  4(x10)
+  lw x3,  2(x10)
+  lh x4,  3(x10)
+  lb x5,  3(x10)
+  lhu x6, 3(x10)
+  lbu x7, 3(x10)
+  sw x1, 8(x10)
+  sw x1, 9(x10)
+  sh x2, 9(x10)
+  sb x2, 11(x10)
+  lw x12, 8(x10)
+  lw x13, 12(x10)
+  done
--- a/src/test/scala/Manifest.scala
+++ b/src/test/scala/Manifest.scala
@ -1,4 +1,5 @@
 package FiveStage
+
 import org.scalatest.{Matchers, FlatSpec}
 import cats._
 import cats.implicits._
@ -20,7 +21,7 @@ object Manifest {

  val singleTest = "forward2.s"

-  val nopPadded = true
+  val nopPadded = false

  val singleTestOptions = TestOptions(
    printIfSuccessful  = true,
@ -53,18 +54,15 @@ object Manifest {

 class ProfileBranching extends FlatSpec with Matchers {
  it should "profile some branches" in {
-    TestRunner.profileBranching(
-      Manifest.singleTestOptions.copy(testName = "branchProfiling.s", maxSteps = 50000)
+    BranchProfiler.profileBranching(
+      Manifest.singleTestOptions.copy(testName = "branchProfiling.s", maxSteps = 150000)
    ) should be(true)
  }
 }

 class ProfileCache extends FlatSpec with Matchers {
  it should "profile a cache" in {
-    say("Warning, this test takes forever to run! 2 minutes on my machine at least.")
-    say("This happens due to the less than optimal way of storing the update log. Sorry I guess")
-    say("You probably want to debug this with a smaller program")
-    TestRunner.profileCache(
+    CacheProfiler.profileCache(
      Manifest.singleTestOptions.copy(testName = "convolution.s", maxSteps = 150000)
    ) should be(true)
  }
--- a/src/test/scala/RISCV/DataTypes.scala
+++ b/src/test/scala/RISCV/DataTypes.scala
@ -176,8 +176,16 @@ object Data {
      val bitsRight = 32 - slots.log2
      val leftShifted = i << bitsLeft
      val rightShifted = leftShifted >>> bitsRight
-      // say(i)
-      // say(rightShifted)
+      rightShifted
+    }
+
+    // To get the entire word call with from = 31, to = 0
+    def bits(from: Int, to: Int): Int = {
+      val bitsLeft = 31 - from
+      val bitsRight = bitsLeft + to
+      val leftShifted = i << bitsLeft
+      val rightShifted = leftShifted >>> bitsRight
+
      rightShifted
    }
  }
--- a/src/test/scala/RISCV/branchProfiler.scala
+++ b/src/test/scala/RISCV/branchProfiler.scala
@ -0,0 +1,147 @@
+package FiveStage
+
+import org.scalatest.{Matchers, FlatSpec}
+import cats._
+import cats.implicits._
+import fileUtils._
+
+import chisel3.iotesters._
+import scala.collection.mutable.LinkedHashMap
+
+import fansi.Str
+
+import Ops._
+import Data._
+import VM._
+
+import PrintUtils._
+import LogParser._
+
+object BranchProfiler {
+
+  def profileBranching(testOptions: TestOptions): Boolean = {
+
+    val testResults = for {
+      lines                           <- fileUtils.readTest(testOptions)
+      program                         <- FiveStage.Parser.parseProgram(lines, testOptions)
+      (binary, (trace, finalVM))      <- program.validate(testOptions.maxSteps).map(x => (x._1, x._2.run))
+    } yield {
+
+      sealed trait BranchEvent
+      case class Taken(from: Int, to: Int) extends BranchEvent { override def toString = s"Taken      ${from.hs}\t${to.hs}" }
+      case class NotTaken(addr: Int) extends BranchEvent { override def toString =       s"Not Taken  ${addr.hs}" }
+
+      val events: List[BranchEvent] = trace.flatMap(_.event).collect{
+        case PcUpdateBranch(from, to) => Taken(from.value, to.value)
+        case PcUpdateNoBranch(at) => NotTaken(at.value)
+      }
+
+
+      /**
+        * This is a sample profiler for a rather unrealistic branch predictor which has an unlimited amount
+        * of slots
+        */
+      def OneBitInfiniteSlots(events: List[BranchEvent]): Int = {
+
+        // Uncomment to take a look at the event log
+        // say(events.mkString("\n","\n","\n"))
+
+        // Helper inspects the next element of the event list. If the event is a mispredict the prediction table is updated
+        // to reflect this.
+        // As long as there are remaining events the helper calls itself recursively on the remainder
+        def helper(events: List[BranchEvent], predictionTable: Map[Int, Boolean]): Int = {
+          events match {
+
+            // Scala syntax for matching a list with a head element of some type and a tail
+	    // `case h :: t =>`
+	    // means we want to match a list with at least a head and a tail (tail can be Nil, so we
+	    // essentially want to match a list with at least one element)
+	    // h is the first element of the list, t is the remainder (which can be Nil, aka empty)
+
+	    // `case Constructor(arg1, arg2) :: t => `
+	    // means we want to match a list whose first element is of type Constructor, giving us access to its internal
+	    // values.
+
+	    // `case Constructor(arg1, arg2) :: t => if(p(arg1, arg2))`
+	    // means we want to match a list whose first element is of type Constructor while satisfying some predicate p,
+	    // called an if guard.
+            case Taken(from, to) :: t if( predictionTable(from)) => helper(t, predictionTable)
+            case Taken(from, to) :: t if(!predictionTable(from)) => 1 + helper(t, predictionTable.updated(from, true))
+            case NotTaken(addr)  :: t if( predictionTable(addr)) => 1 + helper(t, predictionTable.updated(addr, false))
+            case NotTaken(addr)  :: t if(!predictionTable(addr)) => helper(t, predictionTable)
+            case Nil => 0
+          }
+        }
+
+        // Initially every possible branch is set to false since the initial state of the predictor is to assume branch not taken
+        def initState = events.map{
+          case Taken(from, addr) => (from, false)
+          case NotTaken(addr)    => (addr, false)
+        }.toMap
+
+        helper(events, initState)
+      }
+
+
+      def twoBitPredictor(events: List[BranchEvent], slots: Int): Int = {
+
+        case class nBitPredictor(
+          values          : List[Int],
+          predictionRules : List[Boolean],
+          transitionRules : Int => Boolean => Int,
+        ){
+          val slots = values.size
+
+          def predict(pc: Int): Boolean = predictionRules(values(pc.getTag(slots)))
+
+          def update(pc: Int, taken: Boolean): nBitPredictor = {
+            val current = values(pc.getTag(slots))
+            val next = copy(values = values.updated(pc.getTag(slots), transitionRules(current)(taken)))
+            next
+          }
+
+          override def toString = values.map(x => x.binary(2)).mkString("[","][","]")
+        }
+
+        val initPredictor = nBitPredictor(
+          List.fill(slots)(0),
+          List(
+            false,
+            false,
+            true,
+            true,
+          ),
+          r => r match {
+            case 0 => taken => if(taken) 1 else 0
+            case 1 => taken => if(taken) 2 else 0
+            case 2 => taken => if(taken) 3 else 1
+            case 3 => taken => if(taken) 3 else 2
+          }
+        )
+
+        events.foldLeft((0, initPredictor)){ case(((acc, bp), event)) =>
+          println()
+          say(s"total misses: $acc")
+          say(event)
+          event match {
+            case Taken(pc, _) => say(s"taken at tag:     ${pc.getTag(slots)}")
+            case NotTaken(pc) => say(s"not taken at tag: ${pc.getTag(slots)}")
+          }
+          say(bp)
+          event match {
+            case Taken(pc, _) if bp.predict(pc)  => {say("HIT!");  (acc,     bp.update(pc, true))}
+            case Taken(pc, _)                    => {say("MISS!"); (acc + 1, bp.update(pc, true))}
+            case NotTaken(pc) if !bp.predict(pc) => {say("HIT!");  (acc,     bp.update(pc, false))}
+            case NotTaken(pc)                    => {say("MISS!"); (acc + 1, bp.update(pc, false))}
+          }
+        }._1
+      }
+
+      say(events.mkString("\n","\n","\n"))
+      say(twoBitPredictor(events, 8))
+    }
+
+
+    true
+  }
+}
--- a/src/test/scala/RISCV/cacheProfiler.scala
+++ b/src/test/scala/RISCV/cacheProfiler.scala
@ -0,0 +1,204 @@
+package FiveStage
+
+import org.scalatest.{Matchers, FlatSpec}
+import cats._
+import cats.implicits._
+import fileUtils._
+
+import chisel3.iotesters._
+import scala.collection.mutable.LinkedHashMap
+
+import fansi.Str
+
+import Ops._
+import Data._
+import VM._
+
+import PrintUtils._
+import LogParser._
+
+object CacheProfiler {
+  
+  def profileCache(testOptions: TestOptions): Boolean = {
+
+    val testResults = for {
+      lines                           <- fileUtils.readTest(testOptions)
+      program                         <- FiveStage.Parser.parseProgram(lines, testOptions)
+      (binary, (trace, finalVM))      <- program.validate(testOptions.maxSteps).map(x => (x._1, x._2.run))
+    } yield {
+
+      import TestUtils._
+
+      sealed trait MemoryEvent
+      case class Write(addr: Int) extends MemoryEvent
+      case class Read(addr: Int) extends MemoryEvent
+
+      val events = trace.flatMap(_.event).collect{
+        case MemWrite(addr, _) => Write(addr.value)
+        case MemRead(addr, _) => Read(addr.value)
+      }
+
+
+      class CacheProfiler(setCount: Int, setSize: Int, blockSize: Int){
+
+        // If we set counter to 0 we risk evicting the first allocated block.
+        var counter = 1
+        var misses = 0
+        var mostRecent = 0
+        var wasMiss = false
+
+        implicit class AddrOps(i: Int){
+          val blockOffsetBits = blockSize.log2
+          val lineBits = setSize.log2
+
+          def lineIdx: Int = {
+            i.bits(2 + blockOffsetBits + lineBits - 1, 2 + blockOffsetBits)
+          }
+        }
+
+
+        case class CacheLine(tag: Int, lastUsed: Int){
+          def matches(addr: Int): Boolean = List.fill(blockSize)(tag)
+            .zipWithIndex
+            .map{ case(btag, idx) => btag + idx*4 }
+            .map(_ == addr)
+            .foldLeft(false)(_ || _)
+
+          def renderContent(addr: Int): String = (addr == mostRecent, wasMiss) match {
+            case (true, true)  => Console.RED + addr.hs + Console.RESET
+            case (true, false) => Console.GREEN + addr.hs + Console.RESET
+            case _ => addr.hs
+          }
+
+          def render: String = {
+            val blockContents = List.fill(blockSize)(tag)
+              .zipWithIndex
+              .map{ case(btag, idx) => renderContent(btag + idx*4) }
+              .mkString("Contents: || ", " | ", " |")
+
+            s"Base: ${tag.hs} LRU: $lastUsed\t" + blockContents
+          }
+        }
+        object CacheLine {
+          def truncateTag(addr: Int) = addr - (addr % (blockSize*4))
+        }
+
+
+        case class CacheSet(blocks: Array[CacheLine]){
+          def lineIdx(addr: Int): Int = addr.lineIdx
+          def contains(addr: Int): Boolean = blocks.map(_.matches(addr)).foldLeft(false)(_ || _)
+
+          def updateLRU(addr: Int): Unit = {
+            val idx = lineIdx(addr)
+            val next = blocks(idx).copy(lastUsed = counter)
+            blocks(idx) = next
+          }
+
+          def render: String = {
+            blocks.map(_.render).mkString("\n", "\n", "\n")
+          }
+        }
+
+
+        case class Cache(sets: Array[CacheSet]){
+
+          /** returns the index of set if hit */
+          def checkHit(addr: Int): Option[Int] = sets
+            .zipWithIndex
+            .map{ case(set, idx) => Option.when(set.contains(addr))(idx) }
+            .flatten.headOption
+
+
+          /** Updates the LRU counter */
+          def updateLRU(addr: Int, setIdx: Int): Unit = sets(setIdx).updateLRU(addr)
+
+
+          /** Gets set with least recently used */
+          def getLRU(addr: Int): Int = sets
+            .map( set => set.blocks(set.lineIdx(addr)).lastUsed)
+            .zipWithIndex
+            .sortBy(_._1)
+            .map(_._2)
+            .head
+
+
+          /** Entry point */
+          def handleAccess(addr: Int): Unit = {
+            mostRecent = addr
+            counter += 1
+
+            checkHit(addr) match {
+
+              case Some(setIdx) => {
+                wasMiss = false
+                updateLRU(addr, setIdx)
+                // say(s"${addr.hs} HIT")
+              }
+
+              case None => {
+                val set = sets(getLRU(addr))
+                val nextTag = CacheLine.truncateTag(addr)
+                set.blocks(set.lineIdx(addr)) = set.blocks(set.lineIdx(addr)).copy(
+                  tag = nextTag,
+                  lastUsed = counter
+                )
+                misses += 1
+
+                wasMiss = true
+                // say(s"${addr.hs} MISS")
+                // say(s"BLOCK ${addr.lineIdx} IN SET ${getLRU(addr)} EVICTED. BYE BYE")
+              }
+            }
+          }
+
+          /** Pretty pictures! */
+          def render: String = {
+            sets.map(_.render).mkString("\n", "\n", "\n")
+          }
+        }
+
+        object Cache {
+          def init: Cache = Cache(Array.fill(setCount)(
+            CacheSet(Array.fill(setSize)(CacheLine(57005, 0))))
+          )
+        }
+      }
+
+      for{
+        sets <- List(2, 4, 8)
+        blockSize <- List(4, 8)
+        lines <- List(2, 4, 8)
+      } yield {
+
+        val myTest = new CacheProfiler(sets, lines, blockSize)
+        val myCache = myTest.Cache.init
+        events.foreach{
+          case Write(addr) => myCache.handleAccess(addr)
+          case Read(addr) => myCache.handleAccess(addr)
+        }
+
+        say(s"sets: $sets, lines: $lines, blockSize: $blockSize yields ${myTest.misses} misses")
+      }
+
+      // val myTest = new CacheProfiler(2, 4, 4)
+      // val myCache = myTest.Cache.init
+      // events.foreach{
+      //   case Write(addr) => {
+      //     say(addr.hs)
+      //     myCache.handleAccess(addr)
+      //     say(myCache.render)
+      //   }
+      //   case Read(addr) => {
+      //     say(addr.hs)
+      //     myCache.handleAccess(addr)
+      //     say(myCache.render)
+      //   }
+      // }
+
+      // say(myTest.misses)
+
+    }
+
+    true
+  }
+}
--- a/src/test/scala/RISCV/testRunner.scala
+++ b/src/test/scala/RISCV/testRunner.scala
@ -101,98 +101,4 @@ object TestRunner {
      successful
    }.toOption.getOrElse(false)
  }
-
-  def profileBranching(testOptions: TestOptions): Boolean = {
-
-    val testResults = for {
-      lines                           <- fileUtils.readTest(testOptions)
-      program                         <- FiveStage.Parser.parseProgram(lines, testOptions)
-      (binary, (trace, finalVM))      <- program.validate(testOptions.maxSteps).map(x => (x._1, x._2.run))
-    } yield {
-
-      sealed trait BranchEvent
-      case class Taken(from: Int, to: Int) extends BranchEvent { override def toString = s"Taken      ${from.hs}\t${to.hs}" }
-      case class NotTaken(addr: Int) extends BranchEvent { override def toString =       s"Not Taken  ${addr.hs}" }
-
-      val events: List[BranchEvent] = trace.flatMap(_.event).collect{
-        case PcUpdateBranch(from, to) => Taken(from.value, to.value)
-        case PcUpdateNoBranch(at) => NotTaken(at.value)
-      }
-
-
-      /**
-        * This is a sample profiler for a rather unrealistic branch predictor which has an unlimited amount
-        * of slots
-        */
-      def OneBitInfiniteSlots(events: List[BranchEvent]): Int = {
-
-        // Uncomment to take a look at the event log
-        // say(events.mkString("\n","\n","\n"))
-
-        // Helper inspects the next element of the event list. If the event is a mispredict the prediction table is updated
-        // to reflect this.
-        // As long as there are remaining events the helper calls itself recursively on the remainder
-        def helper(events: List[BranchEvent], predictionTable: Map[Int, Boolean]): Int = {
-          events match {
-
-            // Scala syntax for matching a list with a head element of some type and a tail
-	    // `case h :: t =>`
-	    // means we want to match a list with at least a head and a tail (tail can be Nil, so we
-	    // essentially want to match a list with at least one element)
-	    // h is the first element of the list, t is the remainder (which can be Nil, aka empty)
-
-	    // `case Constructor(arg1, arg2) :: t => `
-	    // means we want to match a list whose first element is of type Constructor, giving us access to its internal
-	    // values.
-
-	    // `case Constructor(arg1, arg2) :: t => if(p(arg1, arg2))`
-	    // means we want to match a list whose first element is of type Constructor while satisfying some predicate p,
-	    // called an if guard.
-            case Taken(from, to) :: t if( predictionTable(from)) => helper(t, predictionTable)
-            case Taken(from, to) :: t if(!predictionTable(from)) => 1 + helper(t, predictionTable.updated(from, true))
-            case NotTaken(addr)  :: t if( predictionTable(addr)) => 1 + helper(t, predictionTable.updated(addr, false))
-            case NotTaken(addr)  :: t if(!predictionTable(addr)) => helper(t, predictionTable)
-            case Nil => 0
-          }
-        }
-
-        // Initially every possible branch is set to false since the initial state of the predictor is to assume branch not taken
-        def initState = events.map{
-          case Taken(from, addr) => (from, false)
-          case NotTaken(addr)    => (addr, false)
-        }.toMap
-
-        helper(events, initState)
-      }
-
-      say(OneBitInfiniteSlots(events))
-    }
-
-
-    true
-  }
-
-
-  def profileCache(testOptions: TestOptions): Boolean = {
-
-    val testResults = for {
-      lines                           <- fileUtils.readTest(testOptions)
-      program                         <- FiveStage.Parser.parseProgram(lines, testOptions)
-      (binary, (trace, finalVM))      <- program.validate(testOptions.maxSteps).map(x => (x._1, x._2.run))
-    } yield {
-
-      sealed trait MemoryEvent
-      case class Write(addr: Int) extends MemoryEvent
-      case class Read(addr: Int) extends MemoryEvent
-
-      val events: List[MemoryEvent] = trace.flatMap(_.event).collect{
-        case MemWrite(x,_) => Write(x.value)
-        case MemRead(x,_) => Read(x.value)
-      }
-
-      // Your cache here
-
-    }
-    true
-  }
 }
--- a/src/test/scala/TestUtils.scala
+++ b/src/test/scala/TestUtils.scala
@ -6,6 +6,10 @@ import PrintUtils._

 object TestUtils {

+  implicit class OptionBackport(t: Option.type){
+    def when[T](b: Boolean)(t: => T) = if(b) Some(t) else None
+  }
+
  /**
    * Generate and serialize BTrees for the test runner
    */
--- a/src/test/scala/chiselTestRunner.scala
+++ b/src/test/scala/chiselTestRunner.scala
@ -142,7 +142,7 @@ private class ChiselTestRunner (

  // After finishing, let the circuit run until all updates can be committed.
  private def flush: List[CircuitTrace] =
-    (0 to 3).map(_ => stepOne).reverse.toList
+    (0 to 4).map(_ => stepOne).reverse.toList

  /**
    * Run the entire shebang
--- a/theory2.org
+++ b/theory2.org
@ -1,36 +1,55 @@
-* Question 1 - Hazards
-  For the following programs describe each hazard with type (data or control), line number and a
-  small (max one sentence) description
+* Question 0 - Testing hazards
+  This question is mandatory, but rewards no points (not directly at least).

-** program 1
-  #+begin_src asm
-    addi t0,   zero,  10
-    addi t1,   zero,  20
-  L2:
-    sub  t1,   t1,    t0
-    beq  t1,   zero, .L2
-    jr   ra
-  #+end_src
+  The tests found in the testing framework are useful for testing a fully working processor, however it
+  leaves much to be desired for when you actually want to design one from whole cloth.
+
+  To rectify this, you should write some tests of your own that should serve as a minimal case for various
+  hazards that you will encounter. You do not need to deliver anything here, but I expect you to have
+  these tests if you ask me for help debugging your design during lab hours.
+  (You can of course come to lab hours if you're having trouble writing these tests)


-** program 2
-  #+begin_src asm
-    addi t0,   zero,  10
-    lw   t0,   10(t0)
-    beq  t0,   zero,  .L3
-    jr   ra
-  #+end_src
+** Forwarding
+   The tests in forward1.s and forward2.s are automatically generated, long, and non-specific,
+   thus not very suited for debugging.
+
+   You should write one (or more) test(s) that systematically expose your processor to dependency
+   hazards, including instructions that:
+   + Needs forwarding from MEM and WB (i.e dependencies with NOPs between them).
+   + Exposes results that should *not* be forwarded due to regWrite being false.
+   + Writes and reads to/from the zero register.


-** program 3
-  #+begin_src asm
-  lw   t0,   0(t0)
-  lw   t1,   4(t0)
-  sw   t0,   8(t1)
-  lw   t1,   12(t0)
-  beq  t0,   t1,  .L3
-  jr   ra
-  #+end_src
+** Load freezes
+   Loads freezes are tricky since they have an interaction with the forwarding unit, often causing
+   bugs that appear with low frequency in the supplied test programs.
+
+   You should write tests (I suggest one test per case) that systematically expose your processor to
+   dependency hazards where one or more of the dependencies are memory accesses, including instructions that:
+   + Needs forwarding from MEM and WB where MEM, WB or both are load instructions.
+   + Exposes false dependencies from MEM and WB where one or more are loads.
+     For instance, consider ~addi x1, x1, 0x10~ in machine code with the rs2 field highlighted:
+     0x00a08093 = 0b00000000 | 10100 | 0001000000010010011
+     In this case there is a false dependency on x20 since x20 is only an artefact of the immediate
+     value which could cause an unecessary freeze.
+   + Writes and reads to/from the zero register, which could trigger an unecessary freeze
+   + Instructions that causes multiple freezes in a row.
+   + Instructions that causes multiple freezes in a row followed by an instruction with multiple
+     dependencies.
+
+
+** Control hazards
+   There are a lot of possible interactions when jumping and branching, you need to write tests
+   that ensures that instructions are properly bubbled if they shouldn't have been fetched.
+   You should also test for interactions between forwarding and freezing here, i.e what happens
+   when the address calculation relies on forwarded values? What happens if the forwarded value
+   comes from a load instruction necessitating a freeze?
+
+
+* TODO Question 1 - Hazards
+  Write programs here that are less of a crapshoot. Clarify dependency vs hazards etc etc and
+  *enforce* a format that is easy to grade.


 * Question 2 - Handling hazards
@ -39,7 +58,7 @@

 ** Data hazards 1
   At some cycle the following instructions can be found in a 5 stage design:
-   
+
   #+begin_src text
   EX:                  ||     MEM:                ||      WB:
   ---------------------||-------------------------||--------------------------
@ -52,13 +71,17 @@
   branch   = false     ||     branch   = true     ||      branch   = false
   jump     = false     ||     jump     = false    ||      jump     = false
   #+end_src
-   
+
   For the operation currently in EX, from where (ID, MEM or WB) should the forwarder get data from for rs1 and rs2?
-   
+   Answer should be on the form:
+
+   rs1: Narnia
+   rs2: Wikipedia
+
 ** Data hazards 2

   At some cycle the following instructions can be found in a 5 stage design:
-   
+
   #+begin_src text
   EX:                  ||     MEM:                ||      WB:
   ---------------------||-------------------------||--------------------------
@ -73,11 +96,15 @@
   #+end_src

   For the operation currently in EX, from where (ID, MEM or WB) should the forwarder get data from for rs1 and rs2?
+   Answer should be on the form:
+
+   rs1: Random noise
+   rs2: WB (MEM if it's a tuesday)

 ** Data hazards 3

   At some cycle the following instructions can be found in a 5 stage design:
-   
+
   #+begin_src text
   EX:                  ||     MEM:                ||      WB:
   ---------------------||-------------------------||--------------------------
@ -89,24 +116,26 @@
   memWrite = true      ||     memWrite = false    ||      memWrite = false
   branch   = false     ||     branch   = false    ||      branch   = false
   jump     = false     ||     jump     = false    ||      jump     = false
-
-   Should the forwarding unit issue a load hazard signal?
-   (Hint: what are the semantics of the instruction currently in EX stage?)
   #+end_src

+   Should the forwarding unit issue a load hazard signal? *This is a yes/no question*
+   (Hint: what are the semantics of the instruction currently in EX stage?)
+
 * Question 3 - Branch prediction
-  Consider a 2 bit branch predictor with only 4 slots for a 32 bit architecture (without BTB), where the decision to 
+  Consider a 2 bit branch predictor with only 4 slots for a 32 bit architecture (without BTB), where the decision to
  take a branch or not is decided in accordance to the following table:
  #+begin_src text
  state  ||  predict taken  ||  next state if taken  ||  next state if not taken ||
  =======||=================||=======================||==========================||
  00     ||  NO             ||  01                   ||  00                      ||
-  01     ||  NO             ||  10                   ||  00                      ||
-  10     ||  YES            ||  11                   ||  01                      ||
+  01     ||  NO             ||  11                   ||  00                      ||
+  10     ||  YES            ||  11                   ||  00                      ||
  11     ||  YES            ||  11                   ||  10                      ||
  #+end_src
-  
-  (This is known as a saturating 2bit counter, it is *not* the same scheme as in the lecture slides)
+
+  Which corresponds to this figure:
+  #+CAPTION: FSM of a 2 bit branch predictor. Note that it is not a 2bit saturating counter.
+  [[./Images/BranchPredictor.png]]

  At some point during execution the program counter is ~0xc~ and the branch predictor table looks like this:
  #+begin_src text
@ -114,21 +143,34 @@
  ======||========
  00    ||  01
  01    ||  00
-  10    ||  11
-  11    ||  01
+  10    ||  01
+  11    ||  10
  #+end_src

  For the following program:
  #+begin_src asm
-  0xc  addi x1, x3, 10
-  0x10 add  x2, x1, x1
-  0x14 beq  x1, x2, .L1 
+  .L1:
+  0x0C addi x1, x1, 1
+  0x10 add  x2, x2, x1
+  0x14 bge  x2, x3, .L1
  0x18 j    .L2
+  .L3:
+  0x1C addi x2, x2, 0x10
+  0x20 slli x2, 0x4
+  0x24 jr   ra
  #+end_src
-  
-  Will the predictor predict taken or not taken for the beq instruction?

-* Question 4 - Benchmarking
+  At cycle 0 the state of the machine is as following:
+  #+begin_src text
+  PC = 0x0C
+  x1 = 0x0
+  x2 = 0x0
+  x3 = 0x7
+  #+end_src
+
+  At which cycle will the PC be 0x24 given a 2 cycle delay for mispredicts?
+
+* Question 4 - Benchmarking a branch profiler
  In order to gauge the performance increase from adding branch predictors it is necessary to do some testing.
  Rather than writing a test from scratch it is better to use the tester already in use in the test harness.
  When running a program the VM outputs a log of all events, including which branches have been taken and which
@ -148,7 +190,7 @@

  To help you get started, I have provided you with much of the necessary code.
  In order to get an idea for how you should profile branch misses, consider the following profiler which calculates
-  misses for a processor with a branch predictor with a 1 bit predictor with infinite memory:
+  misses for a processor with a branch predictor with a 1 bit predictor with infinite slots:

  #+BEGIN_SRC scala
  def OneBitInfiniteSlots(events: List[BranchEvent]): Int = {
@ -172,11 +214,11 @@
 	// `case Constructor(arg1, arg2) :: t => if(p(arg1, arg2))`
 	// means we want to match a list whose first element is of type Constructor while satisfying some predicate p,
 	// called an if guard.
-        case Taken(from, to) :: t if( predictionTable(from)) => helper(t, predictionTable)
-        case Taken(from, to) :: t if(!predictionTable(from)) => 1 + helper(t, predictionTable.updated(from, true))
-        case NotTaken(addr)  :: t if( predictionTable(addr)) => 1 + helper(t, predictionTable.updated(addr, false))
-        case NotTaken(addr)  :: t if(!predictionTable(addr)) => helper(t, predictionTable)
-        case _ => 0
+	case Taken(from, to) :: t if( predictionTable(from)) => helper(t, predictionTable)
+	case Taken(from, to) :: t if(!predictionTable(from)) => 1 + helper(t, predictionTable.updated(from, true))
+	case NotTaken(addr)  :: t if( predictionTable(addr)) => 1 + helper(t, predictionTable.updated(addr, false))
+	case NotTaken(addr)  :: t if(!predictionTable(addr)) => helper(t, predictionTable)
+	case _ => 0
      }
    }

@ -207,7 +249,7 @@

   With a 2 bit 8 slot scheme, how many mispredicts will happen?
   Answer with a number.
-   
+
   Hint: Use the getTag method defined on int (in DataTypes.scala) to get the tag for an address.
   #+BEGIN_SRC scala
   val slots = 8
@ -221,7 +263,7 @@
   say(0x1C5C.getTag(slots)) // prints 7
   say(0x1C60.getTag(slots)) // prints 0 (thus conflicts with 0x1C40)
   #+END_SRC
-   
+

 * Question 5 - Cache profiling
  Unlike our design which has a very limited memory pool, real designs have access to vast amounts of memory, offset
@ -231,11 +273,6 @@
  In order to investigate how caches can alter performance it is therefore necessary to make some rather
  unrealistic assumptions to see how different cache schemes impacts performance.

-  We will therefore assume the following:
-  + Reads from main memory takes 5 cycles
-  + cache has a total storage of 8 words (256 bits)
-  + cache reads work as they do now (i.e no additional latency)
-
  For this exercise you will write a program that parses a log of memory events, similar to previous task
  #+BEGIN_SRC scala
  sealed trait MemoryEvent
@ -246,32 +283,13 @@
  def profile(events: List[MemoryEvent]): Int = ???
  #+END_SRC

-** Your task
-   Your job is to implement a model that tests how many delay cycles will occur for a cache which:
-   + Follows a 2-way associative scheme
-   + set size is 4 words (128 bits) (total cache size: a whopping 256 bits)
-   + Block size is 1 word (32 bits) meaning that we *do not need a block offset*.
-   + Is write-through write no-allocate (this means that you can ignore stores, only loads will affect the cache)
+** TODO Your task
+   Your job is to implement a *parameterised* model that tests how many delay cycles will occur for a cache with
+   the following configuration:
+   + Follows an n-way associative scheme (parameter)
+   + Is write-through write allocate.
   + Eviction policy is LRU (least recently used)
-     
-   In the typical cache each block has more than 32 bits, requiring an offset, however the
-   simulated cache does not.
-   This means that the simulated cache has two sets of 4 words, greatly reducing the complexity
-   of your implementation.
-   
-   Additionally, assume that writes does not change the the LRU counter. 
-   This means that that your cache will only consider which value was most recently loaded,
-   not written.
-   It's not realistic, but it allows you to completely disregard write events (you can
-   just filter them out if you want.)

-   Your answer should be the number of cache miss latency cycles when using this cache.
-
-*** Further study
-    If you have the time I strongly encourage you to experiment with a larger cache with bigger
-    block sizes, forcing you to implement the additional complexity of block offsets.
-    Likewise, by trying a different scheme than write-through no-allocate you will get a much
-    better grasp on how exactly the cache works.
-    This is *not* a deliverable, just something I encourage you to tinker with to get a better
-    understanding.
+   To make this task easier a data structure with stub methods has been implemented for you.
   
+   Answer by pasting the output from running the branchProfiler test.